Merge tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing updates from Steven Rostedt:

- Use READ_ONCE() and WRITE_ONCE() instead of RCU for syscall
tracepoints

Individual system call trace events are pseudo events attached to the
raw_syscall trace events that just trace the entry and exit of all
system calls. When any of these individual system call trace events
get enabled, an element in an array indexed by the system call number
is assigned to the trace file that defines how to trace it. When the
trace event triggers, it reads this array and if the array has an
element, it uses that trace file to know what to write it (the trace
file defines the output format of the corresponding system call).

The issue is that it uses rcu_dereference_ptr() and marks the
elements of the array as using RCU. This is incorrect. There is no
RCU synchronization here. The event file that is pointed to has a
completely different way to make sure its freed properly. The reading
of the array during the system call trace event is only to know if
there is a value or not. If not, it does nothing (it means this
system call isn't being traced). If it does, it uses the information
to store the system call data.

The RCU usage here can simply be replaced by READ_ONCE() and
WRITE_ONCE() macros.

- Have the system call trace events use "0x" for hex values

Some system call trace events display hex values but do not have "0x"
in front of it. Seeing "count: 44" can be assumed that it is 44
decimal when in actuality it is 44 hex (68 decimal). Display "0x44"
instead.

- Use vmalloc_array() in tracing_map_sort_entries()

The function tracing_map_sort_entries() used array_size() and
vmalloc() when it could have simply used vmalloc_array().

- Use for_each_online_cpu() in trace_osnoise.c()

Instead of open coding for_each_cpu(cpu, cpu_online_mask), use
for_each_online_cpu().

- Move the buffer field in struct trace_seq to the end

The buffer field in struct trace_seq is architecture dependent in
size, and caused padding for the fields after it. By moving the
buffer to the end of the structure, it compacts the trace_seq
structure better.

- Remove redundant zeroing of cmdline_idx field in
saved_cmdlines_buffer()

The structure that contains cmdline_idx is zeroed by memset(), no
need to explicitly zero any of its fields after that.

- Use system_percpu_wq instead of system_wq in user_event_mm_remove()

As system_wq is being deprecated, use the new wq.

- Add cond_resched() is ftrace_module_enable()

Some modules have a lot of functions (thousands of them), and the
enabling of those functions can take some time. On non preemtable
kernels, it was triggering a watchdog timeout. Add a cond_resched()
to prevent that.

- Add a BUILD_BUG_ON() to make sure PID_MAX_DEFAULT is always a power
of 2

There's code that depends on PID_MAX_DEFAULT being a power of 2 or it
will break. If in the future that changes, make sure the build fails
to ensure that the code is fixed that depends on this.

- Grab mutex_lock() before ever exiting s_start()

The s_start() function is a seq_file start routine. As s_stop() is
always called even if s_start() fails, and s_stop() expects the
event_mutex to be held as it will always release it. That mutex must
always be taken in s_start() even if that function fails.

* tag 'trace-v6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
tracing: Fix lock imbalance in s_start() memory allocation failure path
tracing: Ensure optimized hashing works
ftrace: Fix softlockup in ftrace_module_enable
tracing: replace use of system_wq with system_percpu_wq
tracing: Remove redundant 0 value initialization
tracing: Move buffer in trace_seq to end of struct
tracing/osnoise: Use for_each_online_cpu() instead of for_each_cpu()
tracing: Use vmalloc_array() to improve code
tracing: Have syscall trace events show "0x" for values greater than 10
tracing: Replace syscall RCU pointer assignment with READ/WRITE_ONCE()

+27 -21
+1 -1
include/linux/trace_seq.h
··· 21 (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) 22 23 struct trace_seq { 24 - char buffer[TRACE_SEQ_BUFFER_SIZE]; 25 struct seq_buf seq; 26 size_t readpos; 27 int full; 28 }; 29 30 static inline void
··· 21 (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) 22 23 struct trace_seq { 24 struct seq_buf seq; 25 size_t readpos; 26 int full; 27 + char buffer[TRACE_SEQ_BUFFER_SIZE]; 28 }; 29 30 static inline void
+2
kernel/trace/ftrace.c
··· 7535 if (!within_module(rec->ip, mod)) 7536 break; 7537 7538 /* Weak functions should still be ignored */ 7539 if (!test_for_valid_rec(rec)) { 7540 /* Clear all other flags. Should not be enabled anyway */
··· 7535 if (!within_module(rec->ip, mod)) 7536 break; 7537 7538 + cond_resched(); 7539 + 7540 /* Weak functions should still be ignored */ 7541 if (!test_for_valid_rec(rec)) { 7542 /* Clear all other flags. Should not be enabled anyway */
+2 -2
kernel/trace/trace.h
··· 380 #ifdef CONFIG_FTRACE_SYSCALLS 381 int sys_refcount_enter; 382 int sys_refcount_exit; 383 - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; 384 - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; 385 #endif 386 int stop_count; 387 int clock_id;
··· 380 #ifdef CONFIG_FTRACE_SYSCALLS 381 int sys_refcount_enter; 382 int sys_refcount_exit; 383 + struct trace_event_file *enter_syscall_files[NR_syscalls]; 384 + struct trace_event_file *exit_syscall_files[NR_syscalls]; 385 #endif 386 int stop_count; 387 int clock_id;
+1 -2
kernel/trace/trace_events.c
··· 1629 loff_t l; 1630 1631 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 1632 if (!iter) 1633 return NULL; 1634 - 1635 - mutex_lock(&event_mutex); 1636 1637 iter->type = SET_EVENT_FILE; 1638 iter->file = list_entry(&tr->events, struct trace_event_file, list);
··· 1629 loff_t l; 1630 1631 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 1632 + mutex_lock(&event_mutex); 1633 if (!iter) 1634 return NULL; 1635 1636 iter->type = SET_EVENT_FILE; 1637 iter->file = list_entry(&tr->events, struct trace_event_file, list);
+1 -1
kernel/trace/trace_events_user.c
··· 835 * so we use a work queue after call_rcu() to run within. 836 */ 837 INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); 838 - queue_rcu_work(system_wq, &mm->put_rwork); 839 } 840 841 void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
··· 835 * so we use a work queue after call_rcu() to run within. 836 */ 837 INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); 838 + queue_rcu_work(system_percpu_wq, &mm->put_rwork); 839 } 840 841 void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
+2 -2
kernel/trace/trace_osnoise.c
··· 271 * So far, all the values are initialized as 0, so 272 * zeroing the structure is perfect. 273 */ 274 - for_each_cpu(cpu, cpu_online_mask) { 275 tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); 276 if (tlat_var->kthread) 277 hrtimer_cancel(&tlat_var->timer); ··· 295 * So far, all the values are initialized as 0, so 296 * zeroing the structure is perfect. 297 */ 298 - for_each_cpu(cpu, cpu_online_mask) { 299 osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); 300 memset(osn_var, 0, sizeof(*osn_var)); 301 }
··· 271 * So far, all the values are initialized as 0, so 272 * zeroing the structure is perfect. 273 */ 274 + for_each_online_cpu(cpu) { 275 tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); 276 if (tlat_var->kthread) 277 hrtimer_cancel(&tlat_var->timer); ··· 295 * So far, all the values are initialized as 0, so 296 * zeroing the structure is perfect. 297 */ 298 + for_each_online_cpu(cpu) { 299 osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); 300 memset(osn_var, 0, sizeof(*osn_var)); 301 }
+2 -1
kernel/trace/trace_sched_switch.c
··· 224 /* Place map_cmdline_to_pid array right after saved_cmdlines */ 225 s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; 226 227 - s->cmdline_idx = 0; 228 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, 229 sizeof(s->map_pid_to_cmdline)); 230 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, ··· 246 /* treat recording of idle task as a success */ 247 if (!tsk->pid) 248 return 1; 249 250 tpid = tsk->pid & (PID_MAX_DEFAULT - 1); 251
··· 224 /* Place map_cmdline_to_pid array right after saved_cmdlines */ 225 s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; 226 227 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, 228 sizeof(s->map_pid_to_cmdline)); 229 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, ··· 247 /* treat recording of idle task as a success */ 248 if (!tsk->pid) 249 return 1; 250 + 251 + BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT)); 252 253 tpid = tsk->pid & (PID_MAX_DEFAULT - 1); 254
+15 -11
kernel/trace/trace_syscalls.c
··· 153 if (trace_seq_has_overflowed(s)) 154 goto end; 155 156 /* parameter types */ 157 if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) 158 trace_seq_printf(s, "%s ", entry->types[i]); 159 160 /* parameter values */ 161 - trace_seq_printf(s, "%s: %lx%s", entry->args[i], 162 - trace->args[i], 163 - i == entry->nb_args - 1 ? "" : ", "); 164 } 165 166 trace_seq_putc(s, ')'); ··· 316 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 317 return; 318 319 - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ 320 - trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); 321 if (!trace_file) 322 return; 323 ··· 361 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 362 return; 363 364 - /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ 365 - trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); 366 if (!trace_file) 367 return; 368 ··· 397 if (!tr->sys_refcount_enter) 398 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 399 if (!ret) { 400 - rcu_assign_pointer(tr->enter_syscall_files[num], file); 401 tr->sys_refcount_enter++; 402 } 403 mutex_unlock(&syscall_trace_lock); ··· 415 return; 416 mutex_lock(&syscall_trace_lock); 417 tr->sys_refcount_enter--; 418 - RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); 419 if (!tr->sys_refcount_enter) 420 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 421 mutex_unlock(&syscall_trace_lock); ··· 435 if (!tr->sys_refcount_exit) 436 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 437 if (!ret) { 438 - rcu_assign_pointer(tr->exit_syscall_files[num], file); 439 tr->sys_refcount_exit++; 440 } 441 mutex_unlock(&syscall_trace_lock); ··· 453 return; 454 mutex_lock(&syscall_trace_lock); 455 tr->sys_refcount_exit--; 456 - RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); 457 if (!tr->sys_refcount_exit) 458 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 459 mutex_unlock(&syscall_trace_lock);
··· 153 if (trace_seq_has_overflowed(s)) 154 goto end; 155 156 + if (i) 157 + trace_seq_puts(s, ", "); 158 + 159 /* parameter types */ 160 if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) 161 trace_seq_printf(s, "%s ", entry->types[i]); 162 163 /* parameter values */ 164 + if (trace->args[i] < 10) 165 + trace_seq_printf(s, "%s: %lu", entry->args[i], 166 + trace->args[i]); 167 + else 168 + trace_seq_printf(s, "%s: 0x%lx", entry->args[i], 169 + trace->args[i]); 170 } 171 172 trace_seq_putc(s, ')'); ··· 310 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 311 return; 312 313 + trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); 314 if (!trace_file) 315 return; 316 ··· 356 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 357 return; 358 359 + trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); 360 if (!trace_file) 361 return; 362 ··· 393 if (!tr->sys_refcount_enter) 394 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 395 if (!ret) { 396 + WRITE_ONCE(tr->enter_syscall_files[num], file); 397 tr->sys_refcount_enter++; 398 } 399 mutex_unlock(&syscall_trace_lock); ··· 411 return; 412 mutex_lock(&syscall_trace_lock); 413 tr->sys_refcount_enter--; 414 + WRITE_ONCE(tr->enter_syscall_files[num], NULL); 415 if (!tr->sys_refcount_enter) 416 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 417 mutex_unlock(&syscall_trace_lock); ··· 431 if (!tr->sys_refcount_exit) 432 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 433 if (!ret) { 434 + WRITE_ONCE(tr->exit_syscall_files[num], file); 435 tr->sys_refcount_exit++; 436 } 437 mutex_unlock(&syscall_trace_lock); ··· 449 return; 450 mutex_lock(&syscall_trace_lock); 451 tr->sys_refcount_exit--; 452 + WRITE_ONCE(tr->exit_syscall_files[num], NULL); 453 if (!tr->sys_refcount_exit) 454 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 455 mutex_unlock(&syscall_trace_lock);
+1 -1
kernel/trace/tracing_map.c
··· 1076 struct tracing_map_sort_entry *sort_entry, **entries; 1077 int i, n_entries, ret; 1078 1079 - entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); 1080 if (!entries) 1081 return -ENOMEM; 1082
··· 1076 struct tracing_map_sort_entry *sort_entry, **entries; 1077 int i, n_entries, ret; 1078 1079 + entries = vmalloc_array(map->max_elts, sizeof(sort_entry)); 1080 if (!entries) 1081 return -ENOMEM; 1082