tracing: Have trace_marker use per-cpu data to read user space

It was reported that using __copy_from_user_inatomic() can actually
schedule. Which is bad when preemption is disabled. Even though there's
logic to check in_atomic() is set, but this is a nop when the kernel is
configured with PREEMPT_NONE. This is due to page faulting and the code
could schedule with preemption disabled.

Link: https://lore.kernel.org/all/20250819105152.2766363-1-luogengkun@huaweicloud.com/

The solution was to change the __copy_from_user_inatomic() to
copy_from_user_nofault(). But then it was reported that this caused a
regression in Android. There's several applications writing into
trace_marker() in Android, but now instead of showing the expected data,
it is showing:

tracing_mark_write: <faulted>

After reverting the conversion to copy_from_user_nofault(), Android was
able to get the data again.

Writes to the trace_marker is a way to efficiently and quickly enter data
into the Linux tracing buffer. It takes no locks and was designed to be as
non-intrusive as possible. This means it cannot allocate memory, and must
use pre-allocated data.

A method that is actively being worked on to have faultable system call
tracepoints read user space data is to allocate per CPU buffers, and use
them in the callback. The method uses a technique similar to seqcount.
That is something like this:

preempt_disable();
cpu = smp_processor_id();
buffer = this_cpu_ptr(&pre_allocated_cpu_buffers, cpu);
do {
cnt = nr_context_switches_cpu(cpu);
migrate_disable();
preempt_enable();
ret = copy_from_user(buffer, ptr, size);
preempt_disable();
migrate_enable();
} while (!ret && cnt != nr_context_switches_cpu(cpu));

if (!ret)
ring_buffer_write(buffer);
preempt_enable();

It's a little more involved than that, but the above is the basic logic.
The idea is to acquire the current CPU buffer, disable migration, and then
enable preemption. At this moment, it can safely use copy_from_user().
After reading the data from user space, it disables preemption again. It
then checks to see if there was any new scheduling on this CPU. If there
was, it must assume that the buffer was corrupted by another task. If
there wasn't, then the buffer is still valid as only tasks in preemptable
context can write to this buffer and only those that are running on the
CPU.

By using this method, where trace_marker open allocates the per CPU
buffers, trace_marker writes can access user space and even fault it in,
without having to allocate or take any locks of its own.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Luo Gengkun <luogengkun@huaweicloud.com>
Cc: Wattson CI <wattson-external@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20251008124510.6dba541a@gandalf.local.home
Fixes: 3d62ab32df065 ("tracing: Fix tracing_marker may trigger page fault during preempt_disable")
Reported-by: Runping Lai <runpinglai@google.com>
Tested-by: Runping Lai <runpinglai@google.com>
Closes: https://lore.kernel.org/linux-trace-kernel/20251007003417.3470979-2-runpinglai@google.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+220 -48
+220 -48
kernel/trace/trace.c
··· 4791 return single_release(inode, filp); 4792 } 4793 4794 - static int tracing_mark_open(struct inode *inode, struct file *filp) 4795 - { 4796 - stream_open(inode, filp); 4797 - return tracing_open_generic_tr(inode, filp); 4798 - } 4799 - 4800 static int tracing_release(struct inode *inode, struct file *file) 4801 { 4802 struct trace_array *tr = inode->i_private; ··· 7157 7158 #define TRACE_MARKER_MAX_SIZE 4096 7159 7160 - static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, 7161 size_t cnt, unsigned long ip) 7162 { 7163 struct ring_buffer_event *event; ··· 7167 int meta_size; 7168 ssize_t written; 7169 size_t size; 7170 - int len; 7171 - 7172 - /* Used in tracing_mark_raw_write() as well */ 7173 - #define FAULTED_STR "<faulted>" 7174 - #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ 7175 7176 meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ 7177 again: 7178 size = cnt + meta_size; 7179 - 7180 - /* If less than "<faulted>", then make sure we can still add that */ 7181 - if (cnt < FAULTED_SIZE) 7182 - size += FAULTED_SIZE - cnt; 7183 7184 buffer = tr->array_buffer.buffer; 7185 event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, ··· 7181 * make it smaller and try again. 7182 */ 7183 if (size > ring_buffer_max_event_size(buffer)) { 7184 - /* cnt < FAULTED size should never be bigger than max */ 7185 - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) 7186 - return -EBADF; 7187 cnt = ring_buffer_max_event_size(buffer) - meta_size; 7188 /* The above should only happen once */ 7189 if (WARN_ON_ONCE(cnt + meta_size == size)) ··· 7194 7195 entry = ring_buffer_event_data(event); 7196 entry->ip = ip; 7197 - 7198 - len = copy_from_user_nofault(&entry->buf, ubuf, cnt); 7199 - if (len) { 7200 - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); 7201 - cnt = FAULTED_SIZE; 7202 - written = -EFAULT; 7203 - } else 7204 - written = cnt; 7205 7206 if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { 7207 /* do not add \n before testing triggers, but add \0 */ ··· 7219 return written; 7220 } 7221 7222 static ssize_t 7223 tracing_mark_write(struct file *filp, const char __user *ubuf, 7224 size_t cnt, loff_t *fpos) ··· 7389 struct trace_array *tr = filp->private_data; 7390 ssize_t written = -ENODEV; 7391 unsigned long ip; 7392 7393 if (tracing_disabled) 7394 return -EINVAL; ··· 7404 if (cnt > TRACE_MARKER_MAX_SIZE) 7405 cnt = TRACE_MARKER_MAX_SIZE; 7406 7407 /* The selftests expect this function to be the IP address */ 7408 ip = _THIS_IP_; 7409 ··· 7421 if (tr == &global_trace) { 7422 guard(rcu)(); 7423 list_for_each_entry_rcu(tr, &marker_copies, marker_list) { 7424 - written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7425 if (written < 0) 7426 break; 7427 } 7428 } else { 7429 - written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7430 } 7431 7432 return written; 7433 } 7434 7435 static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, 7436 - const char __user *ubuf, size_t cnt) 7437 { 7438 struct ring_buffer_event *event; 7439 struct trace_buffer *buffer; 7440 struct raw_data_entry *entry; 7441 ssize_t written; 7442 - int size; 7443 - int len; 7444 - 7445 - #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7446 7447 size = sizeof(*entry) + cnt; 7448 - if (cnt < FAULT_SIZE_ID) 7449 - size += FAULT_SIZE_ID - cnt; 7450 7451 buffer = tr->array_buffer.buffer; 7452 ··· 7455 return -EBADF; 7456 7457 entry = ring_buffer_event_data(event); 7458 - 7459 - len = copy_from_user_nofault(&entry->id, ubuf, cnt); 7460 - if (len) { 7461 - entry->id = -1; 7462 - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); 7463 - written = -EFAULT; 7464 - } else 7465 - written = cnt; 7466 7467 __buffer_unlock_commit(buffer, event); 7468 ··· 7469 { 7470 struct trace_array *tr = filp->private_data; 7471 ssize_t written = -ENODEV; 7472 - 7473 - #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7474 7475 if (tracing_disabled) 7476 return -EINVAL; ··· 7480 7481 /* The marker must at least have a tag id */ 7482 if (cnt < sizeof(unsigned int)) 7483 return -EINVAL; 7484 7485 /* The global trace_marker_raw can go to multiple instances */ ··· 7506 } 7507 7508 return written; 7509 } 7510 7511 static int tracing_clock_show(struct seq_file *m, void *v) ··· 7936 static const struct file_operations tracing_mark_fops = { 7937 .open = tracing_mark_open, 7938 .write = tracing_mark_write, 7939 - .release = tracing_release_generic_tr, 7940 }; 7941 7942 static const struct file_operations tracing_mark_raw_fops = { 7943 .open = tracing_mark_open, 7944 .write = tracing_mark_raw_write, 7945 - .release = tracing_release_generic_tr, 7946 }; 7947 7948 static const struct file_operations trace_clock_fops = {
··· 4791 return single_release(inode, filp); 4792 } 4793 4794 static int tracing_release(struct inode *inode, struct file *file) 4795 { 4796 struct trace_array *tr = inode->i_private; ··· 7163 7164 #define TRACE_MARKER_MAX_SIZE 4096 7165 7166 + static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, 7167 size_t cnt, unsigned long ip) 7168 { 7169 struct ring_buffer_event *event; ··· 7173 int meta_size; 7174 ssize_t written; 7175 size_t size; 7176 7177 meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ 7178 again: 7179 size = cnt + meta_size; 7180 7181 buffer = tr->array_buffer.buffer; 7182 event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, ··· 7196 * make it smaller and try again. 7197 */ 7198 if (size > ring_buffer_max_event_size(buffer)) { 7199 cnt = ring_buffer_max_event_size(buffer) - meta_size; 7200 /* The above should only happen once */ 7201 if (WARN_ON_ONCE(cnt + meta_size == size)) ··· 7212 7213 entry = ring_buffer_event_data(event); 7214 entry->ip = ip; 7215 + memcpy(&entry->buf, buf, cnt); 7216 + written = cnt; 7217 7218 if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { 7219 /* do not add \n before testing triggers, but add \0 */ ··· 7243 return written; 7244 } 7245 7246 + struct trace_user_buf { 7247 + char *buf; 7248 + }; 7249 + 7250 + struct trace_user_buf_info { 7251 + struct trace_user_buf __percpu *tbuf; 7252 + int ref; 7253 + }; 7254 + 7255 + 7256 + static DEFINE_MUTEX(trace_user_buffer_mutex); 7257 + static struct trace_user_buf_info *trace_user_buffer; 7258 + 7259 + static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) 7260 + { 7261 + char *buf; 7262 + int cpu; 7263 + 7264 + for_each_possible_cpu(cpu) { 7265 + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; 7266 + kfree(buf); 7267 + } 7268 + free_percpu(tinfo->tbuf); 7269 + kfree(tinfo); 7270 + } 7271 + 7272 + static int trace_user_fault_buffer_enable(void) 7273 + { 7274 + struct trace_user_buf_info *tinfo; 7275 + char *buf; 7276 + int cpu; 7277 + 7278 + guard(mutex)(&trace_user_buffer_mutex); 7279 + 7280 + if (trace_user_buffer) { 7281 + trace_user_buffer->ref++; 7282 + return 0; 7283 + } 7284 + 7285 + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 7286 + if (!tinfo) 7287 + return -ENOMEM; 7288 + 7289 + tinfo->tbuf = alloc_percpu(struct trace_user_buf); 7290 + if (!tinfo->tbuf) { 7291 + kfree(tinfo); 7292 + return -ENOMEM; 7293 + } 7294 + 7295 + tinfo->ref = 1; 7296 + 7297 + /* Clear each buffer in case of error */ 7298 + for_each_possible_cpu(cpu) { 7299 + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; 7300 + } 7301 + 7302 + for_each_possible_cpu(cpu) { 7303 + buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, 7304 + cpu_to_node(cpu)); 7305 + if (!buf) { 7306 + trace_user_fault_buffer_free(tinfo); 7307 + return -ENOMEM; 7308 + } 7309 + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; 7310 + } 7311 + 7312 + trace_user_buffer = tinfo; 7313 + 7314 + return 0; 7315 + } 7316 + 7317 + static void trace_user_fault_buffer_disable(void) 7318 + { 7319 + struct trace_user_buf_info *tinfo; 7320 + 7321 + guard(mutex)(&trace_user_buffer_mutex); 7322 + 7323 + tinfo = trace_user_buffer; 7324 + 7325 + if (WARN_ON_ONCE(!tinfo)) 7326 + return; 7327 + 7328 + if (--tinfo->ref) 7329 + return; 7330 + 7331 + trace_user_fault_buffer_free(tinfo); 7332 + trace_user_buffer = NULL; 7333 + } 7334 + 7335 + /* Must be called with preemption disabled */ 7336 + static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, 7337 + const char __user *ptr, size_t size, 7338 + size_t *read_size) 7339 + { 7340 + int cpu = smp_processor_id(); 7341 + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; 7342 + unsigned int cnt; 7343 + int trys = 0; 7344 + int ret; 7345 + 7346 + if (size > TRACE_MARKER_MAX_SIZE) 7347 + size = TRACE_MARKER_MAX_SIZE; 7348 + *read_size = 0; 7349 + 7350 + /* 7351 + * This acts similar to a seqcount. The per CPU context switches are 7352 + * recorded, migration is disabled and preemption is enabled. The 7353 + * read of the user space memory is copied into the per CPU buffer. 7354 + * Preemption is disabled again, and if the per CPU context switches count 7355 + * is still the same, it means the buffer has not been corrupted. 7356 + * If the count is different, it is assumed the buffer is corrupted 7357 + * and reading must be tried again. 7358 + */ 7359 + 7360 + do { 7361 + /* 7362 + * If for some reason, copy_from_user() always causes a context 7363 + * switch, this would then cause an infinite loop. 7364 + * If this task is preempted by another user space task, it 7365 + * will cause this task to try again. But just in case something 7366 + * changes where the copying from user space causes another task 7367 + * to run, prevent this from going into an infinite loop. 7368 + * 100 tries should be plenty. 7369 + */ 7370 + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) 7371 + return NULL; 7372 + 7373 + /* Read the current CPU context switch counter */ 7374 + cnt = nr_context_switches_cpu(cpu); 7375 + 7376 + /* 7377 + * Preemption is going to be enabled, but this task must 7378 + * remain on this CPU. 7379 + */ 7380 + migrate_disable(); 7381 + 7382 + /* 7383 + * Now preemption is being enabed and another task can come in 7384 + * and use the same buffer and corrupt our data. 7385 + */ 7386 + preempt_enable_notrace(); 7387 + 7388 + ret = __copy_from_user(buffer, ptr, size); 7389 + 7390 + preempt_disable_notrace(); 7391 + migrate_enable(); 7392 + 7393 + /* if it faulted, no need to test if the buffer was corrupted */ 7394 + if (ret) 7395 + return NULL; 7396 + 7397 + /* 7398 + * Preemption is disabled again, now check the per CPU context 7399 + * switch counter. If it doesn't match, then another user space 7400 + * process may have schedule in and corrupted our buffer. In that 7401 + * case the copying must be retried. 7402 + */ 7403 + } while (nr_context_switches_cpu(cpu) != cnt); 7404 + 7405 + *read_size = size; 7406 + return buffer; 7407 + } 7408 + 7409 static ssize_t 7410 tracing_mark_write(struct file *filp, const char __user *ubuf, 7411 size_t cnt, loff_t *fpos) ··· 7250 struct trace_array *tr = filp->private_data; 7251 ssize_t written = -ENODEV; 7252 unsigned long ip; 7253 + size_t size; 7254 + char *buf; 7255 7256 if (tracing_disabled) 7257 return -EINVAL; ··· 7263 if (cnt > TRACE_MARKER_MAX_SIZE) 7264 cnt = TRACE_MARKER_MAX_SIZE; 7265 7266 + /* Must have preemption disabled while having access to the buffer */ 7267 + guard(preempt_notrace)(); 7268 + 7269 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7270 + if (!buf) 7271 + return -EFAULT; 7272 + 7273 + if (cnt > size) 7274 + cnt = size; 7275 + 7276 /* The selftests expect this function to be the IP address */ 7277 ip = _THIS_IP_; 7278 ··· 7270 if (tr == &global_trace) { 7271 guard(rcu)(); 7272 list_for_each_entry_rcu(tr, &marker_copies, marker_list) { 7273 + written = write_marker_to_buffer(tr, buf, cnt, ip); 7274 if (written < 0) 7275 break; 7276 } 7277 } else { 7278 + written = write_marker_to_buffer(tr, buf, cnt, ip); 7279 } 7280 7281 return written; 7282 } 7283 7284 static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, 7285 + const char *buf, size_t cnt) 7286 { 7287 struct ring_buffer_event *event; 7288 struct trace_buffer *buffer; 7289 struct raw_data_entry *entry; 7290 ssize_t written; 7291 + size_t size; 7292 7293 size = sizeof(*entry) + cnt; 7294 7295 buffer = tr->array_buffer.buffer; 7296 ··· 7309 return -EBADF; 7310 7311 entry = ring_buffer_event_data(event); 7312 + memcpy(&entry->id, buf, cnt); 7313 + written = cnt; 7314 7315 __buffer_unlock_commit(buffer, event); 7316 ··· 7329 { 7330 struct trace_array *tr = filp->private_data; 7331 ssize_t written = -ENODEV; 7332 + size_t size; 7333 + char *buf; 7334 7335 if (tracing_disabled) 7336 return -EINVAL; ··· 7340 7341 /* The marker must at least have a tag id */ 7342 if (cnt < sizeof(unsigned int)) 7343 + return -EINVAL; 7344 + 7345 + /* Must have preemption disabled while having access to the buffer */ 7346 + guard(preempt_notrace)(); 7347 + 7348 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7349 + if (!buf) 7350 + return -EFAULT; 7351 + 7352 + /* raw write is all or nothing */ 7353 + if (cnt > size) 7354 return -EINVAL; 7355 7356 /* The global trace_marker_raw can go to multiple instances */ ··· 7355 } 7356 7357 return written; 7358 + } 7359 + 7360 + static int tracing_mark_open(struct inode *inode, struct file *filp) 7361 + { 7362 + int ret; 7363 + 7364 + ret = trace_user_fault_buffer_enable(); 7365 + if (ret < 0) 7366 + return ret; 7367 + 7368 + stream_open(inode, filp); 7369 + ret = tracing_open_generic_tr(inode, filp); 7370 + if (ret < 0) 7371 + trace_user_fault_buffer_disable(); 7372 + return ret; 7373 + } 7374 + 7375 + static int tracing_mark_release(struct inode *inode, struct file *file) 7376 + { 7377 + trace_user_fault_buffer_disable(); 7378 + return tracing_release_generic_tr(inode, file); 7379 } 7380 7381 static int tracing_clock_show(struct seq_file *m, void *v) ··· 7764 static const struct file_operations tracing_mark_fops = { 7765 .open = tracing_mark_open, 7766 .write = tracing_mark_write, 7767 + .release = tracing_mark_release, 7768 }; 7769 7770 static const struct file_operations tracing_mark_raw_fops = { 7771 .open = tracing_mark_open, 7772 .write = tracing_mark_raw_write, 7773 + .release = tracing_mark_release, 7774 }; 7775 7776 static const struct file_operations trace_clock_fops = {