tracing: Have trace_marker use per-cpu data to read user space

It was reported that using __copy_from_user_inatomic() can actually
schedule. Which is bad when preemption is disabled. Even though there's
logic to check in_atomic() is set, but this is a nop when the kernel is
configured with PREEMPT_NONE. This is due to page faulting and the code
could schedule with preemption disabled.

Link: https://lore.kernel.org/all/20250819105152.2766363-1-luogengkun@huaweicloud.com/

The solution was to change the __copy_from_user_inatomic() to
copy_from_user_nofault(). But then it was reported that this caused a
regression in Android. There's several applications writing into
trace_marker() in Android, but now instead of showing the expected data,
it is showing:

tracing_mark_write: <faulted>

After reverting the conversion to copy_from_user_nofault(), Android was
able to get the data again.

Writes to the trace_marker is a way to efficiently and quickly enter data
into the Linux tracing buffer. It takes no locks and was designed to be as
non-intrusive as possible. This means it cannot allocate memory, and must
use pre-allocated data.

A method that is actively being worked on to have faultable system call
tracepoints read user space data is to allocate per CPU buffers, and use
them in the callback. The method uses a technique similar to seqcount.
That is something like this:

preempt_disable();
cpu = smp_processor_id();
buffer = this_cpu_ptr(&pre_allocated_cpu_buffers, cpu);
do {
cnt = nr_context_switches_cpu(cpu);
migrate_disable();
preempt_enable();
ret = copy_from_user(buffer, ptr, size);
preempt_disable();
migrate_enable();
} while (!ret && cnt != nr_context_switches_cpu(cpu));

if (!ret)
ring_buffer_write(buffer);
preempt_enable();

It's a little more involved than that, but the above is the basic logic.
The idea is to acquire the current CPU buffer, disable migration, and then
enable preemption. At this moment, it can safely use copy_from_user().
After reading the data from user space, it disables preemption again. It
then checks to see if there was any new scheduling on this CPU. If there
was, it must assume that the buffer was corrupted by another task. If
there wasn't, then the buffer is still valid as only tasks in preemptable
context can write to this buffer and only those that are running on the
CPU.

By using this method, where trace_marker open allocates the per CPU
buffers, trace_marker writes can access user space and even fault it in,
without having to allocate or take any locks of its own.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Luo Gengkun <luogengkun@huaweicloud.com>
Cc: Wattson CI <wattson-external@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20251008124510.6dba541a@gandalf.local.home
Fixes: 3d62ab32df065 ("tracing: Fix tracing_marker may trigger page fault during preempt_disable")
Reported-by: Runping Lai <runpinglai@google.com>
Tested-by: Runping Lai <runpinglai@google.com>
Closes: https://lore.kernel.org/linux-trace-kernel/20251007003417.3470979-2-runpinglai@google.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+220 -48
+220 -48
kernel/trace/trace.c
··· 4791 4791 return single_release(inode, filp); 4792 4792 } 4793 4793 4794 - static int tracing_mark_open(struct inode *inode, struct file *filp) 4795 - { 4796 - stream_open(inode, filp); 4797 - return tracing_open_generic_tr(inode, filp); 4798 - } 4799 - 4800 4794 static int tracing_release(struct inode *inode, struct file *file) 4801 4795 { 4802 4796 struct trace_array *tr = inode->i_private; ··· 7157 7163 7158 7164 #define TRACE_MARKER_MAX_SIZE 4096 7159 7165 7160 - static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, 7166 + static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, 7161 7167 size_t cnt, unsigned long ip) 7162 7168 { 7163 7169 struct ring_buffer_event *event; ··· 7167 7173 int meta_size; 7168 7174 ssize_t written; 7169 7175 size_t size; 7170 - int len; 7171 - 7172 - /* Used in tracing_mark_raw_write() as well */ 7173 - #define FAULTED_STR "<faulted>" 7174 - #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ 7175 7176 7176 7177 meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ 7177 7178 again: 7178 7179 size = cnt + meta_size; 7179 - 7180 - /* If less than "<faulted>", then make sure we can still add that */ 7181 - if (cnt < FAULTED_SIZE) 7182 - size += FAULTED_SIZE - cnt; 7183 7180 7184 7181 buffer = tr->array_buffer.buffer; 7185 7182 event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, ··· 7181 7196 * make it smaller and try again. 7182 7197 */ 7183 7198 if (size > ring_buffer_max_event_size(buffer)) { 7184 - /* cnt < FAULTED size should never be bigger than max */ 7185 - if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) 7186 - return -EBADF; 7187 7199 cnt = ring_buffer_max_event_size(buffer) - meta_size; 7188 7200 /* The above should only happen once */ 7189 7201 if (WARN_ON_ONCE(cnt + meta_size == size)) ··· 7194 7212 7195 7213 entry = ring_buffer_event_data(event); 7196 7214 entry->ip = ip; 7197 - 7198 - len = copy_from_user_nofault(&entry->buf, ubuf, cnt); 7199 - if (len) { 7200 - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); 7201 - cnt = FAULTED_SIZE; 7202 - written = -EFAULT; 7203 - } else 7204 - written = cnt; 7215 + memcpy(&entry->buf, buf, cnt); 7216 + written = cnt; 7205 7217 7206 7218 if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { 7207 7219 /* do not add \n before testing triggers, but add \0 */ ··· 7219 7243 return written; 7220 7244 } 7221 7245 7246 + struct trace_user_buf { 7247 + char *buf; 7248 + }; 7249 + 7250 + struct trace_user_buf_info { 7251 + struct trace_user_buf __percpu *tbuf; 7252 + int ref; 7253 + }; 7254 + 7255 + 7256 + static DEFINE_MUTEX(trace_user_buffer_mutex); 7257 + static struct trace_user_buf_info *trace_user_buffer; 7258 + 7259 + static void trace_user_fault_buffer_free(struct trace_user_buf_info *tinfo) 7260 + { 7261 + char *buf; 7262 + int cpu; 7263 + 7264 + for_each_possible_cpu(cpu) { 7265 + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; 7266 + kfree(buf); 7267 + } 7268 + free_percpu(tinfo->tbuf); 7269 + kfree(tinfo); 7270 + } 7271 + 7272 + static int trace_user_fault_buffer_enable(void) 7273 + { 7274 + struct trace_user_buf_info *tinfo; 7275 + char *buf; 7276 + int cpu; 7277 + 7278 + guard(mutex)(&trace_user_buffer_mutex); 7279 + 7280 + if (trace_user_buffer) { 7281 + trace_user_buffer->ref++; 7282 + return 0; 7283 + } 7284 + 7285 + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 7286 + if (!tinfo) 7287 + return -ENOMEM; 7288 + 7289 + tinfo->tbuf = alloc_percpu(struct trace_user_buf); 7290 + if (!tinfo->tbuf) { 7291 + kfree(tinfo); 7292 + return -ENOMEM; 7293 + } 7294 + 7295 + tinfo->ref = 1; 7296 + 7297 + /* Clear each buffer in case of error */ 7298 + for_each_possible_cpu(cpu) { 7299 + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; 7300 + } 7301 + 7302 + for_each_possible_cpu(cpu) { 7303 + buf = kmalloc_node(TRACE_MARKER_MAX_SIZE, GFP_KERNEL, 7304 + cpu_to_node(cpu)); 7305 + if (!buf) { 7306 + trace_user_fault_buffer_free(tinfo); 7307 + return -ENOMEM; 7308 + } 7309 + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; 7310 + } 7311 + 7312 + trace_user_buffer = tinfo; 7313 + 7314 + return 0; 7315 + } 7316 + 7317 + static void trace_user_fault_buffer_disable(void) 7318 + { 7319 + struct trace_user_buf_info *tinfo; 7320 + 7321 + guard(mutex)(&trace_user_buffer_mutex); 7322 + 7323 + tinfo = trace_user_buffer; 7324 + 7325 + if (WARN_ON_ONCE(!tinfo)) 7326 + return; 7327 + 7328 + if (--tinfo->ref) 7329 + return; 7330 + 7331 + trace_user_fault_buffer_free(tinfo); 7332 + trace_user_buffer = NULL; 7333 + } 7334 + 7335 + /* Must be called with preemption disabled */ 7336 + static char *trace_user_fault_read(struct trace_user_buf_info *tinfo, 7337 + const char __user *ptr, size_t size, 7338 + size_t *read_size) 7339 + { 7340 + int cpu = smp_processor_id(); 7341 + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; 7342 + unsigned int cnt; 7343 + int trys = 0; 7344 + int ret; 7345 + 7346 + if (size > TRACE_MARKER_MAX_SIZE) 7347 + size = TRACE_MARKER_MAX_SIZE; 7348 + *read_size = 0; 7349 + 7350 + /* 7351 + * This acts similar to a seqcount. The per CPU context switches are 7352 + * recorded, migration is disabled and preemption is enabled. The 7353 + * read of the user space memory is copied into the per CPU buffer. 7354 + * Preemption is disabled again, and if the per CPU context switches count 7355 + * is still the same, it means the buffer has not been corrupted. 7356 + * If the count is different, it is assumed the buffer is corrupted 7357 + * and reading must be tried again. 7358 + */ 7359 + 7360 + do { 7361 + /* 7362 + * If for some reason, copy_from_user() always causes a context 7363 + * switch, this would then cause an infinite loop. 7364 + * If this task is preempted by another user space task, it 7365 + * will cause this task to try again. But just in case something 7366 + * changes where the copying from user space causes another task 7367 + * to run, prevent this from going into an infinite loop. 7368 + * 100 tries should be plenty. 7369 + */ 7370 + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) 7371 + return NULL; 7372 + 7373 + /* Read the current CPU context switch counter */ 7374 + cnt = nr_context_switches_cpu(cpu); 7375 + 7376 + /* 7377 + * Preemption is going to be enabled, but this task must 7378 + * remain on this CPU. 7379 + */ 7380 + migrate_disable(); 7381 + 7382 + /* 7383 + * Now preemption is being enabed and another task can come in 7384 + * and use the same buffer and corrupt our data. 7385 + */ 7386 + preempt_enable_notrace(); 7387 + 7388 + ret = __copy_from_user(buffer, ptr, size); 7389 + 7390 + preempt_disable_notrace(); 7391 + migrate_enable(); 7392 + 7393 + /* if it faulted, no need to test if the buffer was corrupted */ 7394 + if (ret) 7395 + return NULL; 7396 + 7397 + /* 7398 + * Preemption is disabled again, now check the per CPU context 7399 + * switch counter. If it doesn't match, then another user space 7400 + * process may have schedule in and corrupted our buffer. In that 7401 + * case the copying must be retried. 7402 + */ 7403 + } while (nr_context_switches_cpu(cpu) != cnt); 7404 + 7405 + *read_size = size; 7406 + return buffer; 7407 + } 7408 + 7222 7409 static ssize_t 7223 7410 tracing_mark_write(struct file *filp, const char __user *ubuf, 7224 7411 size_t cnt, loff_t *fpos) ··· 7389 7250 struct trace_array *tr = filp->private_data; 7390 7251 ssize_t written = -ENODEV; 7391 7252 unsigned long ip; 7253 + size_t size; 7254 + char *buf; 7392 7255 7393 7256 if (tracing_disabled) 7394 7257 return -EINVAL; ··· 7404 7263 if (cnt > TRACE_MARKER_MAX_SIZE) 7405 7264 cnt = TRACE_MARKER_MAX_SIZE; 7406 7265 7266 + /* Must have preemption disabled while having access to the buffer */ 7267 + guard(preempt_notrace)(); 7268 + 7269 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7270 + if (!buf) 7271 + return -EFAULT; 7272 + 7273 + if (cnt > size) 7274 + cnt = size; 7275 + 7407 7276 /* The selftests expect this function to be the IP address */ 7408 7277 ip = _THIS_IP_; 7409 7278 ··· 7421 7270 if (tr == &global_trace) { 7422 7271 guard(rcu)(); 7423 7272 list_for_each_entry_rcu(tr, &marker_copies, marker_list) { 7424 - written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7273 + written = write_marker_to_buffer(tr, buf, cnt, ip); 7425 7274 if (written < 0) 7426 7275 break; 7427 7276 } 7428 7277 } else { 7429 - written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7278 + written = write_marker_to_buffer(tr, buf, cnt, ip); 7430 7279 } 7431 7280 7432 7281 return written; 7433 7282 } 7434 7283 7435 7284 static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, 7436 - const char __user *ubuf, size_t cnt) 7285 + const char *buf, size_t cnt) 7437 7286 { 7438 7287 struct ring_buffer_event *event; 7439 7288 struct trace_buffer *buffer; 7440 7289 struct raw_data_entry *entry; 7441 7290 ssize_t written; 7442 - int size; 7443 - int len; 7444 - 7445 - #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7291 + size_t size; 7446 7292 7447 7293 size = sizeof(*entry) + cnt; 7448 - if (cnt < FAULT_SIZE_ID) 7449 - size += FAULT_SIZE_ID - cnt; 7450 7294 7451 7295 buffer = tr->array_buffer.buffer; 7452 7296 ··· 7455 7309 return -EBADF; 7456 7310 7457 7311 entry = ring_buffer_event_data(event); 7458 - 7459 - len = copy_from_user_nofault(&entry->id, ubuf, cnt); 7460 - if (len) { 7461 - entry->id = -1; 7462 - memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); 7463 - written = -EFAULT; 7464 - } else 7465 - written = cnt; 7312 + memcpy(&entry->id, buf, cnt); 7313 + written = cnt; 7466 7314 7467 7315 __buffer_unlock_commit(buffer, event); 7468 7316 ··· 7469 7329 { 7470 7330 struct trace_array *tr = filp->private_data; 7471 7331 ssize_t written = -ENODEV; 7472 - 7473 - #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7332 + size_t size; 7333 + char *buf; 7474 7334 7475 7335 if (tracing_disabled) 7476 7336 return -EINVAL; ··· 7480 7340 7481 7341 /* The marker must at least have a tag id */ 7482 7342 if (cnt < sizeof(unsigned int)) 7343 + return -EINVAL; 7344 + 7345 + /* Must have preemption disabled while having access to the buffer */ 7346 + guard(preempt_notrace)(); 7347 + 7348 + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, &size); 7349 + if (!buf) 7350 + return -EFAULT; 7351 + 7352 + /* raw write is all or nothing */ 7353 + if (cnt > size) 7483 7354 return -EINVAL; 7484 7355 7485 7356 /* The global trace_marker_raw can go to multiple instances */ ··· 7506 7355 } 7507 7356 7508 7357 return written; 7358 + } 7359 + 7360 + static int tracing_mark_open(struct inode *inode, struct file *filp) 7361 + { 7362 + int ret; 7363 + 7364 + ret = trace_user_fault_buffer_enable(); 7365 + if (ret < 0) 7366 + return ret; 7367 + 7368 + stream_open(inode, filp); 7369 + ret = tracing_open_generic_tr(inode, filp); 7370 + if (ret < 0) 7371 + trace_user_fault_buffer_disable(); 7372 + return ret; 7373 + } 7374 + 7375 + static int tracing_mark_release(struct inode *inode, struct file *file) 7376 + { 7377 + trace_user_fault_buffer_disable(); 7378 + return tracing_release_generic_tr(inode, file); 7509 7379 } 7510 7380 7511 7381 static int tracing_clock_show(struct seq_file *m, void *v) ··· 7936 7764 static const struct file_operations tracing_mark_fops = { 7937 7765 .open = tracing_mark_open, 7938 7766 .write = tracing_mark_write, 7939 - .release = tracing_release_generic_tr, 7767 + .release = tracing_mark_release, 7940 7768 }; 7941 7769 7942 7770 static const struct file_operations tracing_mark_raw_fops = { 7943 7771 .open = tracing_mark_open, 7944 7772 .write = tracing_mark_raw_write, 7945 - .release = tracing_release_generic_tr, 7773 + .release = tracing_mark_release, 7946 7774 }; 7947 7775 7948 7776 static const struct file_operations trace_clock_fops = {