Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Restartable sequences system call
4 *
5 * Copyright (C) 2015, Google, Inc.,
6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
7 * Copyright (C) 2015-2018, EfficiOS Inc.,
8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
9 */
10
11/*
12 * Restartable sequences are a lightweight interface that allows
13 * user-level code to be executed atomically relative to scheduler
14 * preemption and signal delivery. Typically used for implementing
15 * per-cpu operations.
16 *
17 * It allows user-space to perform update operations on per-cpu data
18 * without requiring heavy-weight atomic operations.
19 *
20 * Detailed algorithm of rseq user-space assembly sequences:
21 *
22 * init(rseq_cs)
23 * cpu = TLS->rseq::cpu_id_start
24 * [1] TLS->rseq::rseq_cs = rseq_cs
25 * [start_ip] ----------------------------
26 * [2] if (cpu != TLS->rseq::cpu_id)
27 * goto abort_ip;
28 * [3] <last_instruction_in_cs>
29 * [post_commit_ip] ----------------------------
30 *
31 * The address of jump target abort_ip must be outside the critical
32 * region, i.e.:
33 *
34 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
35 *
36 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
37 * userspace that can handle being interrupted between any of those
38 * instructions, and then resumed to the abort_ip.
39 *
40 * 1. Userspace stores the address of the struct rseq_cs assembly
41 * block descriptor into the rseq_cs field of the registered
42 * struct rseq TLS area. This update is performed through a single
43 * store within the inline assembly instruction sequence.
44 * [start_ip]
45 *
46 * 2. Userspace tests to check whether the current cpu_id field match
47 * the cpu number loaded before start_ip, branching to abort_ip
48 * in case of a mismatch.
49 *
50 * If the sequence is preempted or interrupted by a signal
51 * at or after start_ip and before post_commit_ip, then the kernel
52 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
53 * ip to abort_ip before returning to user-space, so the preempted
54 * execution resumes at abort_ip.
55 *
56 * 3. Userspace critical section final instruction before
57 * post_commit_ip is the commit. The critical section is
58 * self-terminating.
59 * [post_commit_ip]
60 *
61 * 4. <success>
62 *
63 * On failure at [2], or if interrupted by preempt or signal delivery
64 * between [1] and [3]:
65 *
66 * [abort_ip]
67 * F1. <failure>
68 */
69
70/* Required to select the proper per_cpu ops for rseq_stats_inc() */
71#define RSEQ_BUILD_SLOW_PATH
72
73#include <linux/debugfs.h>
74#include <linux/ratelimit.h>
75#include <linux/rseq_entry.h>
76#include <linux/sched.h>
77#include <linux/syscalls.h>
78#include <linux/uaccess.h>
79#include <linux/types.h>
80#include <asm/ptrace.h>
81
82#define CREATE_TRACE_POINTS
83#include <trace/events/rseq.h>
84
85DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
86
87static inline void rseq_control_debug(bool on)
88{
89 if (on)
90 static_branch_enable(&rseq_debug_enabled);
91 else
92 static_branch_disable(&rseq_debug_enabled);
93}
94
95static int __init rseq_setup_debug(char *str)
96{
97 bool on;
98
99 if (kstrtobool(str, &on))
100 return -EINVAL;
101 rseq_control_debug(on);
102 return 1;
103}
104__setup("rseq_debug=", rseq_setup_debug);
105
106#ifdef CONFIG_TRACEPOINTS
107/*
108 * Out of line, so the actual update functions can be in a header to be
109 * inlined into the exit to user code.
110 */
111void __rseq_trace_update(struct task_struct *t)
112{
113 trace_rseq_update(t);
114}
115
116void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
117 unsigned long offset, unsigned long abort_ip)
118{
119 trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
120}
121#endif /* CONFIG_TRACEPOINTS */
122
123#ifdef CONFIG_DEBUG_FS
124#ifdef CONFIG_RSEQ_STATS
125DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
126
127static int rseq_stats_show(struct seq_file *m, void *p)
128{
129 struct rseq_stats stats = { };
130 unsigned int cpu;
131
132 for_each_possible_cpu(cpu) {
133 stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
134 stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
135 stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
136 stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu));
137 stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
138 stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
139 stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
140 stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
141 }
142
143 seq_printf(m, "exit: %16lu\n", stats.exit);
144 seq_printf(m, "signal: %16lu\n", stats.signal);
145 seq_printf(m, "slowp: %16lu\n", stats.slowpath);
146 seq_printf(m, "fastp: %16lu\n", stats.fastpath);
147 seq_printf(m, "ids: %16lu\n", stats.ids);
148 seq_printf(m, "cs: %16lu\n", stats.cs);
149 seq_printf(m, "clear: %16lu\n", stats.clear);
150 seq_printf(m, "fixup: %16lu\n", stats.fixup);
151 return 0;
152}
153
154static int rseq_stats_open(struct inode *inode, struct file *file)
155{
156 return single_open(file, rseq_stats_show, inode->i_private);
157}
158
159static const struct file_operations stat_ops = {
160 .open = rseq_stats_open,
161 .read = seq_read,
162 .llseek = seq_lseek,
163 .release = single_release,
164};
165
166static int __init rseq_stats_init(struct dentry *root_dir)
167{
168 debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
169 return 0;
170}
171#else
172static inline void rseq_stats_init(struct dentry *root_dir) { }
173#endif /* CONFIG_RSEQ_STATS */
174
175static int rseq_debug_show(struct seq_file *m, void *p)
176{
177 bool on = static_branch_unlikely(&rseq_debug_enabled);
178
179 seq_printf(m, "%d\n", on);
180 return 0;
181}
182
183static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
184 size_t count, loff_t *ppos)
185{
186 bool on;
187
188 if (kstrtobool_from_user(ubuf, count, &on))
189 return -EINVAL;
190
191 rseq_control_debug(on);
192 return count;
193}
194
195static int rseq_debug_open(struct inode *inode, struct file *file)
196{
197 return single_open(file, rseq_debug_show, inode->i_private);
198}
199
200static const struct file_operations debug_ops = {
201 .open = rseq_debug_open,
202 .read = seq_read,
203 .write = rseq_debug_write,
204 .llseek = seq_lseek,
205 .release = single_release,
206};
207
208static int __init rseq_debugfs_init(void)
209{
210 struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
211
212 debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
213 rseq_stats_init(root_dir);
214 return 0;
215}
216__initcall(rseq_debugfs_init);
217#endif /* CONFIG_DEBUG_FS */
218
219static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
220{
221 return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
222}
223
224static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
225{
226 struct rseq __user *urseq = t->rseq.usrptr;
227 u64 csaddr;
228
229 scoped_user_read_access(urseq, efault)
230 unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
231 if (likely(!csaddr))
232 return true;
233 return rseq_update_user_cs(t, regs, csaddr);
234efault:
235 return false;
236}
237
238static void rseq_slowpath_update_usr(struct pt_regs *regs)
239{
240 /*
241 * Preserve rseq state and user_irq state. The generic entry code
242 * clears user_irq on the way out, the non-generic entry
243 * architectures are not having user_irq.
244 */
245 const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
246 struct task_struct *t = current;
247 struct rseq_ids ids;
248 u32 node_id;
249 bool event;
250
251 if (unlikely(t->flags & PF_EXITING))
252 return;
253
254 rseq_stat_inc(rseq_stats.slowpath);
255
256 /*
257 * Read and clear the event pending bit first. If the task
258 * was not preempted or migrated or a signal is on the way,
259 * there is no point in doing any of the heavy lifting here
260 * on production kernels. In that case TIF_NOTIFY_RESUME
261 * was raised by some other functionality.
262 *
263 * This is correct because the read/clear operation is
264 * guarded against scheduler preemption, which makes it CPU
265 * local atomic. If the task is preempted right after
266 * re-enabling preemption then TIF_NOTIFY_RESUME is set
267 * again and this function is invoked another time _before_
268 * the task is able to return to user mode.
269 *
270 * On a debug kernel, invoke the fixup code unconditionally
271 * with the result handed in to allow the detection of
272 * inconsistencies.
273 */
274 scoped_guard(irq) {
275 event = t->rseq.event.sched_switch;
276 t->rseq.event.all &= evt_mask.all;
277 ids.cpu_id = task_cpu(t);
278 ids.mm_cid = task_mm_cid(t);
279 }
280
281 if (!event)
282 return;
283
284 node_id = cpu_to_node(ids.cpu_id);
285
286 if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
287 /*
288 * Clear the errors just in case this might survive magically, but
289 * leave the rest intact.
290 */
291 t->rseq.event.error = 0;
292 force_sig(SIGSEGV);
293 }
294}
295
296void __rseq_handle_slowpath(struct pt_regs *regs)
297{
298 /*
299 * If invoked from hypervisors before entering the guest via
300 * resume_user_mode_work(), then @regs is a NULL pointer.
301 *
302 * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
303 * it before returning from the ioctl() to user space when
304 * rseq_event.sched_switch is set.
305 *
306 * So it's safe to ignore here instead of pointlessly updating it
307 * in the vcpu_run() loop.
308 */
309 if (!regs)
310 return;
311
312 rseq_slowpath_update_usr(regs);
313}
314
315void __rseq_signal_deliver(int sig, struct pt_regs *regs)
316{
317 rseq_stat_inc(rseq_stats.signal);
318 /*
319 * Don't update IDs, they are handled on exit to user if
320 * necessary. The important thing is to abort a critical section of
321 * the interrupted context as after this point the instruction
322 * pointer in @regs points to the signal handler.
323 */
324 if (unlikely(!rseq_handle_cs(current, regs))) {
325 /*
326 * Clear the errors just in case this might survive
327 * magically, but leave the rest intact.
328 */
329 current->rseq.event.error = 0;
330 force_sigsegv(sig);
331 }
332}
333
334/*
335 * Terminate the process if a syscall is issued within a restartable
336 * sequence.
337 */
338void __rseq_debug_syscall_return(struct pt_regs *regs)
339{
340 struct task_struct *t = current;
341 u64 csaddr;
342
343 if (!t->rseq.event.has_rseq)
344 return;
345 if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
346 goto fail;
347 if (likely(!csaddr))
348 return;
349 if (unlikely(csaddr >= TASK_SIZE))
350 goto fail;
351 if (rseq_debug_update_user_cs(t, regs, csaddr))
352 return;
353fail:
354 force_sig(SIGSEGV);
355}
356
357#ifdef CONFIG_DEBUG_RSEQ
358/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
359void rseq_syscall(struct pt_regs *regs)
360{
361 __rseq_debug_syscall_return(regs);
362}
363#endif
364
365static bool rseq_reset_ids(void)
366{
367 struct rseq_ids ids = {
368 .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
369 .mm_cid = 0,
370 };
371
372 /*
373 * If this fails, terminate it because this leaves the kernel in
374 * stupid state as exit to user space will try to fixup the ids
375 * again.
376 */
377 if (rseq_set_ids(current, &ids, 0))
378 return true;
379
380 force_sig(SIGSEGV);
381 return false;
382}
383
384/* The original rseq structure size (including padding) is 32 bytes. */
385#define ORIG_RSEQ_SIZE 32
386
387/*
388 * sys_rseq - setup restartable sequences for caller thread.
389 */
390SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
391{
392 if (flags & RSEQ_FLAG_UNREGISTER) {
393 if (flags & ~RSEQ_FLAG_UNREGISTER)
394 return -EINVAL;
395 /* Unregister rseq for current thread. */
396 if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
397 return -EINVAL;
398 if (rseq_len != current->rseq.len)
399 return -EINVAL;
400 if (current->rseq.sig != sig)
401 return -EPERM;
402 if (!rseq_reset_ids())
403 return -EFAULT;
404 rseq_reset(current);
405 return 0;
406 }
407
408 if (unlikely(flags))
409 return -EINVAL;
410
411 if (current->rseq.usrptr) {
412 /*
413 * If rseq is already registered, check whether
414 * the provided address differs from the prior
415 * one.
416 */
417 if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
418 return -EINVAL;
419 if (current->rseq.sig != sig)
420 return -EPERM;
421 /* Already registered. */
422 return -EBUSY;
423 }
424
425 /*
426 * If there was no rseq previously registered, ensure the provided rseq
427 * is properly aligned, as communcated to user-space through the ELF
428 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
429 * size, the required alignment is the original struct rseq alignment.
430 *
431 * In order to be valid, rseq_len is either the original rseq size, or
432 * large enough to contain all supported fields, as communicated to
433 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
434 */
435 if (rseq_len < ORIG_RSEQ_SIZE ||
436 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
437 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
438 rseq_len < offsetof(struct rseq, end))))
439 return -EINVAL;
440 if (!access_ok(rseq, rseq_len))
441 return -EFAULT;
442
443 scoped_user_write_access(rseq, efault) {
444 /*
445 * If the rseq_cs pointer is non-NULL on registration, clear it to
446 * avoid a potential segfault on return to user-space. The proper thing
447 * to do would have been to fail the registration but this would break
448 * older libcs that reuse the rseq area for new threads without
449 * clearing the fields. Don't bother reading it, just reset it.
450 */
451 unsafe_put_user(0UL, &rseq->rseq_cs, efault);
452 /* Initialize IDs in user space */
453 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
454 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
455 unsafe_put_user(0U, &rseq->node_id, efault);
456 unsafe_put_user(0U, &rseq->mm_cid, efault);
457 }
458
459 /*
460 * Activate the registration by setting the rseq area address, length
461 * and signature in the task struct.
462 */
463 current->rseq.usrptr = rseq;
464 current->rseq.len = rseq_len;
465 current->rseq.sig = sig;
466
467 /*
468 * If rseq was previously inactive, and has just been
469 * registered, ensure the cpu_id_start and cpu_id fields
470 * are updated before returning to user-space.
471 */
472 current->rseq.event.has_rseq = true;
473 rseq_force_update();
474 return 0;
475
476efault:
477 return -EFAULT;
478}