Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_RSEQ_ENTRY_H
3#define _LINUX_RSEQ_ENTRY_H
4
5/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
6#ifdef CONFIG_RSEQ_STATS
7#include <linux/percpu.h>
8
9struct rseq_stats {
10 unsigned long exit;
11 unsigned long signal;
12 unsigned long slowpath;
13 unsigned long fastpath;
14 unsigned long ids;
15 unsigned long cs;
16 unsigned long clear;
17 unsigned long fixup;
18};
19
20DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
21
22/*
23 * Slow path has interrupts and preemption enabled, but the fast path
24 * runs with interrupts disabled so there is no point in having the
25 * preemption checks implied in __this_cpu_inc() for every operation.
26 */
27#ifdef RSEQ_BUILD_SLOW_PATH
28#define rseq_stat_inc(which) this_cpu_inc((which))
29#else
30#define rseq_stat_inc(which) raw_cpu_inc((which))
31#endif
32
33#else /* CONFIG_RSEQ_STATS */
34#define rseq_stat_inc(x) do { } while (0)
35#endif /* !CONFIG_RSEQ_STATS */
36
37#ifdef CONFIG_RSEQ
38#include <linux/jump_label.h>
39#include <linux/rseq.h>
40#include <linux/uaccess.h>
41
42#include <linux/tracepoint-defs.h>
43
44#ifdef CONFIG_TRACEPOINTS
45DECLARE_TRACEPOINT(rseq_update);
46DECLARE_TRACEPOINT(rseq_ip_fixup);
47void __rseq_trace_update(struct task_struct *t);
48void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
49 unsigned long offset, unsigned long abort_ip);
50
51static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
52{
53 if (tracepoint_enabled(rseq_update) && ids)
54 __rseq_trace_update(t);
55}
56
57static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
58 unsigned long offset, unsigned long abort_ip)
59{
60 if (tracepoint_enabled(rseq_ip_fixup))
61 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
62}
63
64#else /* CONFIG_TRACEPOINT */
65static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
66static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
67 unsigned long offset, unsigned long abort_ip) { }
68#endif /* !CONFIG_TRACEPOINT */
69
70DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
71
72#ifdef RSEQ_BUILD_SLOW_PATH
73#define rseq_inline
74#else
75#define rseq_inline __always_inline
76#endif
77
78bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
79bool rseq_debug_validate_ids(struct task_struct *t);
80
81static __always_inline void rseq_note_user_irq_entry(void)
82{
83 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
84 current->rseq.event.user_irq = true;
85}
86
87/*
88 * Check whether there is a valid critical section and whether the
89 * instruction pointer in @regs is inside the critical section.
90 *
91 * - If the critical section is invalid, terminate the task.
92 *
93 * - If valid and the instruction pointer is inside, set it to the abort IP.
94 *
95 * - If valid and the instruction pointer is outside, clear the critical
96 * section address.
97 *
98 * Returns true, if the section was valid and either fixup or clear was
99 * done, false otherwise.
100 *
101 * In the failure case task::rseq_event::fatal is set when a invalid
102 * section was found. It's clear when the failure was an unresolved page
103 * fault.
104 *
105 * If inlined into the exit to user path with interrupts disabled, the
106 * caller has to protect against page faults with pagefault_disable().
107 *
108 * In preemptible task context this would be counterproductive as the page
109 * faults could not be fully resolved. As a consequence unresolved page
110 * faults in task context are fatal too.
111 */
112
113#ifdef RSEQ_BUILD_SLOW_PATH
114/*
115 * The debug version is put out of line, but kept here so the code stays
116 * together.
117 *
118 * @csaddr has already been checked by the caller to be in user space
119 */
120bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
121 unsigned long csaddr)
122{
123 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
124 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
125 unsigned long ip = instruction_pointer(regs);
126 u64 __user *uc_head = (u64 __user *) ucs;
127 u32 usig, __user *uc_sig;
128
129 scoped_user_rw_access(ucs, efault) {
130 /*
131 * Evaluate the user pile and exit if one of the conditions
132 * is not fulfilled.
133 */
134 unsafe_get_user(start_ip, &ucs->start_ip, efault);
135 if (unlikely(start_ip >= tasksize))
136 goto die;
137 /* If outside, just clear the critical section. */
138 if (ip < start_ip)
139 goto clear;
140
141 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
142 cs_end = start_ip + offset;
143 /* Check for overflow and wraparound */
144 if (unlikely(cs_end >= tasksize || cs_end < start_ip))
145 goto die;
146
147 /* If not inside, clear it. */
148 if (ip >= cs_end)
149 goto clear;
150
151 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
152 /* Ensure it's "valid" */
153 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
154 goto die;
155 /* Validate that the abort IP is not in the critical section */
156 if (unlikely(abort_ip - start_ip < offset))
157 goto die;
158
159 /*
160 * Check version and flags for 0. No point in emitting
161 * deprecated warnings before dying. That could be done in
162 * the slow path eventually, but *shrug*.
163 */
164 unsafe_get_user(head, uc_head, efault);
165 if (unlikely(head))
166 goto die;
167
168 /* abort_ip - 4 is >= 0. See abort_ip check above */
169 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
170 unsafe_get_user(usig, uc_sig, efault);
171 if (unlikely(usig != t->rseq.sig))
172 goto die;
173
174 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
175 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
176 /* If not in interrupt from user context, let it die */
177 if (unlikely(!t->rseq.event.user_irq))
178 goto die;
179 }
180 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
181 instruction_pointer_set(regs, (unsigned long)abort_ip);
182 rseq_stat_inc(rseq_stats.fixup);
183 break;
184 clear:
185 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
186 rseq_stat_inc(rseq_stats.clear);
187 abort_ip = 0ULL;
188 }
189
190 if (unlikely(abort_ip))
191 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
192 return true;
193die:
194 t->rseq.event.fatal = true;
195efault:
196 return false;
197}
198
199/*
200 * On debug kernels validate that user space did not mess with it if the
201 * debug branch is enabled.
202 */
203bool rseq_debug_validate_ids(struct task_struct *t)
204{
205 struct rseq __user *rseq = t->rseq.usrptr;
206 u32 cpu_id, uval, node_id;
207
208 /*
209 * On the first exit after registering the rseq region CPU ID is
210 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
211 */
212 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
213 cpu_to_node(t->rseq.ids.cpu_id) : 0;
214
215 scoped_user_read_access(rseq, efault) {
216 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
217 if (cpu_id != t->rseq.ids.cpu_id)
218 goto die;
219 unsafe_get_user(uval, &rseq->cpu_id, efault);
220 if (uval != cpu_id)
221 goto die;
222 unsafe_get_user(uval, &rseq->node_id, efault);
223 if (uval != node_id)
224 goto die;
225 unsafe_get_user(uval, &rseq->mm_cid, efault);
226 if (uval != t->rseq.ids.mm_cid)
227 goto die;
228 }
229 return true;
230die:
231 t->rseq.event.fatal = true;
232efault:
233 return false;
234}
235
236#endif /* RSEQ_BUILD_SLOW_PATH */
237
238/*
239 * This only ensures that abort_ip is in the user address space and
240 * validates that it is preceded by the signature.
241 *
242 * No other sanity checks are done here, that's what the debug code is for.
243 */
244static rseq_inline bool
245rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
246{
247 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
248 unsigned long ip = instruction_pointer(regs);
249 unsigned long tasksize = TASK_SIZE;
250 u64 start_ip, abort_ip, offset;
251 u32 usig, __user *uc_sig;
252
253 rseq_stat_inc(rseq_stats.cs);
254
255 if (unlikely(csaddr >= tasksize)) {
256 t->rseq.event.fatal = true;
257 return false;
258 }
259
260 if (static_branch_unlikely(&rseq_debug_enabled))
261 return rseq_debug_update_user_cs(t, regs, csaddr);
262
263 scoped_user_rw_access(ucs, efault) {
264 unsafe_get_user(start_ip, &ucs->start_ip, efault);
265 unsafe_get_user(offset, &ucs->post_commit_offset, efault);
266 unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
267
268 /*
269 * No sanity checks. If user space screwed it up, it can
270 * keep the pieces. That's what debug code is for.
271 *
272 * If outside, just clear the critical section.
273 */
274 if (ip - start_ip >= offset)
275 goto clear;
276
277 /*
278 * Two requirements for @abort_ip:
279 * - Must be in user space as x86 IRET would happily return to
280 * the kernel.
281 * - The four bytes preceding the instruction at @abort_ip must
282 * contain the signature.
283 *
284 * The latter protects against the following attack vector:
285 *
286 * An attacker with limited abilities to write, creates a critical
287 * section descriptor, sets the abort IP to a library function or
288 * some other ROP gadget and stores the address of the descriptor
289 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
290 * protection.
291 */
292 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
293 goto die;
294
295 /* The address is guaranteed to be >= 0 and < TASK_SIZE */
296 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
297 unsafe_get_user(usig, uc_sig, efault);
298 if (unlikely(usig != t->rseq.sig))
299 goto die;
300
301 /* Invalidate the critical section */
302 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
303 /* Update the instruction pointer */
304 instruction_pointer_set(regs, (unsigned long)abort_ip);
305 rseq_stat_inc(rseq_stats.fixup);
306 break;
307 clear:
308 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
309 rseq_stat_inc(rseq_stats.clear);
310 abort_ip = 0ULL;
311 }
312
313 if (unlikely(abort_ip))
314 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
315 return true;
316die:
317 t->rseq.event.fatal = true;
318efault:
319 return false;
320}
321
322/*
323 * Updates CPU ID, Node ID and MM CID and reads the critical section
324 * address, when @csaddr != NULL. This allows to put the ID update and the
325 * read under the same uaccess region to spare a separate begin/end.
326 *
327 * As this is either invoked from a C wrapper with @csaddr = NULL or from
328 * the fast path code with a valid pointer, a clever compiler should be
329 * able to optimize the read out. Spares a duplicate implementation.
330 *
331 * Returns true, if the operation was successful, false otherwise.
332 *
333 * In the failure case task::rseq_event::fatal is set when invalid data
334 * was found on debug kernels. It's clear when the failure was an unresolved page
335 * fault.
336 *
337 * If inlined into the exit to user path with interrupts disabled, the
338 * caller has to protect against page faults with pagefault_disable().
339 *
340 * In preemptible task context this would be counterproductive as the page
341 * faults could not be fully resolved. As a consequence unresolved page
342 * faults in task context are fatal too.
343 */
344static rseq_inline
345bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
346 u32 node_id, u64 *csaddr)
347{
348 struct rseq __user *rseq = t->rseq.usrptr;
349
350 if (static_branch_unlikely(&rseq_debug_enabled)) {
351 if (!rseq_debug_validate_ids(t))
352 return false;
353 }
354
355 scoped_user_rw_access(rseq, efault) {
356 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
357 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
358 unsafe_put_user(node_id, &rseq->node_id, efault);
359 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
360 if (csaddr)
361 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
362 }
363
364 /* Cache the new values */
365 t->rseq.ids.cpu_cid = ids->cpu_cid;
366 rseq_stat_inc(rseq_stats.ids);
367 rseq_trace_update(t, ids);
368 return true;
369efault:
370 return false;
371}
372
373/*
374 * Update user space with new IDs and conditionally check whether the task
375 * is in a critical section.
376 */
377static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
378 struct rseq_ids *ids, u32 node_id)
379{
380 u64 csaddr;
381
382 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
383 return false;
384
385 /*
386 * On architectures which utilize the generic entry code this
387 * allows to skip the critical section when the entry was not from
388 * a user space interrupt, unless debug mode is enabled.
389 */
390 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
391 if (!static_branch_unlikely(&rseq_debug_enabled)) {
392 if (likely(!t->rseq.event.user_irq))
393 return true;
394 }
395 }
396 if (likely(!csaddr))
397 return true;
398 /* Sigh, this really needs to do work */
399 return rseq_update_user_cs(t, regs, csaddr);
400}
401
402/*
403 * If you want to use this then convert your architecture to the generic
404 * entry code. I'm tired of building workarounds for people who can't be
405 * bothered to make the maintenance of generic infrastructure less
406 * burdensome. Just sucking everything into the architecture code and
407 * thereby making others chase the horrible hacks and keep them working is
408 * neither acceptable nor sustainable.
409 */
410#ifdef CONFIG_GENERIC_ENTRY
411
412/*
413 * This is inlined into the exit path because:
414 *
415 * 1) It's a one time comparison in the fast path when there is no event to
416 * handle
417 *
418 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
419 * so the straight inline operation is:
420 *
421 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
422 * - One 64-bit load to retrieve the critical section address
423 *
424 * 3) In the unlikely case that the critical section address is != NULL:
425 *
426 * - One 64-bit load to retrieve the start IP
427 * - One 64-bit load to retrieve the offset for calculating the end
428 * - One 64-bit load to retrieve the abort IP
429 * - One 64-bit load to retrieve the signature
430 * - One store to clear the critical section address
431 *
432 * The non-debug case implements only the minimal required checking. It
433 * provides protection against a rogue abort IP in kernel space, which
434 * would be exploitable at least on x86, and also against a rogue CS
435 * descriptor by checking the signature at the abort IP. Any fallout from
436 * invalid critical section descriptors is a user space problem. The debug
437 * case provides the full set of checks and terminates the task if a
438 * condition is not met.
439 *
440 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
441 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
442 * slow path there will handle the failure.
443 */
444static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
445{
446 /*
447 * Page faults need to be disabled as this is called with
448 * interrupts disabled
449 */
450 guard(pagefault)();
451 if (likely(!t->rseq.event.ids_changed)) {
452 struct rseq __user *rseq = t->rseq.usrptr;
453 /*
454 * If IDs have not changed rseq_event::user_irq must be true
455 * See rseq_sched_switch_event().
456 */
457 u64 csaddr;
458
459 if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
460 return false;
461
462 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
463 if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
464 return false;
465 }
466 return true;
467 }
468
469 struct rseq_ids ids = {
470 .cpu_id = task_cpu(t),
471 .mm_cid = task_mm_cid(t),
472 };
473 u32 node_id = cpu_to_node(ids.cpu_id);
474
475 return rseq_update_usr(t, regs, &ids, node_id);
476}
477
478static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
479{
480 struct task_struct *t = current;
481
482 /*
483 * If the task did not go through schedule or got the flag enforced
484 * by the rseq syscall or execve, then nothing to do here.
485 *
486 * CPU ID and MM CID can only change when going through a context
487 * switch.
488 *
489 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
490 * only when rseq_event::has_rseq is true. That conditional is
491 * required to avoid setting the TIF bit if RSEQ is not registered
492 * for a task. rseq_event::sched_switch is cleared when RSEQ is
493 * unregistered by a task so it's sufficient to check for the
494 * sched_switch bit alone.
495 *
496 * A sane compiler requires three instructions for the nothing to do
497 * case including clearing the events, but your mileage might vary.
498 */
499 if (unlikely((t->rseq.event.sched_switch))) {
500 rseq_stat_inc(rseq_stats.fastpath);
501
502 if (unlikely(!rseq_exit_user_update(regs, t)))
503 return true;
504 }
505 /* Clear state so next entry starts from a clean slate */
506 t->rseq.event.events = 0;
507 return false;
508}
509
510/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
511#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
512static __always_inline bool test_tif_rseq(unsigned long ti_work)
513{
514 return ti_work & _TIF_RSEQ;
515}
516
517static __always_inline void clear_tif_rseq(void)
518{
519 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
520 clear_thread_flag(TIF_RSEQ);
521}
522#else
523static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
524static __always_inline void clear_tif_rseq(void) { }
525#endif
526
527static __always_inline bool
528rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
529{
530 if (likely(!test_tif_rseq(ti_work)))
531 return false;
532
533 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
534 current->rseq.event.slowpath = true;
535 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
536 return true;
537 }
538
539 clear_tif_rseq();
540 return false;
541}
542
543#else /* CONFIG_GENERIC_ENTRY */
544static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
545{
546 return false;
547}
548#endif /* !CONFIG_GENERIC_ENTRY */
549
550static __always_inline void rseq_syscall_exit_to_user_mode(void)
551{
552 struct rseq_event *ev = ¤t->rseq.event;
553
554 rseq_stat_inc(rseq_stats.exit);
555
556 /* Needed to remove the store for the !lockdep case */
557 if (IS_ENABLED(CONFIG_LOCKDEP)) {
558 WARN_ON_ONCE(ev->sched_switch);
559 ev->events = 0;
560 }
561}
562
563static __always_inline void rseq_irqentry_exit_to_user_mode(void)
564{
565 struct rseq_event *ev = ¤t->rseq.event;
566
567 rseq_stat_inc(rseq_stats.exit);
568
569 lockdep_assert_once(!ev->sched_switch);
570
571 /*
572 * Ensure that event (especially user_irq) is cleared when the
573 * interrupt did not result in a schedule and therefore the
574 * rseq processing could not clear it.
575 */
576 ev->events = 0;
577}
578
579/* Required to keep ARM64 working */
580static __always_inline void rseq_exit_to_user_mode_legacy(void)
581{
582 struct rseq_event *ev = ¤t->rseq.event;
583
584 rseq_stat_inc(rseq_stats.exit);
585
586 if (static_branch_unlikely(&rseq_debug_enabled))
587 WARN_ON_ONCE(ev->sched_switch);
588
589 /*
590 * Ensure that event (especially user_irq) is cleared when the
591 * interrupt did not result in a schedule and therefore the
592 * rseq processing did not clear it.
593 */
594 ev->events = 0;
595}
596
597void __rseq_debug_syscall_return(struct pt_regs *regs);
598
599static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
600{
601 if (static_branch_unlikely(&rseq_debug_enabled))
602 __rseq_debug_syscall_return(regs);
603}
604#else /* CONFIG_RSEQ */
605static inline void rseq_note_user_irq_entry(void) { }
606static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
607{
608 return false;
609}
610static inline void rseq_syscall_exit_to_user_mode(void) { }
611static inline void rseq_irqentry_exit_to_user_mode(void) { }
612static inline void rseq_exit_to_user_mode_legacy(void) { }
613static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
614#endif /* !CONFIG_RSEQ */
615
616#endif /* _LINUX_RSEQ_ENTRY_H */