Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rseq: Simplify the event notification

Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
flags") the bits in task::rseq_event_mask are meaningless and just extra
work in terms of setting them individually.

Aside of that the only relevant point where an event has to be raised is
context switch. Neither the CPU nor MM CID can change without going through
a context switch.

Collapse them all into a single boolean which simplifies the code a lot and
remove the pointless invocations which have been sprinkled all over the
place for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
d923739e 067b3b41

+48 -92
+1 -1
fs/exec.c
··· 1775 1775 force_fatal_sig(SIGSEGV); 1776 1776 1777 1777 sched_mm_cid_after_execve(current); 1778 - rseq_set_notify_resume(current); 1778 + rseq_sched_switch_event(current); 1779 1779 current->in_execve = 0; 1780 1780 1781 1781 return retval;
+13 -53
include/linux/rseq.h
··· 3 3 #define _LINUX_RSEQ_H 4 4 5 5 #ifdef CONFIG_RSEQ 6 - 7 - #include <linux/preempt.h> 8 6 #include <linux/sched.h> 9 - 10 - #ifdef CONFIG_MEMBARRIER 11 - # define RSEQ_EVENT_GUARD irq 12 - #else 13 - # define RSEQ_EVENT_GUARD preempt 14 - #endif 15 - 16 - /* 17 - * Map the event mask on the user-space ABI enum rseq_cs_flags 18 - * for direct mask checks. 19 - */ 20 - enum rseq_event_mask_bits { 21 - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, 22 - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, 23 - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, 24 - }; 25 - 26 - enum rseq_event_mask { 27 - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), 28 - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), 29 - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), 30 - }; 31 - 32 - static inline void rseq_set_notify_resume(struct task_struct *t) 33 - { 34 - if (t->rseq) 35 - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 36 - } 37 7 38 8 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); 39 9 ··· 13 43 __rseq_handle_notify_resume(NULL, regs); 14 44 } 15 45 16 - static inline void rseq_signal_deliver(struct ksignal *ksig, 17 - struct pt_regs *regs) 46 + static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) 18 47 { 19 48 if (current->rseq) { 20 - scoped_guard(RSEQ_EVENT_GUARD) 21 - __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); 49 + current->rseq_event_pending = true; 22 50 __rseq_handle_notify_resume(ksig, regs); 23 51 } 24 52 } 25 53 26 - /* rseq_preempt() requires preemption to be disabled. */ 27 - static inline void rseq_preempt(struct task_struct *t) 54 + static inline void rseq_sched_switch_event(struct task_struct *t) 28 55 { 29 - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); 30 - rseq_set_notify_resume(t); 31 - } 32 - 33 - /* rseq_migrate() requires preemption to be disabled. */ 34 - static inline void rseq_migrate(struct task_struct *t) 35 - { 36 - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); 37 - rseq_set_notify_resume(t); 56 + if (t->rseq) { 57 + t->rseq_event_pending = true; 58 + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 59 + } 38 60 } 39 61 40 62 static __always_inline void rseq_exit_to_user_mode(void) 41 63 { 42 64 if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { 43 - if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) 44 - current->rseq_event_mask = 0; 65 + if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) 66 + current->rseq_event_pending = false; 45 67 } 46 68 } 47 69 ··· 47 85 t->rseq = NULL; 48 86 t->rseq_len = 0; 49 87 t->rseq_sig = 0; 50 - t->rseq_event_mask = 0; 88 + t->rseq_event_pending = false; 51 89 } else { 52 90 t->rseq = current->rseq; 53 91 t->rseq_len = current->rseq_len; 54 92 t->rseq_sig = current->rseq_sig; 55 - t->rseq_event_mask = current->rseq_event_mask; 93 + t->rseq_event_pending = current->rseq_event_pending; 56 94 } 57 95 } 58 96 ··· 61 99 t->rseq = NULL; 62 100 t->rseq_len = 0; 63 101 t->rseq_sig = 0; 64 - t->rseq_event_mask = 0; 102 + t->rseq_event_pending = false; 65 103 } 66 104 67 105 #else /* CONFIG_RSEQ */ 68 - static inline void rseq_set_notify_resume(struct task_struct *t) { } 69 106 static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } 70 107 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 71 - static inline void rseq_preempt(struct task_struct *t) { } 72 - static inline void rseq_migrate(struct task_struct *t) { } 108 + static inline void rseq_sched_switch_event(struct task_struct *t) { } 73 109 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } 74 110 static inline void rseq_execve(struct task_struct *t) { } 75 111 static inline void rseq_exit_to_user_mode(void) { }
+5 -5
include/linux/sched.h
··· 1407 1407 #endif /* CONFIG_NUMA_BALANCING */ 1408 1408 1409 1409 #ifdef CONFIG_RSEQ 1410 - struct rseq __user *rseq; 1411 - u32 rseq_len; 1412 - u32 rseq_sig; 1410 + struct rseq __user *rseq; 1411 + u32 rseq_len; 1412 + u32 rseq_sig; 1413 1413 /* 1414 - * RmW on rseq_event_mask must be performed atomically 1414 + * RmW on rseq_event_pending must be performed atomically 1415 1415 * with respect to preemption. 1416 1416 */ 1417 - unsigned long rseq_event_mask; 1417 + bool rseq_event_pending; 1418 1418 # ifdef CONFIG_DEBUG_RSEQ 1419 1419 /* 1420 1420 * This is a place holder to save a copy of the rseq fields for
+7 -14
include/uapi/linux/rseq.h
··· 114 114 /* 115 115 * Restartable sequences flags field. 116 116 * 117 - * This field should only be updated by the thread which 118 - * registered this data structure. Read by the kernel. 119 - * Mainly used for single-stepping through rseq critical sections 120 - * with debuggers. 121 - * 122 - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 123 - * Inhibit instruction sequence block restart on preemption 124 - * for this thread. 125 - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 126 - * Inhibit instruction sequence block restart on signal 127 - * delivery for this thread. 128 - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 129 - * Inhibit instruction sequence block restart on migration for 130 - * this thread. 117 + * This field was initially intended to allow event masking for 118 + * single-stepping through rseq critical sections with debuggers. 119 + * The kernel does not support this anymore and the relevant bits 120 + * are checked for being always false: 121 + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 122 + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 123 + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 131 124 */ 132 125 __u32 flags; 133 126
+17 -11
kernel/rseq.c
··· 78 78 #define CREATE_TRACE_POINTS 79 79 #include <trace/events/rseq.h> 80 80 81 + #ifdef CONFIG_MEMBARRIER 82 + # define RSEQ_EVENT_GUARD irq 83 + #else 84 + # define RSEQ_EVENT_GUARD preempt 85 + #endif 86 + 81 87 /* The original rseq structure size (including padding) is 32 bytes. */ 82 88 #define ORIG_RSEQ_SIZE 32 83 89 ··· 436 430 */ 437 431 if (regs) { 438 432 /* 439 - * Read and clear the event mask first. If the task was not 440 - * preempted or migrated or a signal is on the way, there 441 - * is no point in doing any of the heavy lifting here on 442 - * production kernels. In that case TIF_NOTIFY_RESUME was 443 - * raised by some other functionality. 433 + * Read and clear the event pending bit first. If the task 434 + * was not preempted or migrated or a signal is on the way, 435 + * there is no point in doing any of the heavy lifting here 436 + * on production kernels. In that case TIF_NOTIFY_RESUME 437 + * was raised by some other functionality. 444 438 * 445 439 * This is correct because the read/clear operation is 446 440 * guarded against scheduler preemption, which makes it CPU ··· 453 447 * with the result handed in to allow the detection of 454 448 * inconsistencies. 455 449 */ 456 - u32 event_mask; 450 + bool event; 457 451 458 452 scoped_guard(RSEQ_EVENT_GUARD) { 459 - event_mask = t->rseq_event_mask; 460 - t->rseq_event_mask = 0; 453 + event = t->rseq_event_pending; 454 + t->rseq_event_pending = false; 461 455 } 462 456 463 - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { 464 - ret = rseq_ip_fixup(regs, !!event_mask); 457 + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { 458 + ret = rseq_ip_fixup(regs, event); 465 459 if (unlikely(ret < 0)) 466 460 goto error; 467 461 } ··· 590 584 * registered, ensure the cpu_id_start and cpu_id fields 591 585 * are updated before returning to user-space. 592 586 */ 593 - rseq_set_notify_resume(current); 587 + rseq_sched_switch_event(current); 594 588 595 589 return 0; 596 590 }
+1 -4
kernel/sched/core.c
··· 3329 3329 if (p->sched_class->migrate_task_rq) 3330 3330 p->sched_class->migrate_task_rq(p, new_cpu); 3331 3331 p->se.nr_migrations++; 3332 - rseq_migrate(p); 3333 3332 sched_mm_cid_migrate_from(p); 3334 3333 perf_event_task_migrate(p); 3335 3334 } ··· 4762 4763 p->sched_task_group = tg; 4763 4764 } 4764 4765 #endif 4765 - rseq_migrate(p); 4766 4766 /* 4767 4767 * We're setting the CPU for the first time, we don't migrate, 4768 4768 * so use __set_task_cpu(). ··· 4825 4827 * as we're not fully set-up yet. 4826 4828 */ 4827 4829 p->recent_used_cpu = task_cpu(p); 4828 - rseq_migrate(p); 4829 4830 __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4830 4831 rq = __task_rq_lock(p, &rf); 4831 4832 update_rq_clock(rq); ··· 5118 5121 kcov_prepare_switch(prev); 5119 5122 sched_info_switch(rq, prev, next); 5120 5123 perf_event_task_sched_out(prev, next); 5121 - rseq_preempt(prev); 5124 + rseq_sched_switch_event(prev); 5122 5125 fire_sched_out_preempt_notifiers(prev, next); 5123 5126 kmap_local_sched_out(); 5124 5127 prepare_task(next);
+4 -4
kernel/sched/membarrier.c
··· 199 199 * is negligible. 200 200 */ 201 201 smp_mb(); 202 - rseq_preempt(current); 202 + rseq_sched_switch_event(current); 203 203 } 204 204 205 205 static void ipi_sync_rq_state(void *info) ··· 407 407 * membarrier, we will end up with some thread in the mm 408 408 * running without a core sync. 409 409 * 410 - * For RSEQ, don't rseq_preempt() the caller. User code 411 - * is not supposed to issue syscalls at all from inside an 412 - * rseq critical section. 410 + * For RSEQ, don't invoke rseq_sched_switch_event() on the 411 + * caller. User code is not supposed to issue syscalls at 412 + * all from inside an rseq critical section. 413 413 */ 414 414 if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 415 415 preempt_disable();