Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rseq: Switch to fast path processing on exit to user

Now that all bits and pieces are in place, hook the RSEQ handling fast path
function into exit_to_user_mode_prepare() after the TIF work bits have been
handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised
and the caller needs to take another turn through the TIF handling slow
path.

This only works for architectures which use the generic entry code.
Architectures who still have their own incomplete hacks are not supported
and won't be.

This results in the following improvements:

Kernel build Before After Reduction

exit to user 80692981 80514451
signal checks: 32581 121 99%
slowpath runs: 1201408 1.49% 198 0.00% 100%
fastpath runs: 675941 0.84% N/A
id updates: 1233989 1.53% 50541 0.06% 96%
cs checks: 1125366 1.39% 0 0.00% 100%
cs cleared: 1125366 100% 0 100%
cs fixup: 0 0% 0

RSEQ selftests Before After Reduction

exit to user: 386281778 387373750
signal checks: 35661203 0 100%
slowpath runs: 140542396 36.38% 100 0.00% 100%
fastpath runs: 9509789 2.51% N/A
id updates: 176203599 45.62% 9087994 2.35% 95%
cs checks: 175587856 45.46% 4728394 1.22% 98%
cs cleared: 172359544 98.16% 1319307 27.90% 99%
cs fixup: 3228312 1.84% 3409087 72.10%

The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to
user invocations, they are relative to the actual 'cs check' invocations.

While some of this could have been avoided in the original code, like the
obvious clearing of CS when it's already clear, the main problem of going
through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ
notify handler is invoked more than once before going out to user
space. Doing this once when everything has stabilized is the only solution
to avoid this.

The initial attempt to completely decouple it from the TIF work turned out
to be suboptimal for workloads, which do a lot of quick and short system
calls. Even if the fast path decision is only 4 instructions (including a
conditional branch), this adds up quickly and becomes measurable when the
rate for actually having to handle rseq is in the low single digit
percentage range of user/kernel transitions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
3db6b38d 05b44aef

+41 -22
+2 -5
include/linux/irq-entry-common.h
··· 197 197 */ 198 198 void arch_do_signal_or_restart(struct pt_regs *regs); 199 199 200 - /** 201 - * exit_to_user_mode_loop - do any pending work before leaving to user space 202 - */ 203 - unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 204 - unsigned long ti_work); 200 + /* Handle pending TIF work */ 201 + unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); 205 202 206 203 /** 207 204 * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+1 -1
include/linux/resume_user_mode.h
··· 59 59 mem_cgroup_handle_over_high(GFP_KERNEL); 60 60 blkcg_maybe_throttle_current(); 61 61 62 - rseq_handle_notify_resume(regs); 62 + rseq_handle_slowpath(regs); 63 63 } 64 64 65 65 #endif /* LINUX_RESUME_USER_MODE_H */
+12 -6
include/linux/rseq.h
··· 7 7 8 8 #include <uapi/linux/rseq.h> 9 9 10 - void __rseq_handle_notify_resume(struct pt_regs *regs); 10 + void __rseq_handle_slowpath(struct pt_regs *regs); 11 11 12 - static inline void rseq_handle_notify_resume(struct pt_regs *regs) 12 + /* Invoked from resume_user_mode_work() */ 13 + static inline void rseq_handle_slowpath(struct pt_regs *regs) 13 14 { 14 - /* '&' is intentional to spare one conditional branch */ 15 - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) 16 - __rseq_handle_notify_resume(regs); 15 + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { 16 + if (current->rseq.event.slowpath) 17 + __rseq_handle_slowpath(regs); 18 + } else { 19 + /* '&' is intentional to spare one conditional branch */ 20 + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) 21 + __rseq_handle_slowpath(regs); 22 + } 17 23 } 18 24 19 25 void __rseq_signal_deliver(int sig, struct pt_regs *regs); ··· 158 152 } 159 153 160 154 #else /* CONFIG_RSEQ */ 161 - static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } 155 + static inline void rseq_handle_slowpath(struct pt_regs *regs) { } 162 156 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 163 157 static inline void rseq_sched_switch_event(struct task_struct *t) { } 164 158 static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+1 -1
init/Kconfig
··· 1941 1941 config DEBUG_RSEQ 1942 1942 default n 1943 1943 bool "Enable debugging of rseq() system call" if EXPERT 1944 - depends on RSEQ && DEBUG_KERNEL 1944 + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY 1945 1945 select RSEQ_DEBUG_DEFAULT_ENABLE 1946 1946 help 1947 1947 Enable extra debugging checks for the rseq system call.
+19 -7
kernel/entry/common.c
··· 11 11 /* Workaround to allow gradual conversion of architecture code */ 12 12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 13 13 14 - /** 15 - * exit_to_user_mode_loop - do any pending work before leaving to user space 16 - * @regs: Pointer to pt_regs on entry stack 17 - * @ti_work: TIF work flags as read by the caller 18 - */ 19 - __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 20 - unsigned long ti_work) 14 + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, 15 + unsigned long ti_work) 21 16 { 22 17 /* 23 18 * Before returning to user space ensure that all pending work ··· 55 60 56 61 /* Return the latest work state for arch_exit_to_user_mode() */ 57 62 return ti_work; 63 + } 64 + 65 + /** 66 + * exit_to_user_mode_loop - do any pending work before leaving to user space 67 + * @regs: Pointer to pt_regs on entry stack 68 + * @ti_work: TIF work flags as read by the caller 69 + */ 70 + __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 71 + unsigned long ti_work) 72 + { 73 + for (;;) { 74 + ti_work = __exit_to_user_mode_loop(regs, ti_work); 75 + 76 + if (likely(!rseq_exit_to_user_mode_restart(regs))) 77 + return ti_work; 78 + ti_work = read_thread_flags(); 79 + } 58 80 } 59 81 60 82 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
+6 -2
kernel/rseq.c
··· 237 237 238 238 static void rseq_slowpath_update_usr(struct pt_regs *regs) 239 239 { 240 - /* Preserve rseq state and user_irq state for exit to user */ 240 + /* 241 + * Preserve rseq state and user_irq state. The generic entry code 242 + * clears user_irq on the way out, the non-generic entry 243 + * architectures are not having user_irq. 244 + */ 241 245 const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 242 246 struct task_struct *t = current; 243 247 struct rseq_ids ids; ··· 293 289 } 294 290 } 295 291 296 - void __rseq_handle_notify_resume(struct pt_regs *regs) 292 + void __rseq_handle_slowpath(struct pt_regs *regs) 297 293 { 298 294 /* 299 295 * If invoked from hypervisors before entering the guest via