Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rseq: Switch to TIF_RSEQ if supported

TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially
with the RSEQ fast path depending on it, but not really handling it.

Define a separate TIF_RSEQ in the generic TIF space and enable the full
separation of fast and slow path for architectures which utilize that.

That avoids the hassle with invocations of resume_user_mode_work() from
hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required
re-evaluation at the end of vcpu_run() a NOOP on architectures which
utilize the generic TIF space and have a separate TIF_RSEQ.

The hypervisor TIF handling does not include the separate TIF_RSEQ as there
is no point in doing so. The guest does neither know nor care about the VMM
host applications RSEQ state. That state is only relevant when the ioctl()
returns to user space.

The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure
handling, but this only happens within exit_to_user_mode_loop(), so
arguably the hypervisor ioctl() code is long done when this happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
32034df6 7a5201ea

+61 -13
+3
include/asm-generic/thread_info_tif.h
··· 45 45 # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) 46 46 #endif 47 47 48 + #define TIF_RSEQ 11 // Run RSEQ fast path 49 + #define _TIF_RSEQ BIT(TIF_RSEQ) 50 + 48 51 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
+1 -1
include/linux/irq-entry-common.h
··· 30 30 #define EXIT_TO_USER_MODE_WORK \ 31 31 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 32 32 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 33 - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 33 + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ 34 34 ARCH_EXIT_TO_USER_MODE_WORK) 35 35 36 36 /**
+15 -7
include/linux/rseq.h
··· 42 42 43 43 static inline void rseq_raise_notify_resume(struct task_struct *t) 44 44 { 45 - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 45 + set_tsk_thread_flag(t, TIF_RSEQ); 46 46 } 47 47 48 48 /* Invoked from context switch to force evaluation on exit to user */ ··· 114 114 115 115 /* 116 116 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, 117 - * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in 118 - * that case just to do it eventually again before returning to user space, 119 - * the entry resume_user_mode_work() invocation is ignored as the register 120 - * argument is NULL. 117 + * which clears TIF_NOTIFY_RESUME on architectures that don't use the 118 + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. 121 119 * 122 - * After returning from guest mode, they have to invoke this function to 123 - * re-raise TIF_NOTIFY_RESUME if necessary. 120 + * To avoid updating user space RSEQ in that case just to do it eventually 121 + * again before returning to user space, because __rseq_handle_slowpath() 122 + * does nothing when invoked with NULL register state. 123 + * 124 + * After returning from guest mode, before exiting to userspace, hypervisors 125 + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. 124 126 */ 125 127 static inline void rseq_virt_userspace_exit(void) 126 128 { 127 129 if (current->rseq.event.sched_switch) 130 + /* 131 + * The generic optimization for deferring RSEQ updates until the next 132 + * exit relies on having a dedicated TIF_RSEQ. 133 + */ 134 + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && 135 + current->rseq.event.sched_switch) 128 136 rseq_raise_notify_resume(current); 129 137 } 130 138
+29 -3
include/linux/rseq_entry.h
··· 507 507 return false; 508 508 } 509 509 510 - static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) 510 + /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 511 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 512 + static __always_inline bool test_tif_rseq(unsigned long ti_work) 511 513 { 514 + return ti_work & _TIF_RSEQ; 515 + } 516 + 517 + static __always_inline void clear_tif_rseq(void) 518 + { 519 + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 520 + clear_thread_flag(TIF_RSEQ); 521 + } 522 + #else 523 + static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 524 + static __always_inline void clear_tif_rseq(void) { } 525 + #endif 526 + 527 + static __always_inline bool 528 + rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 529 + { 530 + if (likely(!test_tif_rseq(ti_work))) 531 + return false; 532 + 512 533 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 513 534 current->rseq.event.slowpath = true; 514 535 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 515 536 return true; 516 537 } 538 + 539 + clear_tif_rseq(); 517 540 return false; 518 541 } 519 542 520 543 #else /* CONFIG_GENERIC_ENTRY */ 521 - static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } 544 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 545 + { 546 + return false; 547 + } 522 548 #endif /* !CONFIG_GENERIC_ENTRY */ 523 549 524 550 static __always_inline void rseq_syscall_exit_to_user_mode(void) ··· 603 577 } 604 578 #else /* CONFIG_RSEQ */ 605 579 static inline void rseq_note_user_irq_entry(void) { } 606 - static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) 580 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 607 581 { 608 582 return false; 609 583 }
+5
include/linux/thread_info.h
··· 67 67 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED 68 68 #endif 69 69 70 + #ifndef TIF_RSEQ 71 + # define TIF_RSEQ TIF_NOTIFY_RESUME 72 + # define _TIF_RSEQ _TIF_NOTIFY_RESUME 73 + #endif 74 + 70 75 #ifdef __KERNEL__ 71 76 72 77 #ifndef arch_set_restart_data
+8 -2
kernel/entry/common.c
··· 11 11 /* Workaround to allow gradual conversion of architecture code */ 12 12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 13 13 14 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 15 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) 16 + #else 17 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) 18 + #endif 19 + 14 20 static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, 15 21 unsigned long ti_work) 16 22 { ··· 24 18 * Before returning to user space ensure that all pending work 25 19 * items have been completed. 26 20 */ 27 - while (ti_work & EXIT_TO_USER_MODE_WORK) { 21 + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { 28 22 29 23 local_irq_enable_exit_to_user(ti_work); 30 24 ··· 74 68 for (;;) { 75 69 ti_work = __exit_to_user_mode_loop(regs, ti_work); 76 70 77 - if (likely(!rseq_exit_to_user_mode_restart(regs))) 71 + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) 78 72 return ti_work; 79 73 ti_work = read_thread_flags(); 80 74 }