···22#ifndef __LINUX_ENTRYCOMMON_H33#define __LINUX_ENTRYCOMMON_H4455+#include <linux/audit.h>56#include <linux/irq-entry-common.h>67#include <linux/livepatch.h>78#include <linux/ptrace.h>···4645 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \4746 ARCH_SYSCALL_WORK_EXIT)48474949-long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);4848+/**4949+ * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper5050+ *5151+ * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().5252+ *5353+ * This allows architecture specific ptrace_report_syscall_entry()5454+ * implementations. If not defined by the architecture this falls back to5555+ * to ptrace_report_syscall_entry().5656+ */5757+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);5858+5959+#ifndef arch_ptrace_report_syscall_entry6060+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)6161+{6262+ return ptrace_report_syscall_entry(regs);6363+}6464+#endif6565+6666+bool syscall_user_dispatch(struct pt_regs *regs);6767+long trace_syscall_enter(struct pt_regs *regs, long syscall);6868+void trace_syscall_exit(struct pt_regs *regs, long ret);6969+7070+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)7171+{7272+ if (unlikely(audit_context())) {7373+ unsigned long args[6];7474+7575+ syscall_get_arguments(current, regs, args);7676+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);7777+ }7878+}7979+8080+static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)8181+{8282+ long syscall, ret = 0;8383+8484+ /*8585+ * Handle Syscall User Dispatch. This must comes first, since8686+ * the ABI here can be something that doesn't make sense for8787+ * other syscall_work features.8888+ */8989+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {9090+ if (syscall_user_dispatch(regs))9191+ return -1L;9292+ }9393+9494+ /*9595+ * User space got a time slice extension granted and relinquishes9696+ * the CPU. The work stops the slice timer to avoid an extra round9797+ * through hrtimer_interrupt().9898+ */9999+ if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)100100+ rseq_syscall_enter_work(syscall_get_nr(current, regs));101101+102102+ /* Handle ptrace */103103+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {104104+ ret = arch_ptrace_report_syscall_entry(regs);105105+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))106106+ return -1L;107107+ }108108+109109+ /* Do seccomp after ptrace, to catch any tracer changes. */110110+ if (work & SYSCALL_WORK_SECCOMP) {111111+ ret = __secure_computing();112112+ if (ret == -1L)113113+ return ret;114114+ }115115+116116+ /* Either of the above might have changed the syscall number */117117+ syscall = syscall_get_nr(current, regs);118118+119119+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))120120+ syscall = trace_syscall_enter(regs, syscall);121121+122122+ syscall_enter_audit(regs, syscall);123123+124124+ return ret ? : syscall;125125+}5012651127/**52128 * syscall_enter_from_user_mode_work - Check and handle work before invoking···15375 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);1547615577 if (work & SYSCALL_WORK_ENTER)156156- syscall = syscall_trace_enter(regs, syscall, work);7878+ syscall = syscall_trace_enter(regs, work);1577915880 return syscall;15981}···190112 return ret;191113}192114115115+/*116116+ * If SYSCALL_EMU is set, then the only reason to report is when117117+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall118118+ * instruction has been already reported in syscall_enter_from_user_mode().119119+ */120120+static __always_inline bool report_single_step(unsigned long work)121121+{122122+ if (work & SYSCALL_WORK_SYSCALL_EMU)123123+ return false;124124+125125+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;126126+}127127+128128+/**129129+ * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()130130+ *131131+ * This allows architecture specific ptrace_report_syscall_exit()132132+ * implementations. If not defined by the architecture this falls back to133133+ * to ptrace_report_syscall_exit().134134+ */135135+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,136136+ int step);137137+138138+#ifndef arch_ptrace_report_syscall_exit139139+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,140140+ int step)141141+{142142+ ptrace_report_syscall_exit(regs, step);143143+}144144+#endif145145+193146/**194147 * syscall_exit_work - Handle work before returning to user mode195148 * @regs: Pointer to current pt_regs···228119 *229120 * Do one-time syscall specific work.230121 */231231-void syscall_exit_work(struct pt_regs *regs, unsigned long work);122122+static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)123123+{124124+ bool step;125125+126126+ /*127127+ * If the syscall was rolled back due to syscall user dispatching,128128+ * then the tracers below are not invoked for the same reason as129129+ * the entry side was not invoked in syscall_trace_enter(): The ABI130130+ * of these syscalls is unknown.131131+ */132132+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {133133+ if (unlikely(current->syscall_dispatch.on_dispatch)) {134134+ current->syscall_dispatch.on_dispatch = false;135135+ return;136136+ }137137+ }138138+139139+ audit_syscall_exit(regs);140140+141141+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)142142+ trace_syscall_exit(regs, syscall_get_return_value(current, regs));143143+144144+ step = report_single_step(work);145145+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)146146+ arch_ptrace_report_syscall_exit(regs, step);147147+}232148233149/**234234- * syscall_exit_to_user_mode_work - Handle work before returning to user mode150150+ * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode235151 * @regs: Pointer to currents pt_regs236152 *237237- * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling238238- * exit_to_user_mode() to perform the final transition to user mode.153153+ * Step 1 of syscall_exit_to_user_mode() with the same calling convention.239154 *240240- * Calling convention is the same as for syscall_exit_to_user_mode() and it241241- * returns with all work handled and interrupts disabled. The caller must242242- * invoke exit_to_user_mode() before actually switching to user mode to243243- * make the final state transitions. Interrupts must stay disabled between244244- * return from this function and the invocation of exit_to_user_mode().155155+ * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.245156 */246157static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)247158{···284155 */285156 if (unlikely(work & SYSCALL_WORK_EXIT))286157 syscall_exit_work(regs, work);287287- local_irq_disable_exit_to_user();288288- syscall_exit_to_user_mode_prepare(regs);289158}290159291160/**292161 * syscall_exit_to_user_mode - Handle work before returning to user mode293162 * @regs: Pointer to currents pt_regs294163 *295295- * Invoked with interrupts enabled and fully valid regs. Returns with all164164+ * Invoked with interrupts enabled and fully valid @regs. Returns with all296165 * work handled, interrupts disabled such that the caller can immediately297166 * switch to user mode. Called from architecture specific syscall and ret298167 * from fork code.···303176 * - ptrace (single stepping)304177 *305178 * 2) Preparatory work179179+ * - Disable interrupts306180 * - Exit to user mode loop (common TIF handling). Invokes307181 * arch_exit_to_user_mode_work() for architecture specific TIF work308182 * - Architecture specific one time work arch_exit_to_user_mode_prepare()···312184 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the313185 * functionality in exit_to_user_mode().314186 *315315- * This is a combination of syscall_exit_to_user_mode_work() (1,2) and316316- * exit_to_user_mode(). This function is preferred unless there is a317317- * compelling architectural reason to use the separate functions.187187+ * This is a combination of syscall_exit_to_user_mode_work() (1), disabling188188+ * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and189189+ * exit_to_user_mode() (3). This function is preferred unless there is a190190+ * compelling architectural reason to invoke the functions separately.318191 */319192static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)320193{321194 instrumentation_begin();322195 syscall_exit_to_user_mode_work(regs);196196+ local_irq_disable_exit_to_user();197197+ syscall_exit_to_user_mode_prepare(regs);323198 instrumentation_end();324199 exit_to_user_mode();325200}
···11// SPDX-License-Identifier: GPL-2.02233-#include <linux/audit.h>43#include <linux/entry-common.h>55-#include "common.h"6475#define CREATE_TRACE_POINTS86#include <trace/events/syscalls.h>971010-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)1111-{1212- if (unlikely(audit_context())) {1313- unsigned long args[6];88+/* Out of line to prevent tracepoint code duplication */1491515- syscall_get_arguments(current, regs, args);1616- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);1717- }1010+long trace_syscall_enter(struct pt_regs *regs, long syscall)1111+{1212+ trace_sys_enter(regs, syscall);1313+ /*1414+ * Probes or BPF hooks in the tracepoint may have changed the1515+ * system call number. Reread it.1616+ */1717+ return syscall_get_nr(current, regs);1818}19192020-long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work)2020+void trace_syscall_exit(struct pt_regs *regs, long ret)2121{2222- long ret = 0;2323-2424- /*2525- * Handle Syscall User Dispatch. This must comes first, since2626- * the ABI here can be something that doesn't make sense for2727- * other syscall_work features.2828- */2929- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {3030- if (syscall_user_dispatch(regs))3131- return -1L;3232- }3333-3434- /*3535- * User space got a time slice extension granted and relinquishes3636- * the CPU. The work stops the slice timer to avoid an extra round3737- * through hrtimer_interrupt().3838- */3939- if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)4040- rseq_syscall_enter_work(syscall);4141-4242- /* Handle ptrace */4343- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {4444- ret = ptrace_report_syscall_entry(regs);4545- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))4646- return -1L;4747- }4848-4949- /* Do seccomp after ptrace, to catch any tracer changes. */5050- if (work & SYSCALL_WORK_SECCOMP) {5151- ret = __secure_computing();5252- if (ret == -1L)5353- return ret;5454- }5555-5656- /* Either of the above might have changed the syscall number */5757- syscall = syscall_get_nr(current, regs);5858-5959- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {6060- trace_sys_enter(regs, syscall);6161- /*6262- * Probes or BPF hooks in the tracepoint may have changed the6363- * system call number as well.6464- */6565- syscall = syscall_get_nr(current, regs);6666- }6767-6868- syscall_enter_audit(regs, syscall);6969-7070- return ret ? : syscall;7171-}7272-7373-/*7474- * If SYSCALL_EMU is set, then the only reason to report is when7575- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall7676- * instruction has been already reported in syscall_enter_from_user_mode().7777- */7878-static inline bool report_single_step(unsigned long work)7979-{8080- if (work & SYSCALL_WORK_SYSCALL_EMU)8181- return false;8282-8383- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;8484-}8585-8686-void syscall_exit_work(struct pt_regs *regs, unsigned long work)8787-{8888- bool step;8989-9090- /*9191- * If the syscall was rolled back due to syscall user dispatching,9292- * then the tracers below are not invoked for the same reason as9393- * the entry side was not invoked in syscall_trace_enter(): The ABI9494- * of these syscalls is unknown.9595- */9696- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {9797- if (unlikely(current->syscall_dispatch.on_dispatch)) {9898- current->syscall_dispatch.on_dispatch = false;9999- return;100100- }101101- }102102-103103- audit_syscall_exit(regs);104104-105105- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)106106- trace_sys_exit(regs, syscall_get_return_value(current, regs));107107-108108- step = report_single_step(work);109109- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)110110- ptrace_report_syscall_exit(regs, step);2222+ trace_sys_exit(regs, ret);11123}