Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: Track userspace children dying in SECCOMP mode

When in seccomp mode, we would hang forever on the futex if a child has
died unexpectedly. In contrast, ptrace mode will notice it and kill the
corresponding thread when it fails to run it.

Fix this issue using a new IRQ that is fired after a SIGCHLD and keeping
an (internal) list of all MMs. In the IRQ handler, find the affected MM
and set its PID to -1 as well as the futex variable to FUTEX_IN_KERN.

This, together with futex returning -EINTR after the signal is
sufficient to implement a race-free detection of a child dying.

Note that this also enables IRQ handling while starting a userspace
process. This should be safe and SECCOMP requires the IRQ in case the
process does not come up properly.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-5-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Benjamin Berg and committed by
Johannes Berg
8420e08f b1e1bd2e

+145 -8
+3 -2
arch/um/include/asm/irq.h
··· 13 13 #define TELNETD_IRQ 8 14 14 #define XTERM_IRQ 9 15 15 #define RANDOM_IRQ 10 16 + #define SIGCHLD_IRQ 11 16 17 17 18 #ifdef CONFIG_UML_NET_VECTOR 18 19 19 - #define VECTOR_BASE_IRQ (RANDOM_IRQ + 1) 20 + #define VECTOR_BASE_IRQ (SIGCHLD_IRQ + 1) 20 21 #define VECTOR_IRQ_SPACE 8 21 22 22 23 #define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) 23 24 24 25 #else 25 26 26 - #define UM_FIRST_DYN_IRQ (RANDOM_IRQ + 1) 27 + #define UM_FIRST_DYN_IRQ (SIGCHLD_IRQ + 1) 27 28 28 29 #endif 29 30
+3
arch/um/include/asm/mmu.h
··· 6 6 #ifndef __ARCH_UM_MMU_H 7 7 #define __ARCH_UM_MMU_H 8 8 9 + #include "linux/types.h" 9 10 #include <mm_id.h> 10 11 11 12 typedef struct mm_context { 12 13 struct mm_id id; 14 + 15 + struct list_head list; 13 16 14 17 /* Address range in need of a TLB sync */ 15 18 unsigned long sync_tlb_range_from;
+2
arch/um/include/shared/irq_user.h
··· 17 17 struct siginfo; 18 18 extern void sigio_handler(int sig, struct siginfo *unused_si, 19 19 struct uml_pt_regs *regs, void *mc); 20 + extern void sigchld_handler(int sig, struct siginfo *unused_si, 21 + struct uml_pt_regs *regs, void *mc); 20 22 void sigio_run_timetravel_handlers(void); 21 23 extern void free_irq_by_fd(int fd); 22 24 extern void deactivate_fd(int fd, int irqnum);
+1
arch/um/include/shared/os.h
··· 197 197 extern void report_enomem(void); 198 198 199 199 /* process.c */ 200 + pid_t os_reap_child(void); 200 201 extern void os_alarm_process(int pid); 201 202 extern void os_kill_process(int pid, int reap_child); 202 203 extern void os_kill_ptraced_process(int pid, int reap_child);
+2
arch/um/include/shared/skas/mm_id.h
··· 14 14 15 15 void __switch_mm(struct mm_id *mm_idp); 16 16 17 + void notify_mm_kill(int pid); 18 + 17 19 #endif
+1
arch/um/include/shared/skas/skas.h
··· 8 8 9 9 #include <sysdep/ptrace.h> 10 10 11 + extern int using_seccomp; 11 12 extern int userspace_pid[]; 12 13 13 14 extern void new_thread_handler(void);
+6
arch/um/kernel/irq.c
··· 690 690 /* Initialize EPOLL Loop */ 691 691 os_setup_epoll(); 692 692 } 693 + 694 + extern void sigchld_handler(int sig, struct siginfo *unused_si, 695 + struct uml_pt_regs *regs, void *mc) 696 + { 697 + do_IRQ(SIGCHLD_IRQ, regs); 698 + }
+77 -5
arch/um/kernel/skas/mmu.c
··· 8 8 #include <linux/sched/signal.h> 9 9 #include <linux/slab.h> 10 10 11 + #include <shared/irq_kern.h> 11 12 #include <asm/pgalloc.h> 12 13 #include <asm/sections.h> 13 14 #include <asm/mmu_context.h> ··· 19 18 20 19 /* Ensure the stub_data struct covers the allocated area */ 21 20 static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); 21 + 22 + spinlock_t mm_list_lock; 23 + struct list_head mm_list; 22 24 23 25 int init_new_context(struct task_struct *task, struct mm_struct *mm) 24 26 { ··· 35 31 36 32 new_id->stack = stack; 37 33 38 - block_signals_trace(); 39 - new_id->pid = start_userspace(stack); 40 - unblock_signals_trace(); 34 + scoped_guard(spinlock_irqsave, &mm_list_lock) { 35 + /* Insert into list, used for lookups when the child dies */ 36 + list_add(&mm->context.list, &mm_list); 37 + } 41 38 39 + new_id->pid = start_userspace(stack); 42 40 if (new_id->pid < 0) { 43 41 ret = new_id->pid; 44 42 goto out_free; ··· 66 60 * zero, resulting in a kill(0), which will result in the 67 61 * whole UML suddenly dying. Also, cover negative and 68 62 * 1 cases, since they shouldn't happen either. 63 + * 64 + * Negative cases happen if the child died unexpectedly. 69 65 */ 70 - if (mmu->id.pid < 2) { 66 + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { 71 67 printk(KERN_ERR "corrupt mm_context - pid = %d\n", 72 68 mmu->id.pid); 73 69 return; 74 70 } 75 - os_kill_ptraced_process(mmu->id.pid, 1); 71 + 72 + if (mmu->id.pid > 0) { 73 + os_kill_ptraced_process(mmu->id.pid, 1); 74 + mmu->id.pid = -1; 75 + } 76 76 77 77 free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); 78 + 79 + guard(spinlock_irqsave)(&mm_list_lock); 80 + 81 + list_del(&mm->context.list); 78 82 } 83 + 84 + static irqreturn_t mm_sigchld_irq(int irq, void* dev) 85 + { 86 + struct mm_context *mm_context; 87 + pid_t pid; 88 + 89 + guard(spinlock)(&mm_list_lock); 90 + 91 + while ((pid = os_reap_child()) > 0) { 92 + /* 93 + * A child died, check if we have an MM with the PID. This is 94 + * only relevant in SECCOMP mode (as ptrace will fail anyway). 95 + * 96 + * See wait_stub_done_seccomp for more details. 97 + */ 98 + list_for_each_entry(mm_context, &mm_list, list) { 99 + if (mm_context->id.pid == pid) { 100 + struct stub_data *stub_data; 101 + printk("Unexpectedly lost MM child! Affected tasks will segfault."); 102 + 103 + /* Marks the MM as dead */ 104 + mm_context->id.pid = -1; 105 + 106 + /* 107 + * NOTE: If SMP is implemented, a futex_wake 108 + * needs to be added here. 109 + */ 110 + stub_data = (void *)mm_context->id.stack; 111 + stub_data->futex = FUTEX_IN_KERN; 112 + 113 + /* 114 + * NOTE: Currently executing syscalls by 115 + * affected tasks may finish normally. 116 + */ 117 + break; 118 + } 119 + } 120 + } 121 + 122 + return IRQ_HANDLED; 123 + } 124 + 125 + static int __init init_child_tracking(void) 126 + { 127 + int err; 128 + 129 + spin_lock_init(&mm_list_lock); 130 + INIT_LIST_HEAD(&mm_list); 131 + 132 + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); 133 + if (err < 0) 134 + panic("Failed to register SIGCHLD IRQ: %d", err); 135 + 136 + return 0; 137 + } 138 + early_initcall(init_child_tracking)
+31
arch/um/os-Linux/process.c
··· 18 18 #include <init.h> 19 19 #include <longjmp.h> 20 20 #include <os.h> 21 + #include <skas/skas.h> 21 22 22 23 void os_alarm_process(int pid) 23 24 { 25 + if (pid <= 0) 26 + return; 27 + 24 28 kill(pid, SIGALRM); 25 29 } 26 30 27 31 void os_kill_process(int pid, int reap_child) 28 32 { 33 + if (pid <= 0) 34 + return; 35 + 36 + /* Block signals until child is reaped */ 37 + block_signals(); 38 + 29 39 kill(pid, SIGKILL); 30 40 if (reap_child) 31 41 CATCH_EINTR(waitpid(pid, NULL, __WALL)); 42 + 43 + unblock_signals(); 32 44 } 33 45 34 46 /* Kill off a ptraced child by all means available. kill it normally first, ··· 50 38 51 39 void os_kill_ptraced_process(int pid, int reap_child) 52 40 { 41 + if (pid <= 0) 42 + return; 43 + 44 + /* Block signals until child is reaped */ 45 + block_signals(); 46 + 53 47 kill(pid, SIGKILL); 54 48 ptrace(PTRACE_KILL, pid); 55 49 ptrace(PTRACE_CONT, pid); 56 50 if (reap_child) 57 51 CATCH_EINTR(waitpid(pid, NULL, __WALL)); 52 + 53 + unblock_signals(); 54 + } 55 + 56 + pid_t os_reap_child(void) 57 + { 58 + int status; 59 + 60 + /* Try to reap a child */ 61 + return waitpid(-1, &status, WNOHANG); 58 62 } 59 63 60 64 /* Don't use the glibc version, which caches the result in TLS. It misses some ··· 179 151 set_handler(SIGBUS); 180 152 signal(SIGHUP, SIG_IGN); 181 153 set_handler(SIGIO); 154 + /* We (currently) only use the child reaper IRQ in seccomp mode */ 155 + if (using_seccomp) 156 + set_handler(SIGCHLD); 182 157 signal(SIGWINCH, SIG_IGN); 183 158 } 184 159
+18 -1
arch/um/os-Linux/signal.c
··· 29 29 [SIGBUS] = relay_signal, 30 30 [SIGSEGV] = segv_handler, 31 31 [SIGIO] = sigio_handler, 32 + [SIGCHLD] = sigchld_handler, 32 33 }; 33 34 34 35 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) ··· 45 44 } 46 45 47 46 /* enable signals if sig isn't IRQ signal */ 48 - if ((sig != SIGIO) && (sig != SIGWINCH)) 47 + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD)) 49 48 unblock_signals_trace(); 50 49 51 50 (*sig_info[sig])(sig, si, &r, mc); ··· 64 63 65 64 #define SIGALRM_BIT 1 66 65 #define SIGALRM_MASK (1 << SIGALRM_BIT) 66 + 67 + #define SIGCHLD_BIT 2 68 + #define SIGCHLD_MASK (1 << SIGCHLD_BIT) 67 69 68 70 int signals_enabled; 69 71 #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) ··· 103 99 sigio_run_timetravel_handlers(); 104 100 else 105 101 signals_pending |= SIGIO_MASK; 102 + return; 103 + } 104 + 105 + if (!enabled && (sig == SIGCHLD)) { 106 + signals_pending |= SIGCHLD_MASK; 106 107 return; 107 108 } 108 109 ··· 190 181 191 182 [SIGIO] = sig_handler, 192 183 [SIGWINCH] = sig_handler, 184 + /* SIGCHLD is only actually registered in seccomp mode. */ 185 + [SIGCHLD] = sig_handler, 193 186 [SIGALRM] = timer_alarm_handler, 194 187 195 188 [SIGUSR1] = sigusr1_handler, ··· 319 308 */ 320 309 if (save_pending & SIGIO_MASK) 321 310 sig_handler_common(SIGIO, NULL, NULL); 311 + 312 + if (save_pending & SIGCHLD_MASK) { 313 + struct uml_pt_regs regs = {}; 314 + 315 + sigchld_handler(SIGCHLD, NULL, &regs, NULL); 316 + } 322 317 323 318 /* Do not reenter the handler */ 324 319
+1
arch/um/os-Linux/skas/process.c
··· 309 309 } 310 310 __initcall(init_stub_exe_fd); 311 311 312 + int using_seccomp; 312 313 int userspace_pid[NR_CPUS]; 313 314 314 315 /**