Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: Add SECCOMP support detection and initialization

This detects seccomp support, sets the global using_seccomp variable and
initilizes the exec registers. The support is only enabled if the
seccomp= kernel parameter is set to either "on" or "auto". With "auto" a
fallback to ptrace mode will happen if initialization failed.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-7-benjamin@sipsolutions.net
[extend help with Kconfig text from v2, use exit syscall instead of libc,
remove unneeded mctx_offset assignment, disable on 32-bit for now]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Benjamin Berg and committed by
Johannes Berg
beddc9fb 406d17c6

+195 -4
+2 -2
arch/um/os-Linux/registers.c
··· 14 14 15 15 /* This is set once at boot time and not changed thereafter */ 16 16 17 - static unsigned long exec_regs[MAX_REG_NR]; 18 - static unsigned long *exec_fp_regs; 17 + unsigned long exec_regs[MAX_REG_NR]; 18 + unsigned long *exec_fp_regs; 19 19 20 20 int init_pid_registers(int pid) 21 21 {
+193 -2
arch/um/os-Linux/start_up.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> 3 4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 4 5 */ 5 6 ··· 25 24 #include <kern_util.h> 26 25 #include <mem_user.h> 27 26 #include <ptrace_user.h> 27 + #include <stdbool.h> 28 + #include <stub-data.h> 29 + #include <sys/prctl.h> 30 + #include <linux/seccomp.h> 31 + #include <linux/filter.h> 32 + #include <sysdep/mcontext.h> 33 + #include <sysdep/stub.h> 28 34 #include <registers.h> 29 35 #include <skas.h> 30 36 #include "internal.h" ··· 232 224 check_sysemu(); 233 225 } 234 226 227 + extern unsigned long host_fp_size; 228 + extern unsigned long exec_regs[MAX_REG_NR]; 229 + extern unsigned long *exec_fp_regs; 230 + 231 + __initdata static struct stub_data *seccomp_test_stub_data; 232 + 233 + static void __init sigsys_handler(int sig, siginfo_t *info, void *p) 234 + { 235 + ucontext_t *uc = p; 236 + 237 + /* Stow away the location of the mcontext in the stack */ 238 + seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - 239 + (unsigned long)&seccomp_test_stub_data->sigstack[0]; 240 + 241 + /* Prevent libc from clearing memory (mctx_offset in particular) */ 242 + syscall(__NR_exit, 0); 243 + } 244 + 245 + static int __init seccomp_helper(void *data) 246 + { 247 + static struct sock_filter filter[] = { 248 + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, 249 + offsetof(struct seccomp_data, nr)), 250 + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), 251 + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), 252 + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), 253 + }; 254 + static struct sock_fprog prog = { 255 + .len = ARRAY_SIZE(filter), 256 + .filter = filter, 257 + }; 258 + struct sigaction sa; 259 + 260 + set_sigstack(seccomp_test_stub_data->sigstack, 261 + sizeof(seccomp_test_stub_data->sigstack)); 262 + 263 + sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 264 + sa.sa_sigaction = (void *) sigsys_handler; 265 + sa.sa_restorer = NULL; 266 + if (sigaction(SIGSYS, &sa, NULL) < 0) 267 + exit(1); 268 + 269 + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 270 + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 271 + SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) 272 + exit(2); 273 + 274 + sleep(0); 275 + 276 + /* Never reached. */ 277 + _exit(3); 278 + } 279 + 280 + static bool __init init_seccomp(void) 281 + { 282 + int pid; 283 + int status; 284 + int n; 285 + unsigned long sp; 286 + 287 + /* doesn't work on 32-bit right now */ 288 + if (!IS_ENABLED(CONFIG_64BIT)) 289 + return false; 290 + 291 + /* 292 + * We check that we can install a seccomp filter and then exit(0) 293 + * from a trapped syscall. 294 + * 295 + * Note that we cannot verify that no seccomp filter already exists 296 + * for a syscall that results in the process/thread to be killed. 297 + */ 298 + 299 + os_info("Checking that seccomp filters can be installed..."); 300 + 301 + seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), 302 + PROT_READ | PROT_WRITE, 303 + MAP_SHARED | MAP_ANON, 0, 0); 304 + 305 + /* Use the syscall data area as stack, we just need something */ 306 + sp = (unsigned long)&seccomp_test_stub_data->syscall_data + 307 + sizeof(seccomp_test_stub_data->syscall_data) - 308 + sizeof(void *); 309 + pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); 310 + 311 + if (pid < 0) 312 + fatal_perror("check_seccomp : clone failed"); 313 + 314 + CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); 315 + if (n < 0) 316 + fatal_perror("check_seccomp : waitpid failed"); 317 + 318 + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { 319 + struct uml_pt_regs *regs; 320 + unsigned long fp_size; 321 + int r; 322 + 323 + /* Fill in the host_fp_size from the mcontext. */ 324 + regs = calloc(1, sizeof(struct uml_pt_regs)); 325 + get_stub_state(regs, seccomp_test_stub_data, &fp_size); 326 + host_fp_size = fp_size; 327 + free(regs); 328 + 329 + /* Repeat with the correct size */ 330 + regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); 331 + r = get_stub_state(regs, seccomp_test_stub_data, NULL); 332 + 333 + /* Store as the default startup registers */ 334 + exec_fp_regs = malloc(host_fp_size); 335 + memcpy(exec_regs, regs->gp, sizeof(exec_regs)); 336 + memcpy(exec_fp_regs, regs->fp, host_fp_size); 337 + 338 + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 339 + 340 + free(regs); 341 + 342 + if (r) { 343 + os_info("failed to fetch registers: %d\n", r); 344 + return false; 345 + } 346 + 347 + os_info("OK\n"); 348 + return true; 349 + } 350 + 351 + if (WIFEXITED(status) && WEXITSTATUS(status) == 2) 352 + os_info("missing\n"); 353 + else 354 + os_info("error\n"); 355 + 356 + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 357 + return false; 358 + } 359 + 360 + 235 361 static void __init check_coredump_limit(void) 236 362 { 237 363 struct rlimit lim; ··· 420 278 } 421 279 } 422 280 281 + static int seccomp_config __initdata; 282 + 283 + static int __init uml_seccomp_config(char *line, int *add) 284 + { 285 + *add = 0; 286 + 287 + if (strcmp(line, "off") == 0) 288 + seccomp_config = 0; 289 + else if (strcmp(line, "auto") == 0) 290 + seccomp_config = 1; 291 + else if (strcmp(line, "on") == 0) 292 + seccomp_config = 2; 293 + else 294 + fatal("Invalid seccomp option '%s', expected on/auto/off\n", 295 + line); 296 + 297 + return 0; 298 + } 299 + 300 + __uml_setup("seccomp=", uml_seccomp_config, 301 + "seccomp=<on/auto/off>\n" 302 + " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" 303 + " processes work collaboratively with the kernel instead of being\n" 304 + " traced using ptrace. All syscalls from the application are caught and\n" 305 + " redirected using a signal. This signal handler in turn is permitted to\n" 306 + " do the selected set of syscalls to communicate with the UML kernel and\n" 307 + " do the required memory management.\n" 308 + "\n" 309 + " This method is overall faster than the ptrace based userspace, primarily\n" 310 + " because it reduces the number of context switches for (minor) page faults.\n" 311 + "\n" 312 + " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" 313 + " userspace from reading and writing all physical memory. Userspace\n" 314 + " processes could also trick the stub into disabling SIGALRM which\n" 315 + " prevents it from being interrupted for scheduling purposes.\n" 316 + "\n" 317 + " This is insecure and should only be used with a trusted userspace\n\n" 318 + ); 423 319 424 320 void __init os_early_checks(void) 425 321 { ··· 466 286 /* Print out the core dump limits early */ 467 287 check_coredump_limit(); 468 288 469 - check_ptrace(); 470 - 471 289 /* Need to check this early because mmapping happens before the 472 290 * kernel is running. 473 291 */ 474 292 check_tmpexec(); 293 + 294 + if (seccomp_config) { 295 + if (init_seccomp()) { 296 + using_seccomp = 1; 297 + return; 298 + } 299 + 300 + if (seccomp_config == 2) 301 + fatal("SECCOMP userspace requested but not functional!\n"); 302 + } 303 + 304 + using_seccomp = 0; 305 + check_ptrace(); 475 306 476 307 pid = start_ptraced_child(); 477 308 if (init_pid_registers(pid))