Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: use execveat to create userspace MMs

Using clone will not undo features that have been enabled by libc. An
example of this already happening is rseq, which could cause the kernel
to read/write memory of the userspace process. In the future the
standard library might also use mseal by default to protect itself,
which would also thwart our attempts at unmapping everything.

Solve all this by taking a step back and doing an execve into a tiny
static binary that sets up the minimal environment required for the
stub without using any standard library. That way we have a clean
execution environment that is fully under the control of UML.

Note that this changes things a bit as the FDs are not anymore shared
with the kernel. Instead, we explicitly share the FDs for the physical
memory and all existing iomem regions. Doing this is fine, as iomem
regions cannot be added at runtime.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20240919124511.282088-3-benjamin@sipsolutions.net
[use pipe() instead of pipe2(), remove unneeded close() calls]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Benjamin Berg and committed by
Johannes Berg
32e8eaf2 cbb8e65e

+264 -57
+2 -1
arch/um/Makefile
··· 61 61 $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ 62 62 -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \ 63 63 -Din6addr_loopback=kernel_in6addr_loopback \ 64 - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr 64 + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \ 65 + -D__close_range=kernel__close_range 65 66 66 67 KBUILD_RUSTFLAGS += -Crelocation-model=pie 67 68
+11
arch/um/include/shared/skas/stub-data.h
··· 12 12 #include <as-layout.h> 13 13 #include <sysdep/tls.h> 14 14 15 + struct stub_init_data { 16 + unsigned long stub_start; 17 + 18 + int stub_code_fd; 19 + unsigned long stub_code_offset; 20 + int stub_data_fd; 21 + unsigned long stub_data_offset; 22 + 23 + unsigned long segv_handler; 24 + }; 25 + 15 26 #define STUB_NEXT_SYSCALL(s) \ 16 27 ((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len)) 17 28
+2
arch/um/kernel/skas/.gitignore
··· 1 + stub_exe 2 + stub_exe.dbg
+31 -2
arch/um/kernel/skas/Makefile
··· 3 3 # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 4 4 # 5 5 6 - obj-y := stub.o mmu.o process.o syscall.o uaccess.o 6 + obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ 7 + stub_exe_embed.o 8 + 9 + # Stub executable 10 + 11 + stub_exe_objs-y := stub_exe.o 12 + 13 + stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) 14 + 15 + # Object file containing the ELF executable 16 + $(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe 17 + 18 + $(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE 19 + $(call if_changed,stub_exe) 20 + 21 + $(obj)/stub_exe: OBJCOPYFLAGS := -S 22 + $(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE 23 + $(call if_changed,objcopy) 24 + 25 + quiet_cmd_stub_exe = STUB_EXE $@ 26 + cmd_stub_exe = $(CC) -nostdlib -o $@ \ 27 + $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \ 28 + $(filter %.o,$^) 29 + 30 + STUB_EXE_LDFLAGS = -n -static 31 + 32 + targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) 33 + 34 + # end 7 35 8 36 # stub.o is in the stub, so it can't be built with profiling 9 37 # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> 10 38 # disable it 11 39 12 40 CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) 13 - UNPROFILE_OBJS := stub.o 41 + CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) 42 + UNPROFILE_OBJS := stub.o stub_exe.o 14 43 KCOV_INSTRUMENT := n 15 44 16 45 include $(srctree)/arch/um/scripts/Makefile.rules
+88
arch/um/kernel/skas/stub_exe.c
··· 1 + #include <sys/ptrace.h> 2 + #include <sys/prctl.h> 3 + #include <asm/unistd.h> 4 + #include <sysdep/stub.h> 5 + #include <stub-data.h> 6 + 7 + void _start(void); 8 + 9 + noinline static void real_init(void) 10 + { 11 + struct stub_init_data init_data; 12 + unsigned long res; 13 + struct { 14 + void *ss_sp; 15 + int ss_flags; 16 + size_t ss_size; 17 + } stack = { 18 + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, 19 + }; 20 + struct { 21 + void *sa_handler_; 22 + unsigned long sa_flags; 23 + void *sa_restorer; 24 + unsigned long long sa_mask; 25 + } sa = { 26 + /* Need to set SA_RESTORER (but the handler never returns) */ 27 + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, 28 + /* no need to mask any signals */ 29 + .sa_mask = 0, 30 + }; 31 + 32 + /* set a nice name */ 33 + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); 34 + 35 + /* read information from STDIN and close it */ 36 + res = stub_syscall3(__NR_read, 0, 37 + (unsigned long)&init_data, sizeof(init_data)); 38 + if (res != sizeof(init_data)) 39 + stub_syscall1(__NR_exit, 10); 40 + 41 + stub_syscall1(__NR_close, 0); 42 + 43 + /* map stub code + data */ 44 + res = stub_syscall6(STUB_MMAP_NR, 45 + init_data.stub_start, UM_KERN_PAGE_SIZE, 46 + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, 47 + init_data.stub_code_fd, init_data.stub_code_offset); 48 + if (res != init_data.stub_start) 49 + stub_syscall1(__NR_exit, 11); 50 + 51 + res = stub_syscall6(STUB_MMAP_NR, 52 + init_data.stub_start + UM_KERN_PAGE_SIZE, 53 + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, 54 + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, 55 + init_data.stub_data_fd, init_data.stub_data_offset); 56 + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) 57 + stub_syscall1(__NR_exit, 12); 58 + 59 + /* setup signal stack inside stub data */ 60 + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; 61 + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); 62 + 63 + /* register SIGSEGV handler */ 64 + sa.sa_handler_ = (void *) init_data.segv_handler; 65 + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, 66 + sizeof(sa.sa_mask)); 67 + if (res != 0) 68 + stub_syscall1(__NR_exit, 13); 69 + 70 + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); 71 + 72 + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); 73 + 74 + stub_syscall1(__NR_exit, 14); 75 + 76 + __builtin_unreachable(); 77 + } 78 + 79 + void _start(void) 80 + { 81 + char *alloc; 82 + 83 + /* Make enough space for the stub (including space for alignment) */ 84 + alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) * UM_KERN_PAGE_SIZE); 85 + asm volatile("" : "+r,m"(alloc) : : "memory"); 86 + 87 + real_init(); 88 + }
+11
arch/um/kernel/skas/stub_exe_embed.S
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #include <linux/init.h> 3 + #include <linux/linkage.h> 4 + 5 + __INITDATA 6 + 7 + SYM_DATA_START(stub_exe_start) 8 + .incbin "arch/um/kernel/skas/stub_exe" 9 + SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) 10 + 11 + __FINIT
+1 -1
arch/um/os-Linux/mem.c
··· 42 42 } 43 43 44 44 /* Set by make_tempfile() during early boot. */ 45 - static char *tempdir = NULL; 45 + char *tempdir = NULL; 46 46 47 47 /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */ 48 48 static int __init check_tmpfs(const char *dir)
+118 -53
arch/um/os-Linux/skas/process.c
··· 10 10 #include <sched.h> 11 11 #include <errno.h> 12 12 #include <string.h> 13 + #include <fcntl.h> 14 + #include <mem_user.h> 13 15 #include <sys/mman.h> 14 16 #include <sys/wait.h> 17 + #include <sys/stat.h> 15 18 #include <asm/unistd.h> 16 19 #include <as-layout.h> 17 20 #include <init.h> ··· 179 176 180 177 extern char __syscall_stub_start[]; 181 178 182 - /** 183 - * userspace_tramp() - userspace trampoline 184 - * @stack: pointer to the new userspace stack page 185 - * 186 - * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed. 187 - * This function will run on a temporary stack page. 188 - * It ptrace()'es itself, then 189 - * Two pages are mapped into the userspace address space: 190 - * - STUB_CODE (with EXEC), which contains the skas stub code 191 - * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel. 192 - * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process. 193 - * And last the process stops itself to give control to the UML kernel for this userspace process. 194 - * 195 - * Return: Always zero, otherwise the current userspace process is ended with non null exit() call 196 - */ 179 + static int stub_exe_fd; 180 + 197 181 static int userspace_tramp(void *stack) 198 182 { 199 - struct sigaction sa; 200 - void *addr; 201 - int fd; 183 + char *const argv[] = { "uml-userspace", NULL }; 184 + int pipe_fds[2]; 202 185 unsigned long long offset; 203 - unsigned long segv_handler = STUB_CODE + 204 - (unsigned long) stub_segv_handler - 205 - (unsigned long) __syscall_stub_start; 186 + struct stub_init_data init_data = { 187 + .stub_start = STUB_START, 188 + .segv_handler = STUB_CODE + 189 + (unsigned long) stub_segv_handler - 190 + (unsigned long) __syscall_stub_start, 191 + }; 192 + struct iomem_region *iomem; 193 + int ret; 206 194 207 - ptrace(PTRACE_TRACEME, 0, 0, 0); 195 + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), 196 + &offset); 197 + init_data.stub_code_offset = MMAP_OFFSET(offset); 208 198 209 - signal(SIGTERM, SIG_DFL); 210 - signal(SIGWINCH, SIG_IGN); 199 + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); 200 + init_data.stub_data_offset = MMAP_OFFSET(offset); 211 201 212 - fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); 213 - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, 214 - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); 215 - if (addr == MAP_FAILED) { 216 - os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", 217 - STUB_CODE, errno); 218 - exit(1); 202 + /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ 203 + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); 204 + 205 + fcntl(init_data.stub_data_fd, F_SETFD, 0); 206 + for (iomem = iomem_regions; iomem; iomem = iomem->next) 207 + fcntl(iomem->fd, F_SETFD, 0); 208 + 209 + /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ 210 + if (pipe(pipe_fds)) 211 + exit(2); 212 + 213 + if (dup2(pipe_fds[0], 0) < 0) 214 + exit(3); 215 + close(pipe_fds[0]); 216 + 217 + /* Write init_data and close write side */ 218 + ret = write(pipe_fds[1], &init_data, sizeof(init_data)); 219 + close(pipe_fds[1]); 220 + 221 + if (ret != sizeof(init_data)) 222 + exit(4); 223 + 224 + execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); 225 + 226 + exit(5); 227 + } 228 + 229 + extern char stub_exe_start[]; 230 + extern char stub_exe_end[]; 231 + 232 + extern char *tempdir; 233 + 234 + #define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" 235 + 236 + #ifndef MFD_EXEC 237 + #define MFD_EXEC 0x0010U 238 + #endif 239 + 240 + static int __init init_stub_exe_fd(void) 241 + { 242 + size_t written = 0; 243 + char *tmpfile = NULL; 244 + 245 + stub_exe_fd = memfd_create("uml-userspace", 246 + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); 247 + 248 + if (stub_exe_fd < 0) { 249 + printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); 250 + 251 + tmpfile = malloc(strlen(tempdir) + 252 + strlen(STUB_EXE_NAME_TEMPLATE) + 1); 253 + if (tmpfile == NULL) 254 + panic("Failed to allocate memory for stub binary name"); 255 + 256 + strcpy(tmpfile, tempdir); 257 + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); 258 + 259 + stub_exe_fd = mkstemp(tmpfile); 260 + if (stub_exe_fd < 0) 261 + panic("Could not create temporary file for stub binary: %d", 262 + -errno); 219 263 } 220 264 221 - fd = phys_mapping(uml_to_phys(stack), &offset); 222 - addr = mmap((void *) STUB_DATA, 223 - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, 224 - MAP_FIXED | MAP_SHARED, fd, offset); 225 - if (addr == MAP_FAILED) { 226 - os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", 227 - STUB_DATA, errno); 228 - exit(1); 265 + while (written < stub_exe_end - stub_exe_start) { 266 + ssize_t res = write(stub_exe_fd, stub_exe_start + written, 267 + stub_exe_end - stub_exe_start - written); 268 + if (res < 0) { 269 + if (errno == EINTR) 270 + continue; 271 + 272 + if (tmpfile) 273 + unlink(tmpfile); 274 + panic("Failed write stub binary: %d", -errno); 275 + } 276 + 277 + written += res; 229 278 } 230 279 231 - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); 232 - sigemptyset(&sa.sa_mask); 233 - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 234 - sa.sa_sigaction = (void *) segv_handler; 235 - sa.sa_restorer = NULL; 236 - if (sigaction(SIGSEGV, &sa, NULL) < 0) { 237 - os_info("%s - setting SIGSEGV handler failed - errno = %d\n", 238 - __func__, errno); 239 - exit(1); 280 + if (!tmpfile) { 281 + fcntl(stub_exe_fd, F_ADD_SEALS, 282 + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); 283 + } else { 284 + if (fchmod(stub_exe_fd, 00500) < 0) { 285 + unlink(tmpfile); 286 + panic("Could not make stub binary executable: %d", 287 + -errno); 288 + } 289 + 290 + close(stub_exe_fd); 291 + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); 292 + if (stub_exe_fd < 0) { 293 + unlink(tmpfile); 294 + panic("Could not reopen stub binary: %d", -errno); 295 + } 296 + 297 + unlink(tmpfile); 298 + free(tmpfile); 240 299 } 241 300 242 - kill(os_getpid(), SIGSTOP); 243 301 return 0; 244 302 } 303 + __initcall(init_stub_exe_fd); 245 304 246 305 int userspace_pid[NR_CPUS]; 247 306 ··· 322 257 { 323 258 void *stack; 324 259 unsigned long sp; 325 - int pid, status, n, flags, err; 260 + int pid, status, n, err; 326 261 327 262 /* setup a temporary stack page */ 328 263 stack = mmap(NULL, UM_KERN_PAGE_SIZE, ··· 338 273 /* set stack pointer to the end of the stack page, so it can grow downwards */ 339 274 sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; 340 275 341 - flags = CLONE_FILES | SIGCHLD; 342 - 343 276 /* clone into new userspace process */ 344 - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); 277 + pid = clone(userspace_tramp, (void *) sp, 278 + CLONE_VFORK | CLONE_VM | SIGCHLD, 279 + (void *)stub_stack); 345 280 if (pid < 0) { 346 281 err = -errno; 347 282 printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",