Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'exec-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull execve updates from Eric Biederman:
"During the development of v5.7 I ran into bugs and quality of
implementation issues related to exec that could not be easily fixed
because of the way exec is implemented. So I have been diggin into
exec and cleaning up what I can.

This cycle I have been looking at different ideas and different
implementations to see what is possible to improve exec, and cleaning
the way exec interfaces with in kernel users. Only cleaning up the
interfaces of exec with rest of the kernel has managed to stabalize
and make it through review in time for v5.9-rc1 resulting in 2 sets of
changes this cycle.

- Implement kernel_execve

- Make the user mode driver code a better citizen

With kernel_execve the code size got a little larger as the copying of
parameters from userspace and copying of parameters from userspace is
now separate. The good news is kernel threads no longer need to play
games with set_fs to use exec. Which when combined with the rest of
Christophs set_fs changes should security bugs with set_fs much more
difficult"

* 'exec-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (23 commits)
exec: Implement kernel_execve
exec: Factor bprm_stack_limits out of prepare_arg_pages
exec: Factor bprm_execve out of do_execve_common
exec: Move bprm_mm_init into alloc_bprm
exec: Move initialization of bprm->filename into alloc_bprm
exec: Factor out alloc_bprm
exec: Remove unnecessary spaces from binfmts.h
umd: Stop using split_argv
umd: Remove exit_umh
bpfilter: Take advantage of the facilities of struct pid
exit: Factor thread_group_exited out of pidfd_poll
umd: Track user space drivers with struct pid
bpfilter: Move bpfilter_umh back into init data
exec: Remove do_execve_file
umh: Stop calling do_execve_file
umd: Transform fork_usermode_blob into fork_usermode_driver
umd: Rename umd_info.cmdline umd_info.driver_name
umd: For clarity rename umh_info umd_info
umh: Separate the user mode driver and the user mode helper support
umh: Remove call_usermodehelper_setup_file.
...

+504 -388
+1 -1
arch/x86/entry/entry_32.S
··· 854 854 CALL_NOSPEC ebx 855 855 /* 856 856 * A kernel thread is allowed to return here after successfully 857 - * calling do_execve(). Exit to userspace to complete the execve() 857 + * calling kernel_execve(). Exit to userspace to complete the execve() 858 858 * syscall. 859 859 */ 860 860 movl $0, PT_EAX(%esp)
+1 -1
arch/x86/entry/entry_64.S
··· 293 293 CALL_NOSPEC rbx 294 294 /* 295 295 * A kernel thread is allowed to return here after successfully 296 - * calling do_execve(). Exit to userspace to complete the execve() 296 + * calling kernel_execve(). Exit to userspace to complete the execve() 297 297 * syscall. 298 298 */ 299 299 movq $0, RAX(%rsp)
+1 -1
arch/x86/kernel/unwind_frame.c
··· 275 275 * This user_mode() check is slightly broader than a PF_KTHREAD 276 276 * check because it also catches the awkward situation where a 277 277 * newly forked kthread transitions into a user task by calling 278 - * do_execve(), which eventually clears PF_KTHREAD. 278 + * kernel_execve(), which eventually clears PF_KTHREAD. 279 279 */ 280 280 if (!user_mode(regs)) 281 281 goto the_end;
+221 -134
fs/exec.c
··· 448 448 return i; 449 449 } 450 450 451 - static int prepare_arg_pages(struct linux_binprm *bprm, 452 - struct user_arg_ptr argv, struct user_arg_ptr envp) 451 + static int count_strings_kernel(const char *const *argv) 452 + { 453 + int i; 454 + 455 + if (!argv) 456 + return 0; 457 + 458 + for (i = 0; argv[i]; ++i) { 459 + if (i >= MAX_ARG_STRINGS) 460 + return -E2BIG; 461 + if (fatal_signal_pending(current)) 462 + return -ERESTARTNOHAND; 463 + cond_resched(); 464 + } 465 + return i; 466 + } 467 + 468 + static int bprm_stack_limits(struct linux_binprm *bprm) 453 469 { 454 470 unsigned long limit, ptr_size; 455 - 456 - bprm->argc = count(argv, MAX_ARG_STRINGS); 457 - if (bprm->argc < 0) 458 - return bprm->argc; 459 - 460 - bprm->envc = count(envp, MAX_ARG_STRINGS); 461 - if (bprm->envc < 0) 462 - return bprm->envc; 463 471 464 472 /* 465 473 * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM ··· 640 632 return 0; 641 633 } 642 634 EXPORT_SYMBOL(copy_string_kernel); 635 + 636 + static int copy_strings_kernel(int argc, const char *const *argv, 637 + struct linux_binprm *bprm) 638 + { 639 + while (argc-- > 0) { 640 + int ret = copy_string_kernel(argv[argc], bprm); 641 + if (ret < 0) 642 + return ret; 643 + if (fatal_signal_pending(current)) 644 + return -ERESTARTNOHAND; 645 + cond_resched(); 646 + } 647 + return 0; 648 + } 643 649 644 650 #ifdef CONFIG_MMU 645 651 ··· 1565 1543 1566 1544 static void free_bprm(struct linux_binprm *bprm) 1567 1545 { 1546 + if (bprm->mm) { 1547 + acct_arg_size(bprm, 0); 1548 + mmput(bprm->mm); 1549 + } 1568 1550 free_arg_pages(bprm); 1569 1551 if (bprm->cred) { 1570 1552 mutex_unlock(&current->signal->cred_guard_mutex); ··· 1583 1557 /* If a binfmt changed the interp, free it. */ 1584 1558 if (bprm->interp != bprm->filename) 1585 1559 kfree(bprm->interp); 1560 + kfree(bprm->fdpath); 1586 1561 kfree(bprm); 1562 + } 1563 + 1564 + static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) 1565 + { 1566 + struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 1567 + int retval = -ENOMEM; 1568 + if (!bprm) 1569 + goto out; 1570 + 1571 + if (fd == AT_FDCWD || filename->name[0] == '/') { 1572 + bprm->filename = filename->name; 1573 + } else { 1574 + if (filename->name[0] == '\0') 1575 + bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); 1576 + else 1577 + bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", 1578 + fd, filename->name); 1579 + if (!bprm->fdpath) 1580 + goto out_free; 1581 + 1582 + bprm->filename = bprm->fdpath; 1583 + } 1584 + bprm->interp = bprm->filename; 1585 + 1586 + retval = bprm_mm_init(bprm); 1587 + if (retval) 1588 + goto out_free; 1589 + return bprm; 1590 + 1591 + out_free: 1592 + free_bprm(bprm); 1593 + out: 1594 + return ERR_PTR(retval); 1587 1595 } 1588 1596 1589 1597 int bprm_change_interp(const char *interp, struct linux_binprm *bprm) ··· 1878 1818 /* 1879 1819 * sys_execve() executes a new program. 1880 1820 */ 1881 - static int __do_execve_file(int fd, struct filename *filename, 1882 - struct user_arg_ptr argv, 1883 - struct user_arg_ptr envp, 1884 - int flags, struct file *file) 1821 + static int bprm_execve(struct linux_binprm *bprm, 1822 + int fd, struct filename *filename, int flags) 1885 1823 { 1886 - char *pathbuf = NULL; 1887 - struct linux_binprm *bprm; 1824 + struct file *file; 1888 1825 struct files_struct *displaced; 1826 + int retval; 1827 + 1828 + retval = unshare_files(&displaced); 1829 + if (retval) 1830 + return retval; 1831 + 1832 + retval = prepare_bprm_creds(bprm); 1833 + if (retval) 1834 + goto out_files; 1835 + 1836 + check_unsafe_exec(bprm); 1837 + current->in_execve = 1; 1838 + 1839 + file = do_open_execat(fd, filename, flags); 1840 + retval = PTR_ERR(file); 1841 + if (IS_ERR(file)) 1842 + goto out_unmark; 1843 + 1844 + sched_exec(); 1845 + 1846 + bprm->file = file; 1847 + /* 1848 + * Record that a name derived from an O_CLOEXEC fd will be 1849 + * inaccessible after exec. Relies on having exclusive access to 1850 + * current->files (due to unshare_files above). 1851 + */ 1852 + if (bprm->fdpath && 1853 + close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) 1854 + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; 1855 + 1856 + /* Set the unchanging part of bprm->cred */ 1857 + retval = security_bprm_creds_for_exec(bprm); 1858 + if (retval) 1859 + goto out; 1860 + 1861 + retval = exec_binprm(bprm); 1862 + if (retval < 0) 1863 + goto out; 1864 + 1865 + /* execve succeeded */ 1866 + current->fs->in_exec = 0; 1867 + current->in_execve = 0; 1868 + rseq_execve(current); 1869 + acct_update_integrals(current); 1870 + task_numa_free(current, false); 1871 + if (displaced) 1872 + put_files_struct(displaced); 1873 + return retval; 1874 + 1875 + out: 1876 + /* 1877 + * If past the point of no return ensure the the code never 1878 + * returns to the userspace process. Use an existing fatal 1879 + * signal if present otherwise terminate the process with 1880 + * SIGSEGV. 1881 + */ 1882 + if (bprm->point_of_no_return && !fatal_signal_pending(current)) 1883 + force_sigsegv(SIGSEGV); 1884 + 1885 + out_unmark: 1886 + current->fs->in_exec = 0; 1887 + current->in_execve = 0; 1888 + 1889 + out_files: 1890 + if (displaced) 1891 + reset_files_struct(displaced); 1892 + 1893 + return retval; 1894 + } 1895 + 1896 + static int do_execveat_common(int fd, struct filename *filename, 1897 + struct user_arg_ptr argv, 1898 + struct user_arg_ptr envp, 1899 + int flags) 1900 + { 1901 + struct linux_binprm *bprm; 1889 1902 int retval; 1890 1903 1891 1904 if (IS_ERR(filename)) ··· 1980 1847 * further execve() calls fail. */ 1981 1848 current->flags &= ~PF_NPROC_EXCEEDED; 1982 1849 1983 - retval = unshare_files(&displaced); 1984 - if (retval) 1850 + bprm = alloc_bprm(fd, filename); 1851 + if (IS_ERR(bprm)) { 1852 + retval = PTR_ERR(bprm); 1985 1853 goto out_ret; 1986 - 1987 - retval = -ENOMEM; 1988 - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); 1989 - if (!bprm) 1990 - goto out_files; 1991 - 1992 - retval = prepare_bprm_creds(bprm); 1993 - if (retval) 1994 - goto out_free; 1995 - 1996 - check_unsafe_exec(bprm); 1997 - current->in_execve = 1; 1998 - 1999 - if (!file) 2000 - file = do_open_execat(fd, filename, flags); 2001 - retval = PTR_ERR(file); 2002 - if (IS_ERR(file)) 2003 - goto out_unmark; 2004 - 2005 - sched_exec(); 2006 - 2007 - bprm->file = file; 2008 - if (!filename) { 2009 - bprm->filename = "none"; 2010 - } else if (fd == AT_FDCWD || filename->name[0] == '/') { 2011 - bprm->filename = filename->name; 2012 - } else { 2013 - if (filename->name[0] == '\0') 2014 - pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); 2015 - else 2016 - pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", 2017 - fd, filename->name); 2018 - if (!pathbuf) { 2019 - retval = -ENOMEM; 2020 - goto out_unmark; 2021 - } 2022 - /* 2023 - * Record that a name derived from an O_CLOEXEC fd will be 2024 - * inaccessible after exec. Relies on having exclusive access to 2025 - * current->files (due to unshare_files above). 2026 - */ 2027 - if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) 2028 - bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; 2029 - bprm->filename = pathbuf; 2030 1854 } 2031 - bprm->interp = bprm->filename; 2032 1855 2033 - retval = bprm_mm_init(bprm); 2034 - if (retval) 2035 - goto out_unmark; 2036 - 2037 - retval = prepare_arg_pages(bprm, argv, envp); 1856 + retval = count(argv, MAX_ARG_STRINGS); 2038 1857 if (retval < 0) 2039 - goto out; 1858 + goto out_free; 1859 + bprm->argc = retval; 2040 1860 2041 - /* Set the unchanging part of bprm->cred */ 2042 - retval = security_bprm_creds_for_exec(bprm); 2043 - if (retval) 2044 - goto out; 1861 + retval = count(envp, MAX_ARG_STRINGS); 1862 + if (retval < 0) 1863 + goto out_free; 1864 + bprm->envc = retval; 1865 + 1866 + retval = bprm_stack_limits(bprm); 1867 + if (retval < 0) 1868 + goto out_free; 2045 1869 2046 1870 retval = copy_string_kernel(bprm->filename, bprm); 2047 1871 if (retval < 0) 2048 - goto out; 2049 - 1872 + goto out_free; 2050 1873 bprm->exec = bprm->p; 1874 + 2051 1875 retval = copy_strings(bprm->envc, envp, bprm); 2052 1876 if (retval < 0) 2053 - goto out; 1877 + goto out_free; 2054 1878 2055 1879 retval = copy_strings(bprm->argc, argv, bprm); 2056 1880 if (retval < 0) 2057 - goto out; 1881 + goto out_free; 2058 1882 2059 - retval = exec_binprm(bprm); 2060 - if (retval < 0) 2061 - goto out; 2062 - 2063 - /* execve succeeded */ 2064 - current->fs->in_exec = 0; 2065 - current->in_execve = 0; 2066 - rseq_execve(current); 2067 - acct_update_integrals(current); 2068 - task_numa_free(current, false); 2069 - free_bprm(bprm); 2070 - kfree(pathbuf); 2071 - if (filename) 2072 - putname(filename); 2073 - if (displaced) 2074 - put_files_struct(displaced); 2075 - return retval; 2076 - 2077 - out: 2078 - /* 2079 - * If past the point of no return ensure the the code never 2080 - * returns to the userspace process. Use an existing fatal 2081 - * signal if present otherwise terminate the process with 2082 - * SIGSEGV. 2083 - */ 2084 - if (bprm->point_of_no_return && !fatal_signal_pending(current)) 2085 - force_sigsegv(SIGSEGV); 2086 - if (bprm->mm) { 2087 - acct_arg_size(bprm, 0); 2088 - mmput(bprm->mm); 2089 - } 2090 - 2091 - out_unmark: 2092 - current->fs->in_exec = 0; 2093 - current->in_execve = 0; 2094 - 1883 + retval = bprm_execve(bprm, fd, filename, flags); 2095 1884 out_free: 2096 1885 free_bprm(bprm); 2097 - kfree(pathbuf); 2098 1886 2099 - out_files: 2100 - if (displaced) 2101 - reset_files_struct(displaced); 2102 1887 out_ret: 2103 - if (filename) 2104 - putname(filename); 1888 + putname(filename); 2105 1889 return retval; 2106 1890 } 2107 1891 2108 - static int do_execveat_common(int fd, struct filename *filename, 2109 - struct user_arg_ptr argv, 2110 - struct user_arg_ptr envp, 2111 - int flags) 1892 + int kernel_execve(const char *kernel_filename, 1893 + const char *const *argv, const char *const *envp) 2112 1894 { 2113 - return __do_execve_file(fd, filename, argv, envp, flags, NULL); 1895 + struct filename *filename; 1896 + struct linux_binprm *bprm; 1897 + int fd = AT_FDCWD; 1898 + int retval; 1899 + 1900 + filename = getname_kernel(kernel_filename); 1901 + if (IS_ERR(filename)) 1902 + return PTR_ERR(filename); 1903 + 1904 + bprm = alloc_bprm(fd, filename); 1905 + if (IS_ERR(bprm)) { 1906 + retval = PTR_ERR(bprm); 1907 + goto out_ret; 1908 + } 1909 + 1910 + retval = count_strings_kernel(argv); 1911 + if (retval < 0) 1912 + goto out_free; 1913 + bprm->argc = retval; 1914 + 1915 + retval = count_strings_kernel(envp); 1916 + if (retval < 0) 1917 + goto out_free; 1918 + bprm->envc = retval; 1919 + 1920 + retval = bprm_stack_limits(bprm); 1921 + if (retval < 0) 1922 + goto out_free; 1923 + 1924 + retval = copy_string_kernel(bprm->filename, bprm); 1925 + if (retval < 0) 1926 + goto out_free; 1927 + bprm->exec = bprm->p; 1928 + 1929 + retval = copy_strings_kernel(bprm->envc, envp, bprm); 1930 + if (retval < 0) 1931 + goto out_free; 1932 + 1933 + retval = copy_strings_kernel(bprm->argc, argv, bprm); 1934 + if (retval < 0) 1935 + goto out_free; 1936 + 1937 + retval = bprm_execve(bprm, fd, filename, 0); 1938 + out_free: 1939 + free_bprm(bprm); 1940 + out_ret: 1941 + putname(filename); 1942 + return retval; 2114 1943 } 2115 1944 2116 - int do_execve_file(struct file *file, void *__argv, void *__envp) 2117 - { 2118 - struct user_arg_ptr argv = { .ptr.native = __argv }; 2119 - struct user_arg_ptr envp = { .ptr.native = __envp }; 2120 - 2121 - return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file); 2122 - } 2123 - 2124 - int do_execve(struct filename *filename, 1945 + static int do_execve(struct filename *filename, 2125 1946 const char __user *const __user *__argv, 2126 1947 const char __user *const __user *__envp) 2127 1948 { ··· 2084 1997 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); 2085 1998 } 2086 1999 2087 - int do_execveat(int fd, struct filename *filename, 2000 + static int do_execveat(int fd, struct filename *filename, 2088 2001 const char __user *const __user *__argv, 2089 2002 const char __user *const __user *__envp, 2090 2003 int flags)
+8 -13
include/linux/binfmts.h
··· 45 45 #ifdef __alpha__ 46 46 unsigned int taso:1; 47 47 #endif 48 - struct file * executable; /* Executable to pass to the interpreter */ 49 - struct file * interpreter; 50 - struct file * file; 48 + struct file *executable; /* Executable to pass to the interpreter */ 49 + struct file *interpreter; 50 + struct file *file; 51 51 struct cred *cred; /* new credentials */ 52 52 int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ 53 53 unsigned int per_clear; /* bits to clear in current->personality */ 54 54 int argc, envc; 55 - const char * filename; /* Name of binary as seen by procps */ 56 - const char * interp; /* Name of the binary really executed. Most 55 + const char *filename; /* Name of binary as seen by procps */ 56 + const char *interp; /* Name of the binary really executed. Most 57 57 of the time same as filename, but could be 58 58 different for binfmt_{misc,script} */ 59 + const char *fdpath; /* generated filename for execveat */ 59 60 unsigned interp_flags; 60 61 int execfd; /* File descriptor of the executable */ 61 62 unsigned long loader, exec; ··· 135 134 extern void set_binfmt(struct linux_binfmt *new); 136 135 extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); 137 136 138 - extern int do_execve(struct filename *, 139 - const char __user * const __user *, 140 - const char __user * const __user *); 141 - extern int do_execveat(int, struct filename *, 142 - const char __user * const __user *, 143 - const char __user * const __user *, 144 - int); 145 - int do_execve_file(struct file *file, void *__argv, void *__envp); 137 + int kernel_execve(const char *filename, 138 + const char *const *argv, const char *const *envp); 146 139 147 140 #endif /* _LINUX_BINFMTS_H */
+4 -3
include/linux/bpfilter.h
··· 3 3 #define _LINUX_BPFILTER_H 4 4 5 5 #include <uapi/linux/bpfilter.h> 6 - #include <linux/umh.h> 6 + #include <linux/usermode_driver.h> 7 7 8 8 struct sock; 9 9 int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, 10 10 unsigned int optlen); 11 11 int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, 12 12 int __user *optlen); 13 + void bpfilter_umh_cleanup(struct umd_info *info); 14 + 13 15 struct bpfilter_umh_ops { 14 - struct umh_info info; 16 + struct umd_info info; 15 17 /* since ip_getsockopt() can run in parallel, serialize access to umh */ 16 18 struct mutex lock; 17 19 int (*sockopt)(struct sock *sk, int optname, 18 20 char __user *optval, 19 21 unsigned int optlen, bool is_set); 20 22 int (*start)(void); 21 - bool stop; 22 23 }; 23 24 extern struct bpfilter_umh_ops bpfilter_ops; 24 25 #endif
-9
include/linux/sched.h
··· 1507 1507 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1508 1508 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1509 1509 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1510 - #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1511 1510 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1512 1511 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1513 1512 #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ ··· 2014 2015 } 2015 2016 2016 2017 #endif 2017 - 2018 - void __exit_umh(struct task_struct *tsk); 2019 - 2020 - static inline void exit_umh(struct task_struct *tsk) 2021 - { 2022 - if (unlikely(tsk->flags & PF_UMH)) 2023 - __exit_umh(tsk); 2024 - } 2025 2018 2026 2019 #ifdef CONFIG_DEBUG_RSEQ 2027 2020
+2
include/linux/sched/signal.h
··· 674 674 #define delay_group_leader(p) \ 675 675 (thread_group_leader(p) && !thread_group_empty(p)) 676 676 677 + extern bool thread_group_exited(struct pid *pid); 678 + 677 679 extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, 678 680 unsigned long *flags); 679 681
-15
include/linux/umh.h
··· 22 22 const char *path; 23 23 char **argv; 24 24 char **envp; 25 - struct file *file; 26 25 int wait; 27 26 int retval; 28 - pid_t pid; 29 27 int (*init)(struct subprocess_info *info, struct cred *new); 30 28 void (*cleanup)(struct subprocess_info *info); 31 29 void *data; ··· 37 39 gfp_t gfp_mask, 38 40 int (*init)(struct subprocess_info *info, struct cred *new), 39 41 void (*cleanup)(struct subprocess_info *), void *data); 40 - 41 - struct subprocess_info *call_usermodehelper_setup_file(struct file *file, 42 - int (*init)(struct subprocess_info *info, struct cred *new), 43 - void (*cleanup)(struct subprocess_info *), void *data); 44 - struct umh_info { 45 - const char *cmdline; 46 - struct file *pipe_to_umh; 47 - struct file *pipe_from_umh; 48 - struct list_head list; 49 - void (*cleanup)(struct umh_info *info); 50 - pid_t pid; 51 - }; 52 - int fork_usermode_blob(void *data, size_t len, struct umh_info *info); 53 42 54 43 extern int 55 44 call_usermodehelper_exec(struct subprocess_info *info, int wait);
+18
include/linux/usermode_driver.h
··· 1 + #ifndef __LINUX_USERMODE_DRIVER_H__ 2 + #define __LINUX_USERMODE_DRIVER_H__ 3 + 4 + #include <linux/umh.h> 5 + #include <linux/path.h> 6 + 7 + struct umd_info { 8 + const char *driver_name; 9 + struct file *pipe_to_umh; 10 + struct file *pipe_from_umh; 11 + struct path wd; 12 + struct pid *tgid; 13 + }; 14 + int umd_load_blob(struct umd_info *info, const void *data, size_t len); 15 + int umd_unload_blob(struct umd_info *info); 16 + int fork_usermode_driver(struct umd_info *info); 17 + 18 + #endif /* __LINUX_USERMODE_DRIVER_H__ */
+1 -3
init/main.c
··· 1331 1331 pr_debug(" with environment:\n"); 1332 1332 for (p = envp_init; *p; p++) 1333 1333 pr_debug(" %s\n", *p); 1334 - return do_execve(getname_kernel(init_filename), 1335 - (const char __user *const __user *)argv_init, 1336 - (const char __user *const __user *)envp_init); 1334 + return kernel_execve(init_filename, argv_init, envp_init); 1337 1335 } 1338 1336 1339 1337 static int try_to_run_init_process(const char *init_filename)
+1
kernel/Makefile
··· 12 12 notifier.o ksysfs.o cred.o reboot.o \ 13 13 async.o range.o smpboot.o ucount.o 14 14 15 + obj-$(CONFIG_BPFILTER) += usermode_driver.o 15 16 obj-$(CONFIG_MODULES) += kmod.o 16 17 obj-$(CONFIG_MULTIUSER) += groups.o 17 18
+24 -1
kernel/exit.c
··· 805 805 exit_task_namespaces(tsk); 806 806 exit_task_work(tsk); 807 807 exit_thread(tsk); 808 - exit_umh(tsk); 809 808 810 809 /* 811 810 * Flush inherited counters to the parent - before the parent ··· 1710 1711 return -EFAULT; 1711 1712 } 1712 1713 #endif 1714 + 1715 + /** 1716 + * thread_group_exited - check that a thread group has exited 1717 + * @pid: tgid of thread group to be checked. 1718 + * 1719 + * Test if the thread group represented by tgid has exited (all 1720 + * threads are zombies, dead or completely gone). 1721 + * 1722 + * Return: true if the thread group has exited. false otherwise. 1723 + */ 1724 + bool thread_group_exited(struct pid *pid) 1725 + { 1726 + struct task_struct *task; 1727 + bool exited; 1728 + 1729 + rcu_read_lock(); 1730 + task = pid_task(pid, PIDTYPE_PID); 1731 + exited = !task || 1732 + (READ_ONCE(task->exit_state) && thread_group_empty(task)); 1733 + rcu_read_unlock(); 1734 + 1735 + return exited; 1736 + } 1737 + EXPORT_SYMBOL(thread_group_exited); 1713 1738 1714 1739 __weak void abort(void) 1715 1740 {
+1 -5
kernel/fork.c
··· 1792 1792 */ 1793 1793 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 1794 1794 { 1795 - struct task_struct *task; 1796 1795 struct pid *pid = file->private_data; 1797 1796 __poll_t poll_flags = 0; 1798 1797 1799 1798 poll_wait(file, &pid->wait_pidfd, pts); 1800 1799 1801 - rcu_read_lock(); 1802 - task = pid_task(pid, PIDTYPE_PID); 1803 1800 /* 1804 1801 * Inform pollers only when the whole thread group exits. 1805 1802 * If the thread group leader exits before all other threads in the 1806 1803 * group, then poll(2) should block, similar to the wait(2) family. 1807 1804 */ 1808 - if (!task || (task->exit_state && thread_group_empty(task))) 1805 + if (thread_group_exited(pid)) 1809 1806 poll_flags = EPOLLIN | EPOLLRDNORM; 1810 - rcu_read_unlock(); 1811 1807 1812 1808 return poll_flags; 1813 1809 }
+3 -168
kernel/umh.c
··· 26 26 #include <linux/ptrace.h> 27 27 #include <linux/async.h> 28 28 #include <linux/uaccess.h> 29 - #include <linux/shmem_fs.h> 30 - #include <linux/pipe_fs_i.h> 31 29 32 30 #include <trace/events/module.h> 33 31 ··· 36 38 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 37 39 static DEFINE_SPINLOCK(umh_sysctl_lock); 38 40 static DECLARE_RWSEM(umhelper_sem); 39 - static LIST_HEAD(umh_list); 40 - static DEFINE_MUTEX(umh_list_lock); 41 41 42 42 static void call_usermodehelper_freeinfo(struct subprocess_info *info) 43 43 { ··· 98 102 99 103 commit_creds(new); 100 104 101 - sub_info->pid = task_pid_nr(current); 102 - if (sub_info->file) { 103 - retval = do_execve_file(sub_info->file, 104 - sub_info->argv, sub_info->envp); 105 - if (!retval) 106 - current->flags |= PF_UMH; 107 - } else 108 - retval = do_execve(getname_kernel(sub_info->path), 109 - (const char __user *const __user *)sub_info->argv, 110 - (const char __user *const __user *)sub_info->envp); 105 + retval = kernel_execve(sub_info->path, 106 + (const char *const *)sub_info->argv, 107 + (const char *const *)sub_info->envp); 111 108 out: 112 109 sub_info->retval = retval; 113 110 /* ··· 394 405 } 395 406 EXPORT_SYMBOL(call_usermodehelper_setup); 396 407 397 - struct subprocess_info *call_usermodehelper_setup_file(struct file *file, 398 - int (*init)(struct subprocess_info *info, struct cred *new), 399 - void (*cleanup)(struct subprocess_info *info), void *data) 400 - { 401 - struct subprocess_info *sub_info; 402 - struct umh_info *info = data; 403 - const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; 404 - 405 - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); 406 - if (!sub_info) 407 - return NULL; 408 - 409 - sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); 410 - if (!sub_info->argv) { 411 - kfree(sub_info); 412 - return NULL; 413 - } 414 - 415 - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); 416 - sub_info->path = "none"; 417 - sub_info->file = file; 418 - sub_info->init = init; 419 - sub_info->cleanup = cleanup; 420 - sub_info->data = data; 421 - return sub_info; 422 - } 423 - 424 - static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) 425 - { 426 - struct umh_info *umh_info = info->data; 427 - struct file *from_umh[2]; 428 - struct file *to_umh[2]; 429 - int err; 430 - 431 - /* create pipe to send data to umh */ 432 - err = create_pipe_files(to_umh, 0); 433 - if (err) 434 - return err; 435 - err = replace_fd(0, to_umh[0], 0); 436 - fput(to_umh[0]); 437 - if (err < 0) { 438 - fput(to_umh[1]); 439 - return err; 440 - } 441 - 442 - /* create pipe to receive data from umh */ 443 - err = create_pipe_files(from_umh, 0); 444 - if (err) { 445 - fput(to_umh[1]); 446 - replace_fd(0, NULL, 0); 447 - return err; 448 - } 449 - err = replace_fd(1, from_umh[1], 0); 450 - fput(from_umh[1]); 451 - if (err < 0) { 452 - fput(to_umh[1]); 453 - replace_fd(0, NULL, 0); 454 - fput(from_umh[0]); 455 - return err; 456 - } 457 - 458 - umh_info->pipe_to_umh = to_umh[1]; 459 - umh_info->pipe_from_umh = from_umh[0]; 460 - return 0; 461 - } 462 - 463 - static void umh_clean_and_save_pid(struct subprocess_info *info) 464 - { 465 - struct umh_info *umh_info = info->data; 466 - 467 - /* cleanup if umh_pipe_setup() was successful but exec failed */ 468 - if (info->pid && info->retval) { 469 - fput(umh_info->pipe_to_umh); 470 - fput(umh_info->pipe_from_umh); 471 - } 472 - 473 - argv_free(info->argv); 474 - umh_info->pid = info->pid; 475 - } 476 - 477 - /** 478 - * fork_usermode_blob - fork a blob of bytes as a usermode process 479 - * @data: a blob of bytes that can be do_execv-ed as a file 480 - * @len: length of the blob 481 - * @info: information about usermode process (shouldn't be NULL) 482 - * 483 - * If info->cmdline is set it will be used as command line for the 484 - * user process, else "usermodehelper" is used. 485 - * 486 - * Returns either negative error or zero which indicates success 487 - * in executing a blob of bytes as a usermode process. In such 488 - * case 'struct umh_info *info' is populated with two pipes 489 - * and a pid of the process. The caller is responsible for health 490 - * check of the user process, killing it via pid, and closing the 491 - * pipes when user process is no longer needed. 492 - */ 493 - int fork_usermode_blob(void *data, size_t len, struct umh_info *info) 494 - { 495 - struct subprocess_info *sub_info; 496 - struct file *file; 497 - ssize_t written; 498 - loff_t pos = 0; 499 - int err; 500 - 501 - file = shmem_kernel_file_setup("", len, 0); 502 - if (IS_ERR(file)) 503 - return PTR_ERR(file); 504 - 505 - written = kernel_write(file, data, len, &pos); 506 - if (written != len) { 507 - err = written; 508 - if (err >= 0) 509 - err = -ENOMEM; 510 - goto out; 511 - } 512 - 513 - err = -ENOMEM; 514 - sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, 515 - umh_clean_and_save_pid, info); 516 - if (!sub_info) 517 - goto out; 518 - 519 - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 520 - if (!err) { 521 - mutex_lock(&umh_list_lock); 522 - list_add(&info->list, &umh_list); 523 - mutex_unlock(&umh_list_lock); 524 - } 525 - out: 526 - fput(file); 527 - return err; 528 - } 529 - EXPORT_SYMBOL_GPL(fork_usermode_blob); 530 - 531 408 /** 532 409 * call_usermodehelper_exec - start a usermode application 533 410 * @sub_info: information about the subprocessa ··· 553 698 } 554 699 555 700 return 0; 556 - } 557 - 558 - void __exit_umh(struct task_struct *tsk) 559 - { 560 - struct umh_info *info; 561 - pid_t pid = tsk->pid; 562 - 563 - mutex_lock(&umh_list_lock); 564 - list_for_each_entry(info, &umh_list, list) { 565 - if (info->pid == pid) { 566 - list_del(&info->list); 567 - mutex_unlock(&umh_list_lock); 568 - goto out; 569 - } 570 - } 571 - mutex_unlock(&umh_list_lock); 572 - return; 573 - out: 574 - if (info->cleanup) 575 - info->cleanup(info); 576 701 } 577 702 578 703 struct ctl_table usermodehelper_table[] = {
+182
kernel/usermode_driver.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * umd - User mode driver support 4 + */ 5 + #include <linux/shmem_fs.h> 6 + #include <linux/pipe_fs_i.h> 7 + #include <linux/mount.h> 8 + #include <linux/fs_struct.h> 9 + #include <linux/task_work.h> 10 + #include <linux/usermode_driver.h> 11 + 12 + static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) 13 + { 14 + struct file_system_type *type; 15 + struct vfsmount *mnt; 16 + struct file *file; 17 + ssize_t written; 18 + loff_t pos = 0; 19 + 20 + type = get_fs_type("tmpfs"); 21 + if (!type) 22 + return ERR_PTR(-ENODEV); 23 + 24 + mnt = kern_mount(type); 25 + put_filesystem(type); 26 + if (IS_ERR(mnt)) 27 + return mnt; 28 + 29 + file = file_open_root(mnt->mnt_root, mnt, name, O_CREAT | O_WRONLY, 0700); 30 + if (IS_ERR(file)) { 31 + mntput(mnt); 32 + return ERR_CAST(file); 33 + } 34 + 35 + written = kernel_write(file, data, len, &pos); 36 + if (written != len) { 37 + int err = written; 38 + if (err >= 0) 39 + err = -ENOMEM; 40 + filp_close(file, NULL); 41 + mntput(mnt); 42 + return ERR_PTR(err); 43 + } 44 + 45 + fput(file); 46 + 47 + /* Flush delayed fput so exec can open the file read-only */ 48 + flush_delayed_fput(); 49 + task_work_run(); 50 + return mnt; 51 + } 52 + 53 + /** 54 + * umd_load_blob - Remember a blob of bytes for fork_usermode_driver 55 + * @info: information about usermode driver 56 + * @data: a blob of bytes that can be executed as a file 57 + * @len: The lentgh of the blob 58 + * 59 + */ 60 + int umd_load_blob(struct umd_info *info, const void *data, size_t len) 61 + { 62 + struct vfsmount *mnt; 63 + 64 + if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) 65 + return -EBUSY; 66 + 67 + mnt = blob_to_mnt(data, len, info->driver_name); 68 + if (IS_ERR(mnt)) 69 + return PTR_ERR(mnt); 70 + 71 + info->wd.mnt = mnt; 72 + info->wd.dentry = mnt->mnt_root; 73 + return 0; 74 + } 75 + EXPORT_SYMBOL_GPL(umd_load_blob); 76 + 77 + /** 78 + * umd_unload_blob - Disassociate @info from a previously loaded blob 79 + * @info: information about usermode driver 80 + * 81 + */ 82 + int umd_unload_blob(struct umd_info *info) 83 + { 84 + if (WARN_ON_ONCE(!info->wd.mnt || 85 + !info->wd.dentry || 86 + info->wd.mnt->mnt_root != info->wd.dentry)) 87 + return -EINVAL; 88 + 89 + kern_unmount(info->wd.mnt); 90 + info->wd.mnt = NULL; 91 + info->wd.dentry = NULL; 92 + return 0; 93 + } 94 + EXPORT_SYMBOL_GPL(umd_unload_blob); 95 + 96 + static int umd_setup(struct subprocess_info *info, struct cred *new) 97 + { 98 + struct umd_info *umd_info = info->data; 99 + struct file *from_umh[2]; 100 + struct file *to_umh[2]; 101 + int err; 102 + 103 + /* create pipe to send data to umh */ 104 + err = create_pipe_files(to_umh, 0); 105 + if (err) 106 + return err; 107 + err = replace_fd(0, to_umh[0], 0); 108 + fput(to_umh[0]); 109 + if (err < 0) { 110 + fput(to_umh[1]); 111 + return err; 112 + } 113 + 114 + /* create pipe to receive data from umh */ 115 + err = create_pipe_files(from_umh, 0); 116 + if (err) { 117 + fput(to_umh[1]); 118 + replace_fd(0, NULL, 0); 119 + return err; 120 + } 121 + err = replace_fd(1, from_umh[1], 0); 122 + fput(from_umh[1]); 123 + if (err < 0) { 124 + fput(to_umh[1]); 125 + replace_fd(0, NULL, 0); 126 + fput(from_umh[0]); 127 + return err; 128 + } 129 + 130 + set_fs_pwd(current->fs, &umd_info->wd); 131 + umd_info->pipe_to_umh = to_umh[1]; 132 + umd_info->pipe_from_umh = from_umh[0]; 133 + umd_info->tgid = get_pid(task_tgid(current)); 134 + return 0; 135 + } 136 + 137 + static void umd_cleanup(struct subprocess_info *info) 138 + { 139 + struct umd_info *umd_info = info->data; 140 + 141 + /* cleanup if umh_setup() was successful but exec failed */ 142 + if (info->retval) { 143 + fput(umd_info->pipe_to_umh); 144 + fput(umd_info->pipe_from_umh); 145 + put_pid(umd_info->tgid); 146 + umd_info->tgid = NULL; 147 + } 148 + } 149 + 150 + /** 151 + * fork_usermode_driver - fork a usermode driver 152 + * @info: information about usermode driver (shouldn't be NULL) 153 + * 154 + * Returns either negative error or zero which indicates success in 155 + * executing a usermode driver. In such case 'struct umd_info *info' 156 + * is populated with two pipes and a tgid of the process. The caller is 157 + * responsible for health check of the user process, killing it via 158 + * tgid, and closing the pipes when user process is no longer needed. 159 + */ 160 + int fork_usermode_driver(struct umd_info *info) 161 + { 162 + struct subprocess_info *sub_info; 163 + const char *argv[] = { info->driver_name, NULL }; 164 + int err; 165 + 166 + if (WARN_ON_ONCE(info->tgid)) 167 + return -EBUSY; 168 + 169 + err = -ENOMEM; 170 + sub_info = call_usermodehelper_setup(info->driver_name, 171 + (char **)argv, NULL, GFP_KERNEL, 172 + umd_setup, umd_cleanup, info); 173 + if (!sub_info) 174 + goto out; 175 + 176 + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 177 + out: 178 + return err; 179 + } 180 + EXPORT_SYMBOL_GPL(fork_usermode_driver); 181 + 182 +
+19 -19
net/bpfilter/bpfilter_kern.c
··· 15 15 16 16 static void shutdown_umh(void) 17 17 { 18 - struct task_struct *tsk; 18 + struct umd_info *info = &bpfilter_ops.info; 19 + struct pid *tgid = info->tgid; 19 20 20 - if (bpfilter_ops.stop) 21 - return; 22 - 23 - tsk = get_pid_task(find_vpid(bpfilter_ops.info.pid), PIDTYPE_PID); 24 - if (tsk) { 25 - send_sig(SIGKILL, tsk, 1); 26 - put_task_struct(tsk); 21 + if (tgid) { 22 + kill_pid(tgid, SIGKILL, 1); 23 + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); 24 + bpfilter_umh_cleanup(info); 27 25 } 28 26 } 29 27 ··· 46 48 req.cmd = optname; 47 49 req.addr = (long __force __user)optval; 48 50 req.len = optlen; 49 - if (!bpfilter_ops.info.pid) 51 + if (!bpfilter_ops.info.tgid) 50 52 goto out; 51 53 n = kernel_write(bpfilter_ops.info.pipe_to_umh, &req, sizeof(req), 52 54 &pos); ··· 75 77 int err; 76 78 77 79 /* fork usermode process */ 78 - err = fork_usermode_blob(&bpfilter_umh_start, 79 - &bpfilter_umh_end - &bpfilter_umh_start, 80 - &bpfilter_ops.info); 80 + err = fork_usermode_driver(&bpfilter_ops.info); 81 81 if (err) 82 82 return err; 83 - bpfilter_ops.stop = false; 84 - pr_info("Loaded bpfilter_umh pid %d\n", bpfilter_ops.info.pid); 83 + pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); 85 84 86 85 /* health check that usermode process started correctly */ 87 86 if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { ··· 93 98 { 94 99 int err; 95 100 101 + err = umd_load_blob(&bpfilter_ops.info, 102 + &bpfilter_umh_start, 103 + &bpfilter_umh_end - &bpfilter_umh_start); 104 + if (err) 105 + return err; 106 + 96 107 mutex_lock(&bpfilter_ops.lock); 97 - if (!bpfilter_ops.stop) { 98 - err = -EFAULT; 99 - goto out; 100 - } 101 108 err = start_umh(); 102 109 if (!err && IS_ENABLED(CONFIG_INET)) { 103 110 bpfilter_ops.sockopt = &__bpfilter_process_sockopt; 104 111 bpfilter_ops.start = &start_umh; 105 112 } 106 - out: 107 113 mutex_unlock(&bpfilter_ops.lock); 114 + if (err) 115 + umd_unload_blob(&bpfilter_ops.info); 108 116 return err; 109 117 } 110 118 ··· 120 122 bpfilter_ops.sockopt = NULL; 121 123 } 122 124 mutex_unlock(&bpfilter_ops.lock); 125 + 126 + umd_unload_blob(&bpfilter_ops.info); 123 127 } 124 128 module_init(load_umh); 125 129 module_exit(fini_umh);
+1 -1
net/bpfilter/bpfilter_umh_blob.S
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 - .section .rodata, "a" 2 + .section .init.rodata, "a" 3 3 .global bpfilter_umh_start 4 4 bpfilter_umh_start: 5 5 .incbin "net/bpfilter/bpfilter_umh"
+11 -9
net/ipv4/bpfilter/sockopt.c
··· 12 12 struct bpfilter_umh_ops bpfilter_ops; 13 13 EXPORT_SYMBOL_GPL(bpfilter_ops); 14 14 15 - static void bpfilter_umh_cleanup(struct umh_info *info) 15 + void bpfilter_umh_cleanup(struct umd_info *info) 16 16 { 17 - mutex_lock(&bpfilter_ops.lock); 18 - bpfilter_ops.stop = true; 19 17 fput(info->pipe_to_umh); 20 18 fput(info->pipe_from_umh); 21 - info->pid = 0; 22 - mutex_unlock(&bpfilter_ops.lock); 19 + put_pid(info->tgid); 20 + info->tgid = NULL; 23 21 } 22 + EXPORT_SYMBOL_GPL(bpfilter_umh_cleanup); 24 23 25 24 static int bpfilter_mbox_request(struct sock *sk, int optname, 26 25 char __user *optval, ··· 37 38 goto out; 38 39 } 39 40 } 40 - if (bpfilter_ops.stop) { 41 + if (bpfilter_ops.info.tgid && 42 + thread_group_exited(bpfilter_ops.info.tgid)) 43 + bpfilter_umh_cleanup(&bpfilter_ops.info); 44 + 45 + if (!bpfilter_ops.info.tgid) { 41 46 err = bpfilter_ops.start(); 42 47 if (err) 43 48 goto out; ··· 72 69 static int __init bpfilter_sockopt_init(void) 73 70 { 74 71 mutex_init(&bpfilter_ops.lock); 75 - bpfilter_ops.stop = true; 76 - bpfilter_ops.info.cmdline = "bpfilter_umh"; 77 - bpfilter_ops.info.cleanup = &bpfilter_umh_cleanup; 72 + bpfilter_ops.info.tgid = NULL; 73 + bpfilter_ops.info.driver_name = "bpfilter_umh"; 78 74 79 75 return 0; 80 76 }
+1 -1
security/tomoyo/common.h
··· 425 425 struct tomoyo_obj_info *obj; 426 426 /* 427 427 * For holding parameters specific to execve() request. 428 - * NULL if not dealing do_execve(). 428 + * NULL if not dealing execve(). 429 429 */ 430 430 struct tomoyo_execve *ee; 431 431 struct tomoyo_domain_info *domain;
+2 -2
security/tomoyo/domain.c
··· 767 767 768 768 /* 769 769 * Check for domain transition preference if "file execute" matched. 770 - * If preference is given, make do_execve() fail if domain transition 770 + * If preference is given, make execve() fail if domain transition 771 771 * has failed, for domain transition preference should be used with 772 772 * destination domain defined. 773 773 */ ··· 810 810 snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>", 811 811 candidate->name); 812 812 /* 813 - * Make do_execve() fail if domain transition across namespaces 813 + * Make execve() fail if domain transition across namespaces 814 814 * has failed. 815 815 */ 816 816 reject_on_transition_failure = true;
+2 -2
security/tomoyo/tomoyo.c
··· 93 93 struct tomoyo_task *s = tomoyo_task(current); 94 94 95 95 /* 96 - * Execute permission is checked against pathname passed to do_execve() 96 + * Execute permission is checked against pathname passed to execve() 97 97 * using current domain. 98 98 */ 99 99 if (!s->old_domain_info) { ··· 307 307 */ 308 308 static int tomoyo_file_open(struct file *f) 309 309 { 310 - /* Don't check read permission here if called from do_execve(). */ 310 + /* Don't check read permission here if called from execve(). */ 311 311 if (current->in_execve) 312 312 return 0; 313 313 return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,