Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

userns: Add a knob to disable setgroups on a per user namespace basis

- Expose the knob to user space through a proc file /proc/<pid>/setgroups

A value of "deny" means the setgroups system call is disabled in the
current processes user namespace and can not be enabled in the
future in this user namespace.

A value of "allow" means the segtoups system call is enabled.

- Descendant user namespaces inherit the value of setgroups from
their parents.

- A proc file is used (instead of a sysctl) as sysctls currently do
not allow checking the permissions at open time.

- Writing to the proc file is restricted to before the gid_map
for the user namespace is set.

This ensures that disabling setgroups at a user namespace
level will never remove the ability to call setgroups
from a process that already has that ability.

A process may opt in to the setgroups disable for itself by
creating, entering and configuring a user namespace or by calling
setns on an existing user namespace with setgroups disabled.
Processes without privileges already can not call setgroups so this
is a noop. Prodcess with privilege become processes without
privilege when entering a user namespace and as with any other path
to dropping privilege they would not have the ability to call
setgroups. So this remains within the bounds of what is possible
without a knob to disable setgroups permanently in a user namespace.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

+146
+53
fs/proc/base.c
··· 2464 2464 .llseek = seq_lseek, 2465 2465 .release = proc_id_map_release, 2466 2466 }; 2467 + 2468 + static int proc_setgroups_open(struct inode *inode, struct file *file) 2469 + { 2470 + struct user_namespace *ns = NULL; 2471 + struct task_struct *task; 2472 + int ret; 2473 + 2474 + ret = -ESRCH; 2475 + task = get_proc_task(inode); 2476 + if (task) { 2477 + rcu_read_lock(); 2478 + ns = get_user_ns(task_cred_xxx(task, user_ns)); 2479 + rcu_read_unlock(); 2480 + put_task_struct(task); 2481 + } 2482 + if (!ns) 2483 + goto err; 2484 + 2485 + if (file->f_mode & FMODE_WRITE) { 2486 + ret = -EACCES; 2487 + if (!ns_capable(ns, CAP_SYS_ADMIN)) 2488 + goto err_put_ns; 2489 + } 2490 + 2491 + ret = single_open(file, &proc_setgroups_show, ns); 2492 + if (ret) 2493 + goto err_put_ns; 2494 + 2495 + return 0; 2496 + err_put_ns: 2497 + put_user_ns(ns); 2498 + err: 2499 + return ret; 2500 + } 2501 + 2502 + static int proc_setgroups_release(struct inode *inode, struct file *file) 2503 + { 2504 + struct seq_file *seq = file->private_data; 2505 + struct user_namespace *ns = seq->private; 2506 + int ret = single_release(inode, file); 2507 + put_user_ns(ns); 2508 + return ret; 2509 + } 2510 + 2511 + static const struct file_operations proc_setgroups_operations = { 2512 + .open = proc_setgroups_open, 2513 + .write = proc_setgroups_write, 2514 + .read = seq_read, 2515 + .llseek = seq_lseek, 2516 + .release = proc_setgroups_release, 2517 + }; 2467 2518 #endif /* CONFIG_USER_NS */ 2468 2519 2469 2520 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, ··· 2623 2572 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2624 2573 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2625 2574 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2575 + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), 2626 2576 #endif 2627 2577 #ifdef CONFIG_CHECKPOINT_RESTORE 2628 2578 REG("timers", S_IRUGO, proc_timers_operations), ··· 2965 2913 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2966 2914 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2967 2915 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2916 + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), 2968 2917 #endif 2969 2918 }; 2970 2919
+7
include/linux/user_namespace.h
··· 17 17 } extent[UID_GID_MAP_MAX_EXTENTS]; 18 18 }; 19 19 20 + #define USERNS_SETGROUPS_ALLOWED 1UL 21 + 22 + #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED 23 + 20 24 struct user_namespace { 21 25 struct uid_gid_map uid_map; 22 26 struct uid_gid_map gid_map; ··· 31 27 kuid_t owner; 32 28 kgid_t group; 33 29 unsigned int proc_inum; 30 + unsigned long flags; 34 31 35 32 /* Register of per-UID persistent keyrings for this namespace */ 36 33 #ifdef CONFIG_PERSISTENT_KEYRINGS ··· 68 63 extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); 69 64 extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); 70 65 extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); 66 + extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); 67 + extern int proc_setgroups_show(struct seq_file *m, void *v); 71 68 extern bool userns_may_setgroups(const struct user_namespace *ns); 72 69 #else 73 70
+1
kernel/user.c
··· 51 51 .owner = GLOBAL_ROOT_UID, 52 52 .group = GLOBAL_ROOT_GID, 53 53 .proc_inum = PROC_USER_INIT_INO, 54 + .flags = USERNS_INIT_FLAGS, 54 55 #ifdef CONFIG_PERSISTENT_KEYRINGS 55 56 .persistent_keyring_register_sem = 56 57 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+85
kernel/user_namespace.c
··· 100 100 ns->owner = owner; 101 101 ns->group = group; 102 102 103 + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ 104 + mutex_lock(&userns_state_mutex); 105 + ns->flags = parent_ns->flags; 106 + mutex_unlock(&userns_state_mutex); 107 + 103 108 set_cred_user_ns(new, ns); 104 109 105 110 #ifdef CONFIG_PERSISTENT_KEYRINGS ··· 844 839 return false; 845 840 } 846 841 842 + int proc_setgroups_show(struct seq_file *seq, void *v) 843 + { 844 + struct user_namespace *ns = seq->private; 845 + unsigned long userns_flags = ACCESS_ONCE(ns->flags); 846 + 847 + seq_printf(seq, "%s\n", 848 + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? 849 + "allow" : "deny"); 850 + return 0; 851 + } 852 + 853 + ssize_t proc_setgroups_write(struct file *file, const char __user *buf, 854 + size_t count, loff_t *ppos) 855 + { 856 + struct seq_file *seq = file->private_data; 857 + struct user_namespace *ns = seq->private; 858 + char kbuf[8], *pos; 859 + bool setgroups_allowed; 860 + ssize_t ret; 861 + 862 + /* Only allow a very narrow range of strings to be written */ 863 + ret = -EINVAL; 864 + if ((*ppos != 0) || (count >= sizeof(kbuf))) 865 + goto out; 866 + 867 + /* What was written? */ 868 + ret = -EFAULT; 869 + if (copy_from_user(kbuf, buf, count)) 870 + goto out; 871 + kbuf[count] = '\0'; 872 + pos = kbuf; 873 + 874 + /* What is being requested? */ 875 + ret = -EINVAL; 876 + if (strncmp(pos, "allow", 5) == 0) { 877 + pos += 5; 878 + setgroups_allowed = true; 879 + } 880 + else if (strncmp(pos, "deny", 4) == 0) { 881 + pos += 4; 882 + setgroups_allowed = false; 883 + } 884 + else 885 + goto out; 886 + 887 + /* Verify there is not trailing junk on the line */ 888 + pos = skip_spaces(pos); 889 + if (*pos != '\0') 890 + goto out; 891 + 892 + ret = -EPERM; 893 + mutex_lock(&userns_state_mutex); 894 + if (setgroups_allowed) { 895 + /* Enabling setgroups after setgroups has been disabled 896 + * is not allowed. 897 + */ 898 + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) 899 + goto out_unlock; 900 + } else { 901 + /* Permanently disabling setgroups after setgroups has 902 + * been enabled by writing the gid_map is not allowed. 903 + */ 904 + if (ns->gid_map.nr_extents != 0) 905 + goto out_unlock; 906 + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; 907 + } 908 + mutex_unlock(&userns_state_mutex); 909 + 910 + /* Report a successful write */ 911 + *ppos = count; 912 + ret = count; 913 + out: 914 + return ret; 915 + out_unlock: 916 + mutex_unlock(&userns_state_mutex); 917 + goto out; 918 + } 919 + 847 920 bool userns_may_setgroups(const struct user_namespace *ns) 848 921 { 849 922 bool allowed; ··· 931 848 * the user namespace has been established. 932 849 */ 933 850 allowed = ns->gid_map.nr_extents != 0; 851 + /* Is setgroups allowed? */ 852 + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); 934 853 mutex_unlock(&userns_state_mutex); 935 854 936 855 return allowed;