Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull userns fixes from Eric W Biederman:
"The bulk of the changes are fixing the worst consequences of the user
namespace design oversight in not considering what happens when one
namespace starts off as a clone of another namespace, as happens with
the mount namespace.

The rest of the changes are just plain bug fixes.

Many thanks to Andy Lutomirski for pointing out many of these issues."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
userns: Restrict when proc and sysfs can be mounted
ipc: Restrict mounting the mqueue filesystem
vfs: Carefully propogate mounts across user namespaces
vfs: Add a mount flag to lock read only bind mounts
userns: Don't allow creation if the user is chrooted
yama: Better permission check for ptraceme
pid: Handle the exit of a multi-threaded init.
scm: Require CAP_SYS_ADMIN over the current pidns to spoof pids.

+105 -8
+53 -1
fs/namespace.c
··· 798 798 } 799 799 800 800 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; 801 + /* Don't allow unprivileged users to change mount flags */ 802 + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) 803 + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 804 + 801 805 atomic_inc(&sb->s_active); 802 806 mnt->mnt.mnt_sb = sb; 803 807 mnt->mnt.mnt_root = dget(root); ··· 1717 1713 if (readonly_request == __mnt_is_readonly(mnt)) 1718 1714 return 0; 1719 1715 1716 + if (mnt->mnt_flags & MNT_LOCK_READONLY) 1717 + return -EPERM; 1718 + 1720 1719 if (readonly_request) 1721 1720 error = mnt_make_readonly(real_mount(mnt)); 1722 1721 else ··· 2346 2339 /* First pass: copy the tree topology */ 2347 2340 copy_flags = CL_COPY_ALL | CL_EXPIRE; 2348 2341 if (user_ns != mnt_ns->user_ns) 2349 - copy_flags |= CL_SHARED_TO_SLAVE; 2342 + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2350 2343 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2351 2344 if (IS_ERR(new)) { 2352 2345 up_write(&namespace_sem); ··· 2737 2730 bool our_mnt(struct vfsmount *mnt) 2738 2731 { 2739 2732 return check_mnt(real_mount(mnt)); 2733 + } 2734 + 2735 + bool current_chrooted(void) 2736 + { 2737 + /* Does the current process have a non-standard root */ 2738 + struct path ns_root; 2739 + struct path fs_root; 2740 + bool chrooted; 2741 + 2742 + /* Find the namespace root */ 2743 + ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; 2744 + ns_root.dentry = ns_root.mnt->mnt_root; 2745 + path_get(&ns_root); 2746 + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) 2747 + ; 2748 + 2749 + get_fs_root(current->fs, &fs_root); 2750 + 2751 + chrooted = !path_equal(&fs_root, &ns_root); 2752 + 2753 + path_put(&fs_root); 2754 + path_put(&ns_root); 2755 + 2756 + return chrooted; 2757 + } 2758 + 2759 + void update_mnt_policy(struct user_namespace *userns) 2760 + { 2761 + struct mnt_namespace *ns = current->nsproxy->mnt_ns; 2762 + struct mount *mnt; 2763 + 2764 + down_read(&namespace_sem); 2765 + list_for_each_entry(mnt, &ns->list, mnt_list) { 2766 + switch (mnt->mnt.mnt_sb->s_magic) { 2767 + case SYSFS_MAGIC: 2768 + userns->may_mount_sysfs = true; 2769 + break; 2770 + case PROC_SUPER_MAGIC: 2771 + userns->may_mount_proc = true; 2772 + break; 2773 + } 2774 + if (userns->may_mount_sysfs && userns->may_mount_proc) 2775 + break; 2776 + } 2777 + up_read(&namespace_sem); 2740 2778 } 2741 2779 2742 2780 static void *mntns_get(struct task_struct *task)
+6
fs/pnode.c
··· 9 9 #include <linux/mnt_namespace.h> 10 10 #include <linux/mount.h> 11 11 #include <linux/fs.h> 12 + #include <linux/nsproxy.h> 12 13 #include "internal.h" 13 14 #include "pnode.h" 14 15 ··· 221 220 int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, 222 221 struct mount *source_mnt, struct list_head *tree_list) 223 222 { 223 + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 224 224 struct mount *m, *child; 225 225 int ret = 0; 226 226 struct mount *prev_dest_mnt = dest_mnt; ··· 238 236 continue; 239 237 240 238 source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); 239 + 240 + /* Notice when we are propagating across user namespaces */ 241 + if (m->mnt_ns->user_ns != user_ns) 242 + type |= CL_UNPRIVILEGED; 241 243 242 244 child = copy_tree(source, source->mnt.mnt_root, type); 243 245 if (IS_ERR(child)) {
+1
fs/pnode.h
··· 23 23 #define CL_MAKE_SHARED 0x08 24 24 #define CL_PRIVATE 0x10 25 25 #define CL_SHARED_TO_SLAVE 0x20 26 + #define CL_UNPRIVILEGED 0x40 26 27 27 28 static inline void set_mnt_shared(struct mount *mnt) 28 29 {
+4
fs/proc/root.c
··· 16 16 #include <linux/sched.h> 17 17 #include <linux/module.h> 18 18 #include <linux/bitops.h> 19 + #include <linux/user_namespace.h> 19 20 #include <linux/mount.h> 20 21 #include <linux/pid_namespace.h> 21 22 #include <linux/parser.h> ··· 109 108 } else { 110 109 ns = task_active_pid_ns(current); 111 110 options = data; 111 + 112 + if (!current_user_ns()->may_mount_proc) 113 + return ERR_PTR(-EPERM); 112 114 } 113 115 114 116 sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
+4
fs/sysfs/mount.c
··· 19 19 #include <linux/module.h> 20 20 #include <linux/magic.h> 21 21 #include <linux/slab.h> 22 + #include <linux/user_namespace.h> 22 23 23 24 #include "sysfs.h" 24 25 ··· 111 110 enum kobj_ns_type type; 112 111 struct super_block *sb; 113 112 int error; 113 + 114 + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) 115 + return ERR_PTR(-EPERM); 114 116 115 117 info = kzalloc(sizeof(*info), GFP_KERNEL); 116 118 if (!info)
+2
include/linux/fs_struct.h
··· 50 50 spin_unlock(&fs->lock); 51 51 } 52 52 53 + extern bool current_chrooted(void); 54 + 53 55 #endif /* _LINUX_FS_STRUCT_H */
+2
include/linux/mount.h
··· 47 47 48 48 #define MNT_INTERNAL 0x4000 49 49 50 + #define MNT_LOCK_READONLY 0x400000 51 + 50 52 struct vfsmount { 51 53 struct dentry *mnt_root; /* root of the mounted tree */ 52 54 struct super_block *mnt_sb; /* pointer to superblock */
+4
include/linux/user_namespace.h
··· 26 26 kuid_t owner; 27 27 kgid_t group; 28 28 unsigned int proc_inum; 29 + bool may_mount_sysfs; 30 + bool may_mount_proc; 29 31 }; 30 32 31 33 extern struct user_namespace init_user_ns; ··· 83 81 } 84 82 85 83 #endif 84 + 85 + void update_mnt_policy(struct user_namespace *userns); 86 86 87 87 #endif /* _LINUX_USER_H */
+10 -2
ipc/mqueue.c
··· 330 330 int flags, const char *dev_name, 331 331 void *data) 332 332 { 333 - if (!(flags & MS_KERNMOUNT)) 334 - data = current->nsproxy->ipc_ns; 333 + if (!(flags & MS_KERNMOUNT)) { 334 + struct ipc_namespace *ns = current->nsproxy->ipc_ns; 335 + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN 336 + * over the ipc namespace. 337 + */ 338 + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 339 + return ERR_PTR(-EPERM); 340 + 341 + data = ns; 342 + } 335 343 return mount_ns(fs_type, flags, data, mqueue_fill_super); 336 344 } 337 345
+2 -1
kernel/pid_namespace.c
··· 181 181 int nr; 182 182 int rc; 183 183 struct task_struct *task, *me = current; 184 + int init_pids = thread_group_leader(me) ? 1 : 2; 184 185 185 186 /* Don't allow any more processes into the pid namespace */ 186 187 disable_pid_allocation(pid_ns); ··· 231 230 */ 232 231 for (;;) { 233 232 set_current_state(TASK_UNINTERRUPTIBLE); 234 - if (pid_ns->nr_hashed == 1) 233 + if (pid_ns->nr_hashed == init_pids) 235 234 break; 236 235 schedule(); 237 236 }
+2
kernel/user.c
··· 51 51 .owner = GLOBAL_ROOT_UID, 52 52 .group = GLOBAL_ROOT_GID, 53 53 .proc_inum = PROC_USER_INIT_INO, 54 + .may_mount_sysfs = true, 55 + .may_mount_proc = true, 54 56 }; 55 57 EXPORT_SYMBOL_GPL(init_user_ns); 56 58
+11
kernel/user_namespace.c
··· 61 61 kgid_t group = new->egid; 62 62 int ret; 63 63 64 + /* 65 + * Verify that we can not violate the policy of which files 66 + * may be accessed that is specified by the root directory, 67 + * by verifing that the root directory is at the root of the 68 + * mount namespace which allows all files to be accessed. 69 + */ 70 + if (current_chrooted()) 71 + return -EPERM; 72 + 64 73 /* The creator needs a mapping in the parent user namespace 65 74 * or else we won't be able to reasonably tell userspace who 66 75 * created a user_namespace. ··· 95 86 ns->group = group; 96 87 97 88 set_cred_user_ns(new, ns); 89 + 90 + update_mnt_policy(ns); 98 91 99 92 return 0; 100 93 }
+3 -1
net/core/scm.c
··· 24 24 #include <linux/interrupt.h> 25 25 #include <linux/netdevice.h> 26 26 #include <linux/security.h> 27 + #include <linux/pid_namespace.h> 27 28 #include <linux/pid.h> 28 29 #include <linux/nsproxy.h> 29 30 #include <linux/slab.h> ··· 53 52 if (!uid_valid(uid) || !gid_valid(gid)) 54 53 return -EINVAL; 55 54 56 - if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) && 55 + if ((creds->pid == task_tgid_vnr(current) || 56 + ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && 57 57 ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || 58 58 uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && 59 59 ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
+1 -3
security/yama/yama_lsm.c
··· 347 347 /* Only disallow PTRACE_TRACEME on more aggressive settings. */ 348 348 switch (ptrace_scope) { 349 349 case YAMA_SCOPE_CAPABILITY: 350 - rcu_read_lock(); 351 - if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) 350 + if (!has_ns_capability(parent, current_user_ns(), CAP_SYS_PTRACE)) 352 351 rc = -EPERM; 353 - rcu_read_unlock(); 354 352 break; 355 353 case YAMA_SCOPE_NO_ATTACH: 356 354 rc = -EPERM;