Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace changes from Eric Biederman:
"While small this set of changes is very significant with respect to
containers in general and user namespaces in particular. The user
space interface is now complete.

This set of changes adds support for unprivileged users to create user
namespaces and as a user namespace root to create other namespaces.
The tyranny of supporting suid root preventing unprivileged users from
using cool new kernel features is broken.

This set of changes completes the work on setns, adding support for
the pid, user, mount namespaces.

This set of changes includes a bunch of basic pid namespace
cleanups/simplifications. Of particular significance is the rework of
the pid namespace cleanup so it no longer requires sending out
tendrils into all kinds of unexpected cleanup paths for operation. At
least one case of broken error handling is fixed by this cleanup.

The files under /proc/<pid>/ns/ have been converted from regular files
to magic symlinks which prevents incorrect caching by the VFS,
ensuring the files always refer to the namespace the process is
currently using and ensuring that the ptrace_mayaccess permission
checks are always applied.

The files under /proc/<pid>/ns/ have been given stable inode numbers
so it is now possible to see if different processes share the same
namespaces.

Through the David Miller's net tree are changes to relax many of the
permission checks in the networking stack to allowing the user
namespace root to usefully use the networking stack. Similar changes
for the mount namespace and the pid namespace are coming through my
tree.

Two small changes to add user namespace support were commited here adn
in David Miller's -net tree so that I could complete the work on the
/proc/<pid>/ns/ files in this tree.

Work remains to make it safe to build user namespaces and 9p, afs,
ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the
Kconfig guard remains in place preventing that user namespaces from
being built when any of those filesystems are enabled.

Future design work remains to allow root users outside of the initial
user namespace to mount more than just /proc and /sys."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits)
proc: Usable inode numbers for the namespace file descriptors.
proc: Fix the namespace inode permission checks.
proc: Generalize proc inode allocation
userns: Allow unprivilged mounts of proc and sysfs
userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file
procfs: Print task uids and gids in the userns that opened the proc file
userns: Implement unshare of the user namespace
userns: Implent proc namespace operations
userns: Kill task_user_ns
userns: Make create_new_namespaces take a user_ns parameter
userns: Allow unprivileged use of setns.
userns: Allow unprivileged users to create new namespaces
userns: Allow setting a userns mapping to your current uid.
userns: Allow chown and setgid preservation
userns: Allow unprivileged users to create user namespaces.
userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped
userns: fix return value on mntns_install() failure
vfs: Allow unprivileged manipulation of the mount namespace.
vfs: Only support slave subtrees across different user namespaces
vfs: Add a user namespace reference from struct mnt_namespace
...

+996 -451
+1 -1
arch/powerpc/platforms/cell/spufs/sched.c
··· 1094 1094 LOAD_INT(c), LOAD_FRAC(c), 1095 1095 count_active_contexts(), 1096 1096 atomic_read(&nr_spu_contexts), 1097 - current->nsproxy->pid_ns->last_pid); 1097 + task_active_pid_ns(current)->last_pid); 1098 1098 return 0; 1099 1099 } 1100 1100
+1 -1
arch/um/drivers/mconsole_kern.c
··· 123 123 124 124 void mconsole_proc(struct mc_request *req) 125 125 { 126 - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; 126 + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; 127 127 char *buf; 128 128 int len; 129 129 struct file *file;
+2 -1
drivers/staging/android/binder.c
··· 35 35 #include <linux/uaccess.h> 36 36 #include <linux/vmalloc.h> 37 37 #include <linux/slab.h> 38 + #include <linux/pid_namespace.h> 38 39 39 40 #include "binder.h" 40 41 #include "binder_trace.h" ··· 2321 2320 if (t->from) { 2322 2321 struct task_struct *sender = t->from->proc->tsk; 2323 2322 tr.sender_pid = task_tgid_nr_ns(sender, 2324 - current->nsproxy->pid_ns); 2323 + task_active_pid_ns(current)); 2325 2324 } else { 2326 2325 tr.sender_pid = 0; 2327 2326 }
+7 -4
fs/attr.c
··· 49 49 /* Make sure a caller can chown. */ 50 50 if ((ia_valid & ATTR_UID) && 51 51 (!uid_eq(current_fsuid(), inode->i_uid) || 52 - !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) 52 + !uid_eq(attr->ia_uid, inode->i_uid)) && 53 + !inode_capable(inode, CAP_CHOWN)) 53 54 return -EPERM; 54 55 55 56 /* Make sure caller can chgrp. */ 56 57 if ((ia_valid & ATTR_GID) && 57 58 (!uid_eq(current_fsuid(), inode->i_uid) || 58 59 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && 59 - !capable(CAP_CHOWN)) 60 + !inode_capable(inode, CAP_CHOWN)) 60 61 return -EPERM; 61 62 62 63 /* Make sure a caller can chmod. */ ··· 66 65 return -EPERM; 67 66 /* Also check the setgid bit! */ 68 67 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 69 - inode->i_gid) && !capable(CAP_FSETID)) 68 + inode->i_gid) && 69 + !inode_capable(inode, CAP_FSETID)) 70 70 attr->ia_mode &= ~S_ISGID; 71 71 } 72 72 ··· 159 157 if (ia_valid & ATTR_MODE) { 160 158 umode_t mode = attr->ia_mode; 161 159 162 - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 160 + if (!in_group_p(inode->i_gid) && 161 + !inode_capable(inode, CAP_FSETID)) 163 162 mode &= ~S_ISGID; 164 163 inode->i_mode = mode; 165 164 }
+4 -4
fs/autofs4/autofs_i.h
··· 74 74 unsigned long last_used; 75 75 atomic_t count; 76 76 77 - uid_t uid; 78 - gid_t gid; 77 + kuid_t uid; 78 + kgid_t gid; 79 79 }; 80 80 81 81 #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ ··· 89 89 struct qstr name; 90 90 u32 dev; 91 91 u64 ino; 92 - uid_t uid; 93 - gid_t gid; 92 + kuid_t uid; 93 + kgid_t gid; 94 94 pid_t pid; 95 95 pid_t tgid; 96 96 /* This is for status reporting upon return */
+2 -2
fs/autofs4/dev-ioctl.c
··· 437 437 err = 0; 438 438 autofs4_expire_wait(path.dentry); 439 439 spin_lock(&sbi->fs_lock); 440 - param->requester.uid = ino->uid; 441 - param->requester.gid = ino->gid; 440 + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); 441 + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); 442 442 spin_unlock(&sbi->fs_lock); 443 443 } 444 444 path_put(&path);
+15 -9
fs/autofs4/inode.c
··· 36 36 37 37 void autofs4_clean_ino(struct autofs_info *ino) 38 38 { 39 - ino->uid = 0; 40 - ino->gid = 0; 39 + ino->uid = GLOBAL_ROOT_UID; 40 + ino->gid = GLOBAL_ROOT_GID; 41 41 ino->last_used = jiffies; 42 42 } 43 43 ··· 79 79 return 0; 80 80 81 81 seq_printf(m, ",fd=%d", sbi->pipefd); 82 - if (root_inode->i_uid != 0) 83 - seq_printf(m, ",uid=%u", root_inode->i_uid); 84 - if (root_inode->i_gid != 0) 85 - seq_printf(m, ",gid=%u", root_inode->i_gid); 82 + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) 83 + seq_printf(m, ",uid=%u", 84 + from_kuid_munged(&init_user_ns, root_inode->i_uid)); 85 + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) 86 + seq_printf(m, ",gid=%u", 87 + from_kgid_munged(&init_user_ns, root_inode->i_gid)); 86 88 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 87 89 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 88 90 seq_printf(m, ",minproto=%d", sbi->min_proto); ··· 128 126 {Opt_err, NULL} 129 127 }; 130 128 131 - static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 129 + static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, 132 130 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 133 131 { 134 132 char *p; ··· 161 159 case Opt_uid: 162 160 if (match_int(args, &option)) 163 161 return 1; 164 - *uid = option; 162 + *uid = make_kuid(current_user_ns(), option); 163 + if (!uid_valid(*uid)) 164 + return 1; 165 165 break; 166 166 case Opt_gid: 167 167 if (match_int(args, &option)) 168 168 return 1; 169 - *gid = option; 169 + *gid = make_kgid(current_user_ns(), option); 170 + if (!gid_valid(*gid)) 171 + return 1; 170 172 break; 171 173 case Opt_pgrp: 172 174 if (match_int(args, &option))
+3 -2
fs/autofs4/waitq.c
··· 154 154 case autofs_ptype_expire_direct: 155 155 { 156 156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; 157 + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; 157 158 158 159 pktsz = sizeof(*packet); 159 160 ··· 164 163 packet->name[wq->name.len] = '\0'; 165 164 packet->dev = wq->dev; 166 165 packet->ino = wq->ino; 167 - packet->uid = wq->uid; 168 - packet->gid = wq->gid; 166 + packet->uid = from_kuid_munged(user_ns, wq->uid); 167 + packet->gid = from_kgid_munged(user_ns, wq->gid); 169 168 packet->pid = wq->pid; 170 169 packet->tgid = wq->tgid; 171 170 break;
+3 -6
fs/exec.c
··· 1266 1266 bprm->cred->egid = current_egid(); 1267 1267 1268 1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1269 - !current->no_new_privs) { 1269 + !current->no_new_privs && 1270 + kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && 1271 + kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { 1270 1272 /* Set-uid? */ 1271 1273 if (mode & S_ISUID) { 1272 - if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) 1273 - return -EPERM; 1274 1274 bprm->per_clear |= PER_CLEAR_ON_SETID; 1275 1275 bprm->cred->euid = inode->i_uid; 1276 - 1277 1276 } 1278 1277 1279 1278 /* Set-gid? */ ··· 1282 1283 * executable. 1283 1284 */ 1284 1285 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1285 - if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) 1286 - return -EPERM; 1287 1286 bprm->per_clear |= PER_CLEAR_ON_SETID; 1288 1287 bprm->cred->egid = inode->i_gid; 1289 1288 }
+2 -2
fs/fuse/dev.c
··· 92 92 93 93 static void fuse_req_init_context(struct fuse_req *req) 94 94 { 95 - req->in.h.uid = current_fsuid(); 96 - req->in.h.gid = current_fsgid(); 95 + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); 96 + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); 97 97 req->in.h.pid = current->pid; 98 98 } 99 99
+10 -10
fs/fuse/dir.c
··· 818 818 stat->ino = attr->ino; 819 819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 820 820 stat->nlink = attr->nlink; 821 - stat->uid = attr->uid; 822 - stat->gid = attr->gid; 821 + stat->uid = make_kuid(&init_user_ns, attr->uid); 822 + stat->gid = make_kgid(&init_user_ns, attr->gid); 823 823 stat->rdev = inode->i_rdev; 824 824 stat->atime.tv_sec = attr->atime; 825 825 stat->atime.tv_nsec = attr->atimensec; ··· 1007 1007 rcu_read_lock(); 1008 1008 ret = 0; 1009 1009 cred = __task_cred(task); 1010 - if (cred->euid == fc->user_id && 1011 - cred->suid == fc->user_id && 1012 - cred->uid == fc->user_id && 1013 - cred->egid == fc->group_id && 1014 - cred->sgid == fc->group_id && 1015 - cred->gid == fc->group_id) 1010 + if (uid_eq(cred->euid, fc->user_id) && 1011 + uid_eq(cred->suid, fc->user_id) && 1012 + uid_eq(cred->uid, fc->user_id) && 1013 + gid_eq(cred->egid, fc->group_id) && 1014 + gid_eq(cred->sgid, fc->group_id) && 1015 + gid_eq(cred->gid, fc->group_id)) 1016 1016 ret = 1; 1017 1017 rcu_read_unlock(); 1018 1018 ··· 1306 1306 if (ivalid & ATTR_MODE) 1307 1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; 1308 1308 if (ivalid & ATTR_UID) 1309 - arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; 1309 + arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); 1310 1310 if (ivalid & ATTR_GID) 1311 - arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; 1311 + arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); 1312 1312 if (ivalid & ATTR_SIZE) 1313 1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; 1314 1314 if (ivalid & ATTR_ATIME) {
+2 -2
fs/fuse/fuse_i.h
··· 333 333 atomic_t count; 334 334 335 335 /** The user id for this mount */ 336 - uid_t user_id; 336 + kuid_t user_id; 337 337 338 338 /** The group id for this mount */ 339 - gid_t group_id; 339 + kgid_t group_id; 340 340 341 341 /** The fuse mount flags for this mount */ 342 342 unsigned flags;
+14 -9
fs/fuse/inode.c
··· 60 60 struct fuse_mount_data { 61 61 int fd; 62 62 unsigned rootmode; 63 - unsigned user_id; 64 - unsigned group_id; 63 + kuid_t user_id; 64 + kgid_t group_id; 65 65 unsigned fd_present:1; 66 66 unsigned rootmode_present:1; 67 67 unsigned user_id_present:1; ··· 164 164 inode->i_ino = fuse_squash_ino(attr->ino); 165 165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 166 166 set_nlink(inode, attr->nlink); 167 - inode->i_uid = attr->uid; 168 - inode->i_gid = attr->gid; 167 + inode->i_uid = make_kuid(&init_user_ns, attr->uid); 168 + inode->i_gid = make_kgid(&init_user_ns, attr->gid); 169 169 inode->i_blocks = attr->blocks; 170 170 inode->i_atime.tv_sec = attr->atime; 171 171 inode->i_atime.tv_nsec = attr->atimensec; ··· 492 492 case OPT_USER_ID: 493 493 if (match_int(&args[0], &value)) 494 494 return 0; 495 - d->user_id = value; 495 + d->user_id = make_kuid(current_user_ns(), value); 496 + if (!uid_valid(d->user_id)) 497 + return 0; 496 498 d->user_id_present = 1; 497 499 break; 498 500 499 501 case OPT_GROUP_ID: 500 502 if (match_int(&args[0], &value)) 501 503 return 0; 502 - d->group_id = value; 504 + d->group_id = make_kgid(current_user_ns(), value); 505 + if (!gid_valid(d->group_id)) 506 + return 0; 503 507 d->group_id_present = 1; 504 508 break; 505 509 ··· 544 540 struct super_block *sb = root->d_sb; 545 541 struct fuse_conn *fc = get_fuse_conn_super(sb); 546 542 547 - seq_printf(m, ",user_id=%u", fc->user_id); 548 - seq_printf(m, ",group_id=%u", fc->group_id); 543 + seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); 544 + seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); 549 545 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) 550 546 seq_puts(m, ",default_permissions"); 551 547 if (fc->flags & FUSE_ALLOW_OTHER) ··· 993 989 if (!file) 994 990 goto err; 995 991 996 - if (file->f_op != &fuse_dev_operations) 992 + if ((file->f_op != &fuse_dev_operations) || 993 + (file->f_cred->user_ns != &init_user_ns)) 997 994 goto err_fput; 998 995 999 996 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+1 -1
fs/hppfs/hppfs.c
··· 710 710 struct vfsmount *proc_mnt; 711 711 int err = -ENOENT; 712 712 713 - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); 713 + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); 714 714 if (IS_ERR(proc_mnt)) 715 715 goto out; 716 716
+3
fs/mount.h
··· 4 4 5 5 struct mnt_namespace { 6 6 atomic_t count; 7 + unsigned int proc_inum; 7 8 struct mount * root; 8 9 struct list_head list; 10 + struct user_namespace *user_ns; 11 + u64 seq; /* Sequence number to prevent loops */ 9 12 wait_queue_head_t poll; 10 13 int event; 11 14 };
+175 -36
fs/namespace.c
··· 12 12 #include <linux/export.h> 13 13 #include <linux/capability.h> 14 14 #include <linux/mnt_namespace.h> 15 + #include <linux/user_namespace.h> 15 16 #include <linux/namei.h> 16 17 #include <linux/security.h> 17 18 #include <linux/idr.h> ··· 21 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 22 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 23 22 #include <linux/uaccess.h> 23 + #include <linux/proc_fs.h> 24 24 #include "pnode.h" 25 25 #include "internal.h" 26 26 ··· 786 784 if (!mnt) 787 785 return ERR_PTR(-ENOMEM); 788 786 789 - if (flag & (CL_SLAVE | CL_PRIVATE)) 787 + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 790 788 mnt->mnt_group_id = 0; /* not a peer of original */ 791 789 else 792 790 mnt->mnt_group_id = old->mnt_group_id; ··· 807 805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 808 806 br_write_unlock(&vfsmount_lock); 809 807 810 - if (flag & CL_SLAVE) { 808 + if ((flag & CL_SLAVE) || 809 + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 811 810 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 812 811 mnt->mnt_master = old; 813 812 CLEAR_MNT_SHARED(mnt); ··· 1269 1266 goto dput_and_out; 1270 1267 1271 1268 retval = -EPERM; 1272 - if (!capable(CAP_SYS_ADMIN)) 1269 + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1273 1270 goto dput_and_out; 1274 1271 1275 1272 retval = do_umount(mnt, flags); ··· 1295 1292 1296 1293 static int mount_is_safe(struct path *path) 1297 1294 { 1298 - if (capable(CAP_SYS_ADMIN)) 1295 + if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1299 1296 return 0; 1300 1297 return -EPERM; 1301 1298 #ifdef notyet ··· 1309 1306 return -EPERM; 1310 1307 return 0; 1311 1308 #endif 1309 + } 1310 + 1311 + static bool mnt_ns_loop(struct path *path) 1312 + { 1313 + /* Could bind mounting the mount namespace inode cause a 1314 + * mount namespace loop? 1315 + */ 1316 + struct inode *inode = path->dentry->d_inode; 1317 + struct proc_inode *ei; 1318 + struct mnt_namespace *mnt_ns; 1319 + 1320 + if (!proc_ns_inode(inode)) 1321 + return false; 1322 + 1323 + ei = PROC_I(inode); 1324 + if (ei->ns_ops != &mntns_operations) 1325 + return false; 1326 + 1327 + mnt_ns = ei->ns; 1328 + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1312 1329 } 1313 1330 1314 1331 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, ··· 1633 1610 int type; 1634 1611 int err = 0; 1635 1612 1636 - if (!capable(CAP_SYS_ADMIN)) 1613 + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1637 1614 return -EPERM; 1638 1615 1639 1616 if (path->dentry != path->mnt->mnt_root) ··· 1677 1654 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 1678 1655 if (err) 1679 1656 return err; 1657 + 1658 + err = -EINVAL; 1659 + if (mnt_ns_loop(&old_path)) 1660 + goto out; 1680 1661 1681 1662 err = lock_mount(path); 1682 1663 if (err) ··· 1797 1770 struct mount *p; 1798 1771 struct mount *old; 1799 1772 int err = 0; 1800 - if (!capable(CAP_SYS_ADMIN)) 1773 + if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1801 1774 return -EPERM; 1802 1775 if (!old_name || !*old_name) 1803 1776 return -EINVAL; ··· 1884 1857 return ERR_PTR(err); 1885 1858 } 1886 1859 1887 - static struct vfsmount * 1888 - do_kern_mount(const char *fstype, int flags, const char *name, void *data) 1889 - { 1890 - struct file_system_type *type = get_fs_type(fstype); 1891 - struct vfsmount *mnt; 1892 - if (!type) 1893 - return ERR_PTR(-ENODEV); 1894 - mnt = vfs_kern_mount(type, flags, name, data); 1895 - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 1896 - !mnt->mnt_sb->s_subtype) 1897 - mnt = fs_set_subtype(mnt, fstype); 1898 - put_filesystem(type); 1899 - return mnt; 1900 - } 1901 - 1902 1860 /* 1903 1861 * add a mount into a namespace's mount tree 1904 1862 */ ··· 1929 1917 * create a new mount for userspace and request it to be added into the 1930 1918 * namespace's tree 1931 1919 */ 1932 - static int do_new_mount(struct path *path, const char *type, int flags, 1920 + static int do_new_mount(struct path *path, const char *fstype, int flags, 1933 1921 int mnt_flags, const char *name, void *data) 1934 1922 { 1923 + struct file_system_type *type; 1924 + struct user_namespace *user_ns; 1935 1925 struct vfsmount *mnt; 1936 1926 int err; 1937 1927 1938 - if (!type) 1928 + if (!fstype) 1939 1929 return -EINVAL; 1940 1930 1941 1931 /* we need capabilities... */ 1942 - if (!capable(CAP_SYS_ADMIN)) 1932 + user_ns = real_mount(path->mnt)->mnt_ns->user_ns; 1933 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 1943 1934 return -EPERM; 1944 1935 1945 - mnt = do_kern_mount(type, flags, name, data); 1936 + type = get_fs_type(fstype); 1937 + if (!type) 1938 + return -ENODEV; 1939 + 1940 + if (user_ns != &init_user_ns) { 1941 + if (!(type->fs_flags & FS_USERNS_MOUNT)) { 1942 + put_filesystem(type); 1943 + return -EPERM; 1944 + } 1945 + /* Only in special cases allow devices from mounts 1946 + * created outside the initial user namespace. 1947 + */ 1948 + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 1949 + flags |= MS_NODEV; 1950 + mnt_flags |= MNT_NODEV; 1951 + } 1952 + } 1953 + 1954 + mnt = vfs_kern_mount(type, flags, name, data); 1955 + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 1956 + !mnt->mnt_sb->s_subtype) 1957 + mnt = fs_set_subtype(mnt, fstype); 1958 + 1959 + put_filesystem(type); 1946 1960 if (IS_ERR(mnt)) 1947 1961 return PTR_ERR(mnt); 1948 1962 ··· 2299 2261 return retval; 2300 2262 } 2301 2263 2302 - static struct mnt_namespace *alloc_mnt_ns(void) 2264 + static void free_mnt_ns(struct mnt_namespace *ns) 2265 + { 2266 + proc_free_inum(ns->proc_inum); 2267 + put_user_ns(ns->user_ns); 2268 + kfree(ns); 2269 + } 2270 + 2271 + /* 2272 + * Assign a sequence number so we can detect when we attempt to bind 2273 + * mount a reference to an older mount namespace into the current 2274 + * mount namespace, preventing reference counting loops. A 64bit 2275 + * number incrementing at 10Ghz will take 12,427 years to wrap which 2276 + * is effectively never, so we can ignore the possibility. 2277 + */ 2278 + static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2279 + 2280 + static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2303 2281 { 2304 2282 struct mnt_namespace *new_ns; 2283 + int ret; 2305 2284 2306 2285 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2307 2286 if (!new_ns) 2308 2287 return ERR_PTR(-ENOMEM); 2288 + ret = proc_alloc_inum(&new_ns->proc_inum); 2289 + if (ret) { 2290 + kfree(new_ns); 2291 + return ERR_PTR(ret); 2292 + } 2293 + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2309 2294 atomic_set(&new_ns->count, 1); 2310 2295 new_ns->root = NULL; 2311 2296 INIT_LIST_HEAD(&new_ns->list); 2312 2297 init_waitqueue_head(&new_ns->poll); 2313 2298 new_ns->event = 0; 2299 + new_ns->user_ns = get_user_ns(user_ns); 2314 2300 return new_ns; 2315 2301 } 2316 2302 ··· 2343 2281 * copied from the namespace of the passed in task structure. 2344 2282 */ 2345 2283 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2346 - struct fs_struct *fs) 2284 + struct user_namespace *user_ns, struct fs_struct *fs) 2347 2285 { 2348 2286 struct mnt_namespace *new_ns; 2349 2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2350 2288 struct mount *p, *q; 2351 2289 struct mount *old = mnt_ns->root; 2352 2290 struct mount *new; 2291 + int copy_flags; 2353 2292 2354 - new_ns = alloc_mnt_ns(); 2293 + new_ns = alloc_mnt_ns(user_ns); 2355 2294 if (IS_ERR(new_ns)) 2356 2295 return new_ns; 2357 2296 2358 2297 down_write(&namespace_sem); 2359 2298 /* First pass: copy the tree topology */ 2360 - new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2299 + copy_flags = CL_COPY_ALL | CL_EXPIRE; 2300 + if (user_ns != mnt_ns->user_ns) 2301 + copy_flags |= CL_SHARED_TO_SLAVE; 2302 + new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2361 2303 if (IS_ERR(new)) { 2362 2304 up_write(&namespace_sem); 2363 - kfree(new_ns); 2305 + free_mnt_ns(new_ns); 2364 2306 return ERR_CAST(new); 2365 2307 } 2366 2308 new_ns->root = new; ··· 2405 2339 } 2406 2340 2407 2341 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2408 - struct fs_struct *new_fs) 2342 + struct user_namespace *user_ns, struct fs_struct *new_fs) 2409 2343 { 2410 2344 struct mnt_namespace *new_ns; 2411 2345 ··· 2415 2349 if (!(flags & CLONE_NEWNS)) 2416 2350 return ns; 2417 2351 2418 - new_ns = dup_mnt_ns(ns, new_fs); 2352 + new_ns = dup_mnt_ns(ns, user_ns, new_fs); 2419 2353 2420 2354 put_mnt_ns(ns); 2421 2355 return new_ns; ··· 2427 2361 */ 2428 2362 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2429 2363 { 2430 - struct mnt_namespace *new_ns = alloc_mnt_ns(); 2364 + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2431 2365 if (!IS_ERR(new_ns)) { 2432 2366 struct mount *mnt = real_mount(m); 2433 2367 mnt->mnt_ns = new_ns; ··· 2567 2501 struct mount *new_mnt, *root_mnt; 2568 2502 int error; 2569 2503 2570 - if (!capable(CAP_SYS_ADMIN)) 2504 + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) 2571 2505 return -EPERM; 2572 2506 2573 2507 error = user_path_dir(new_root, &new); ··· 2649 2583 struct vfsmount *mnt; 2650 2584 struct mnt_namespace *ns; 2651 2585 struct path root; 2586 + struct file_system_type *type; 2652 2587 2653 - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2588 + type = get_fs_type("rootfs"); 2589 + if (!type) 2590 + panic("Can't find rootfs type"); 2591 + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2592 + put_filesystem(type); 2654 2593 if (IS_ERR(mnt)) 2655 2594 panic("Can't create rootfs"); 2656 2595 ··· 2718 2647 br_write_unlock(&vfsmount_lock); 2719 2648 up_write(&namespace_sem); 2720 2649 release_mounts(&umount_list); 2721 - kfree(ns); 2650 + free_mnt_ns(ns); 2722 2651 } 2723 2652 2724 2653 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) ··· 2752 2681 { 2753 2682 return check_mnt(real_mount(mnt)); 2754 2683 } 2684 + 2685 + static void *mntns_get(struct task_struct *task) 2686 + { 2687 + struct mnt_namespace *ns = NULL; 2688 + struct nsproxy *nsproxy; 2689 + 2690 + rcu_read_lock(); 2691 + nsproxy = task_nsproxy(task); 2692 + if (nsproxy) { 2693 + ns = nsproxy->mnt_ns; 2694 + get_mnt_ns(ns); 2695 + } 2696 + rcu_read_unlock(); 2697 + 2698 + return ns; 2699 + } 2700 + 2701 + static void mntns_put(void *ns) 2702 + { 2703 + put_mnt_ns(ns); 2704 + } 2705 + 2706 + static int mntns_install(struct nsproxy *nsproxy, void *ns) 2707 + { 2708 + struct fs_struct *fs = current->fs; 2709 + struct mnt_namespace *mnt_ns = ns; 2710 + struct path root; 2711 + 2712 + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 2713 + !nsown_capable(CAP_SYS_CHROOT)) 2714 + return -EPERM; 2715 + 2716 + if (fs->users != 1) 2717 + return -EINVAL; 2718 + 2719 + get_mnt_ns(mnt_ns); 2720 + put_mnt_ns(nsproxy->mnt_ns); 2721 + nsproxy->mnt_ns = mnt_ns; 2722 + 2723 + /* Find the root */ 2724 + root.mnt = &mnt_ns->root->mnt; 2725 + root.dentry = mnt_ns->root->mnt.mnt_root; 2726 + path_get(&root); 2727 + while(d_mountpoint(root.dentry) && follow_down_one(&root)) 2728 + ; 2729 + 2730 + /* Update the pwd and root */ 2731 + set_fs_pwd(fs, &root); 2732 + set_fs_root(fs, &root); 2733 + 2734 + path_put(&root); 2735 + return 0; 2736 + } 2737 + 2738 + static unsigned int mntns_inum(void *ns) 2739 + { 2740 + struct mnt_namespace *mnt_ns = ns; 2741 + return mnt_ns->proc_inum; 2742 + } 2743 + 2744 + const struct proc_ns_operations mntns_operations = { 2745 + .name = "mnt", 2746 + .type = CLONE_NEWNS, 2747 + .get = mntns_get, 2748 + .put = mntns_put, 2749 + .install = mntns_install, 2750 + .inum = mntns_inum, 2751 + };
+1 -1
fs/open.c
··· 435 435 goto dput_and_out; 436 436 437 437 error = -EPERM; 438 - if (!capable(CAP_SYS_CHROOT)) 438 + if (!nsown_capable(CAP_SYS_CHROOT)) 439 439 goto dput_and_out; 440 440 error = security_path_chroot(&path); 441 441 if (error)
+1
fs/pnode.h
··· 22 22 #define CL_COPY_ALL 0x04 23 23 #define CL_MAKE_SHARED 0x08 24 24 #define CL_PRIVATE 0x10 25 + #define CL_SHARED_TO_SLAVE 0x20 25 26 26 27 static inline void set_mnt_shared(struct mount *mnt) 27 28 {
+1
fs/proc/Makefile
··· 21 21 proc-y += version.o 22 22 proc-y += softirqs.o 23 23 proc-y += namespaces.o 24 + proc-y += self.o 24 25 proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 25 26 proc-$(CONFIG_NET) += proc_net.o 26 27 proc-$(CONFIG_PROC_KCORE) += kcore.o
+1 -1
fs/proc/array.c
··· 162 162 static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 163 163 struct pid *pid, struct task_struct *p) 164 164 { 165 - struct user_namespace *user_ns = current_user_ns(); 165 + struct user_namespace *user_ns = seq_user_ns(m); 166 166 struct group_info *group_info; 167 167 int g; 168 168 struct fdtable *fdt = NULL;
+3 -166
fs/proc/base.c
··· 2345 2345 }; 2346 2346 #endif 2347 2347 2348 - /* 2349 - * /proc/self: 2350 - */ 2351 - static int proc_self_readlink(struct dentry *dentry, char __user *buffer, 2352 - int buflen) 2353 - { 2354 - struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2355 - pid_t tgid = task_tgid_nr_ns(current, ns); 2356 - char tmp[PROC_NUMBUF]; 2357 - if (!tgid) 2358 - return -ENOENT; 2359 - sprintf(tmp, "%d", tgid); 2360 - return vfs_readlink(dentry,buffer,buflen,tmp); 2361 - } 2362 - 2363 - static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 2364 - { 2365 - struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2366 - pid_t tgid = task_tgid_nr_ns(current, ns); 2367 - char *name = ERR_PTR(-ENOENT); 2368 - if (tgid) { 2369 - /* 11 for max length of signed int in decimal + NULL term */ 2370 - name = kmalloc(12, GFP_KERNEL); 2371 - if (!name) 2372 - name = ERR_PTR(-ENOMEM); 2373 - else 2374 - sprintf(name, "%d", tgid); 2375 - } 2376 - nd_set_link(nd, name); 2377 - return NULL; 2378 - } 2379 - 2380 - static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, 2381 - void *cookie) 2382 - { 2383 - char *s = nd_get_link(nd); 2384 - if (!IS_ERR(s)) 2385 - kfree(s); 2386 - } 2387 - 2388 - static const struct inode_operations proc_self_inode_operations = { 2389 - .readlink = proc_self_readlink, 2390 - .follow_link = proc_self_follow_link, 2391 - .put_link = proc_self_put_link, 2392 - }; 2393 - 2394 - /* 2395 - * proc base 2396 - * 2397 - * These are the directory entries in the root directory of /proc 2398 - * that properly belong to the /proc filesystem, as they describe 2399 - * describe something that is process related. 2400 - */ 2401 - static const struct pid_entry proc_base_stuff[] = { 2402 - NOD("self", S_IFLNK|S_IRWXUGO, 2403 - &proc_self_inode_operations, NULL, {}), 2404 - }; 2405 - 2406 - static struct dentry *proc_base_instantiate(struct inode *dir, 2407 - struct dentry *dentry, struct task_struct *task, const void *ptr) 2408 - { 2409 - const struct pid_entry *p = ptr; 2410 - struct inode *inode; 2411 - struct proc_inode *ei; 2412 - struct dentry *error; 2413 - 2414 - /* Allocate the inode */ 2415 - error = ERR_PTR(-ENOMEM); 2416 - inode = new_inode(dir->i_sb); 2417 - if (!inode) 2418 - goto out; 2419 - 2420 - /* Initialize the inode */ 2421 - ei = PROC_I(inode); 2422 - inode->i_ino = get_next_ino(); 2423 - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2424 - 2425 - /* 2426 - * grab the reference to the task. 2427 - */ 2428 - ei->pid = get_task_pid(task, PIDTYPE_PID); 2429 - if (!ei->pid) 2430 - goto out_iput; 2431 - 2432 - inode->i_mode = p->mode; 2433 - if (S_ISDIR(inode->i_mode)) 2434 - set_nlink(inode, 2); 2435 - if (S_ISLNK(inode->i_mode)) 2436 - inode->i_size = 64; 2437 - if (p->iop) 2438 - inode->i_op = p->iop; 2439 - if (p->fop) 2440 - inode->i_fop = p->fop; 2441 - ei->op = p->op; 2442 - d_add(dentry, inode); 2443 - error = NULL; 2444 - out: 2445 - return error; 2446 - out_iput: 2447 - iput(inode); 2448 - goto out; 2449 - } 2450 - 2451 - static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) 2452 - { 2453 - struct dentry *error; 2454 - struct task_struct *task = get_proc_task(dir); 2455 - const struct pid_entry *p, *last; 2456 - 2457 - error = ERR_PTR(-ENOENT); 2458 - 2459 - if (!task) 2460 - goto out_no_task; 2461 - 2462 - /* Lookup the directory entry */ 2463 - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; 2464 - for (p = proc_base_stuff; p <= last; p++) { 2465 - if (p->len != dentry->d_name.len) 2466 - continue; 2467 - if (!memcmp(dentry->d_name.name, p->name, p->len)) 2468 - break; 2469 - } 2470 - if (p > last) 2471 - goto out; 2472 - 2473 - error = proc_base_instantiate(dir, dentry, task, p); 2474 - 2475 - out: 2476 - put_task_struct(task); 2477 - out_no_task: 2478 - return error; 2479 - } 2480 - 2481 - static int proc_base_fill_cache(struct file *filp, void *dirent, 2482 - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) 2483 - { 2484 - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, 2485 - proc_base_instantiate, task, p); 2486 - } 2487 - 2488 2348 #ifdef CONFIG_TASK_IO_ACCOUNTING 2489 2349 static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2490 2350 { ··· 2699 2839 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2700 2840 tgid->numbers[i].nr); 2701 2841 } 2702 - 2703 - upid = &pid->numbers[pid->level]; 2704 - if (upid->nr == 1) 2705 - pid_ns_release_proc(upid->ns); 2706 2842 } 2707 2843 2708 2844 static struct dentry *proc_pid_instantiate(struct inode *dir, ··· 2732 2876 2733 2877 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2734 2878 { 2735 - struct dentry *result; 2879 + struct dentry *result = NULL; 2736 2880 struct task_struct *task; 2737 2881 unsigned tgid; 2738 2882 struct pid_namespace *ns; 2739 - 2740 - result = proc_base_lookup(dir, dentry); 2741 - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) 2742 - goto out; 2743 2883 2744 2884 tgid = name_to_int(dentry); 2745 2885 if (tgid == ~0U) ··· 2799 2947 return iter; 2800 2948 } 2801 2949 2802 - #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) 2950 + #define TGID_OFFSET (FIRST_PROCESS_ENTRY) 2803 2951 2804 2952 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 2805 2953 struct tgid_iter iter) ··· 2819 2967 /* for the /proc/ directory itself, after non-process stuff has been done */ 2820 2968 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2821 2969 { 2822 - unsigned int nr; 2823 - struct task_struct *reaper; 2824 2970 struct tgid_iter iter; 2825 2971 struct pid_namespace *ns; 2826 2972 filldir_t __filldir; 2827 2973 2828 2974 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 2829 - goto out_no_task; 2830 - nr = filp->f_pos - FIRST_PROCESS_ENTRY; 2831 - 2832 - reaper = get_proc_task(filp->f_path.dentry->d_inode); 2833 - if (!reaper) 2834 - goto out_no_task; 2835 - 2836 - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { 2837 - const struct pid_entry *p = &proc_base_stuff[nr]; 2838 - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) 2839 - goto out; 2840 - } 2975 + goto out; 2841 2976 2842 2977 ns = filp->f_dentry->d_sb->s_fs_info; 2843 2978 iter.task = NULL; ··· 2845 3006 } 2846 3007 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2847 3008 out: 2848 - put_task_struct(reaper); 2849 - out_no_task: 2850 3009 return 0; 2851 3010 } 2852 3011
+13 -13
fs/proc/generic.c
··· 350 350 * Return an inode number between PROC_DYNAMIC_FIRST and 351 351 * 0xffffffff, or zero on failure. 352 352 */ 353 - static unsigned int get_inode_number(void) 353 + int proc_alloc_inum(unsigned int *inum) 354 354 { 355 355 unsigned int i; 356 356 int error; 357 357 358 358 retry: 359 - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) 360 - return 0; 359 + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) 360 + return -ENOMEM; 361 361 362 362 spin_lock(&proc_inum_lock); 363 363 error = ida_get_new(&proc_inum_ida, &i); ··· 365 365 if (error == -EAGAIN) 366 366 goto retry; 367 367 else if (error) 368 - return 0; 368 + return error; 369 369 370 370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 371 371 spin_lock(&proc_inum_lock); 372 372 ida_remove(&proc_inum_ida, i); 373 373 spin_unlock(&proc_inum_lock); 374 - return 0; 374 + return -ENOSPC; 375 375 } 376 - return PROC_DYNAMIC_FIRST + i; 376 + *inum = PROC_DYNAMIC_FIRST + i; 377 + return 0; 377 378 } 378 379 379 - static void release_inode_number(unsigned int inum) 380 + void proc_free_inum(unsigned int inum) 380 381 { 381 382 spin_lock(&proc_inum_lock); 382 383 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); ··· 555 554 556 555 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 557 556 { 558 - unsigned int i; 559 557 struct proc_dir_entry *tmp; 558 + int ret; 560 559 561 - i = get_inode_number(); 562 - if (i == 0) 563 - return -EAGAIN; 564 - dp->low_ino = i; 560 + ret = proc_alloc_inum(&dp->low_ino); 561 + if (ret) 562 + return ret; 565 563 566 564 if (S_ISDIR(dp->mode)) { 567 565 if (dp->proc_iops == NULL) { ··· 764 764 765 765 static void free_proc_entry(struct proc_dir_entry *de) 766 766 { 767 - release_inode_number(de->low_ino); 767 + proc_free_inum(de->low_ino); 768 768 769 769 if (S_ISLNK(de->mode)) 770 770 kfree(de->data);
+4 -2
fs/proc/inode.c
··· 31 31 struct proc_dir_entry *de; 32 32 struct ctl_table_header *head; 33 33 const struct proc_ns_operations *ns_ops; 34 + void *ns; 34 35 35 36 truncate_inode_pages(&inode->i_data, 0); 36 37 clear_inode(inode); ··· 50 49 } 51 50 /* Release any associated namespace */ 52 51 ns_ops = PROC_I(inode)->ns_ops; 53 - if (ns_ops && ns_ops->put) 54 - ns_ops->put(PROC_I(inode)->ns); 52 + ns = PROC_I(inode)->ns; 53 + if (ns_ops && ns) 54 + ns_ops->put(ns); 55 55 } 56 56 57 57 static struct kmem_cache * proc_inode_cachep;
+1
fs/proc/internal.h
··· 15 15 struct mempolicy; 16 16 17 17 extern struct proc_dir_entry proc_root; 18 + extern void proc_self_init(void); 18 19 #ifdef CONFIG_PROC_SYSCTL 19 20 extern int proc_sys_init(void); 20 21 extern void sysctl_head_put(struct ctl_table_header *head);
+164 -21
fs/proc/namespaces.c
··· 11 11 #include <net/net_namespace.h> 12 12 #include <linux/ipc_namespace.h> 13 13 #include <linux/pid_namespace.h> 14 + #include <linux/user_namespace.h> 14 15 #include "internal.h" 15 16 16 17 ··· 25 24 #ifdef CONFIG_IPC_NS 26 25 &ipcns_operations, 27 26 #endif 27 + #ifdef CONFIG_PID_NS 28 + &pidns_operations, 29 + #endif 30 + #ifdef CONFIG_USER_NS 31 + &userns_operations, 32 + #endif 33 + &mntns_operations, 28 34 }; 29 35 30 36 static const struct file_operations ns_file_operations = { 31 37 .llseek = no_llseek, 38 + }; 39 + 40 + static const struct inode_operations ns_inode_operations = { 41 + .setattr = proc_setattr, 42 + }; 43 + 44 + static int ns_delete_dentry(const struct dentry *dentry) 45 + { 46 + /* Don't cache namespace inodes when not in use */ 47 + return 1; 48 + } 49 + 50 + static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) 51 + { 52 + struct inode *inode = dentry->d_inode; 53 + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; 54 + 55 + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", 56 + ns_ops->name, inode->i_ino); 57 + } 58 + 59 + const struct dentry_operations ns_dentry_operations = 60 + { 61 + .d_delete = ns_delete_dentry, 62 + .d_dname = ns_dname, 63 + }; 64 + 65 + static struct dentry *proc_ns_get_dentry(struct super_block *sb, 66 + struct task_struct *task, const struct proc_ns_operations *ns_ops) 67 + { 68 + struct dentry *dentry, *result; 69 + struct inode *inode; 70 + struct proc_inode *ei; 71 + struct qstr qname = { .name = "", }; 72 + void *ns; 73 + 74 + ns = ns_ops->get(task); 75 + if (!ns) 76 + return ERR_PTR(-ENOENT); 77 + 78 + dentry = d_alloc_pseudo(sb, &qname); 79 + if (!dentry) { 80 + ns_ops->put(ns); 81 + return ERR_PTR(-ENOMEM); 82 + } 83 + 84 + inode = iget_locked(sb, ns_ops->inum(ns)); 85 + if (!inode) { 86 + dput(dentry); 87 + ns_ops->put(ns); 88 + return ERR_PTR(-ENOMEM); 89 + } 90 + 91 + ei = PROC_I(inode); 92 + if (inode->i_state & I_NEW) { 93 + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 94 + inode->i_op = &ns_inode_operations; 95 + inode->i_mode = S_IFREG | S_IRUGO; 96 + inode->i_fop = &ns_file_operations; 97 + ei->ns_ops = ns_ops; 98 + ei->ns = ns; 99 + unlock_new_inode(inode); 100 + } else { 101 + ns_ops->put(ns); 102 + } 103 + 104 + d_set_d_op(dentry, &ns_dentry_operations); 105 + result = d_instantiate_unique(dentry, inode); 106 + if (result) { 107 + dput(dentry); 108 + dentry = result; 109 + } 110 + 111 + return dentry; 112 + } 113 + 114 + static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) 115 + { 116 + struct inode *inode = dentry->d_inode; 117 + struct super_block *sb = inode->i_sb; 118 + struct proc_inode *ei = PROC_I(inode); 119 + struct task_struct *task; 120 + struct dentry *ns_dentry; 121 + void *error = ERR_PTR(-EACCES); 122 + 123 + task = get_proc_task(inode); 124 + if (!task) 125 + goto out; 126 + 127 + if (!ptrace_may_access(task, PTRACE_MODE_READ)) 128 + goto out_put_task; 129 + 130 + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); 131 + if (IS_ERR(ns_dentry)) { 132 + error = ERR_CAST(ns_dentry); 133 + goto out_put_task; 134 + } 135 + 136 + dput(nd->path.dentry); 137 + nd->path.dentry = ns_dentry; 138 + error = NULL; 139 + 140 + out_put_task: 141 + put_task_struct(task); 142 + out: 143 + return error; 144 + } 145 + 146 + static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) 147 + { 148 + struct inode *inode = dentry->d_inode; 149 + struct proc_inode *ei = PROC_I(inode); 150 + const struct proc_ns_operations *ns_ops = ei->ns_ops; 151 + struct task_struct *task; 152 + void *ns; 153 + char name[50]; 154 + int len = -EACCES; 155 + 156 + task = get_proc_task(inode); 157 + if (!task) 158 + goto out; 159 + 160 + if (!ptrace_may_access(task, PTRACE_MODE_READ)) 161 + goto out_put_task; 162 + 163 + len = -ENOENT; 164 + ns = ns_ops->get(task); 165 + if (!ns) 166 + goto out_put_task; 167 + 168 + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); 169 + len = strlen(name); 170 + 171 + if (len > buflen) 172 + len = buflen; 173 + if (copy_to_user(buffer, name, len)) 174 + len = -EFAULT; 175 + 176 + ns_ops->put(ns); 177 + out_put_task: 178 + put_task_struct(task); 179 + out: 180 + return len; 181 + } 182 + 183 + static const struct inode_operations proc_ns_link_inode_operations = { 184 + .readlink = proc_ns_readlink, 185 + .follow_link = proc_ns_follow_link, 186 + .setattr = proc_setattr, 32 187 }; 33 188 34 189 static struct dentry *proc_ns_instantiate(struct inode *dir, ··· 194 37 struct inode *inode; 195 38 struct proc_inode *ei; 196 39 struct dentry *error = ERR_PTR(-ENOENT); 197 - void *ns; 198 40 199 41 inode = proc_pid_make_inode(dir->i_sb, task); 200 42 if (!inode) 201 43 goto out; 202 44 203 - ns = ns_ops->get(task); 204 - if (!ns) 205 - goto out_iput; 206 - 207 45 ei = PROC_I(inode); 208 - inode->i_mode = S_IFREG|S_IRUSR; 209 - inode->i_fop = &ns_file_operations; 210 - ei->ns_ops = ns_ops; 211 - ei->ns = ns; 46 + inode->i_mode = S_IFLNK|S_IRWXUGO; 47 + inode->i_op = &proc_ns_link_inode_operations; 48 + ei->ns_ops = ns_ops; 212 49 213 50 d_set_d_op(dentry, &pid_dentry_operations); 214 51 d_add(dentry, inode); ··· 211 60 error = NULL; 212 61 out: 213 62 return error; 214 - out_iput: 215 - iput(inode); 216 - goto out; 217 63 } 218 64 219 65 static int proc_ns_fill_cache(struct file *filp, void *dirent, ··· 236 88 ret = -ENOENT; 237 89 if (!task) 238 90 goto out_no_task; 239 - 240 - ret = -EPERM; 241 - if (!ptrace_may_access(task, PTRACE_MODE_READ)) 242 - goto out; 243 91 244 92 ret = 0; 245 93 i = filp->f_pos; ··· 296 152 if (!task) 297 153 goto out_no_task; 298 154 299 - error = ERR_PTR(-EPERM); 300 - if (!ptrace_may_access(task, PTRACE_MODE_READ)) 301 - goto out; 302 - 303 155 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 304 156 for (entry = ns_entries; entry < last; entry++) { 305 157 if (strlen((*entry)->name) != len) ··· 303 163 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 304 164 break; 305 165 } 306 - error = ERR_PTR(-ENOENT); 307 166 if (entry == last) 308 167 goto out; 309 168 ··· 337 198 return ERR_PTR(-EINVAL); 338 199 } 339 200 201 + bool proc_ns_inode(struct inode *inode) 202 + { 203 + return inode->i_fop == &ns_file_operations; 204 + }
+3 -14
fs/proc/root.c
··· 100 100 int err; 101 101 struct super_block *sb; 102 102 struct pid_namespace *ns; 103 - struct proc_inode *ei; 104 103 char *options; 105 104 106 105 if (flags & MS_KERNMOUNT) { 107 106 ns = (struct pid_namespace *)data; 108 107 options = NULL; 109 108 } else { 110 - ns = current->nsproxy->pid_ns; 109 + ns = task_active_pid_ns(current); 111 110 options = data; 112 111 } 113 112 ··· 129 130 sb->s_flags |= MS_ACTIVE; 130 131 } 131 132 132 - ei = PROC_I(sb->s_root->d_inode); 133 - if (!ei->pid) { 134 - rcu_read_lock(); 135 - ei->pid = get_pid(find_pid_ns(1, ns)); 136 - rcu_read_unlock(); 137 - } 138 - 139 133 return dget(sb->s_root); 140 134 } 141 135 ··· 145 153 .name = "proc", 146 154 .mount = proc_mount, 147 155 .kill_sb = proc_kill_sb, 156 + .fs_flags = FS_USERNS_MOUNT, 148 157 }; 149 158 150 159 void __init proc_root_init(void) ··· 156 163 err = register_filesystem(&proc_fs_type); 157 164 if (err) 158 165 return; 159 - err = pid_ns_prepare_proc(&init_pid_ns); 160 - if (err) { 161 - unregister_filesystem(&proc_fs_type); 162 - return; 163 - } 164 166 167 + proc_self_init(); 165 168 proc_symlink("mounts", NULL, "self/mounts"); 166 169 167 170 proc_net_init();
+59
fs/proc/self.c
··· 1 + #include <linux/proc_fs.h> 2 + #include <linux/sched.h> 3 + #include <linux/namei.h> 4 + 5 + /* 6 + * /proc/self: 7 + */ 8 + static int proc_self_readlink(struct dentry *dentry, char __user *buffer, 9 + int buflen) 10 + { 11 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 12 + pid_t tgid = task_tgid_nr_ns(current, ns); 13 + char tmp[PROC_NUMBUF]; 14 + if (!tgid) 15 + return -ENOENT; 16 + sprintf(tmp, "%d", tgid); 17 + return vfs_readlink(dentry,buffer,buflen,tmp); 18 + } 19 + 20 + static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 21 + { 22 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 23 + pid_t tgid = task_tgid_nr_ns(current, ns); 24 + char *name = ERR_PTR(-ENOENT); 25 + if (tgid) { 26 + /* 11 for max length of signed int in decimal + NULL term */ 27 + name = kmalloc(12, GFP_KERNEL); 28 + if (!name) 29 + name = ERR_PTR(-ENOMEM); 30 + else 31 + sprintf(name, "%d", tgid); 32 + } 33 + nd_set_link(nd, name); 34 + return NULL; 35 + } 36 + 37 + static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, 38 + void *cookie) 39 + { 40 + char *s = nd_get_link(nd); 41 + if (!IS_ERR(s)) 42 + kfree(s); 43 + } 44 + 45 + static const struct inode_operations proc_self_inode_operations = { 46 + .readlink = proc_self_readlink, 47 + .follow_link = proc_self_follow_link, 48 + .put_link = proc_self_put_link, 49 + }; 50 + 51 + void __init proc_self_init(void) 52 + { 53 + struct proc_dir_entry *proc_self_symlink; 54 + mode_t mode; 55 + 56 + mode = S_IFLNK | S_IRWXUGO; 57 + proc_self_symlink = proc_create("self", mode, NULL, NULL ); 58 + proc_self_symlink->proc_iops = &proc_self_inode_operations; 59 + }
+1
fs/sysfs/mount.c
··· 149 149 .name = "sysfs", 150 150 .mount = sysfs_mount, 151 151 .kill_sb = sysfs_kill_sb, 152 + .fs_flags = FS_USERNS_MOUNT, 152 153 }; 153 154 154 155 int __init sysfs_init(void)
-2
include/linux/cred.h
··· 344 344 extern struct user_namespace init_user_ns; 345 345 #ifdef CONFIG_USER_NS 346 346 #define current_user_ns() (current_cred_xxx(user_ns)) 347 - #define task_user_ns(task) (task_cred_xxx((task), user_ns)) 348 347 #else 349 348 #define current_user_ns() (&init_user_ns) 350 - #define task_user_ns(task) (&init_user_ns) 351 349 #endif 352 350 353 351
+2
include/linux/fs.h
··· 1810 1810 #define FS_REQUIRES_DEV 1 1811 1811 #define FS_BINARY_MOUNTDATA 2 1812 1812 #define FS_HAS_SUBTYPE 4 1813 + #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ 1814 + #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ 1813 1815 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 1814 1816 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ 1815 1817 struct dentry *(*mount) (struct file_system_type *, int,
+6 -3
include/linux/ipc_namespace.h
··· 67 67 68 68 /* user_ns which owns the ipc ns */ 69 69 struct user_namespace *user_ns; 70 + 71 + unsigned int proc_inum; 70 72 }; 71 73 72 74 extern struct ipc_namespace init_ipc_ns; ··· 135 133 136 134 #if defined(CONFIG_IPC_NS) 137 135 extern struct ipc_namespace *copy_ipcs(unsigned long flags, 138 - struct task_struct *tsk); 136 + struct user_namespace *user_ns, struct ipc_namespace *ns); 137 + 139 138 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) 140 139 { 141 140 if (ns) ··· 147 144 extern void put_ipc_ns(struct ipc_namespace *ns); 148 145 #else 149 146 static inline struct ipc_namespace *copy_ipcs(unsigned long flags, 150 - struct task_struct *tsk) 147 + struct user_namespace *user_ns, struct ipc_namespace *ns) 151 148 { 152 149 if (flags & CLONE_NEWIPC) 153 150 return ERR_PTR(-EINVAL); 154 151 155 - return tsk->nsproxy->ipc_ns; 152 + return ns; 156 153 } 157 154 158 155 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
+2 -1
include/linux/mnt_namespace.h
··· 4 4 5 5 struct mnt_namespace; 6 6 struct fs_struct; 7 + struct user_namespace; 7 8 8 9 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, 9 - struct fs_struct *); 10 + struct user_namespace *, struct fs_struct *); 10 11 extern void put_mnt_ns(struct mnt_namespace *ns); 11 12 12 13 extern const struct file_operations proc_mounts_operations;
+1 -1
include/linux/nsproxy.h
··· 67 67 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); 68 68 void free_nsproxy(struct nsproxy *ns); 69 69 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, 70 - struct fs_struct *); 70 + struct cred *, struct fs_struct *); 71 71 int __init nsproxy_cache_init(void); 72 72 73 73 static inline void put_nsproxy(struct nsproxy *ns)
+8 -3
include/linux/pid_namespace.h
··· 21 21 struct kref kref; 22 22 struct pidmap pidmap[PIDMAP_ENTRIES]; 23 23 int last_pid; 24 + int nr_hashed; 24 25 struct task_struct *child_reaper; 25 26 struct kmem_cache *pid_cachep; 26 27 unsigned int level; ··· 32 31 #ifdef CONFIG_BSD_PROCESS_ACCT 33 32 struct bsd_acct_struct *bacct; 34 33 #endif 34 + struct user_namespace *user_ns; 35 + struct work_struct proc_work; 35 36 kgid_t pid_gid; 36 37 int hide_pid; 37 38 int reboot; /* group exit code if this pidns was rebooted */ 39 + unsigned int proc_inum; 38 40 }; 39 41 40 42 extern struct pid_namespace init_pid_ns; ··· 50 46 return ns; 51 47 } 52 48 53 - extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); 49 + extern struct pid_namespace *copy_pid_ns(unsigned long flags, 50 + struct user_namespace *user_ns, struct pid_namespace *ns); 54 51 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); 55 52 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); 56 53 extern void put_pid_ns(struct pid_namespace *ns); ··· 64 59 return ns; 65 60 } 66 61 67 - static inline struct pid_namespace * 68 - copy_pid_ns(unsigned long flags, struct pid_namespace *ns) 62 + static inline struct pid_namespace *copy_pid_ns(unsigned long flags, 63 + struct user_namespace *user_ns, struct pid_namespace *ns) 69 64 { 70 65 if (flags & CLONE_NEWPID) 71 66 ns = ERR_PTR(-EINVAL);
+25 -1
include/linux/proc_fs.h
··· 28 28 */ 29 29 30 30 enum { 31 - PROC_ROOT_INO = 1, 31 + PROC_ROOT_INO = 1, 32 + PROC_IPC_INIT_INO = 0xEFFFFFFFU, 33 + PROC_UTS_INIT_INO = 0xEFFFFFFEU, 34 + PROC_USER_INIT_INO = 0xEFFFFFFDU, 35 + PROC_PID_INIT_INO = 0xEFFFFFFCU, 32 36 }; 33 37 34 38 /* ··· 178 174 struct proc_dir_entry *parent); 179 175 180 176 extern struct file *proc_ns_fget(int fd); 177 + extern bool proc_ns_inode(struct inode *inode); 181 178 179 + extern int proc_alloc_inum(unsigned int *pino); 180 + extern void proc_free_inum(unsigned int inum); 182 181 #else 183 182 184 183 #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) ··· 236 229 return ERR_PTR(-EINVAL); 237 230 } 238 231 232 + static inline bool proc_ns_inode(struct inode *inode) 233 + { 234 + return false; 235 + } 236 + 237 + static inline int proc_alloc_inum(unsigned int *inum) 238 + { 239 + *inum = 1; 240 + return 0; 241 + } 242 + static inline void proc_free_inum(unsigned int inum) 243 + { 244 + } 239 245 #endif /* CONFIG_PROC_FS */ 240 246 241 247 #if !defined(CONFIG_PROC_KCORE) ··· 267 247 void *(*get)(struct task_struct *task); 268 248 void (*put)(void *ns); 269 249 int (*install)(struct nsproxy *nsproxy, void *ns); 250 + unsigned int (*inum)(void *ns); 270 251 }; 271 252 extern const struct proc_ns_operations netns_operations; 272 253 extern const struct proc_ns_operations utsns_operations; 273 254 extern const struct proc_ns_operations ipcns_operations; 255 + extern const struct proc_ns_operations pidns_operations; 256 + extern const struct proc_ns_operations userns_operations; 257 + extern const struct proc_ns_operations mntns_operations; 274 258 275 259 union proc_op { 276 260 int (*proc_get_link)(struct dentry *, struct path *);
+10
include/linux/user_namespace.h
··· 25 25 struct user_namespace *parent; 26 26 kuid_t owner; 27 27 kgid_t group; 28 + unsigned int proc_inum; 28 29 }; 29 30 30 31 extern struct user_namespace init_user_ns; ··· 40 39 } 41 40 42 41 extern int create_user_ns(struct cred *new); 42 + extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); 43 43 extern void free_user_ns(struct kref *kref); 44 44 45 45 static inline void put_user_ns(struct user_namespace *ns) ··· 66 64 static inline int create_user_ns(struct cred *new) 67 65 { 68 66 return -EINVAL; 67 + } 68 + 69 + static inline int unshare_userns(unsigned long unshare_flags, 70 + struct cred **new_cred) 71 + { 72 + if (unshare_flags & CLONE_NEWUSER) 73 + return -EINVAL; 74 + return 0; 69 75 } 70 76 71 77 static inline void put_user_ns(struct user_namespace *ns)
+4 -3
include/linux/utsname.h
··· 23 23 struct kref kref; 24 24 struct new_utsname name; 25 25 struct user_namespace *user_ns; 26 + unsigned int proc_inum; 26 27 }; 27 28 extern struct uts_namespace init_uts_ns; 28 29 ··· 34 33 } 35 34 36 35 extern struct uts_namespace *copy_utsname(unsigned long flags, 37 - struct task_struct *tsk); 36 + struct user_namespace *user_ns, struct uts_namespace *old_ns); 38 37 extern void free_uts_ns(struct kref *kref); 39 38 40 39 static inline void put_uts_ns(struct uts_namespace *ns) ··· 51 50 } 52 51 53 52 static inline struct uts_namespace *copy_utsname(unsigned long flags, 54 - struct task_struct *tsk) 53 + struct user_namespace *user_ns, struct uts_namespace *old_ns) 55 54 { 56 55 if (flags & CLONE_NEWUTS) 57 56 return ERR_PTR(-EINVAL); 58 57 59 - return tsk->nsproxy->uts_ns; 58 + return old_ns; 60 59 } 61 60 #endif 62 61
+2
include/net/net_namespace.h
··· 56 56 57 57 struct user_namespace *user_ns; /* Owning user namespace */ 58 58 59 + unsigned int proc_inum; 60 + 59 61 struct proc_dir_entry *proc_net; 60 62 struct proc_dir_entry *proc_net_stat; 61 63
-2
init/Kconfig
··· 1069 1069 # Filesystems 1070 1070 depends on 9P_FS = n 1071 1071 depends on AFS_FS = n 1072 - depends on AUTOFS4_FS = n 1073 1072 depends on CEPH_FS = n 1074 1073 depends on CIFS = n 1075 1074 depends on CODA_FS = n 1076 - depends on FUSE_FS = n 1077 1075 depends on GFS2_FS = n 1078 1076 depends on NCP_FS = n 1079 1077 depends on NFSD = n
-1
init/main.c
··· 812 812 system_state = SYSTEM_RUNNING; 813 813 numa_default_policy(); 814 814 815 - current->signal->flags |= SIGNAL_UNKILLABLE; 816 815 flush_delayed_fput(); 817 816 818 817 if (ramdisk_execute_command) {
+2
init/version.c
··· 12 12 #include <linux/utsname.h> 13 13 #include <generated/utsrelease.h> 14 14 #include <linux/version.h> 15 + #include <linux/proc_fs.h> 15 16 16 17 #ifndef CONFIG_KALLSYMS 17 18 #define version(a) Version_ ## a ··· 35 34 .domainname = UTS_DOMAINNAME, 36 35 }, 37 36 .user_ns = &init_user_ns, 37 + .proc_inum = PROC_UTS_INIT_INO, 38 38 }; 39 39 EXPORT_SYMBOL_GPL(init_uts_ns); 40 40
+2
ipc/msgutil.c
··· 16 16 #include <linux/msg.h> 17 17 #include <linux/ipc_namespace.h> 18 18 #include <linux/utsname.h> 19 + #include <linux/proc_fs.h> 19 20 #include <asm/uaccess.h> 20 21 21 22 #include "util.h" ··· 31 30 struct ipc_namespace init_ipc_ns = { 32 31 .count = ATOMIC_INIT(1), 33 32 .user_ns = &init_user_ns, 33 + .proc_inum = PROC_IPC_INIT_INO, 34 34 }; 35 35 36 36 atomic_t nr_ipc_ns = ATOMIC_INIT(1);
+25 -7
ipc/namespace.c
··· 16 16 17 17 #include "util.h" 18 18 19 - static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, 19 + static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, 20 20 struct ipc_namespace *old_ns) 21 21 { 22 22 struct ipc_namespace *ns; ··· 26 26 if (ns == NULL) 27 27 return ERR_PTR(-ENOMEM); 28 28 29 + err = proc_alloc_inum(&ns->proc_inum); 30 + if (err) { 31 + kfree(ns); 32 + return ERR_PTR(err); 33 + } 34 + 29 35 atomic_set(&ns->count, 1); 30 36 err = mq_init_ns(ns); 31 37 if (err) { 38 + proc_free_inum(ns->proc_inum); 32 39 kfree(ns); 33 40 return ERR_PTR(err); 34 41 } ··· 53 46 ipcns_notify(IPCNS_CREATED); 54 47 register_ipcns_notifier(ns); 55 48 56 - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 49 + ns->user_ns = get_user_ns(user_ns); 57 50 58 51 return ns; 59 52 } 60 53 61 54 struct ipc_namespace *copy_ipcs(unsigned long flags, 62 - struct task_struct *tsk) 55 + struct user_namespace *user_ns, struct ipc_namespace *ns) 63 56 { 64 - struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; 65 - 66 57 if (!(flags & CLONE_NEWIPC)) 67 58 return get_ipc_ns(ns); 68 - return create_ipc_ns(tsk, ns); 59 + return create_ipc_ns(user_ns, ns); 69 60 } 70 61 71 62 /* ··· 118 113 */ 119 114 ipcns_notify(IPCNS_REMOVED); 120 115 put_user_ns(ns->user_ns); 116 + proc_free_inum(ns->proc_inum); 121 117 kfree(ns); 122 118 } 123 119 ··· 167 161 return put_ipc_ns(ns); 168 162 } 169 163 170 - static int ipcns_install(struct nsproxy *nsproxy, void *ns) 164 + static int ipcns_install(struct nsproxy *nsproxy, void *new) 171 165 { 166 + struct ipc_namespace *ns = new; 167 + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 168 + return -EPERM; 169 + 172 170 /* Ditch state from the old ipc namespace */ 173 171 exit_sem(current); 174 172 put_ipc_ns(nsproxy->ipc_ns); 175 173 nsproxy->ipc_ns = get_ipc_ns(ns); 176 174 return 0; 175 + } 176 + 177 + static unsigned int ipcns_inum(void *vp) 178 + { 179 + struct ipc_namespace *ns = vp; 180 + 181 + return ns->proc_inum; 177 182 } 178 183 179 184 const struct proc_ns_operations ipcns_operations = { ··· 193 176 .get = ipcns_get, 194 177 .put = ipcns_put, 195 178 .install = ipcns_install, 179 + .inum = ipcns_inum, 196 180 };
+1 -1
kernel/cgroup.c
··· 3409 3409 { 3410 3410 struct cgroup_pidlist *l; 3411 3411 /* don't need task_nsproxy() if we're looking at ourself */ 3412 - struct pid_namespace *ns = current->nsproxy->pid_ns; 3412 + struct pid_namespace *ns = task_active_pid_ns(current); 3413 3413 3414 3414 /* 3415 3415 * We can't drop the pidlist_mutex before taking the l->mutex in case
+1 -1
kernel/events/core.c
··· 6155 6155 6156 6156 event->parent = parent_event; 6157 6157 6158 - event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 + event->ns = get_pid_ns(task_active_pid_ns(current)); 6159 6159 event->id = atomic64_inc_return(&perf_event_id); 6160 6160 6161 6161 event->state = PERF_EVENT_STATE_INACTIVE;
-12
kernel/exit.c
··· 72 72 list_del_rcu(&p->tasks); 73 73 list_del_init(&p->sibling); 74 74 __this_cpu_dec(process_counts); 75 - /* 76 - * If we are the last child process in a pid namespace to be 77 - * reaped, notify the reaper sleeping zap_pid_ns_processes(). 78 - */ 79 - if (IS_ENABLED(CONFIG_PID_NS)) { 80 - struct task_struct *parent = p->real_parent; 81 - 82 - if ((task_active_pid_ns(parent)->child_reaper == parent) && 83 - list_empty(&parent->children) && 84 - (parent->flags & PF_EXITING)) 85 - wake_up_process(parent); 86 - } 87 75 } 88 76 list_del_rcu(&p->thread_group); 89 77 }
+48 -21
kernel/fork.c
··· 1044 1044 atomic_set(&sig->live, 1); 1045 1045 atomic_set(&sig->sigcnt, 1); 1046 1046 init_waitqueue_head(&sig->wait_chldexit); 1047 - if (clone_flags & CLONE_NEWPID) 1048 - sig->flags |= SIGNAL_UNKILLABLE; 1049 1047 sig->curr_target = tsk; 1050 1048 init_sigpending(&sig->shared_pending); 1051 1049 INIT_LIST_HEAD(&sig->posix_timers); ··· 1436 1438 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1437 1439 1438 1440 if (thread_group_leader(p)) { 1439 - if (is_child_reaper(pid)) 1440 - p->nsproxy->pid_ns->child_reaper = p; 1441 + if (is_child_reaper(pid)) { 1442 + ns_of_pid(pid)->child_reaper = p; 1443 + p->signal->flags |= SIGNAL_UNKILLABLE; 1444 + } 1441 1445 1442 1446 p->signal->leader_pid = pid; 1443 1447 p->signal->tty = tty_kref_get(current->signal->tty); ··· 1473 1473 if (p->io_context) 1474 1474 exit_io_context(p); 1475 1475 bad_fork_cleanup_namespaces: 1476 - if (unlikely(clone_flags & CLONE_NEWPID)) 1477 - pid_ns_release_proc(p->nsproxy->pid_ns); 1478 1476 exit_task_namespaces(p); 1479 1477 bad_fork_cleanup_mm: 1480 1478 if (p->mm) ··· 1552 1554 * Do some preliminary argument and permissions checking before we 1553 1555 * actually start allocating stuff 1554 1556 */ 1555 - if (clone_flags & CLONE_NEWUSER) { 1556 - if (clone_flags & CLONE_THREAD) 1557 + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { 1558 + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) 1557 1559 return -EINVAL; 1558 - /* hopefully this check will go away when userns support is 1559 - * complete 1560 - */ 1561 - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || 1562 - !capable(CAP_SETGID)) 1563 - return -EPERM; 1564 1560 } 1565 1561 1566 1562 /* ··· 1716 1724 { 1717 1725 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1718 1726 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1719 - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1727 + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| 1728 + CLONE_NEWUSER|CLONE_NEWPID)) 1720 1729 return -EINVAL; 1721 1730 /* 1722 1731 * Not implemented, but pretend it works if there is nothing to ··· 1784 1791 { 1785 1792 struct fs_struct *fs, *new_fs = NULL; 1786 1793 struct files_struct *fd, *new_fd = NULL; 1794 + struct cred *new_cred = NULL; 1787 1795 struct nsproxy *new_nsproxy = NULL; 1788 1796 int do_sysvsem = 0; 1789 1797 int err; 1790 1798 1791 - err = check_unshare_flags(unshare_flags); 1792 - if (err) 1793 - goto bad_unshare_out; 1794 - 1799 + /* 1800 + * If unsharing a user namespace must also unshare the thread. 1801 + */ 1802 + if (unshare_flags & CLONE_NEWUSER) 1803 + unshare_flags |= CLONE_THREAD; 1804 + /* 1805 + * If unsharing a pid namespace must also unshare the thread. 1806 + */ 1807 + if (unshare_flags & CLONE_NEWPID) 1808 + unshare_flags |= CLONE_THREAD; 1809 + /* 1810 + * If unsharing a thread from a thread group, must also unshare vm. 1811 + */ 1812 + if (unshare_flags & CLONE_THREAD) 1813 + unshare_flags |= CLONE_VM; 1814 + /* 1815 + * If unsharing vm, must also unshare signal handlers. 1816 + */ 1817 + if (unshare_flags & CLONE_VM) 1818 + unshare_flags |= CLONE_SIGHAND; 1795 1819 /* 1796 1820 * If unsharing namespace, must also unshare filesystem information. 1797 1821 */ 1798 1822 if (unshare_flags & CLONE_NEWNS) 1799 1823 unshare_flags |= CLONE_FS; 1824 + 1825 + err = check_unshare_flags(unshare_flags); 1826 + if (err) 1827 + goto bad_unshare_out; 1800 1828 /* 1801 1829 * CLONE_NEWIPC must also detach from the undolist: after switching 1802 1830 * to a new ipc namespace, the semaphore arrays from the old ··· 1831 1817 err = unshare_fd(unshare_flags, &new_fd); 1832 1818 if (err) 1833 1819 goto bad_unshare_cleanup_fs; 1834 - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1820 + err = unshare_userns(unshare_flags, &new_cred); 1835 1821 if (err) 1836 1822 goto bad_unshare_cleanup_fd; 1823 + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1824 + new_cred, new_fs); 1825 + if (err) 1826 + goto bad_unshare_cleanup_cred; 1837 1827 1838 - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1828 + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { 1839 1829 if (do_sysvsem) { 1840 1830 /* 1841 1831 * CLONE_SYSVSEM is equivalent to sys_exit(). ··· 1872 1854 } 1873 1855 1874 1856 task_unlock(current); 1857 + 1858 + if (new_cred) { 1859 + /* Install the new user namespace */ 1860 + commit_creds(new_cred); 1861 + new_cred = NULL; 1862 + } 1875 1863 } 1876 1864 1877 1865 if (new_nsproxy) 1878 1866 put_nsproxy(new_nsproxy); 1879 1867 1868 + bad_unshare_cleanup_cred: 1869 + if (new_cred) 1870 + put_cred(new_cred); 1880 1871 bad_unshare_cleanup_fd: 1881 1872 if (new_fd) 1882 1873 put_files_struct(new_fd);
+19 -17
kernel/nsproxy.c
··· 57 57 * leave it to the caller to do proper locking and attach it to task. 58 58 */ 59 59 static struct nsproxy *create_new_namespaces(unsigned long flags, 60 - struct task_struct *tsk, struct fs_struct *new_fs) 60 + struct task_struct *tsk, struct user_namespace *user_ns, 61 + struct fs_struct *new_fs) 61 62 { 62 63 struct nsproxy *new_nsp; 63 64 int err; ··· 67 66 if (!new_nsp) 68 67 return ERR_PTR(-ENOMEM); 69 68 70 - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 69 + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); 71 70 if (IS_ERR(new_nsp->mnt_ns)) { 72 71 err = PTR_ERR(new_nsp->mnt_ns); 73 72 goto out_ns; 74 73 } 75 74 76 - new_nsp->uts_ns = copy_utsname(flags, tsk); 75 + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); 77 76 if (IS_ERR(new_nsp->uts_ns)) { 78 77 err = PTR_ERR(new_nsp->uts_ns); 79 78 goto out_uts; 80 79 } 81 80 82 - new_nsp->ipc_ns = copy_ipcs(flags, tsk); 81 + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); 83 82 if (IS_ERR(new_nsp->ipc_ns)) { 84 83 err = PTR_ERR(new_nsp->ipc_ns); 85 84 goto out_ipc; 86 85 } 87 86 88 - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 87 + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 89 88 if (IS_ERR(new_nsp->pid_ns)) { 90 89 err = PTR_ERR(new_nsp->pid_ns); 91 90 goto out_pid; 92 91 } 93 92 94 - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); 93 + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); 95 94 if (IS_ERR(new_nsp->net_ns)) { 96 95 err = PTR_ERR(new_nsp->net_ns); 97 96 goto out_net; ··· 123 122 int copy_namespaces(unsigned long flags, struct task_struct *tsk) 124 123 { 125 124 struct nsproxy *old_ns = tsk->nsproxy; 125 + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); 126 126 struct nsproxy *new_ns; 127 127 int err = 0; 128 128 ··· 136 134 CLONE_NEWPID | CLONE_NEWNET))) 137 135 return 0; 138 136 139 - if (!capable(CAP_SYS_ADMIN)) { 137 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { 140 138 err = -EPERM; 141 139 goto out; 142 140 } ··· 153 151 goto out; 154 152 } 155 153 156 - new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 + new_ns = create_new_namespaces(flags, tsk, 155 + task_cred_xxx(tsk, user_ns), tsk->fs); 157 156 if (IS_ERR(new_ns)) { 158 157 err = PTR_ERR(new_ns); 159 158 goto out; ··· 186 183 * On success, returns the new nsproxy. 187 184 */ 188 185 int unshare_nsproxy_namespaces(unsigned long unshare_flags, 189 - struct nsproxy **new_nsp, struct fs_struct *new_fs) 186 + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) 190 187 { 188 + struct user_namespace *user_ns; 191 189 int err = 0; 192 190 193 191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 - CLONE_NEWNET))) 192 + CLONE_NEWNET | CLONE_NEWPID))) 195 193 return 0; 196 194 197 - if (!capable(CAP_SYS_ADMIN)) 195 + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); 196 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 198 197 return -EPERM; 199 198 200 - *new_nsp = create_new_namespaces(unshare_flags, current, 201 - new_fs ? new_fs : current->fs); 199 + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, 200 + new_fs ? new_fs : current->fs); 202 201 if (IS_ERR(*new_nsp)) { 203 202 err = PTR_ERR(*new_nsp); 204 203 goto out; ··· 246 241 struct file *file; 247 242 int err; 248 243 249 - if (!capable(CAP_SYS_ADMIN)) 250 - return -EPERM; 251 - 252 244 file = proc_ns_fget(fd); 253 245 if (IS_ERR(file)) 254 246 return PTR_ERR(file); ··· 256 254 if (nstype && (ops->type != nstype)) 257 255 goto out; 258 256 259 - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 257 + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 260 258 if (IS_ERR(new_nsproxy)) { 261 259 err = PTR_ERR(new_nsproxy); 262 260 goto out;
+39 -8
kernel/pid.c
··· 36 36 #include <linux/pid_namespace.h> 37 37 #include <linux/init_task.h> 38 38 #include <linux/syscalls.h> 39 + #include <linux/proc_fs.h> 39 40 40 41 #define pid_hashfn(nr, ns) \ 41 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) ··· 79 78 .last_pid = 0, 80 79 .level = 0, 81 80 .child_reaper = &init_task, 81 + .user_ns = &init_user_ns, 82 + .proc_inum = PROC_PID_INIT_INO, 82 83 }; 83 84 EXPORT_SYMBOL_GPL(init_pid_ns); 84 85 ··· 272 269 unsigned long flags; 273 270 274 271 spin_lock_irqsave(&pidmap_lock, flags); 275 - for (i = 0; i <= pid->level; i++) 276 - hlist_del_rcu(&pid->numbers[i].pid_chain); 272 + for (i = 0; i <= pid->level; i++) { 273 + struct upid *upid = pid->numbers + i; 274 + struct pid_namespace *ns = upid->ns; 275 + hlist_del_rcu(&upid->pid_chain); 276 + switch(--ns->nr_hashed) { 277 + case 1: 278 + /* When all that is left in the pid namespace 279 + * is the reaper wake up the reaper. The reaper 280 + * may be sleeping in zap_pid_ns_processes(). 281 + */ 282 + wake_up_process(ns->child_reaper); 283 + break; 284 + case 0: 285 + ns->nr_hashed = -1; 286 + schedule_work(&ns->proc_work); 287 + break; 288 + } 289 + } 277 290 spin_unlock_irqrestore(&pidmap_lock, flags); 278 291 279 292 for (i = 0; i <= pid->level; i++) ··· 311 292 goto out; 312 293 313 294 tmp = ns; 295 + pid->level = ns->level; 314 296 for (i = ns->level; i >= 0; i--) { 315 297 nr = alloc_pidmap(tmp); 316 298 if (nr < 0) ··· 322 302 tmp = tmp->parent; 323 303 } 324 304 305 + if (unlikely(is_child_reaper(pid))) { 306 + if (pid_ns_prepare_proc(ns)) 307 + goto out_free; 308 + } 309 + 325 310 get_pid_ns(ns); 326 - pid->level = ns->level; 327 311 atomic_set(&pid->count, 1); 328 312 for (type = 0; type < PIDTYPE_MAX; ++type) 329 313 INIT_HLIST_HEAD(&pid->tasks[type]); 330 314 331 315 upid = pid->numbers + ns->level; 332 316 spin_lock_irq(&pidmap_lock); 333 - for ( ; upid >= pid->numbers; --upid) 317 + if (ns->nr_hashed < 0) 318 + goto out_unlock; 319 + for ( ; upid >= pid->numbers; --upid) { 334 320 hlist_add_head_rcu(&upid->pid_chain, 335 321 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 322 + upid->ns->nr_hashed++; 323 + } 336 324 spin_unlock_irq(&pidmap_lock); 337 325 338 326 out: 339 327 return pid; 340 328 329 + out_unlock: 330 + spin_unlock(&pidmap_lock); 341 331 out_free: 342 332 while (++i <= ns->level) 343 333 free_pidmap(pid->numbers + i); ··· 374 344 375 345 struct pid *find_vpid(int nr) 376 346 { 377 - return find_pid_ns(nr, current->nsproxy->pid_ns); 347 + return find_pid_ns(nr, task_active_pid_ns(current)); 378 348 } 379 349 EXPORT_SYMBOL_GPL(find_vpid); 380 350 ··· 458 428 459 429 struct task_struct *find_task_by_vpid(pid_t vnr) 460 430 { 461 - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 431 + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); 462 432 } 463 433 464 434 struct pid *get_task_pid(struct task_struct *task, enum pid_type type) ··· 513 483 514 484 pid_t pid_vnr(struct pid *pid) 515 485 { 516 - return pid_nr_ns(pid, current->nsproxy->pid_ns); 486 + return pid_nr_ns(pid, task_active_pid_ns(current)); 517 487 } 518 488 EXPORT_SYMBOL_GPL(pid_vnr); 519 489 ··· 524 494 525 495 rcu_read_lock(); 526 496 if (!ns) 527 - ns = current->nsproxy->pid_ns; 497 + ns = task_active_pid_ns(current); 528 498 if (likely(pid_alive(task))) { 529 499 if (type != PIDTYPE_PID) 530 500 task = task->group_leader; ··· 599 569 /* Reserve PID 0. We never call free_pidmap(0) */ 600 570 set_bit(0, init_pid_ns.pidmap[0].page); 601 571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 572 + init_pid_ns.nr_hashed = 1; 602 573 603 574 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 604 575 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+89 -23
kernel/pid_namespace.c
··· 10 10 11 11 #include <linux/pid.h> 12 12 #include <linux/pid_namespace.h> 13 + #include <linux/user_namespace.h> 13 14 #include <linux/syscalls.h> 14 15 #include <linux/err.h> 15 16 #include <linux/acct.h> ··· 72 71 return NULL; 73 72 } 74 73 74 + static void proc_cleanup_work(struct work_struct *work) 75 + { 76 + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); 77 + pid_ns_release_proc(ns); 78 + } 79 + 75 80 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 76 81 #define MAX_PID_NS_LEVEL 32 77 82 78 - static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 83 + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, 84 + struct pid_namespace *parent_pid_ns) 79 85 { 80 86 struct pid_namespace *ns; 81 87 unsigned int level = parent_pid_ns->level + 1; ··· 107 99 if (ns->pid_cachep == NULL) 108 100 goto out_free_map; 109 101 102 + err = proc_alloc_inum(&ns->proc_inum); 103 + if (err) 104 + goto out_free_map; 105 + 110 106 kref_init(&ns->kref); 111 107 ns->level = level; 112 108 ns->parent = get_pid_ns(parent_pid_ns); 109 + ns->user_ns = get_user_ns(user_ns); 110 + INIT_WORK(&ns->proc_work, proc_cleanup_work); 113 111 114 112 set_bit(0, ns->pidmap[0].page); 115 113 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); ··· 123 109 for (i = 1; i < PIDMAP_ENTRIES; i++) 124 110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 125 111 126 - err = pid_ns_prepare_proc(ns); 127 - if (err) 128 - goto out_put_parent_pid_ns; 129 - 130 112 return ns; 131 113 132 - out_put_parent_pid_ns: 133 - put_pid_ns(parent_pid_ns); 134 114 out_free_map: 135 115 kfree(ns->pidmap[0].page); 136 116 out_free: ··· 137 129 { 138 130 int i; 139 131 132 + proc_free_inum(ns->proc_inum); 140 133 for (i = 0; i < PIDMAP_ENTRIES; i++) 141 134 kfree(ns->pidmap[i].page); 135 + put_user_ns(ns->user_ns); 142 136 kmem_cache_free(pid_ns_cachep, ns); 143 137 } 144 138 145 - struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 139 + struct pid_namespace *copy_pid_ns(unsigned long flags, 140 + struct user_namespace *user_ns, struct pid_namespace *old_ns) 146 141 { 147 142 if (!(flags & CLONE_NEWPID)) 148 143 return get_pid_ns(old_ns); 149 - if (flags & (CLONE_THREAD|CLONE_PARENT)) 144 + if (task_active_pid_ns(current) != old_ns) 150 145 return ERR_PTR(-EINVAL); 151 - return create_pid_namespace(old_ns); 146 + return create_pid_namespace(user_ns, old_ns); 152 147 } 153 148 154 149 static void free_pid_ns(struct kref *kref) ··· 222 211 223 212 /* 224 213 * sys_wait4() above can't reap the TASK_DEAD children. 225 - * Make sure they all go away, see __unhash_process(). 214 + * Make sure they all go away, see free_pid(). 226 215 */ 227 216 for (;;) { 228 - bool need_wait = false; 229 - 230 - read_lock(&tasklist_lock); 231 - if (!list_empty(&current->children)) { 232 - __set_current_state(TASK_UNINTERRUPTIBLE); 233 - need_wait = true; 234 - } 235 - read_unlock(&tasklist_lock); 236 - 237 - if (!need_wait) 217 + set_current_state(TASK_UNINTERRUPTIBLE); 218 + if (pid_ns->nr_hashed == 1) 238 219 break; 239 220 schedule(); 240 221 } 222 + __set_current_state(TASK_RUNNING); 241 223 242 224 if (pid_ns->reboot) 243 225 current->signal->group_exit_code = pid_ns->reboot; ··· 243 239 static int pid_ns_ctl_handler(struct ctl_table *table, int write, 244 240 void __user *buffer, size_t *lenp, loff_t *ppos) 245 241 { 242 + struct pid_namespace *pid_ns = task_active_pid_ns(current); 246 243 struct ctl_table tmp = *table; 247 244 248 - if (write && !capable(CAP_SYS_ADMIN)) 245 + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) 249 246 return -EPERM; 250 247 251 248 /* ··· 255 250 * it should synchronize its usage with external means. 256 251 */ 257 252 258 - tmp.data = &current->nsproxy->pid_ns->last_pid; 253 + tmp.data = &pid_ns->last_pid; 259 254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 260 255 } 261 256 ··· 303 298 /* Not reached */ 304 299 return 0; 305 300 } 301 + 302 + static void *pidns_get(struct task_struct *task) 303 + { 304 + struct pid_namespace *ns; 305 + 306 + rcu_read_lock(); 307 + ns = get_pid_ns(task_active_pid_ns(task)); 308 + rcu_read_unlock(); 309 + 310 + return ns; 311 + } 312 + 313 + static void pidns_put(void *ns) 314 + { 315 + put_pid_ns(ns); 316 + } 317 + 318 + static int pidns_install(struct nsproxy *nsproxy, void *ns) 319 + { 320 + struct pid_namespace *active = task_active_pid_ns(current); 321 + struct pid_namespace *ancestor, *new = ns; 322 + 323 + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) 324 + return -EPERM; 325 + 326 + /* 327 + * Only allow entering the current active pid namespace 328 + * or a child of the current active pid namespace. 329 + * 330 + * This is required for fork to return a usable pid value and 331 + * this maintains the property that processes and their 332 + * children can not escape their current pid namespace. 333 + */ 334 + if (new->level < active->level) 335 + return -EINVAL; 336 + 337 + ancestor = new; 338 + while (ancestor->level > active->level) 339 + ancestor = ancestor->parent; 340 + if (ancestor != active) 341 + return -EINVAL; 342 + 343 + put_pid_ns(nsproxy->pid_ns); 344 + nsproxy->pid_ns = get_pid_ns(new); 345 + return 0; 346 + } 347 + 348 + static unsigned int pidns_inum(void *ns) 349 + { 350 + struct pid_namespace *pid_ns = ns; 351 + return pid_ns->proc_inum; 352 + } 353 + 354 + const struct proc_ns_operations pidns_operations = { 355 + .name = "pid", 356 + .type = CLONE_NEWPID, 357 + .get = pidns_get, 358 + .put = pidns_put, 359 + .install = pidns_install, 360 + .inum = pidns_inum, 361 + }; 306 362 307 363 static __init int pid_namespaces_init(void) 308 364 {
+8 -2
kernel/ptrace.c
··· 215 215 smp_rmb(); 216 216 if (task->mm) 217 217 dumpable = get_dumpable(task->mm); 218 - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 + rcu_read_lock(); 219 + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { 220 + rcu_read_unlock(); 219 221 return -EPERM; 222 + } 223 + rcu_read_unlock(); 220 224 221 225 return security_ptrace_access_check(task, mode); 222 226 } ··· 284 280 285 281 if (seize) 286 282 flags |= PT_SEIZED; 287 - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 + rcu_read_lock(); 284 + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) 288 285 flags |= PT_PTRACE_CAP; 286 + rcu_read_unlock(); 289 287 task->ptrace = flags; 290 288 291 289 __ptrace_link(task, current);
+8 -2
kernel/sched/core.c
··· 4097 4097 goto out_free_cpus_allowed; 4098 4098 } 4099 4099 retval = -EPERM; 4100 - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4101 - goto out_unlock; 4100 + if (!check_same_owner(p)) { 4101 + rcu_read_lock(); 4102 + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4103 + rcu_read_unlock(); 4104 + goto out_unlock; 4105 + } 4106 + rcu_read_unlock(); 4107 + } 4102 4108 4103 4109 retval = security_task_setscheduler(p); 4104 4110 if (retval)
+1 -1
kernel/signal.c
··· 1753 1753 * see comment in do_notify_parent() about the following 4 lines 1754 1754 */ 1755 1755 rcu_read_lock(); 1756 - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1756 + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); 1757 1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 1758 rcu_read_unlock(); 1759 1759
+1 -1
kernel/sysctl_binary.c
··· 1344 1344 goto out_putname; 1345 1345 } 1346 1346 1347 - mnt = current->nsproxy->pid_ns->proc_mnt; 1347 + mnt = task_active_pid_ns(current)->proc_mnt; 1348 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1349 1349 result = PTR_ERR(file); 1350 1350 if (IS_ERR(file))
+2
kernel/user.c
··· 16 16 #include <linux/interrupt.h> 17 17 #include <linux/export.h> 18 18 #include <linux/user_namespace.h> 19 + #include <linux/proc_fs.h> 19 20 20 21 /* 21 22 * userns count is 1 for root user, 1 for init_uts_ns, ··· 52 51 }, 53 52 .owner = GLOBAL_ROOT_UID, 54 53 .group = GLOBAL_ROOT_GID, 54 + .proc_inum = PROC_USER_INIT_INO, 55 55 }; 56 56 EXPORT_SYMBOL_GPL(init_user_ns); 57 57
+128 -19
kernel/user_namespace.c
··· 9 9 #include <linux/nsproxy.h> 10 10 #include <linux/slab.h> 11 11 #include <linux/user_namespace.h> 12 + #include <linux/proc_fs.h> 12 13 #include <linux/highuid.h> 13 14 #include <linux/cred.h> 14 15 #include <linux/securebits.h> ··· 27 26 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28 27 struct uid_gid_map *map); 29 28 29 + static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 30 + { 31 + /* Start with the same capabilities as init but useless for doing 32 + * anything as the capabilities are bound to the new user namespace. 33 + */ 34 + cred->securebits = SECUREBITS_DEFAULT; 35 + cred->cap_inheritable = CAP_EMPTY_SET; 36 + cred->cap_permitted = CAP_FULL_SET; 37 + cred->cap_effective = CAP_FULL_SET; 38 + cred->cap_bset = CAP_FULL_SET; 39 + #ifdef CONFIG_KEYS 40 + key_put(cred->request_key_auth); 41 + cred->request_key_auth = NULL; 42 + #endif 43 + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 44 + cred->user_ns = user_ns; 45 + } 46 + 30 47 /* 31 48 * Create a new user namespace, deriving the creator from the user in the 32 49 * passed credentials, and replacing that user with the new root user for the ··· 58 39 struct user_namespace *ns, *parent_ns = new->user_ns; 59 40 kuid_t owner = new->euid; 60 41 kgid_t group = new->egid; 42 + int ret; 61 43 62 44 /* The creator needs a mapping in the parent user namespace 63 45 * or else we won't be able to reasonably tell userspace who ··· 72 52 if (!ns) 73 53 return -ENOMEM; 74 54 55 + ret = proc_alloc_inum(&ns->proc_inum); 56 + if (ret) { 57 + kmem_cache_free(user_ns_cachep, ns); 58 + return ret; 59 + } 60 + 75 61 kref_init(&ns->kref); 62 + /* Leave the new->user_ns reference with the new user namespace. */ 76 63 ns->parent = parent_ns; 77 64 ns->owner = owner; 78 65 ns->group = group; 79 66 80 - /* Start with the same capabilities as init but useless for doing 81 - * anything as the capabilities are bound to the new user namespace. 82 - */ 83 - new->securebits = SECUREBITS_DEFAULT; 84 - new->cap_inheritable = CAP_EMPTY_SET; 85 - new->cap_permitted = CAP_FULL_SET; 86 - new->cap_effective = CAP_FULL_SET; 87 - new->cap_bset = CAP_FULL_SET; 88 - #ifdef CONFIG_KEYS 89 - key_put(new->request_key_auth); 90 - new->request_key_auth = NULL; 91 - #endif 92 - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 93 - 94 - /* Leave the new->user_ns reference with the new user namespace. */ 95 - /* Leave the reference to our user_ns with the new cred. */ 96 - new->user_ns = ns; 67 + set_cred_user_ns(new, ns); 97 68 98 69 return 0; 70 + } 71 + 72 + int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 73 + { 74 + struct cred *cred; 75 + 76 + if (!(unshare_flags & CLONE_NEWUSER)) 77 + return 0; 78 + 79 + cred = prepare_creds(); 80 + if (!cred) 81 + return -ENOMEM; 82 + 83 + *new_cred = cred; 84 + return create_user_ns(cred); 99 85 } 100 86 101 87 void free_user_ns(struct kref *kref) ··· 110 84 container_of(kref, struct user_namespace, kref); 111 85 112 86 parent = ns->parent; 87 + proc_free_inum(ns->proc_inum); 113 88 kmem_cache_free(user_ns_cachep, ns); 114 89 put_user_ns(parent); 115 90 } ··· 399 372 struct user_namespace *lower_ns; 400 373 uid_t lower; 401 374 402 - lower_ns = current_user_ns(); 375 + lower_ns = seq_user_ns(seq); 403 376 if ((lower_ns == ns) && lower_ns->parent) 404 377 lower_ns = lower_ns->parent; 405 378 ··· 420 393 struct user_namespace *lower_ns; 421 394 gid_t lower; 422 395 423 - lower_ns = current_user_ns(); 396 + lower_ns = seq_user_ns(seq); 424 397 if ((lower_ns == ns) && lower_ns->parent) 425 398 lower_ns = lower_ns->parent; 426 399 ··· 696 669 { 697 670 struct seq_file *seq = file->private_data; 698 671 struct user_namespace *ns = seq->private; 672 + struct user_namespace *seq_ns = seq_user_ns(seq); 699 673 700 674 if (!ns->parent) 675 + return -EPERM; 676 + 677 + if ((seq_ns != ns) && (seq_ns != ns->parent)) 701 678 return -EPERM; 702 679 703 680 return map_write(file, buf, size, ppos, CAP_SETUID, ··· 712 681 { 713 682 struct seq_file *seq = file->private_data; 714 683 struct user_namespace *ns = seq->private; 684 + struct user_namespace *seq_ns = seq_user_ns(seq); 715 685 716 686 if (!ns->parent) 687 + return -EPERM; 688 + 689 + if ((seq_ns != ns) && (seq_ns != ns->parent)) 717 690 return -EPERM; 718 691 719 692 return map_write(file, buf, size, ppos, CAP_SETGID, ··· 744 709 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 745 710 struct uid_gid_map *new_map) 746 711 { 712 + /* Allow mapping to your own filesystem ids */ 713 + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 714 + u32 id = new_map->extent[0].lower_first; 715 + if (cap_setid == CAP_SETUID) { 716 + kuid_t uid = make_kuid(ns->parent, id); 717 + if (uid_eq(uid, current_fsuid())) 718 + return true; 719 + } 720 + else if (cap_setid == CAP_SETGID) { 721 + kgid_t gid = make_kgid(ns->parent, id); 722 + if (gid_eq(gid, current_fsgid())) 723 + return true; 724 + } 725 + } 726 + 747 727 /* Allow anyone to set a mapping that doesn't require privilege */ 748 728 if (!cap_valid(cap_setid)) 749 729 return true; ··· 771 721 772 722 return false; 773 723 } 724 + 725 + static void *userns_get(struct task_struct *task) 726 + { 727 + struct user_namespace *user_ns; 728 + 729 + rcu_read_lock(); 730 + user_ns = get_user_ns(__task_cred(task)->user_ns); 731 + rcu_read_unlock(); 732 + 733 + return user_ns; 734 + } 735 + 736 + static void userns_put(void *ns) 737 + { 738 + put_user_ns(ns); 739 + } 740 + 741 + static int userns_install(struct nsproxy *nsproxy, void *ns) 742 + { 743 + struct user_namespace *user_ns = ns; 744 + struct cred *cred; 745 + 746 + /* Don't allow gaining capabilities by reentering 747 + * the same user namespace. 748 + */ 749 + if (user_ns == current_user_ns()) 750 + return -EINVAL; 751 + 752 + /* Threaded many not enter a different user namespace */ 753 + if (atomic_read(&current->mm->mm_users) > 1) 754 + return -EINVAL; 755 + 756 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 757 + return -EPERM; 758 + 759 + cred = prepare_creds(); 760 + if (!cred) 761 + return -ENOMEM; 762 + 763 + put_user_ns(cred->user_ns); 764 + set_cred_user_ns(cred, get_user_ns(user_ns)); 765 + 766 + return commit_creds(cred); 767 + } 768 + 769 + static unsigned int userns_inum(void *ns) 770 + { 771 + struct user_namespace *user_ns = ns; 772 + return user_ns->proc_inum; 773 + } 774 + 775 + const struct proc_ns_operations userns_operations = { 776 + .name = "user", 777 + .type = CLONE_NEWUSER, 778 + .get = userns_get, 779 + .put = userns_put, 780 + .install = userns_install, 781 + .inum = userns_inum, 782 + }; 774 783 775 784 static __init int user_namespaces_init(void) 776 785 {
+26 -7
kernel/utsname.c
··· 32 32 * @old_ns: namespace to clone 33 33 * Return NULL on error (failure to kmalloc), new ns otherwise 34 34 */ 35 - static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35 + static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 36 36 struct uts_namespace *old_ns) 37 37 { 38 38 struct uts_namespace *ns; 39 + int err; 39 40 40 41 ns = create_uts_ns(); 41 42 if (!ns) 42 43 return ERR_PTR(-ENOMEM); 43 44 45 + err = proc_alloc_inum(&ns->proc_inum); 46 + if (err) { 47 + kfree(ns); 48 + return ERR_PTR(err); 49 + } 50 + 44 51 down_read(&uts_sem); 45 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 46 - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 + ns->user_ns = get_user_ns(user_ns); 47 54 up_read(&uts_sem); 48 55 return ns; 49 56 } ··· 62 55 * versa. 63 56 */ 64 57 struct uts_namespace *copy_utsname(unsigned long flags, 65 - struct task_struct *tsk) 58 + struct user_namespace *user_ns, struct uts_namespace *old_ns) 66 59 { 67 - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; 68 60 struct uts_namespace *new_ns; 69 61 70 62 BUG_ON(!old_ns); ··· 72 66 if (!(flags & CLONE_NEWUTS)) 73 67 return old_ns; 74 68 75 - new_ns = clone_uts_ns(tsk, old_ns); 69 + new_ns = clone_uts_ns(user_ns, old_ns); 76 70 77 71 put_uts_ns(old_ns); 78 72 return new_ns; ··· 84 78 85 79 ns = container_of(kref, struct uts_namespace, kref); 86 80 put_user_ns(ns->user_ns); 81 + proc_free_inum(ns->proc_inum); 87 82 kfree(ns); 88 83 } 89 84 ··· 109 102 put_uts_ns(ns); 110 103 } 111 104 112 - static int utsns_install(struct nsproxy *nsproxy, void *ns) 105 + static int utsns_install(struct nsproxy *nsproxy, void *new) 113 106 { 107 + struct uts_namespace *ns = new; 108 + 109 + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 110 + return -EPERM; 111 + 114 112 get_uts_ns(ns); 115 113 put_uts_ns(nsproxy->uts_ns); 116 114 nsproxy->uts_ns = ns; 117 115 return 0; 116 + } 117 + 118 + static unsigned int utsns_inum(void *vp) 119 + { 120 + struct uts_namespace *ns = vp; 121 + 122 + return ns->proc_inum; 118 123 } 119 124 120 125 const struct proc_ns_operations utsns_operations = { ··· 135 116 .get = utsns_get, 136 117 .put = utsns_put, 137 118 .install = utsns_install, 119 + .inum = utsns_inum, 138 120 }; 139 -
+30 -1
net/core/net_namespace.c
··· 381 381 } 382 382 EXPORT_SYMBOL_GPL(get_net_ns_by_pid); 383 383 384 + static __net_init int net_ns_net_init(struct net *net) 385 + { 386 + return proc_alloc_inum(&net->proc_inum); 387 + } 388 + 389 + static __net_exit void net_ns_net_exit(struct net *net) 390 + { 391 + proc_free_inum(net->proc_inum); 392 + } 393 + 394 + static struct pernet_operations __net_initdata net_ns_ops = { 395 + .init = net_ns_net_init, 396 + .exit = net_ns_net_exit, 397 + }; 398 + 384 399 static int __init net_ns_init(void) 385 400 { 386 401 struct net_generic *ng; ··· 426 411 rtnl_unlock(); 427 412 428 413 mutex_unlock(&net_mutex); 414 + 415 + register_pernet_subsys(&net_ns_ops); 429 416 430 417 return 0; 431 418 } ··· 647 630 648 631 static int netns_install(struct nsproxy *nsproxy, void *ns) 649 632 { 633 + struct net *net = ns; 634 + 635 + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN)) 636 + return -EPERM; 637 + 650 638 put_net(nsproxy->net_ns); 651 - nsproxy->net_ns = get_net(ns); 639 + nsproxy->net_ns = get_net(net); 652 640 return 0; 641 + } 642 + 643 + static unsigned int netns_inum(void *ns) 644 + { 645 + struct net *net = ns; 646 + return net->proc_inum; 653 647 } 654 648 655 649 const struct proc_ns_operations netns_operations = { ··· 669 641 .get = netns_get, 670 642 .put = netns_put, 671 643 .install = netns_install, 644 + .inum = netns_inum, 672 645 }; 673 646 #endif
+9 -3
security/yama/yama_lsm.c
··· 298 298 /* No additional restrictions. */ 299 299 break; 300 300 case YAMA_SCOPE_RELATIONAL: 301 + rcu_read_lock(); 301 302 if (!task_is_descendant(current, child) && 302 303 !ptracer_exception_found(current, child) && 303 - !ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) 304 + !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) 304 305 rc = -EPERM; 306 + rcu_read_unlock(); 305 307 break; 306 308 case YAMA_SCOPE_CAPABILITY: 307 - if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) 309 + rcu_read_lock(); 310 + if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) 308 311 rc = -EPERM; 312 + rcu_read_unlock(); 309 313 break; 310 314 case YAMA_SCOPE_NO_ATTACH: 311 315 default: ··· 347 343 /* Only disallow PTRACE_TRACEME on more aggressive settings. */ 348 344 switch (ptrace_scope) { 349 345 case YAMA_SCOPE_CAPABILITY: 350 - if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) 346 + rcu_read_lock(); 347 + if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) 351 348 rc = -EPERM; 349 + rcu_read_unlock(); 352 350 break; 353 351 case YAMA_SCOPE_NO_ATTACH: 354 352 rc = -EPERM;