Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

+1 -1

arch/powerpc/platforms/cell/spufs/sched.c

··· 1094 1094 LOAD_INT(c), LOAD_FRAC(c), 1095 1095 count_active_contexts(), 1096 1096 atomic_read(&nr_spu_contexts), 1097 - current->nsproxy->pid_ns->last_pid); 1097 + task_active_pid_ns(current)->last_pid); 1098 1098 return 0; 1099 1099 } 1100 1100

+1 -1

arch/um/drivers/mconsole_kern.c

··· 123 123 124 124 void mconsole_proc(struct mc_request *req) 125 125 { 126 - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; 126 + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; 127 127 char *buf; 128 128 int len; 129 129 struct file *file;

+2 -1

drivers/staging/android/binder.c

··· 35 35 #include <linux/uaccess.h> 36 36 #include <linux/vmalloc.h> 37 37 #include <linux/slab.h> 38 + #include <linux/pid_namespace.h> 38 39 39 40 #include "binder.h" 40 41 #include "binder_trace.h" ··· 2321 2320 if (t->from) { 2322 2321 struct task_struct *sender = t->from->proc->tsk; 2323 2322 tr.sender_pid = task_tgid_nr_ns(sender, 2324 - current->nsproxy->pid_ns); 2323 + task_active_pid_ns(current)); 2325 2324 } else { 2326 2325 tr.sender_pid = 0; 2327 2326 }

+7 -4

fs/attr.c

··· 49 49 /* Make sure a caller can chown. */ 50 50 if ((ia_valid & ATTR_UID) && 51 51 (!uid_eq(current_fsuid(), inode->i_uid) || 52 - !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) 52 + !uid_eq(attr->ia_uid, inode->i_uid)) && 53 + !inode_capable(inode, CAP_CHOWN)) 53 54 return -EPERM; 54 55 55 56 /* Make sure caller can chgrp. */ 56 57 if ((ia_valid & ATTR_GID) && 57 58 (!uid_eq(current_fsuid(), inode->i_uid) || 58 59 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && 59 - !capable(CAP_CHOWN)) 60 + !inode_capable(inode, CAP_CHOWN)) 60 61 return -EPERM; 61 62 62 63 /* Make sure a caller can chmod. */ ··· 66 65 return -EPERM; 67 66 /* Also check the setgid bit! */ 68 67 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 69 - inode->i_gid) && !capable(CAP_FSETID)) 68 + inode->i_gid) && 69 + !inode_capable(inode, CAP_FSETID)) 70 70 attr->ia_mode &= ~S_ISGID; 71 71 } 72 72 ··· 159 157 if (ia_valid & ATTR_MODE) { 160 158 umode_t mode = attr->ia_mode; 161 159 162 - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 160 + if (!in_group_p(inode->i_gid) && 161 + !inode_capable(inode, CAP_FSETID)) 163 162 mode &= ~S_ISGID; 164 163 inode->i_mode = mode; 165 164 }

+4 -4

fs/autofs4/autofs_i.h

··· 74 74 unsigned long last_used; 75 75 atomic_t count; 76 76 77 - uid_t uid; 78 - gid_t gid; 77 + kuid_t uid; 78 + kgid_t gid; 79 79 }; 80 80 81 81 #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ ··· 89 89 struct qstr name; 90 90 u32 dev; 91 91 u64 ino; 92 - uid_t uid; 93 - gid_t gid; 92 + kuid_t uid; 93 + kgid_t gid; 94 94 pid_t pid; 95 95 pid_t tgid; 96 96 /* This is for status reporting upon return */

+2 -2

fs/autofs4/dev-ioctl.c

··· 437 437 err = 0; 438 438 autofs4_expire_wait(path.dentry); 439 439 spin_lock(&sbi->fs_lock); 440 - param->requester.uid = ino->uid; 441 - param->requester.gid = ino->gid; 440 + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); 441 + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); 442 442 spin_unlock(&sbi->fs_lock); 443 443 } 444 444 path_put(&path);

+15 -9

fs/autofs4/inode.c

··· 36 36 37 37 void autofs4_clean_ino(struct autofs_info *ino) 38 38 { 39 - ino->uid = 0; 40 - ino->gid = 0; 39 + ino->uid = GLOBAL_ROOT_UID; 40 + ino->gid = GLOBAL_ROOT_GID; 41 41 ino->last_used = jiffies; 42 42 } 43 43 ··· 79 79 return 0; 80 80 81 81 seq_printf(m, ",fd=%d", sbi->pipefd); 82 - if (root_inode->i_uid != 0) 83 - seq_printf(m, ",uid=%u", root_inode->i_uid); 84 - if (root_inode->i_gid != 0) 85 - seq_printf(m, ",gid=%u", root_inode->i_gid); 82 + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) 83 + seq_printf(m, ",uid=%u", 84 + from_kuid_munged(&init_user_ns, root_inode->i_uid)); 85 + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) 86 + seq_printf(m, ",gid=%u", 87 + from_kgid_munged(&init_user_ns, root_inode->i_gid)); 86 88 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 87 89 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 88 90 seq_printf(m, ",minproto=%d", sbi->min_proto); ··· 128 126 {Opt_err, NULL} 129 127 }; 130 128 131 - static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 129 + static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, 132 130 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 133 131 { 134 132 char *p; ··· 161 159 case Opt_uid: 162 160 if (match_int(args, &option)) 163 161 return 1; 164 - *uid = option; 162 + *uid = make_kuid(current_user_ns(), option); 163 + if (!uid_valid(*uid)) 164 + return 1; 165 165 break; 166 166 case Opt_gid: 167 167 if (match_int(args, &option)) 168 168 return 1; 169 - *gid = option; 169 + *gid = make_kgid(current_user_ns(), option); 170 + if (!gid_valid(*gid)) 171 + return 1; 170 172 break; 171 173 case Opt_pgrp: 172 174 if (match_int(args, &option))

+3 -2

fs/autofs4/waitq.c

··· 154 154 case autofs_ptype_expire_direct: 155 155 { 156 156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; 157 + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; 157 158 158 159 pktsz = sizeof(*packet); 159 160 ··· 164 163 packet->name[wq->name.len] = '\0'; 165 164 packet->dev = wq->dev; 166 165 packet->ino = wq->ino; 167 - packet->uid = wq->uid; 168 - packet->gid = wq->gid; 166 + packet->uid = from_kuid_munged(user_ns, wq->uid); 167 + packet->gid = from_kgid_munged(user_ns, wq->gid); 169 168 packet->pid = wq->pid; 170 169 packet->tgid = wq->tgid; 171 170 break;

+3 -6

fs/exec.c

··· 1266 1266 bprm->cred->egid = current_egid(); 1267 1267 1268 1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1269 - !current->no_new_privs) { 1269 + !current->no_new_privs && 1270 + kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && 1271 + kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { 1270 1272 /* Set-uid? */ 1271 1273 if (mode & S_ISUID) { 1272 - if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) 1273 - return -EPERM; 1274 1274 bprm->per_clear |= PER_CLEAR_ON_SETID; 1275 1275 bprm->cred->euid = inode->i_uid; 1276 - 1277 1276 } 1278 1277 1279 1278 /* Set-gid? */ ··· 1282 1283 * executable. 1283 1284 */ 1284 1285 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1285 - if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) 1286 - return -EPERM; 1287 1286 bprm->per_clear |= PER_CLEAR_ON_SETID; 1288 1287 bprm->cred->egid = inode->i_gid; 1289 1288 }

+2 -2

fs/fuse/dev.c

··· 92 92 93 93 static void fuse_req_init_context(struct fuse_req *req) 94 94 { 95 - req->in.h.uid = current_fsuid(); 96 - req->in.h.gid = current_fsgid(); 95 + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); 96 + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); 97 97 req->in.h.pid = current->pid; 98 98 } 99 99

+10 -10

fs/fuse/dir.c

··· 818 818 stat->ino = attr->ino; 819 819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 820 820 stat->nlink = attr->nlink; 821 - stat->uid = attr->uid; 822 - stat->gid = attr->gid; 821 + stat->uid = make_kuid(&init_user_ns, attr->uid); 822 + stat->gid = make_kgid(&init_user_ns, attr->gid); 823 823 stat->rdev = inode->i_rdev; 824 824 stat->atime.tv_sec = attr->atime; 825 825 stat->atime.tv_nsec = attr->atimensec; ··· 1007 1007 rcu_read_lock(); 1008 1008 ret = 0; 1009 1009 cred = __task_cred(task); 1010 - if (cred->euid == fc->user_id && 1011 - cred->suid == fc->user_id && 1012 - cred->uid == fc->user_id && 1013 - cred->egid == fc->group_id && 1014 - cred->sgid == fc->group_id && 1015 - cred->gid == fc->group_id) 1010 + if (uid_eq(cred->euid, fc->user_id) && 1011 + uid_eq(cred->suid, fc->user_id) && 1012 + uid_eq(cred->uid, fc->user_id) && 1013 + gid_eq(cred->egid, fc->group_id) && 1014 + gid_eq(cred->sgid, fc->group_id) && 1015 + gid_eq(cred->gid, fc->group_id)) 1016 1016 ret = 1; 1017 1017 rcu_read_unlock(); 1018 1018 ··· 1306 1306 if (ivalid & ATTR_MODE) 1307 1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; 1308 1308 if (ivalid & ATTR_UID) 1309 - arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; 1309 + arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); 1310 1310 if (ivalid & ATTR_GID) 1311 - arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; 1311 + arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); 1312 1312 if (ivalid & ATTR_SIZE) 1313 1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; 1314 1314 if (ivalid & ATTR_ATIME) {

+2 -2

fs/fuse/fuse_i.h

··· 333 333 atomic_t count; 334 334 335 335 /** The user id for this mount */ 336 - uid_t user_id; 336 + kuid_t user_id; 337 337 338 338 /** The group id for this mount */ 339 - gid_t group_id; 339 + kgid_t group_id; 340 340 341 341 /** The fuse mount flags for this mount */ 342 342 unsigned flags;

+14 -9

fs/fuse/inode.c

··· 60 60 struct fuse_mount_data { 61 61 int fd; 62 62 unsigned rootmode; 63 - unsigned user_id; 64 - unsigned group_id; 63 + kuid_t user_id; 64 + kgid_t group_id; 65 65 unsigned fd_present:1; 66 66 unsigned rootmode_present:1; 67 67 unsigned user_id_present:1; ··· 164 164 inode->i_ino = fuse_squash_ino(attr->ino); 165 165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 166 166 set_nlink(inode, attr->nlink); 167 - inode->i_uid = attr->uid; 168 - inode->i_gid = attr->gid; 167 + inode->i_uid = make_kuid(&init_user_ns, attr->uid); 168 + inode->i_gid = make_kgid(&init_user_ns, attr->gid); 169 169 inode->i_blocks = attr->blocks; 170 170 inode->i_atime.tv_sec = attr->atime; 171 171 inode->i_atime.tv_nsec = attr->atimensec; ··· 492 492 case OPT_USER_ID: 493 493 if (match_int(&args[0], &value)) 494 494 return 0; 495 - d->user_id = value; 495 + d->user_id = make_kuid(current_user_ns(), value); 496 + if (!uid_valid(d->user_id)) 497 + return 0; 496 498 d->user_id_present = 1; 497 499 break; 498 500 499 501 case OPT_GROUP_ID: 500 502 if (match_int(&args[0], &value)) 501 503 return 0; 502 - d->group_id = value; 504 + d->group_id = make_kgid(current_user_ns(), value); 505 + if (!gid_valid(d->group_id)) 506 + return 0; 503 507 d->group_id_present = 1; 504 508 break; 505 509 ··· 544 540 struct super_block *sb = root->d_sb; 545 541 struct fuse_conn *fc = get_fuse_conn_super(sb); 546 542 547 - seq_printf(m, ",user_id=%u", fc->user_id); 548 - seq_printf(m, ",group_id=%u", fc->group_id); 543 + seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); 544 + seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); 549 545 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) 550 546 seq_puts(m, ",default_permissions"); 551 547 if (fc->flags & FUSE_ALLOW_OTHER) ··· 993 989 if (!file) 994 990 goto err; 995 991 996 - if (file->f_op != &fuse_dev_operations) 992 + if ((file->f_op != &fuse_dev_operations) || 993 + (file->f_cred->user_ns != &init_user_ns)) 997 994 goto err_fput; 998 995 999 996 fc = kmalloc(sizeof(*fc), GFP_KERNEL);

+1 -1

fs/hppfs/hppfs.c

··· 710 710 struct vfsmount *proc_mnt; 711 711 int err = -ENOENT; 712 712 713 - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); 713 + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); 714 714 if (IS_ERR(proc_mnt)) 715 715 goto out; 716 716

+3

fs/mount.h

··· 4 4 5 5 struct mnt_namespace { 6 6 atomic_t count; 7 + unsigned int proc_inum; 7 8 struct mount * root; 8 9 struct list_head list; 10 + struct user_namespace *user_ns; 11 + u64 seq; /* Sequence number to prevent loops */ 9 12 wait_queue_head_t poll; 10 13 int event; 11 14 };

+175 -36

fs/namespace.c

··· 12 12 #include <linux/export.h> 13 13 #include <linux/capability.h> 14 14 #include <linux/mnt_namespace.h> 15 + #include <linux/user_namespace.h> 15 16 #include <linux/namei.h> 16 17 #include <linux/security.h> 17 18 #include <linux/idr.h> ··· 21 20 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 22 21 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 23 22 #include <linux/uaccess.h> 23 + #include <linux/proc_fs.h> 24 24 #include "pnode.h" 25 25 #include "internal.h" 26 26 ··· 786 784 if (!mnt) 787 785 return ERR_PTR(-ENOMEM); 788 786 789 - if (flag & (CL_SLAVE | CL_PRIVATE)) 787 + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) 790 788 mnt->mnt_group_id = 0; /* not a peer of original */ 791 789 else 792 790 mnt->mnt_group_id = old->mnt_group_id; ··· 807 805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 808 806 br_write_unlock(&vfsmount_lock); 809 807 810 - if (flag & CL_SLAVE) { 808 + if ((flag & CL_SLAVE) || 809 + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 811 810 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 812 811 mnt->mnt_master = old; 813 812 CLEAR_MNT_SHARED(mnt); ··· 1269 1266 goto dput_and_out; 1270 1267 1271 1268 retval = -EPERM; 1272 - if (!capable(CAP_SYS_ADMIN)) 1269 + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1273 1270 goto dput_and_out; 1274 1271 1275 1272 retval = do_umount(mnt, flags); ··· 1295 1292 1296 1293 static int mount_is_safe(struct path *path) 1297 1294 { 1298 - if (capable(CAP_SYS_ADMIN)) 1295 + if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1299 1296 return 0; 1300 1297 return -EPERM; 1301 1298 #ifdef notyet ··· 1309 1306 return -EPERM; 1310 1307 return 0; 1311 1308 #endif 1309 + } 1310 + 1311 + static bool mnt_ns_loop(struct path *path) 1312 + { 1313 + /* Could bind mounting the mount namespace inode cause a 1314 + * mount namespace loop? 1315 + */ 1316 + struct inode *inode = path->dentry->d_inode; 1317 + struct proc_inode *ei; 1318 + struct mnt_namespace *mnt_ns; 1319 + 1320 + if (!proc_ns_inode(inode)) 1321 + return false; 1322 + 1323 + ei = PROC_I(inode); 1324 + if (ei->ns_ops != &mntns_operations) 1325 + return false; 1326 + 1327 + mnt_ns = ei->ns; 1328 + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; 1312 1329 } 1313 1330 1314 1331 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, ··· 1633 1610 int type; 1634 1611 int err = 0; 1635 1612 1636 - if (!capable(CAP_SYS_ADMIN)) 1613 + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1637 1614 return -EPERM; 1638 1615 1639 1616 if (path->dentry != path->mnt->mnt_root) ··· 1677 1654 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 1678 1655 if (err) 1679 1656 return err; 1657 + 1658 + err = -EINVAL; 1659 + if (mnt_ns_loop(&old_path)) 1660 + goto out; 1680 1661 1681 1662 err = lock_mount(path); 1682 1663 if (err) ··· 1797 1770 struct mount *p; 1798 1771 struct mount *old; 1799 1772 int err = 0; 1800 - if (!capable(CAP_SYS_ADMIN)) 1773 + if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) 1801 1774 return -EPERM; 1802 1775 if (!old_name || !*old_name) 1803 1776 return -EINVAL; ··· 1884 1857 return ERR_PTR(err); 1885 1858 } 1886 1859 1887 - static struct vfsmount * 1888 - do_kern_mount(const char *fstype, int flags, const char *name, void *data) 1889 - { 1890 - struct file_system_type *type = get_fs_type(fstype); 1891 - struct vfsmount *mnt; 1892 - if (!type) 1893 - return ERR_PTR(-ENODEV); 1894 - mnt = vfs_kern_mount(type, flags, name, data); 1895 - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 1896 - !mnt->mnt_sb->s_subtype) 1897 - mnt = fs_set_subtype(mnt, fstype); 1898 - put_filesystem(type); 1899 - return mnt; 1900 - } 1901 - 1902 1860 /* 1903 1861 * add a mount into a namespace's mount tree 1904 1862 */ ··· 1929 1917 * create a new mount for userspace and request it to be added into the 1930 1918 * namespace's tree 1931 1919 */ 1932 - static int do_new_mount(struct path *path, const char *type, int flags, 1920 + static int do_new_mount(struct path *path, const char *fstype, int flags, 1933 1921 int mnt_flags, const char *name, void *data) 1934 1922 { 1923 + struct file_system_type *type; 1924 + struct user_namespace *user_ns; 1935 1925 struct vfsmount *mnt; 1936 1926 int err; 1937 1927 1938 - if (!type) 1928 + if (!fstype) 1939 1929 return -EINVAL; 1940 1930 1941 1931 /* we need capabilities... */ 1942 - if (!capable(CAP_SYS_ADMIN)) 1932 + user_ns = real_mount(path->mnt)->mnt_ns->user_ns; 1933 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 1943 1934 return -EPERM; 1944 1935 1945 - mnt = do_kern_mount(type, flags, name, data); 1936 + type = get_fs_type(fstype); 1937 + if (!type) 1938 + return -ENODEV; 1939 + 1940 + if (user_ns != &init_user_ns) { 1941 + if (!(type->fs_flags & FS_USERNS_MOUNT)) { 1942 + put_filesystem(type); 1943 + return -EPERM; 1944 + } 1945 + /* Only in special cases allow devices from mounts 1946 + * created outside the initial user namespace. 1947 + */ 1948 + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 1949 + flags |= MS_NODEV; 1950 + mnt_flags |= MNT_NODEV; 1951 + } 1952 + } 1953 + 1954 + mnt = vfs_kern_mount(type, flags, name, data); 1955 + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && 1956 + !mnt->mnt_sb->s_subtype) 1957 + mnt = fs_set_subtype(mnt, fstype); 1958 + 1959 + put_filesystem(type); 1946 1960 if (IS_ERR(mnt)) 1947 1961 return PTR_ERR(mnt); 1948 1962 ··· 2299 2261 return retval; 2300 2262 } 2301 2263 2302 - static struct mnt_namespace *alloc_mnt_ns(void) 2264 + static void free_mnt_ns(struct mnt_namespace *ns) 2265 + { 2266 + proc_free_inum(ns->proc_inum); 2267 + put_user_ns(ns->user_ns); 2268 + kfree(ns); 2269 + } 2270 + 2271 + /* 2272 + * Assign a sequence number so we can detect when we attempt to bind 2273 + * mount a reference to an older mount namespace into the current 2274 + * mount namespace, preventing reference counting loops. A 64bit 2275 + * number incrementing at 10Ghz will take 12,427 years to wrap which 2276 + * is effectively never, so we can ignore the possibility. 2277 + */ 2278 + static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); 2279 + 2280 + static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) 2303 2281 { 2304 2282 struct mnt_namespace *new_ns; 2283 + int ret; 2305 2284 2306 2285 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2307 2286 if (!new_ns) 2308 2287 return ERR_PTR(-ENOMEM); 2288 + ret = proc_alloc_inum(&new_ns->proc_inum); 2289 + if (ret) { 2290 + kfree(new_ns); 2291 + return ERR_PTR(ret); 2292 + } 2293 + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); 2309 2294 atomic_set(&new_ns->count, 1); 2310 2295 new_ns->root = NULL; 2311 2296 INIT_LIST_HEAD(&new_ns->list); 2312 2297 init_waitqueue_head(&new_ns->poll); 2313 2298 new_ns->event = 0; 2299 + new_ns->user_ns = get_user_ns(user_ns); 2314 2300 return new_ns; 2315 2301 } 2316 2302 ··· 2343 2281 * copied from the namespace of the passed in task structure. 2344 2282 */ 2345 2283 static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2346 - struct fs_struct *fs) 2284 + struct user_namespace *user_ns, struct fs_struct *fs) 2347 2285 { 2348 2286 struct mnt_namespace *new_ns; 2349 2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2350 2288 struct mount *p, *q; 2351 2289 struct mount *old = mnt_ns->root; 2352 2290 struct mount *new; 2291 + int copy_flags; 2353 2292 2354 - new_ns = alloc_mnt_ns(); 2293 + new_ns = alloc_mnt_ns(user_ns); 2355 2294 if (IS_ERR(new_ns)) 2356 2295 return new_ns; 2357 2296 2358 2297 down_write(&namespace_sem); 2359 2298 /* First pass: copy the tree topology */ 2360 - new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2299 + copy_flags = CL_COPY_ALL | CL_EXPIRE; 2300 + if (user_ns != mnt_ns->user_ns) 2301 + copy_flags |= CL_SHARED_TO_SLAVE; 2302 + new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2361 2303 if (IS_ERR(new)) { 2362 2304 up_write(&namespace_sem); 2363 - kfree(new_ns); 2305 + free_mnt_ns(new_ns); 2364 2306 return ERR_CAST(new); 2365 2307 } 2366 2308 new_ns->root = new; ··· 2405 2339 } 2406 2340 2407 2341 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2408 - struct fs_struct *new_fs) 2342 + struct user_namespace *user_ns, struct fs_struct *new_fs) 2409 2343 { 2410 2344 struct mnt_namespace *new_ns; 2411 2345 ··· 2415 2349 if (!(flags & CLONE_NEWNS)) 2416 2350 return ns; 2417 2351 2418 - new_ns = dup_mnt_ns(ns, new_fs); 2352 + new_ns = dup_mnt_ns(ns, user_ns, new_fs); 2419 2353 2420 2354 put_mnt_ns(ns); 2421 2355 return new_ns; ··· 2427 2361 */ 2428 2362 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2429 2363 { 2430 - struct mnt_namespace *new_ns = alloc_mnt_ns(); 2364 + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); 2431 2365 if (!IS_ERR(new_ns)) { 2432 2366 struct mount *mnt = real_mount(m); 2433 2367 mnt->mnt_ns = new_ns; ··· 2567 2501 struct mount *new_mnt, *root_mnt; 2568 2502 int error; 2569 2503 2570 - if (!capable(CAP_SYS_ADMIN)) 2504 + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) 2571 2505 return -EPERM; 2572 2506 2573 2507 error = user_path_dir(new_root, &new); ··· 2649 2583 struct vfsmount *mnt; 2650 2584 struct mnt_namespace *ns; 2651 2585 struct path root; 2586 + struct file_system_type *type; 2652 2587 2653 - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2588 + type = get_fs_type("rootfs"); 2589 + if (!type) 2590 + panic("Can't find rootfs type"); 2591 + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); 2592 + put_filesystem(type); 2654 2593 if (IS_ERR(mnt)) 2655 2594 panic("Can't create rootfs"); 2656 2595 ··· 2718 2647 br_write_unlock(&vfsmount_lock); 2719 2648 up_write(&namespace_sem); 2720 2649 release_mounts(&umount_list); 2721 - kfree(ns); 2650 + free_mnt_ns(ns); 2722 2651 } 2723 2652 2724 2653 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) ··· 2752 2681 { 2753 2682 return check_mnt(real_mount(mnt)); 2754 2683 } 2684 + 2685 + static void *mntns_get(struct task_struct *task) 2686 + { 2687 + struct mnt_namespace *ns = NULL; 2688 + struct nsproxy *nsproxy; 2689 + 2690 + rcu_read_lock(); 2691 + nsproxy = task_nsproxy(task); 2692 + if (nsproxy) { 2693 + ns = nsproxy->mnt_ns; 2694 + get_mnt_ns(ns); 2695 + } 2696 + rcu_read_unlock(); 2697 + 2698 + return ns; 2699 + } 2700 + 2701 + static void mntns_put(void *ns) 2702 + { 2703 + put_mnt_ns(ns); 2704 + } 2705 + 2706 + static int mntns_install(struct nsproxy *nsproxy, void *ns) 2707 + { 2708 + struct fs_struct *fs = current->fs; 2709 + struct mnt_namespace *mnt_ns = ns; 2710 + struct path root; 2711 + 2712 + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 2713 + !nsown_capable(CAP_SYS_CHROOT)) 2714 + return -EPERM; 2715 + 2716 + if (fs->users != 1) 2717 + return -EINVAL; 2718 + 2719 + get_mnt_ns(mnt_ns); 2720 + put_mnt_ns(nsproxy->mnt_ns); 2721 + nsproxy->mnt_ns = mnt_ns; 2722 + 2723 + /* Find the root */ 2724 + root.mnt = &mnt_ns->root->mnt; 2725 + root.dentry = mnt_ns->root->mnt.mnt_root; 2726 + path_get(&root); 2727 + while(d_mountpoint(root.dentry) && follow_down_one(&root)) 2728 + ; 2729 + 2730 + /* Update the pwd and root */ 2731 + set_fs_pwd(fs, &root); 2732 + set_fs_root(fs, &root); 2733 + 2734 + path_put(&root); 2735 + return 0; 2736 + } 2737 + 2738 + static unsigned int mntns_inum(void *ns) 2739 + { 2740 + struct mnt_namespace *mnt_ns = ns; 2741 + return mnt_ns->proc_inum; 2742 + } 2743 + 2744 + const struct proc_ns_operations mntns_operations = { 2745 + .name = "mnt", 2746 + .type = CLONE_NEWNS, 2747 + .get = mntns_get, 2748 + .put = mntns_put, 2749 + .install = mntns_install, 2750 + .inum = mntns_inum, 2751 + };

+1 -1

fs/open.c

··· 435 435 goto dput_and_out; 436 436 437 437 error = -EPERM; 438 - if (!capable(CAP_SYS_CHROOT)) 438 + if (!nsown_capable(CAP_SYS_CHROOT)) 439 439 goto dput_and_out; 440 440 error = security_path_chroot(&path); 441 441 if (error)

+1

fs/pnode.h

··· 22 22 #define CL_COPY_ALL 0x04 23 23 #define CL_MAKE_SHARED 0x08 24 24 #define CL_PRIVATE 0x10 25 + #define CL_SHARED_TO_SLAVE 0x20 25 26 26 27 static inline void set_mnt_shared(struct mount *mnt) 27 28 {

+1

fs/proc/Makefile

··· 21 21 proc-y += version.o 22 22 proc-y += softirqs.o 23 23 proc-y += namespaces.o 24 + proc-y += self.o 24 25 proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 25 26 proc-$(CONFIG_NET) += proc_net.o 26 27 proc-$(CONFIG_PROC_KCORE) += kcore.o

+1 -1

fs/proc/array.c

··· 162 162 static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 163 163 struct pid *pid, struct task_struct *p) 164 164 { 165 - struct user_namespace *user_ns = current_user_ns(); 165 + struct user_namespace *user_ns = seq_user_ns(m); 166 166 struct group_info *group_info; 167 167 int g; 168 168 struct fdtable *fdt = NULL;

+3 -166

fs/proc/base.c

··· 2345 2345 }; 2346 2346 #endif 2347 2347 2348 - /* 2349 - * /proc/self: 2350 - */ 2351 - static int proc_self_readlink(struct dentry *dentry, char __user *buffer, 2352 - int buflen) 2353 - { 2354 - struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2355 - pid_t tgid = task_tgid_nr_ns(current, ns); 2356 - char tmp[PROC_NUMBUF]; 2357 - if (!tgid) 2358 - return -ENOENT; 2359 - sprintf(tmp, "%d", tgid); 2360 - return vfs_readlink(dentry,buffer,buflen,tmp); 2361 - } 2362 - 2363 - static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 2364 - { 2365 - struct pid_namespace *ns = dentry->d_sb->s_fs_info; 2366 - pid_t tgid = task_tgid_nr_ns(current, ns); 2367 - char *name = ERR_PTR(-ENOENT); 2368 - if (tgid) { 2369 - /* 11 for max length of signed int in decimal + NULL term */ 2370 - name = kmalloc(12, GFP_KERNEL); 2371 - if (!name) 2372 - name = ERR_PTR(-ENOMEM); 2373 - else 2374 - sprintf(name, "%d", tgid); 2375 - } 2376 - nd_set_link(nd, name); 2377 - return NULL; 2378 - } 2379 - 2380 - static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, 2381 - void *cookie) 2382 - { 2383 - char *s = nd_get_link(nd); 2384 - if (!IS_ERR(s)) 2385 - kfree(s); 2386 - } 2387 - 2388 - static const struct inode_operations proc_self_inode_operations = { 2389 - .readlink = proc_self_readlink, 2390 - .follow_link = proc_self_follow_link, 2391 - .put_link = proc_self_put_link, 2392 - }; 2393 - 2394 - /* 2395 - * proc base 2396 - * 2397 - * These are the directory entries in the root directory of /proc 2398 - * that properly belong to the /proc filesystem, as they describe 2399 - * describe something that is process related. 2400 - */ 2401 - static const struct pid_entry proc_base_stuff[] = { 2402 - NOD("self", S_IFLNK|S_IRWXUGO, 2403 - &proc_self_inode_operations, NULL, {}), 2404 - }; 2405 - 2406 - static struct dentry *proc_base_instantiate(struct inode *dir, 2407 - struct dentry *dentry, struct task_struct *task, const void *ptr) 2408 - { 2409 - const struct pid_entry *p = ptr; 2410 - struct inode *inode; 2411 - struct proc_inode *ei; 2412 - struct dentry *error; 2413 - 2414 - /* Allocate the inode */ 2415 - error = ERR_PTR(-ENOMEM); 2416 - inode = new_inode(dir->i_sb); 2417 - if (!inode) 2418 - goto out; 2419 - 2420 - /* Initialize the inode */ 2421 - ei = PROC_I(inode); 2422 - inode->i_ino = get_next_ino(); 2423 - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 2424 - 2425 - /* 2426 - * grab the reference to the task. 2427 - */ 2428 - ei->pid = get_task_pid(task, PIDTYPE_PID); 2429 - if (!ei->pid) 2430 - goto out_iput; 2431 - 2432 - inode->i_mode = p->mode; 2433 - if (S_ISDIR(inode->i_mode)) 2434 - set_nlink(inode, 2); 2435 - if (S_ISLNK(inode->i_mode)) 2436 - inode->i_size = 64; 2437 - if (p->iop) 2438 - inode->i_op = p->iop; 2439 - if (p->fop) 2440 - inode->i_fop = p->fop; 2441 - ei->op = p->op; 2442 - d_add(dentry, inode); 2443 - error = NULL; 2444 - out: 2445 - return error; 2446 - out_iput: 2447 - iput(inode); 2448 - goto out; 2449 - } 2450 - 2451 - static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) 2452 - { 2453 - struct dentry *error; 2454 - struct task_struct *task = get_proc_task(dir); 2455 - const struct pid_entry *p, *last; 2456 - 2457 - error = ERR_PTR(-ENOENT); 2458 - 2459 - if (!task) 2460 - goto out_no_task; 2461 - 2462 - /* Lookup the directory entry */ 2463 - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; 2464 - for (p = proc_base_stuff; p <= last; p++) { 2465 - if (p->len != dentry->d_name.len) 2466 - continue; 2467 - if (!memcmp(dentry->d_name.name, p->name, p->len)) 2468 - break; 2469 - } 2470 - if (p > last) 2471 - goto out; 2472 - 2473 - error = proc_base_instantiate(dir, dentry, task, p); 2474 - 2475 - out: 2476 - put_task_struct(task); 2477 - out_no_task: 2478 - return error; 2479 - } 2480 - 2481 - static int proc_base_fill_cache(struct file *filp, void *dirent, 2482 - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) 2483 - { 2484 - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, 2485 - proc_base_instantiate, task, p); 2486 - } 2487 - 2488 2348 #ifdef CONFIG_TASK_IO_ACCOUNTING 2489 2349 static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2490 2350 { ··· 2699 2839 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2700 2840 tgid->numbers[i].nr); 2701 2841 } 2702 - 2703 - upid = &pid->numbers[pid->level]; 2704 - if (upid->nr == 1) 2705 - pid_ns_release_proc(upid->ns); 2706 2842 } 2707 2843 2708 2844 static struct dentry *proc_pid_instantiate(struct inode *dir, ··· 2732 2876 2733 2877 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2734 2878 { 2735 - struct dentry *result; 2879 + struct dentry *result = NULL; 2736 2880 struct task_struct *task; 2737 2881 unsigned tgid; 2738 2882 struct pid_namespace *ns; 2739 - 2740 - result = proc_base_lookup(dir, dentry); 2741 - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) 2742 - goto out; 2743 2883 2744 2884 tgid = name_to_int(dentry); 2745 2885 if (tgid == ~0U) ··· 2799 2947 return iter; 2800 2948 } 2801 2949 2802 - #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) 2950 + #define TGID_OFFSET (FIRST_PROCESS_ENTRY) 2803 2951 2804 2952 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 2805 2953 struct tgid_iter iter) ··· 2819 2967 /* for the /proc/ directory itself, after non-process stuff has been done */ 2820 2968 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2821 2969 { 2822 - unsigned int nr; 2823 - struct task_struct *reaper; 2824 2970 struct tgid_iter iter; 2825 2971 struct pid_namespace *ns; 2826 2972 filldir_t __filldir; 2827 2973 2828 2974 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 2829 - goto out_no_task; 2830 - nr = filp->f_pos - FIRST_PROCESS_ENTRY; 2831 - 2832 - reaper = get_proc_task(filp->f_path.dentry->d_inode); 2833 - if (!reaper) 2834 - goto out_no_task; 2835 - 2836 - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { 2837 - const struct pid_entry *p = &proc_base_stuff[nr]; 2838 - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) 2839 - goto out; 2840 - } 2975 + goto out; 2841 2976 2842 2977 ns = filp->f_dentry->d_sb->s_fs_info; 2843 2978 iter.task = NULL; ··· 2845 3006 } 2846 3007 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2847 3008 out: 2848 - put_task_struct(reaper); 2849 - out_no_task: 2850 3009 return 0; 2851 3010 } 2852 3011

+13 -13

fs/proc/generic.c

··· 350 350 * Return an inode number between PROC_DYNAMIC_FIRST and 351 351 * 0xffffffff, or zero on failure. 352 352 */ 353 - static unsigned int get_inode_number(void) 353 + int proc_alloc_inum(unsigned int *inum) 354 354 { 355 355 unsigned int i; 356 356 int error; 357 357 358 358 retry: 359 - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) 360 - return 0; 359 + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) 360 + return -ENOMEM; 361 361 362 362 spin_lock(&proc_inum_lock); 363 363 error = ida_get_new(&proc_inum_ida, &i); ··· 365 365 if (error == -EAGAIN) 366 366 goto retry; 367 367 else if (error) 368 - return 0; 368 + return error; 369 369 370 370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 371 371 spin_lock(&proc_inum_lock); 372 372 ida_remove(&proc_inum_ida, i); 373 373 spin_unlock(&proc_inum_lock); 374 - return 0; 374 + return -ENOSPC; 375 375 } 376 - return PROC_DYNAMIC_FIRST + i; 376 + *inum = PROC_DYNAMIC_FIRST + i; 377 + return 0; 377 378 } 378 379 379 - static void release_inode_number(unsigned int inum) 380 + void proc_free_inum(unsigned int inum) 380 381 { 381 382 spin_lock(&proc_inum_lock); 382 383 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); ··· 555 554 556 555 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 557 556 { 558 - unsigned int i; 559 557 struct proc_dir_entry *tmp; 558 + int ret; 560 559 561 - i = get_inode_number(); 562 - if (i == 0) 563 - return -EAGAIN; 564 - dp->low_ino = i; 560 + ret = proc_alloc_inum(&dp->low_ino); 561 + if (ret) 562 + return ret; 565 563 566 564 if (S_ISDIR(dp->mode)) { 567 565 if (dp->proc_iops == NULL) { ··· 764 764 765 765 static void free_proc_entry(struct proc_dir_entry *de) 766 766 { 767 - release_inode_number(de->low_ino); 767 + proc_free_inum(de->low_ino); 768 768 769 769 if (S_ISLNK(de->mode)) 770 770 kfree(de->data);

+4 -2

fs/proc/inode.c

··· 31 31 struct proc_dir_entry *de; 32 32 struct ctl_table_header *head; 33 33 const struct proc_ns_operations *ns_ops; 34 + void *ns; 34 35 35 36 truncate_inode_pages(&inode->i_data, 0); 36 37 clear_inode(inode); ··· 50 49 } 51 50 /* Release any associated namespace */ 52 51 ns_ops = PROC_I(inode)->ns_ops; 53 - if (ns_ops && ns_ops->put) 54 - ns_ops->put(PROC_I(inode)->ns); 52 + ns = PROC_I(inode)->ns; 53 + if (ns_ops && ns) 54 + ns_ops->put(ns); 55 55 } 56 56 57 57 static struct kmem_cache * proc_inode_cachep;

+1

fs/proc/internal.h

··· 15 15 struct mempolicy; 16 16 17 17 extern struct proc_dir_entry proc_root; 18 + extern void proc_self_init(void); 18 19 #ifdef CONFIG_PROC_SYSCTL 19 20 extern int proc_sys_init(void); 20 21 extern void sysctl_head_put(struct ctl_table_header *head);

+164 -21

fs/proc/namespaces.c

··· 11 11 #include <net/net_namespace.h> 12 12 #include <linux/ipc_namespace.h> 13 13 #include <linux/pid_namespace.h> 14 + #include <linux/user_namespace.h> 14 15 #include "internal.h" 15 16 16 17 ··· 25 24 #ifdef CONFIG_IPC_NS 26 25 &ipcns_operations, 27 26 #endif 27 + #ifdef CONFIG_PID_NS 28 + &pidns_operations, 29 + #endif 30 + #ifdef CONFIG_USER_NS 31 + &userns_operations, 32 + #endif 33 + &mntns_operations, 28 34 }; 29 35 30 36 static const struct file_operations ns_file_operations = { 31 37 .llseek = no_llseek, 38 + }; 39 + 40 + static const struct inode_operations ns_inode_operations = { 41 + .setattr = proc_setattr, 42 + }; 43 + 44 + static int ns_delete_dentry(const struct dentry *dentry) 45 + { 46 + /* Don't cache namespace inodes when not in use */ 47 + return 1; 48 + } 49 + 50 + static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) 51 + { 52 + struct inode *inode = dentry->d_inode; 53 + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; 54 + 55 + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", 56 + ns_ops->name, inode->i_ino); 57 + } 58 + 59 + const struct dentry_operations ns_dentry_operations = 60 + { 61 + .d_delete = ns_delete_dentry, 62 + .d_dname = ns_dname, 63 + }; 64 + 65 + static struct dentry *proc_ns_get_dentry(struct super_block *sb, 66 + struct task_struct *task, const struct proc_ns_operations *ns_ops) 67 + { 68 + struct dentry *dentry, *result; 69 + struct inode *inode; 70 + struct proc_inode *ei; 71 + struct qstr qname = { .name = "", }; 72 + void *ns; 73 + 74 + ns = ns_ops->get(task); 75 + if (!ns) 76 + return ERR_PTR(-ENOENT); 77 + 78 + dentry = d_alloc_pseudo(sb, &qname); 79 + if (!dentry) { 80 + ns_ops->put(ns); 81 + return ERR_PTR(-ENOMEM); 82 + } 83 + 84 + inode = iget_locked(sb, ns_ops->inum(ns)); 85 + if (!inode) { 86 + dput(dentry); 87 + ns_ops->put(ns); 88 + return ERR_PTR(-ENOMEM); 89 + } 90 + 91 + ei = PROC_I(inode); 92 + if (inode->i_state & I_NEW) { 93 + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 94 + inode->i_op = &ns_inode_operations; 95 + inode->i_mode = S_IFREG | S_IRUGO; 96 + inode->i_fop = &ns_file_operations; 97 + ei->ns_ops = ns_ops; 98 + ei->ns = ns; 99 + unlock_new_inode(inode); 100 + } else { 101 + ns_ops->put(ns); 102 + } 103 + 104 + d_set_d_op(dentry, &ns_dentry_operations); 105 + result = d_instantiate_unique(dentry, inode); 106 + if (result) { 107 + dput(dentry); 108 + dentry = result; 109 + } 110 + 111 + return dentry; 112 + } 113 + 114 + static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) 115 + { 116 + struct inode *inode = dentry->d_inode; 117 + struct super_block *sb = inode->i_sb; 118 + struct proc_inode *ei = PROC_I(inode); 119 + struct task_struct *task; 120 + struct dentry *ns_dentry; 121 + void *error = ERR_PTR(-EACCES); 122 + 123 + task = get_proc_task(inode); 124 + if (!task) 125 + goto out; 126 + 127 + if (!ptrace_may_access(task, PTRACE_MODE_READ)) 128 + goto out_put_task; 129 + 130 + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); 131 + if (IS_ERR(ns_dentry)) { 132 + error = ERR_CAST(ns_dentry); 133 + goto out_put_task; 134 + } 135 + 136 + dput(nd->path.dentry); 137 + nd->path.dentry = ns_dentry; 138 + error = NULL; 139 + 140 + out_put_task: 141 + put_task_struct(task); 142 + out: 143 + return error; 144 + } 145 + 146 + static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) 147 + { 148 + struct inode *inode = dentry->d_inode; 149 + struct proc_inode *ei = PROC_I(inode); 150 + const struct proc_ns_operations *ns_ops = ei->ns_ops; 151 + struct task_struct *task; 152 + void *ns; 153 + char name[50]; 154 + int len = -EACCES; 155 + 156 + task = get_proc_task(inode); 157 + if (!task) 158 + goto out; 159 + 160 + if (!ptrace_may_access(task, PTRACE_MODE_READ)) 161 + goto out_put_task; 162 + 163 + len = -ENOENT; 164 + ns = ns_ops->get(task); 165 + if (!ns) 166 + goto out_put_task; 167 + 168 + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); 169 + len = strlen(name); 170 + 171 + if (len > buflen) 172 + len = buflen; 173 + if (copy_to_user(buffer, name, len)) 174 + len = -EFAULT; 175 + 176 + ns_ops->put(ns); 177 + out_put_task: 178 + put_task_struct(task); 179 + out: 180 + return len; 181 + } 182 + 183 + static const struct inode_operations proc_ns_link_inode_operations = { 184 + .readlink = proc_ns_readlink, 185 + .follow_link = proc_ns_follow_link, 186 + .setattr = proc_setattr, 32 187 }; 33 188 34 189 static struct dentry *proc_ns_instantiate(struct inode *dir, ··· 194 37 struct inode *inode; 195 38 struct proc_inode *ei; 196 39 struct dentry *error = ERR_PTR(-ENOENT); 197 - void *ns; 198 40 199 41 inode = proc_pid_make_inode(dir->i_sb, task); 200 42 if (!inode) 201 43 goto out; 202 44 203 - ns = ns_ops->get(task); 204 - if (!ns) 205 - goto out_iput; 206 - 207 45 ei = PROC_I(inode); 208 - inode->i_mode = S_IFREG|S_IRUSR; 209 - inode->i_fop = &ns_file_operations; 210 - ei->ns_ops = ns_ops; 211 - ei->ns = ns; 46 + inode->i_mode = S_IFLNK|S_IRWXUGO; 47 + inode->i_op = &proc_ns_link_inode_operations; 48 + ei->ns_ops = ns_ops; 212 49 213 50 d_set_d_op(dentry, &pid_dentry_operations); 214 51 d_add(dentry, inode); ··· 211 60 error = NULL; 212 61 out: 213 62 return error; 214 - out_iput: 215 - iput(inode); 216 - goto out; 217 63 } 218 64 219 65 static int proc_ns_fill_cache(struct file *filp, void *dirent, ··· 236 88 ret = -ENOENT; 237 89 if (!task) 238 90 goto out_no_task; 239 - 240 - ret = -EPERM; 241 - if (!ptrace_may_access(task, PTRACE_MODE_READ)) 242 - goto out; 243 91 244 92 ret = 0; 245 93 i = filp->f_pos; ··· 296 152 if (!task) 297 153 goto out_no_task; 298 154 299 - error = ERR_PTR(-EPERM); 300 - if (!ptrace_may_access(task, PTRACE_MODE_READ)) 301 - goto out; 302 - 303 155 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 304 156 for (entry = ns_entries; entry < last; entry++) { 305 157 if (strlen((*entry)->name) != len) ··· 303 163 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 304 164 break; 305 165 } 306 - error = ERR_PTR(-ENOENT); 307 166 if (entry == last) 308 167 goto out; 309 168 ··· 337 198 return ERR_PTR(-EINVAL); 338 199 } 339 200 201 + bool proc_ns_inode(struct inode *inode) 202 + { 203 + return inode->i_fop == &ns_file_operations; 204 + }

+3 -14

fs/proc/root.c

··· 100 100 int err; 101 101 struct super_block *sb; 102 102 struct pid_namespace *ns; 103 - struct proc_inode *ei; 104 103 char *options; 105 104 106 105 if (flags & MS_KERNMOUNT) { 107 106 ns = (struct pid_namespace *)data; 108 107 options = NULL; 109 108 } else { 110 - ns = current->nsproxy->pid_ns; 109 + ns = task_active_pid_ns(current); 111 110 options = data; 112 111 } 113 112 ··· 129 130 sb->s_flags |= MS_ACTIVE; 130 131 } 131 132 132 - ei = PROC_I(sb->s_root->d_inode); 133 - if (!ei->pid) { 134 - rcu_read_lock(); 135 - ei->pid = get_pid(find_pid_ns(1, ns)); 136 - rcu_read_unlock(); 137 - } 138 - 139 133 return dget(sb->s_root); 140 134 } 141 135 ··· 145 153 .name = "proc", 146 154 .mount = proc_mount, 147 155 .kill_sb = proc_kill_sb, 156 + .fs_flags = FS_USERNS_MOUNT, 148 157 }; 149 158 150 159 void __init proc_root_init(void) ··· 156 163 err = register_filesystem(&proc_fs_type); 157 164 if (err) 158 165 return; 159 - err = pid_ns_prepare_proc(&init_pid_ns); 160 - if (err) { 161 - unregister_filesystem(&proc_fs_type); 162 - return; 163 - } 164 166 167 + proc_self_init(); 165 168 proc_symlink("mounts", NULL, "self/mounts"); 166 169 167 170 proc_net_init();

+59

fs/proc/self.c

··· 1 + #include <linux/proc_fs.h> 2 + #include <linux/sched.h> 3 + #include <linux/namei.h> 4 + 5 + /* 6 + * /proc/self: 7 + */ 8 + static int proc_self_readlink(struct dentry *dentry, char __user *buffer, 9 + int buflen) 10 + { 11 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 12 + pid_t tgid = task_tgid_nr_ns(current, ns); 13 + char tmp[PROC_NUMBUF]; 14 + if (!tgid) 15 + return -ENOENT; 16 + sprintf(tmp, "%d", tgid); 17 + return vfs_readlink(dentry,buffer,buflen,tmp); 18 + } 19 + 20 + static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) 21 + { 22 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 23 + pid_t tgid = task_tgid_nr_ns(current, ns); 24 + char *name = ERR_PTR(-ENOENT); 25 + if (tgid) { 26 + /* 11 for max length of signed int in decimal + NULL term */ 27 + name = kmalloc(12, GFP_KERNEL); 28 + if (!name) 29 + name = ERR_PTR(-ENOMEM); 30 + else 31 + sprintf(name, "%d", tgid); 32 + } 33 + nd_set_link(nd, name); 34 + return NULL; 35 + } 36 + 37 + static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, 38 + void *cookie) 39 + { 40 + char *s = nd_get_link(nd); 41 + if (!IS_ERR(s)) 42 + kfree(s); 43 + } 44 + 45 + static const struct inode_operations proc_self_inode_operations = { 46 + .readlink = proc_self_readlink, 47 + .follow_link = proc_self_follow_link, 48 + .put_link = proc_self_put_link, 49 + }; 50 + 51 + void __init proc_self_init(void) 52 + { 53 + struct proc_dir_entry *proc_self_symlink; 54 + mode_t mode; 55 + 56 + mode = S_IFLNK | S_IRWXUGO; 57 + proc_self_symlink = proc_create("self", mode, NULL, NULL ); 58 + proc_self_symlink->proc_iops = &proc_self_inode_operations; 59 + }

+1

fs/sysfs/mount.c

··· 149 149 .name = "sysfs", 150 150 .mount = sysfs_mount, 151 151 .kill_sb = sysfs_kill_sb, 152 + .fs_flags = FS_USERNS_MOUNT, 152 153 }; 153 154 154 155 int __init sysfs_init(void)

-2

include/linux/cred.h

··· 344 344 extern struct user_namespace init_user_ns; 345 345 #ifdef CONFIG_USER_NS 346 346 #define current_user_ns() (current_cred_xxx(user_ns)) 347 - #define task_user_ns(task) (task_cred_xxx((task), user_ns)) 348 347 #else 349 348 #define current_user_ns() (&init_user_ns) 350 - #define task_user_ns(task) (&init_user_ns) 351 349 #endif 352 350 353 351

+2

include/linux/fs.h

··· 1810 1810 #define FS_REQUIRES_DEV 1 1811 1811 #define FS_BINARY_MOUNTDATA 2 1812 1812 #define FS_HAS_SUBTYPE 4 1813 + #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ 1814 + #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ 1813 1815 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ 1814 1816 #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ 1815 1817 struct dentry *(*mount) (struct file_system_type *, int,

+6 -3

include/linux/ipc_namespace.h

··· 67 67 68 68 /* user_ns which owns the ipc ns */ 69 69 struct user_namespace *user_ns; 70 + 71 + unsigned int proc_inum; 70 72 }; 71 73 72 74 extern struct ipc_namespace init_ipc_ns; ··· 135 133 136 134 #if defined(CONFIG_IPC_NS) 137 135 extern struct ipc_namespace *copy_ipcs(unsigned long flags, 138 - struct task_struct *tsk); 136 + struct user_namespace *user_ns, struct ipc_namespace *ns); 137 + 139 138 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) 140 139 { 141 140 if (ns) ··· 147 144 extern void put_ipc_ns(struct ipc_namespace *ns); 148 145 #else 149 146 static inline struct ipc_namespace *copy_ipcs(unsigned long flags, 150 - struct task_struct *tsk) 147 + struct user_namespace *user_ns, struct ipc_namespace *ns) 151 148 { 152 149 if (flags & CLONE_NEWIPC) 153 150 return ERR_PTR(-EINVAL); 154 151 155 - return tsk->nsproxy->ipc_ns; 152 + return ns; 156 153 } 157 154 158 155 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)

+2 -1

include/linux/mnt_namespace.h

··· 4 4 5 5 struct mnt_namespace; 6 6 struct fs_struct; 7 + struct user_namespace; 7 8 8 9 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, 9 - struct fs_struct *); 10 + struct user_namespace *, struct fs_struct *); 10 11 extern void put_mnt_ns(struct mnt_namespace *ns); 11 12 12 13 extern const struct file_operations proc_mounts_operations;

+1 -1

include/linux/nsproxy.h

··· 67 67 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); 68 68 void free_nsproxy(struct nsproxy *ns); 69 69 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, 70 - struct fs_struct *); 70 + struct cred *, struct fs_struct *); 71 71 int __init nsproxy_cache_init(void); 72 72 73 73 static inline void put_nsproxy(struct nsproxy *ns)

+8 -3

include/linux/pid_namespace.h

··· 21 21 struct kref kref; 22 22 struct pidmap pidmap[PIDMAP_ENTRIES]; 23 23 int last_pid; 24 + int nr_hashed; 24 25 struct task_struct *child_reaper; 25 26 struct kmem_cache *pid_cachep; 26 27 unsigned int level; ··· 32 31 #ifdef CONFIG_BSD_PROCESS_ACCT 33 32 struct bsd_acct_struct *bacct; 34 33 #endif 34 + struct user_namespace *user_ns; 35 + struct work_struct proc_work; 35 36 kgid_t pid_gid; 36 37 int hide_pid; 37 38 int reboot; /* group exit code if this pidns was rebooted */ 39 + unsigned int proc_inum; 38 40 }; 39 41 40 42 extern struct pid_namespace init_pid_ns; ··· 50 46 return ns; 51 47 } 52 48 53 - extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); 49 + extern struct pid_namespace *copy_pid_ns(unsigned long flags, 50 + struct user_namespace *user_ns, struct pid_namespace *ns); 54 51 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); 55 52 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); 56 53 extern void put_pid_ns(struct pid_namespace *ns); ··· 64 59 return ns; 65 60 } 66 61 67 - static inline struct pid_namespace * 68 - copy_pid_ns(unsigned long flags, struct pid_namespace *ns) 62 + static inline struct pid_namespace *copy_pid_ns(unsigned long flags, 63 + struct user_namespace *user_ns, struct pid_namespace *ns) 69 64 { 70 65 if (flags & CLONE_NEWPID) 71 66 ns = ERR_PTR(-EINVAL);

+25 -1

include/linux/proc_fs.h

··· 28 28 */ 29 29 30 30 enum { 31 - PROC_ROOT_INO = 1, 31 + PROC_ROOT_INO = 1, 32 + PROC_IPC_INIT_INO = 0xEFFFFFFFU, 33 + PROC_UTS_INIT_INO = 0xEFFFFFFEU, 34 + PROC_USER_INIT_INO = 0xEFFFFFFDU, 35 + PROC_PID_INIT_INO = 0xEFFFFFFCU, 32 36 }; 33 37 34 38 /* ··· 178 174 struct proc_dir_entry *parent); 179 175 180 176 extern struct file *proc_ns_fget(int fd); 177 + extern bool proc_ns_inode(struct inode *inode); 181 178 179 + extern int proc_alloc_inum(unsigned int *pino); 180 + extern void proc_free_inum(unsigned int inum); 182 181 #else 183 182 184 183 #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) ··· 236 229 return ERR_PTR(-EINVAL); 237 230 } 238 231 232 + static inline bool proc_ns_inode(struct inode *inode) 233 + { 234 + return false; 235 + } 236 + 237 + static inline int proc_alloc_inum(unsigned int *inum) 238 + { 239 + *inum = 1; 240 + return 0; 241 + } 242 + static inline void proc_free_inum(unsigned int inum) 243 + { 244 + } 239 245 #endif /* CONFIG_PROC_FS */ 240 246 241 247 #if !defined(CONFIG_PROC_KCORE) ··· 267 247 void *(*get)(struct task_struct *task); 268 248 void (*put)(void *ns); 269 249 int (*install)(struct nsproxy *nsproxy, void *ns); 250 + unsigned int (*inum)(void *ns); 270 251 }; 271 252 extern const struct proc_ns_operations netns_operations; 272 253 extern const struct proc_ns_operations utsns_operations; 273 254 extern const struct proc_ns_operations ipcns_operations; 255 + extern const struct proc_ns_operations pidns_operations; 256 + extern const struct proc_ns_operations userns_operations; 257 + extern const struct proc_ns_operations mntns_operations; 274 258 275 259 union proc_op { 276 260 int (*proc_get_link)(struct dentry *, struct path *);

+10

include/linux/user_namespace.h

··· 25 25 struct user_namespace *parent; 26 26 kuid_t owner; 27 27 kgid_t group; 28 + unsigned int proc_inum; 28 29 }; 29 30 30 31 extern struct user_namespace init_user_ns; ··· 40 39 } 41 40 42 41 extern int create_user_ns(struct cred *new); 42 + extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); 43 43 extern void free_user_ns(struct kref *kref); 44 44 45 45 static inline void put_user_ns(struct user_namespace *ns) ··· 66 64 static inline int create_user_ns(struct cred *new) 67 65 { 68 66 return -EINVAL; 67 + } 68 + 69 + static inline int unshare_userns(unsigned long unshare_flags, 70 + struct cred **new_cred) 71 + { 72 + if (unshare_flags & CLONE_NEWUSER) 73 + return -EINVAL; 74 + return 0; 69 75 } 70 76 71 77 static inline void put_user_ns(struct user_namespace *ns)

+4 -3

include/linux/utsname.h

··· 23 23 struct kref kref; 24 24 struct new_utsname name; 25 25 struct user_namespace *user_ns; 26 + unsigned int proc_inum; 26 27 }; 27 28 extern struct uts_namespace init_uts_ns; 28 29 ··· 34 33 } 35 34 36 35 extern struct uts_namespace *copy_utsname(unsigned long flags, 37 - struct task_struct *tsk); 36 + struct user_namespace *user_ns, struct uts_namespace *old_ns); 38 37 extern void free_uts_ns(struct kref *kref); 39 38 40 39 static inline void put_uts_ns(struct uts_namespace *ns) ··· 51 50 } 52 51 53 52 static inline struct uts_namespace *copy_utsname(unsigned long flags, 54 - struct task_struct *tsk) 53 + struct user_namespace *user_ns, struct uts_namespace *old_ns) 55 54 { 56 55 if (flags & CLONE_NEWUTS) 57 56 return ERR_PTR(-EINVAL); 58 57 59 - return tsk->nsproxy->uts_ns; 58 + return old_ns; 60 59 } 61 60 #endif 62 61

+2

include/net/net_namespace.h

··· 56 56 57 57 struct user_namespace *user_ns; /* Owning user namespace */ 58 58 59 + unsigned int proc_inum; 60 + 59 61 struct proc_dir_entry *proc_net; 60 62 struct proc_dir_entry *proc_net_stat; 61 63

-2

init/Kconfig

··· 1069 1069 # Filesystems 1070 1070 depends on 9P_FS = n 1071 1071 depends on AFS_FS = n 1072 - depends on AUTOFS4_FS = n 1073 1072 depends on CEPH_FS = n 1074 1073 depends on CIFS = n 1075 1074 depends on CODA_FS = n 1076 - depends on FUSE_FS = n 1077 1075 depends on GFS2_FS = n 1078 1076 depends on NCP_FS = n 1079 1077 depends on NFSD = n

-1

init/main.c

··· 812 812 system_state = SYSTEM_RUNNING; 813 813 numa_default_policy(); 814 814 815 - current->signal->flags |= SIGNAL_UNKILLABLE; 816 815 flush_delayed_fput(); 817 816 818 817 if (ramdisk_execute_command) {

+2

init/version.c

··· 12 12 #include <linux/utsname.h> 13 13 #include <generated/utsrelease.h> 14 14 #include <linux/version.h> 15 + #include <linux/proc_fs.h> 15 16 16 17 #ifndef CONFIG_KALLSYMS 17 18 #define version(a) Version_ ## a ··· 35 34 .domainname = UTS_DOMAINNAME, 36 35 }, 37 36 .user_ns = &init_user_ns, 37 + .proc_inum = PROC_UTS_INIT_INO, 38 38 }; 39 39 EXPORT_SYMBOL_GPL(init_uts_ns); 40 40

+2

ipc/msgutil.c

··· 16 16 #include <linux/msg.h> 17 17 #include <linux/ipc_namespace.h> 18 18 #include <linux/utsname.h> 19 + #include <linux/proc_fs.h> 19 20 #include <asm/uaccess.h> 20 21 21 22 #include "util.h" ··· 31 30 struct ipc_namespace init_ipc_ns = { 32 31 .count = ATOMIC_INIT(1), 33 32 .user_ns = &init_user_ns, 33 + .proc_inum = PROC_IPC_INIT_INO, 34 34 }; 35 35 36 36 atomic_t nr_ipc_ns = ATOMIC_INIT(1);

+25 -7

ipc/namespace.c

··· 16 16 17 17 #include "util.h" 18 18 19 - static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, 19 + static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, 20 20 struct ipc_namespace *old_ns) 21 21 { 22 22 struct ipc_namespace *ns; ··· 26 26 if (ns == NULL) 27 27 return ERR_PTR(-ENOMEM); 28 28 29 + err = proc_alloc_inum(&ns->proc_inum); 30 + if (err) { 31 + kfree(ns); 32 + return ERR_PTR(err); 33 + } 34 + 29 35 atomic_set(&ns->count, 1); 30 36 err = mq_init_ns(ns); 31 37 if (err) { 38 + proc_free_inum(ns->proc_inum); 32 39 kfree(ns); 33 40 return ERR_PTR(err); 34 41 } ··· 53 46 ipcns_notify(IPCNS_CREATED); 54 47 register_ipcns_notifier(ns); 55 48 56 - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 49 + ns->user_ns = get_user_ns(user_ns); 57 50 58 51 return ns; 59 52 } 60 53 61 54 struct ipc_namespace *copy_ipcs(unsigned long flags, 62 - struct task_struct *tsk) 55 + struct user_namespace *user_ns, struct ipc_namespace *ns) 63 56 { 64 - struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; 65 - 66 57 if (!(flags & CLONE_NEWIPC)) 67 58 return get_ipc_ns(ns); 68 - return create_ipc_ns(tsk, ns); 59 + return create_ipc_ns(user_ns, ns); 69 60 } 70 61 71 62 /* ··· 118 113 */ 119 114 ipcns_notify(IPCNS_REMOVED); 120 115 put_user_ns(ns->user_ns); 116 + proc_free_inum(ns->proc_inum); 121 117 kfree(ns); 122 118 } 123 119 ··· 167 161 return put_ipc_ns(ns); 168 162 } 169 163 170 - static int ipcns_install(struct nsproxy *nsproxy, void *ns) 164 + static int ipcns_install(struct nsproxy *nsproxy, void *new) 171 165 { 166 + struct ipc_namespace *ns = new; 167 + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 168 + return -EPERM; 169 + 172 170 /* Ditch state from the old ipc namespace */ 173 171 exit_sem(current); 174 172 put_ipc_ns(nsproxy->ipc_ns); 175 173 nsproxy->ipc_ns = get_ipc_ns(ns); 176 174 return 0; 175 + } 176 + 177 + static unsigned int ipcns_inum(void *vp) 178 + { 179 + struct ipc_namespace *ns = vp; 180 + 181 + return ns->proc_inum; 177 182 } 178 183 179 184 const struct proc_ns_operations ipcns_operations = { ··· 193 176 .get = ipcns_get, 194 177 .put = ipcns_put, 195 178 .install = ipcns_install, 179 + .inum = ipcns_inum, 196 180 };

+1 -1

kernel/cgroup.c

··· 3409 3409 { 3410 3410 struct cgroup_pidlist *l; 3411 3411 /* don't need task_nsproxy() if we're looking at ourself */ 3412 - struct pid_namespace *ns = current->nsproxy->pid_ns; 3412 + struct pid_namespace *ns = task_active_pid_ns(current); 3413 3413 3414 3414 /* 3415 3415 * We can't drop the pidlist_mutex before taking the l->mutex in case

+1 -1

kernel/events/core.c

··· 6155 6155 6156 6156 event->parent = parent_event; 6157 6157 6158 - event->ns = get_pid_ns(current->nsproxy->pid_ns); 6158 + event->ns = get_pid_ns(task_active_pid_ns(current)); 6159 6159 event->id = atomic64_inc_return(&perf_event_id); 6160 6160 6161 6161 event->state = PERF_EVENT_STATE_INACTIVE;

-12

kernel/exit.c

··· 72 72 list_del_rcu(&p->tasks); 73 73 list_del_init(&p->sibling); 74 74 __this_cpu_dec(process_counts); 75 - /* 76 - * If we are the last child process in a pid namespace to be 77 - * reaped, notify the reaper sleeping zap_pid_ns_processes(). 78 - */ 79 - if (IS_ENABLED(CONFIG_PID_NS)) { 80 - struct task_struct *parent = p->real_parent; 81 - 82 - if ((task_active_pid_ns(parent)->child_reaper == parent) && 83 - list_empty(&parent->children) && 84 - (parent->flags & PF_EXITING)) 85 - wake_up_process(parent); 86 - } 87 75 } 88 76 list_del_rcu(&p->thread_group); 89 77 }

+48 -21

kernel/fork.c

··· 1044 1044 atomic_set(&sig->live, 1); 1045 1045 atomic_set(&sig->sigcnt, 1); 1046 1046 init_waitqueue_head(&sig->wait_chldexit); 1047 - if (clone_flags & CLONE_NEWPID) 1048 - sig->flags |= SIGNAL_UNKILLABLE; 1049 1047 sig->curr_target = tsk; 1050 1048 init_sigpending(&sig->shared_pending); 1051 1049 INIT_LIST_HEAD(&sig->posix_timers); ··· 1436 1438 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1437 1439 1438 1440 if (thread_group_leader(p)) { 1439 - if (is_child_reaper(pid)) 1440 - p->nsproxy->pid_ns->child_reaper = p; 1441 + if (is_child_reaper(pid)) { 1442 + ns_of_pid(pid)->child_reaper = p; 1443 + p->signal->flags |= SIGNAL_UNKILLABLE; 1444 + } 1441 1445 1442 1446 p->signal->leader_pid = pid; 1443 1447 p->signal->tty = tty_kref_get(current->signal->tty); ··· 1473 1473 if (p->io_context) 1474 1474 exit_io_context(p); 1475 1475 bad_fork_cleanup_namespaces: 1476 - if (unlikely(clone_flags & CLONE_NEWPID)) 1477 - pid_ns_release_proc(p->nsproxy->pid_ns); 1478 1476 exit_task_namespaces(p); 1479 1477 bad_fork_cleanup_mm: 1480 1478 if (p->mm) ··· 1552 1554 * Do some preliminary argument and permissions checking before we 1553 1555 * actually start allocating stuff 1554 1556 */ 1555 - if (clone_flags & CLONE_NEWUSER) { 1556 - if (clone_flags & CLONE_THREAD) 1557 + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { 1558 + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) 1557 1559 return -EINVAL; 1558 - /* hopefully this check will go away when userns support is 1559 - * complete 1560 - */ 1561 - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || 1562 - !capable(CAP_SETGID)) 1563 - return -EPERM; 1564 1560 } 1565 1561 1566 1562 /* ··· 1716 1724 { 1717 1725 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1718 1726 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1719 - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) 1727 + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| 1728 + CLONE_NEWUSER|CLONE_NEWPID)) 1720 1729 return -EINVAL; 1721 1730 /* 1722 1731 * Not implemented, but pretend it works if there is nothing to ··· 1784 1791 { 1785 1792 struct fs_struct *fs, *new_fs = NULL; 1786 1793 struct files_struct *fd, *new_fd = NULL; 1794 + struct cred *new_cred = NULL; 1787 1795 struct nsproxy *new_nsproxy = NULL; 1788 1796 int do_sysvsem = 0; 1789 1797 int err; 1790 1798 1791 - err = check_unshare_flags(unshare_flags); 1792 - if (err) 1793 - goto bad_unshare_out; 1794 - 1799 + /* 1800 + * If unsharing a user namespace must also unshare the thread. 1801 + */ 1802 + if (unshare_flags & CLONE_NEWUSER) 1803 + unshare_flags |= CLONE_THREAD; 1804 + /* 1805 + * If unsharing a pid namespace must also unshare the thread. 1806 + */ 1807 + if (unshare_flags & CLONE_NEWPID) 1808 + unshare_flags |= CLONE_THREAD; 1809 + /* 1810 + * If unsharing a thread from a thread group, must also unshare vm. 1811 + */ 1812 + if (unshare_flags & CLONE_THREAD) 1813 + unshare_flags |= CLONE_VM; 1814 + /* 1815 + * If unsharing vm, must also unshare signal handlers. 1816 + */ 1817 + if (unshare_flags & CLONE_VM) 1818 + unshare_flags |= CLONE_SIGHAND; 1795 1819 /* 1796 1820 * If unsharing namespace, must also unshare filesystem information. 1797 1821 */ 1798 1822 if (unshare_flags & CLONE_NEWNS) 1799 1823 unshare_flags |= CLONE_FS; 1824 + 1825 + err = check_unshare_flags(unshare_flags); 1826 + if (err) 1827 + goto bad_unshare_out; 1800 1828 /* 1801 1829 * CLONE_NEWIPC must also detach from the undolist: after switching 1802 1830 * to a new ipc namespace, the semaphore arrays from the old ··· 1831 1817 err = unshare_fd(unshare_flags, &new_fd); 1832 1818 if (err) 1833 1819 goto bad_unshare_cleanup_fs; 1834 - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); 1820 + err = unshare_userns(unshare_flags, &new_cred); 1835 1821 if (err) 1836 1822 goto bad_unshare_cleanup_fd; 1823 + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1824 + new_cred, new_fs); 1825 + if (err) 1826 + goto bad_unshare_cleanup_cred; 1837 1827 1838 - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1828 + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { 1839 1829 if (do_sysvsem) { 1840 1830 /* 1841 1831 * CLONE_SYSVSEM is equivalent to sys_exit(). ··· 1872 1854 } 1873 1855 1874 1856 task_unlock(current); 1857 + 1858 + if (new_cred) { 1859 + /* Install the new user namespace */ 1860 + commit_creds(new_cred); 1861 + new_cred = NULL; 1862 + } 1875 1863 } 1876 1864 1877 1865 if (new_nsproxy) 1878 1866 put_nsproxy(new_nsproxy); 1879 1867 1868 + bad_unshare_cleanup_cred: 1869 + if (new_cred) 1870 + put_cred(new_cred); 1880 1871 bad_unshare_cleanup_fd: 1881 1872 if (new_fd) 1882 1873 put_files_struct(new_fd);

+19 -17

kernel/nsproxy.c

··· 57 57 * leave it to the caller to do proper locking and attach it to task. 58 58 */ 59 59 static struct nsproxy *create_new_namespaces(unsigned long flags, 60 - struct task_struct *tsk, struct fs_struct *new_fs) 60 + struct task_struct *tsk, struct user_namespace *user_ns, 61 + struct fs_struct *new_fs) 61 62 { 62 63 struct nsproxy *new_nsp; 63 64 int err; ··· 67 66 if (!new_nsp) 68 67 return ERR_PTR(-ENOMEM); 69 68 70 - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 69 + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); 71 70 if (IS_ERR(new_nsp->mnt_ns)) { 72 71 err = PTR_ERR(new_nsp->mnt_ns); 73 72 goto out_ns; 74 73 } 75 74 76 - new_nsp->uts_ns = copy_utsname(flags, tsk); 75 + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); 77 76 if (IS_ERR(new_nsp->uts_ns)) { 78 77 err = PTR_ERR(new_nsp->uts_ns); 79 78 goto out_uts; 80 79 } 81 80 82 - new_nsp->ipc_ns = copy_ipcs(flags, tsk); 81 + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); 83 82 if (IS_ERR(new_nsp->ipc_ns)) { 84 83 err = PTR_ERR(new_nsp->ipc_ns); 85 84 goto out_ipc; 86 85 } 87 86 88 - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); 87 + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 89 88 if (IS_ERR(new_nsp->pid_ns)) { 90 89 err = PTR_ERR(new_nsp->pid_ns); 91 90 goto out_pid; 92 91 } 93 92 94 - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); 93 + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); 95 94 if (IS_ERR(new_nsp->net_ns)) { 96 95 err = PTR_ERR(new_nsp->net_ns); 97 96 goto out_net; ··· 123 122 int copy_namespaces(unsigned long flags, struct task_struct *tsk) 124 123 { 125 124 struct nsproxy *old_ns = tsk->nsproxy; 125 + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); 126 126 struct nsproxy *new_ns; 127 127 int err = 0; 128 128 ··· 136 134 CLONE_NEWPID | CLONE_NEWNET))) 137 135 return 0; 138 136 139 - if (!capable(CAP_SYS_ADMIN)) { 137 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { 140 138 err = -EPERM; 141 139 goto out; 142 140 } ··· 153 151 goto out; 154 152 } 155 153 156 - new_ns = create_new_namespaces(flags, tsk, tsk->fs); 154 + new_ns = create_new_namespaces(flags, tsk, 155 + task_cred_xxx(tsk, user_ns), tsk->fs); 157 156 if (IS_ERR(new_ns)) { 158 157 err = PTR_ERR(new_ns); 159 158 goto out; ··· 186 183 * On success, returns the new nsproxy. 187 184 */ 188 185 int unshare_nsproxy_namespaces(unsigned long unshare_flags, 189 - struct nsproxy **new_nsp, struct fs_struct *new_fs) 186 + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) 190 187 { 188 + struct user_namespace *user_ns; 191 189 int err = 0; 192 190 193 191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 194 - CLONE_NEWNET))) 192 + CLONE_NEWNET | CLONE_NEWPID))) 195 193 return 0; 196 194 197 - if (!capable(CAP_SYS_ADMIN)) 195 + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); 196 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 198 197 return -EPERM; 199 198 200 - *new_nsp = create_new_namespaces(unshare_flags, current, 201 - new_fs ? new_fs : current->fs); 199 + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, 200 + new_fs ? new_fs : current->fs); 202 201 if (IS_ERR(*new_nsp)) { 203 202 err = PTR_ERR(*new_nsp); 204 203 goto out; ··· 246 241 struct file *file; 247 242 int err; 248 243 249 - if (!capable(CAP_SYS_ADMIN)) 250 - return -EPERM; 251 - 252 244 file = proc_ns_fget(fd); 253 245 if (IS_ERR(file)) 254 246 return PTR_ERR(file); ··· 256 254 if (nstype && (ops->type != nstype)) 257 255 goto out; 258 256 259 - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); 257 + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); 260 258 if (IS_ERR(new_nsproxy)) { 261 259 err = PTR_ERR(new_nsproxy); 262 260 goto out;

+39 -8

kernel/pid.c

··· 36 36 #include <linux/pid_namespace.h> 37 37 #include <linux/init_task.h> 38 38 #include <linux/syscalls.h> 39 + #include <linux/proc_fs.h> 39 40 40 41 #define pid_hashfn(nr, ns) \ 41 42 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) ··· 79 78 .last_pid = 0, 80 79 .level = 0, 81 80 .child_reaper = &init_task, 81 + .user_ns = &init_user_ns, 82 + .proc_inum = PROC_PID_INIT_INO, 82 83 }; 83 84 EXPORT_SYMBOL_GPL(init_pid_ns); 84 85 ··· 272 269 unsigned long flags; 273 270 274 271 spin_lock_irqsave(&pidmap_lock, flags); 275 - for (i = 0; i <= pid->level; i++) 276 - hlist_del_rcu(&pid->numbers[i].pid_chain); 272 + for (i = 0; i <= pid->level; i++) { 273 + struct upid *upid = pid->numbers + i; 274 + struct pid_namespace *ns = upid->ns; 275 + hlist_del_rcu(&upid->pid_chain); 276 + switch(--ns->nr_hashed) { 277 + case 1: 278 + /* When all that is left in the pid namespace 279 + * is the reaper wake up the reaper. The reaper 280 + * may be sleeping in zap_pid_ns_processes(). 281 + */ 282 + wake_up_process(ns->child_reaper); 283 + break; 284 + case 0: 285 + ns->nr_hashed = -1; 286 + schedule_work(&ns->proc_work); 287 + break; 288 + } 289 + } 277 290 spin_unlock_irqrestore(&pidmap_lock, flags); 278 291 279 292 for (i = 0; i <= pid->level; i++) ··· 311 292 goto out; 312 293 313 294 tmp = ns; 295 + pid->level = ns->level; 314 296 for (i = ns->level; i >= 0; i--) { 315 297 nr = alloc_pidmap(tmp); 316 298 if (nr < 0) ··· 322 302 tmp = tmp->parent; 323 303 } 324 304 305 + if (unlikely(is_child_reaper(pid))) { 306 + if (pid_ns_prepare_proc(ns)) 307 + goto out_free; 308 + } 309 + 325 310 get_pid_ns(ns); 326 - pid->level = ns->level; 327 311 atomic_set(&pid->count, 1); 328 312 for (type = 0; type < PIDTYPE_MAX; ++type) 329 313 INIT_HLIST_HEAD(&pid->tasks[type]); 330 314 331 315 upid = pid->numbers + ns->level; 332 316 spin_lock_irq(&pidmap_lock); 333 - for ( ; upid >= pid->numbers; --upid) 317 + if (ns->nr_hashed < 0) 318 + goto out_unlock; 319 + for ( ; upid >= pid->numbers; --upid) { 334 320 hlist_add_head_rcu(&upid->pid_chain, 335 321 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 322 + upid->ns->nr_hashed++; 323 + } 336 324 spin_unlock_irq(&pidmap_lock); 337 325 338 326 out: 339 327 return pid; 340 328 329 + out_unlock: 330 + spin_unlock(&pidmap_lock); 341 331 out_free: 342 332 while (++i <= ns->level) 343 333 free_pidmap(pid->numbers + i); ··· 374 344 375 345 struct pid *find_vpid(int nr) 376 346 { 377 - return find_pid_ns(nr, current->nsproxy->pid_ns); 347 + return find_pid_ns(nr, task_active_pid_ns(current)); 378 348 } 379 349 EXPORT_SYMBOL_GPL(find_vpid); 380 350 ··· 458 428 459 429 struct task_struct *find_task_by_vpid(pid_t vnr) 460 430 { 461 - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); 431 + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); 462 432 } 463 433 464 434 struct pid *get_task_pid(struct task_struct *task, enum pid_type type) ··· 513 483 514 484 pid_t pid_vnr(struct pid *pid) 515 485 { 516 - return pid_nr_ns(pid, current->nsproxy->pid_ns); 486 + return pid_nr_ns(pid, task_active_pid_ns(current)); 517 487 } 518 488 EXPORT_SYMBOL_GPL(pid_vnr); 519 489 ··· 524 494 525 495 rcu_read_lock(); 526 496 if (!ns) 527 - ns = current->nsproxy->pid_ns; 497 + ns = task_active_pid_ns(current); 528 498 if (likely(pid_alive(task))) { 529 499 if (type != PIDTYPE_PID) 530 500 task = task->group_leader; ··· 599 569 /* Reserve PID 0. We never call free_pidmap(0) */ 600 570 set_bit(0, init_pid_ns.pidmap[0].page); 601 571 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 572 + init_pid_ns.nr_hashed = 1; 602 573 603 574 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 604 575 SLAB_HWCACHE_ALIGN | SLAB_PANIC);

+89 -23

kernel/pid_namespace.c

··· 10 10 11 11 #include <linux/pid.h> 12 12 #include <linux/pid_namespace.h> 13 + #include <linux/user_namespace.h> 13 14 #include <linux/syscalls.h> 14 15 #include <linux/err.h> 15 16 #include <linux/acct.h> ··· 72 71 return NULL; 73 72 } 74 73 74 + static void proc_cleanup_work(struct work_struct *work) 75 + { 76 + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); 77 + pid_ns_release_proc(ns); 78 + } 79 + 75 80 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ 76 81 #define MAX_PID_NS_LEVEL 32 77 82 78 - static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) 83 + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, 84 + struct pid_namespace *parent_pid_ns) 79 85 { 80 86 struct pid_namespace *ns; 81 87 unsigned int level = parent_pid_ns->level + 1; ··· 107 99 if (ns->pid_cachep == NULL) 108 100 goto out_free_map; 109 101 102 + err = proc_alloc_inum(&ns->proc_inum); 103 + if (err) 104 + goto out_free_map; 105 + 110 106 kref_init(&ns->kref); 111 107 ns->level = level; 112 108 ns->parent = get_pid_ns(parent_pid_ns); 109 + ns->user_ns = get_user_ns(user_ns); 110 + INIT_WORK(&ns->proc_work, proc_cleanup_work); 113 111 114 112 set_bit(0, ns->pidmap[0].page); 115 113 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); ··· 123 109 for (i = 1; i < PIDMAP_ENTRIES; i++) 124 110 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 125 111 126 - err = pid_ns_prepare_proc(ns); 127 - if (err) 128 - goto out_put_parent_pid_ns; 129 - 130 112 return ns; 131 113 132 - out_put_parent_pid_ns: 133 - put_pid_ns(parent_pid_ns); 134 114 out_free_map: 135 115 kfree(ns->pidmap[0].page); 136 116 out_free: ··· 137 129 { 138 130 int i; 139 131 132 + proc_free_inum(ns->proc_inum); 140 133 for (i = 0; i < PIDMAP_ENTRIES; i++) 141 134 kfree(ns->pidmap[i].page); 135 + put_user_ns(ns->user_ns); 142 136 kmem_cache_free(pid_ns_cachep, ns); 143 137 } 144 138 145 - struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 139 + struct pid_namespace *copy_pid_ns(unsigned long flags, 140 + struct user_namespace *user_ns, struct pid_namespace *old_ns) 146 141 { 147 142 if (!(flags & CLONE_NEWPID)) 148 143 return get_pid_ns(old_ns); 149 - if (flags & (CLONE_THREAD|CLONE_PARENT)) 144 + if (task_active_pid_ns(current) != old_ns) 150 145 return ERR_PTR(-EINVAL); 151 - return create_pid_namespace(old_ns); 146 + return create_pid_namespace(user_ns, old_ns); 152 147 } 153 148 154 149 static void free_pid_ns(struct kref *kref) ··· 222 211 223 212 /* 224 213 * sys_wait4() above can't reap the TASK_DEAD children. 225 - * Make sure they all go away, see __unhash_process(). 214 + * Make sure they all go away, see free_pid(). 226 215 */ 227 216 for (;;) { 228 - bool need_wait = false; 229 - 230 - read_lock(&tasklist_lock); 231 - if (!list_empty(&current->children)) { 232 - __set_current_state(TASK_UNINTERRUPTIBLE); 233 - need_wait = true; 234 - } 235 - read_unlock(&tasklist_lock); 236 - 237 - if (!need_wait) 217 + set_current_state(TASK_UNINTERRUPTIBLE); 218 + if (pid_ns->nr_hashed == 1) 238 219 break; 239 220 schedule(); 240 221 } 222 + __set_current_state(TASK_RUNNING); 241 223 242 224 if (pid_ns->reboot) 243 225 current->signal->group_exit_code = pid_ns->reboot; ··· 243 239 static int pid_ns_ctl_handler(struct ctl_table *table, int write, 244 240 void __user *buffer, size_t *lenp, loff_t *ppos) 245 241 { 242 + struct pid_namespace *pid_ns = task_active_pid_ns(current); 246 243 struct ctl_table tmp = *table; 247 244 248 - if (write && !capable(CAP_SYS_ADMIN)) 245 + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) 249 246 return -EPERM; 250 247 251 248 /* ··· 255 250 * it should synchronize its usage with external means. 256 251 */ 257 252 258 - tmp.data = &current->nsproxy->pid_ns->last_pid; 253 + tmp.data = &pid_ns->last_pid; 259 254 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 260 255 } 261 256 ··· 303 298 /* Not reached */ 304 299 return 0; 305 300 } 301 + 302 + static void *pidns_get(struct task_struct *task) 303 + { 304 + struct pid_namespace *ns; 305 + 306 + rcu_read_lock(); 307 + ns = get_pid_ns(task_active_pid_ns(task)); 308 + rcu_read_unlock(); 309 + 310 + return ns; 311 + } 312 + 313 + static void pidns_put(void *ns) 314 + { 315 + put_pid_ns(ns); 316 + } 317 + 318 + static int pidns_install(struct nsproxy *nsproxy, void *ns) 319 + { 320 + struct pid_namespace *active = task_active_pid_ns(current); 321 + struct pid_namespace *ancestor, *new = ns; 322 + 323 + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) 324 + return -EPERM; 325 + 326 + /* 327 + * Only allow entering the current active pid namespace 328 + * or a child of the current active pid namespace. 329 + * 330 + * This is required for fork to return a usable pid value and 331 + * this maintains the property that processes and their 332 + * children can not escape their current pid namespace. 333 + */ 334 + if (new->level < active->level) 335 + return -EINVAL; 336 + 337 + ancestor = new; 338 + while (ancestor->level > active->level) 339 + ancestor = ancestor->parent; 340 + if (ancestor != active) 341 + return -EINVAL; 342 + 343 + put_pid_ns(nsproxy->pid_ns); 344 + nsproxy->pid_ns = get_pid_ns(new); 345 + return 0; 346 + } 347 + 348 + static unsigned int pidns_inum(void *ns) 349 + { 350 + struct pid_namespace *pid_ns = ns; 351 + return pid_ns->proc_inum; 352 + } 353 + 354 + const struct proc_ns_operations pidns_operations = { 355 + .name = "pid", 356 + .type = CLONE_NEWPID, 357 + .get = pidns_get, 358 + .put = pidns_put, 359 + .install = pidns_install, 360 + .inum = pidns_inum, 361 + }; 306 362 307 363 static __init int pid_namespaces_init(void) 308 364 {

+8 -2

kernel/ptrace.c

··· 215 215 smp_rmb(); 216 216 if (task->mm) 217 217 dumpable = get_dumpable(task->mm); 218 - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) 218 + rcu_read_lock(); 219 + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { 220 + rcu_read_unlock(); 219 221 return -EPERM; 222 + } 223 + rcu_read_unlock(); 220 224 221 225 return security_ptrace_access_check(task, mode); 222 226 } ··· 284 280 285 281 if (seize) 286 282 flags |= PT_SEIZED; 287 - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 + rcu_read_lock(); 284 + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) 288 285 flags |= PT_PTRACE_CAP; 286 + rcu_read_unlock(); 289 287 task->ptrace = flags; 290 288 291 289 __ptrace_link(task, current);

+8 -2

kernel/sched/core.c

··· 4097 4097 goto out_free_cpus_allowed; 4098 4098 } 4099 4099 retval = -EPERM; 4100 - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4101 - goto out_unlock; 4100 + if (!check_same_owner(p)) { 4101 + rcu_read_lock(); 4102 + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4103 + rcu_read_unlock(); 4104 + goto out_unlock; 4105 + } 4106 + rcu_read_unlock(); 4107 + } 4102 4108 4103 4109 retval = security_task_setscheduler(p); 4104 4110 if (retval)

+1 -1

kernel/signal.c

··· 1753 1753 * see comment in do_notify_parent() about the following 4 lines 1754 1754 */ 1755 1755 rcu_read_lock(); 1756 - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1756 + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); 1757 1757 info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); 1758 1758 rcu_read_unlock(); 1759 1759

+1 -1

kernel/sysctl_binary.c

··· 1344 1344 goto out_putname; 1345 1345 } 1346 1346 1347 - mnt = current->nsproxy->pid_ns->proc_mnt; 1347 + mnt = task_active_pid_ns(current)->proc_mnt; 1348 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags); 1349 1349 result = PTR_ERR(file); 1350 1350 if (IS_ERR(file))

+2

kernel/user.c

··· 16 16 #include <linux/interrupt.h> 17 17 #include <linux/export.h> 18 18 #include <linux/user_namespace.h> 19 + #include <linux/proc_fs.h> 19 20 20 21 /* 21 22 * userns count is 1 for root user, 1 for init_uts_ns, ··· 52 51 }, 53 52 .owner = GLOBAL_ROOT_UID, 54 53 .group = GLOBAL_ROOT_GID, 54 + .proc_inum = PROC_USER_INIT_INO, 55 55 }; 56 56 EXPORT_SYMBOL_GPL(init_user_ns); 57 57

+128 -19

kernel/user_namespace.c

··· 9 9 #include <linux/nsproxy.h> 10 10 #include <linux/slab.h> 11 11 #include <linux/user_namespace.h> 12 + #include <linux/proc_fs.h> 12 13 #include <linux/highuid.h> 13 14 #include <linux/cred.h> 14 15 #include <linux/securebits.h> ··· 27 26 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28 27 struct uid_gid_map *map); 29 28 29 + static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 30 + { 31 + /* Start with the same capabilities as init but useless for doing 32 + * anything as the capabilities are bound to the new user namespace. 33 + */ 34 + cred->securebits = SECUREBITS_DEFAULT; 35 + cred->cap_inheritable = CAP_EMPTY_SET; 36 + cred->cap_permitted = CAP_FULL_SET; 37 + cred->cap_effective = CAP_FULL_SET; 38 + cred->cap_bset = CAP_FULL_SET; 39 + #ifdef CONFIG_KEYS 40 + key_put(cred->request_key_auth); 41 + cred->request_key_auth = NULL; 42 + #endif 43 + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 44 + cred->user_ns = user_ns; 45 + } 46 + 30 47 /* 31 48 * Create a new user namespace, deriving the creator from the user in the 32 49 * passed credentials, and replacing that user with the new root user for the ··· 58 39 struct user_namespace *ns, *parent_ns = new->user_ns; 59 40 kuid_t owner = new->euid; 60 41 kgid_t group = new->egid; 42 + int ret; 61 43 62 44 /* The creator needs a mapping in the parent user namespace 63 45 * or else we won't be able to reasonably tell userspace who ··· 72 52 if (!ns) 73 53 return -ENOMEM; 74 54 55 + ret = proc_alloc_inum(&ns->proc_inum); 56 + if (ret) { 57 + kmem_cache_free(user_ns_cachep, ns); 58 + return ret; 59 + } 60 + 75 61 kref_init(&ns->kref); 62 + /* Leave the new->user_ns reference with the new user namespace. */ 76 63 ns->parent = parent_ns; 77 64 ns->owner = owner; 78 65 ns->group = group; 79 66 80 - /* Start with the same capabilities as init but useless for doing 81 - * anything as the capabilities are bound to the new user namespace. 82 - */ 83 - new->securebits = SECUREBITS_DEFAULT; 84 - new->cap_inheritable = CAP_EMPTY_SET; 85 - new->cap_permitted = CAP_FULL_SET; 86 - new->cap_effective = CAP_FULL_SET; 87 - new->cap_bset = CAP_FULL_SET; 88 - #ifdef CONFIG_KEYS 89 - key_put(new->request_key_auth); 90 - new->request_key_auth = NULL; 91 - #endif 92 - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 93 - 94 - /* Leave the new->user_ns reference with the new user namespace. */ 95 - /* Leave the reference to our user_ns with the new cred. */ 96 - new->user_ns = ns; 67 + set_cred_user_ns(new, ns); 97 68 98 69 return 0; 70 + } 71 + 72 + int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 73 + { 74 + struct cred *cred; 75 + 76 + if (!(unshare_flags & CLONE_NEWUSER)) 77 + return 0; 78 + 79 + cred = prepare_creds(); 80 + if (!cred) 81 + return -ENOMEM; 82 + 83 + *new_cred = cred; 84 + return create_user_ns(cred); 99 85 } 100 86 101 87 void free_user_ns(struct kref *kref) ··· 110 84 container_of(kref, struct user_namespace, kref); 111 85 112 86 parent = ns->parent; 87 + proc_free_inum(ns->proc_inum); 113 88 kmem_cache_free(user_ns_cachep, ns); 114 89 put_user_ns(parent); 115 90 } ··· 399 372 struct user_namespace *lower_ns; 400 373 uid_t lower; 401 374 402 - lower_ns = current_user_ns(); 375 + lower_ns = seq_user_ns(seq); 403 376 if ((lower_ns == ns) && lower_ns->parent) 404 377 lower_ns = lower_ns->parent; 405 378 ··· 420 393 struct user_namespace *lower_ns; 421 394 gid_t lower; 422 395 423 - lower_ns = current_user_ns(); 396 + lower_ns = seq_user_ns(seq); 424 397 if ((lower_ns == ns) && lower_ns->parent) 425 398 lower_ns = lower_ns->parent; 426 399 ··· 696 669 { 697 670 struct seq_file *seq = file->private_data; 698 671 struct user_namespace *ns = seq->private; 672 + struct user_namespace *seq_ns = seq_user_ns(seq); 699 673 700 674 if (!ns->parent) 675 + return -EPERM; 676 + 677 + if ((seq_ns != ns) && (seq_ns != ns->parent)) 701 678 return -EPERM; 702 679 703 680 return map_write(file, buf, size, ppos, CAP_SETUID, ··· 712 681 { 713 682 struct seq_file *seq = file->private_data; 714 683 struct user_namespace *ns = seq->private; 684 + struct user_namespace *seq_ns = seq_user_ns(seq); 715 685 716 686 if (!ns->parent) 687 + return -EPERM; 688 + 689 + if ((seq_ns != ns) && (seq_ns != ns->parent)) 717 690 return -EPERM; 718 691 719 692 return map_write(file, buf, size, ppos, CAP_SETGID, ··· 744 709 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 745 710 struct uid_gid_map *new_map) 746 711 { 712 + /* Allow mapping to your own filesystem ids */ 713 + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 714 + u32 id = new_map->extent[0].lower_first; 715 + if (cap_setid == CAP_SETUID) { 716 + kuid_t uid = make_kuid(ns->parent, id); 717 + if (uid_eq(uid, current_fsuid())) 718 + return true; 719 + } 720 + else if (cap_setid == CAP_SETGID) { 721 + kgid_t gid = make_kgid(ns->parent, id); 722 + if (gid_eq(gid, current_fsgid())) 723 + return true; 724 + } 725 + } 726 + 747 727 /* Allow anyone to set a mapping that doesn't require privilege */ 748 728 if (!cap_valid(cap_setid)) 749 729 return true; ··· 771 721 772 722 return false; 773 723 } 724 + 725 + static void *userns_get(struct task_struct *task) 726 + { 727 + struct user_namespace *user_ns; 728 + 729 + rcu_read_lock(); 730 + user_ns = get_user_ns(__task_cred(task)->user_ns); 731 + rcu_read_unlock(); 732 + 733 + return user_ns; 734 + } 735 + 736 + static void userns_put(void *ns) 737 + { 738 + put_user_ns(ns); 739 + } 740 + 741 + static int userns_install(struct nsproxy *nsproxy, void *ns) 742 + { 743 + struct user_namespace *user_ns = ns; 744 + struct cred *cred; 745 + 746 + /* Don't allow gaining capabilities by reentering 747 + * the same user namespace. 748 + */ 749 + if (user_ns == current_user_ns()) 750 + return -EINVAL; 751 + 752 + /* Threaded many not enter a different user namespace */ 753 + if (atomic_read(&current->mm->mm_users) > 1) 754 + return -EINVAL; 755 + 756 + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 757 + return -EPERM; 758 + 759 + cred = prepare_creds(); 760 + if (!cred) 761 + return -ENOMEM; 762 + 763 + put_user_ns(cred->user_ns); 764 + set_cred_user_ns(cred, get_user_ns(user_ns)); 765 + 766 + return commit_creds(cred); 767 + } 768 + 769 + static unsigned int userns_inum(void *ns) 770 + { 771 + struct user_namespace *user_ns = ns; 772 + return user_ns->proc_inum; 773 + } 774 + 775 + const struct proc_ns_operations userns_operations = { 776 + .name = "user", 777 + .type = CLONE_NEWUSER, 778 + .get = userns_get, 779 + .put = userns_put, 780 + .install = userns_install, 781 + .inum = userns_inum, 782 + }; 774 783 775 784 static __init int user_namespaces_init(void) 776 785 {

+26 -7

kernel/utsname.c

··· 32 32 * @old_ns: namespace to clone 33 33 * Return NULL on error (failure to kmalloc), new ns otherwise 34 34 */ 35 - static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, 35 + static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, 36 36 struct uts_namespace *old_ns) 37 37 { 38 38 struct uts_namespace *ns; 39 + int err; 39 40 40 41 ns = create_uts_ns(); 41 42 if (!ns) 42 43 return ERR_PTR(-ENOMEM); 43 44 45 + err = proc_alloc_inum(&ns->proc_inum); 46 + if (err) { 47 + kfree(ns); 48 + return ERR_PTR(err); 49 + } 50 + 44 51 down_read(&uts_sem); 45 52 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 46 - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); 53 + ns->user_ns = get_user_ns(user_ns); 47 54 up_read(&uts_sem); 48 55 return ns; 49 56 } ··· 62 55 * versa. 63 56 */ 64 57 struct uts_namespace *copy_utsname(unsigned long flags, 65 - struct task_struct *tsk) 58 + struct user_namespace *user_ns, struct uts_namespace *old_ns) 66 59 { 67 - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; 68 60 struct uts_namespace *new_ns; 69 61 70 62 BUG_ON(!old_ns); ··· 72 66 if (!(flags & CLONE_NEWUTS)) 73 67 return old_ns; 74 68 75 - new_ns = clone_uts_ns(tsk, old_ns); 69 + new_ns = clone_uts_ns(user_ns, old_ns); 76 70 77 71 put_uts_ns(old_ns); 78 72 return new_ns; ··· 84 78 85 79 ns = container_of(kref, struct uts_namespace, kref); 86 80 put_user_ns(ns->user_ns); 81 + proc_free_inum(ns->proc_inum); 87 82 kfree(ns); 88 83 } 89 84 ··· 109 102 put_uts_ns(ns); 110 103 } 111 104 112 - static int utsns_install(struct nsproxy *nsproxy, void *ns) 105 + static int utsns_install(struct nsproxy *nsproxy, void *new) 113 106 { 107 + struct uts_namespace *ns = new; 108 + 109 + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 110 + return -EPERM; 111 + 114 112 get_uts_ns(ns); 115 113 put_uts_ns(nsproxy->uts_ns); 116 114 nsproxy->uts_ns = ns; 117 115 return 0; 116 + } 117 + 118 + static unsigned int utsns_inum(void *vp) 119 + { 120 + struct uts_namespace *ns = vp; 121 + 122 + return ns->proc_inum; 118 123 } 119 124 120 125 const struct proc_ns_operations utsns_operations = { ··· 135 116 .get = utsns_get, 136 117 .put = utsns_put, 137 118 .install = utsns_install, 119 + .inum = utsns_inum, 138 120 }; 139 -

+30 -1

net/core/net_namespace.c

··· 381 381 } 382 382 EXPORT_SYMBOL_GPL(get_net_ns_by_pid); 383 383 384 + static __net_init int net_ns_net_init(struct net *net) 385 + { 386 + return proc_alloc_inum(&net->proc_inum); 387 + } 388 + 389 + static __net_exit void net_ns_net_exit(struct net *net) 390 + { 391 + proc_free_inum(net->proc_inum); 392 + } 393 + 394 + static struct pernet_operations __net_initdata net_ns_ops = { 395 + .init = net_ns_net_init, 396 + .exit = net_ns_net_exit, 397 + }; 398 + 384 399 static int __init net_ns_init(void) 385 400 { 386 401 struct net_generic *ng; ··· 426 411 rtnl_unlock(); 427 412 428 413 mutex_unlock(&net_mutex); 414 + 415 + register_pernet_subsys(&net_ns_ops); 429 416 430 417 return 0; 431 418 } ··· 647 630 648 631 static int netns_install(struct nsproxy *nsproxy, void *ns) 649 632 { 633 + struct net *net = ns; 634 + 635 + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN)) 636 + return -EPERM; 637 + 650 638 put_net(nsproxy->net_ns); 651 - nsproxy->net_ns = get_net(ns); 639 + nsproxy->net_ns = get_net(net); 652 640 return 0; 641 + } 642 + 643 + static unsigned int netns_inum(void *ns) 644 + { 645 + struct net *net = ns; 646 + return net->proc_inum; 653 647 } 654 648 655 649 const struct proc_ns_operations netns_operations = { ··· 669 641 .get = netns_get, 670 642 .put = netns_put, 671 643 .install = netns_install, 644 + .inum = netns_inum, 672 645 }; 673 646 #endif

+9 -3

security/yama/yama_lsm.c

··· 298 298 /* No additional restrictions. */ 299 299 break; 300 300 case YAMA_SCOPE_RELATIONAL: 301 + rcu_read_lock(); 301 302 if (!task_is_descendant(current, child) && 302 303 !ptracer_exception_found(current, child) && 303 - !ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) 304 + !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) 304 305 rc = -EPERM; 306 + rcu_read_unlock(); 305 307 break; 306 308 case YAMA_SCOPE_CAPABILITY: 307 - if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) 309 + rcu_read_lock(); 310 + if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) 308 311 rc = -EPERM; 312 + rcu_read_unlock(); 309 313 break; 310 314 case YAMA_SCOPE_NO_ATTACH: 311 315 default: ··· 347 343 /* Only disallow PTRACE_TRACEME on more aggressive settings. */ 348 344 switch (ptrace_scope) { 349 345 case YAMA_SCOPE_CAPABILITY: 350 - if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) 346 + rcu_read_lock(); 347 + if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) 351 348 rc = -EPERM; 349 + rcu_read_unlock(); 352 350 break; 353 351 case YAMA_SCOPE_NO_ATTACH: 354 352 rc = -EPERM;