Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull namespace updates from Eric Biederman:
"This is a bunch of small changes built against 3.16-rc6. The most
significant change for users is the first patch which makes setns
drmatically faster by removing unneded rcu handling.

The next chunk of changes are so that "mount -o remount,.." will not
allow the user namespace root to drop flags on a mount set by the
system wide root. Aks this forces read-only mounts to stay read-only,
no-dev mounts to stay no-dev, no-suid mounts to stay no-suid, no-exec
mounts to stay no exec and it prevents unprivileged users from messing
with a mounts atime settings. I have included my test case as the
last patch in this series so people performing backports can verify
this change works correctly.

The next change fixes a bug in NFS that was discovered while auditing
nsproxy users for the first optimization. Today you can oops the
kernel by reading /proc/fs/nfsfs/{servers,volumes} if you are clever
with pid namespaces. I rebased and fixed the build of the
!CONFIG_NFS_FS case yesterday when a build bot caught my typo. Given
that no one to my knowledge bases anything on my tree fixing the typo
in place seems more responsible that requiring a typo-fix to be
backported as well.

The last change is a small semantic cleanup introducing
/proc/thread-self and pointing /proc/mounts and /proc/net at it. This
prevents several kinds of problemantic corner cases. It is a
user-visible change so it has a minute chance of causing regressions
so the change to /proc/mounts and /proc/net are individual one line
commits that can be trivially reverted. Unfortunately I lost and
could not find the email of the original reporter so he is not
credited. From at least one perspective this change to /proc/net is a
refgression fix to allow pthread /proc/net uses that were broken by
the introduction of the network namespace"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
proc: Point /proc/mounts at /proc/thread-self/mounts instead of /proc/self/mounts
proc: Point /proc/net at /proc/thread-self/net instead of /proc/self/net
proc: Implement /proc/thread-self to point at the directory of the current thread
proc: Have net show up under /proc/<tgid>/task/<tid>
NFS: Fix /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes
mnt: Add tests for unprivileged remount cases that have found to be faulty
mnt: Change the default remount atime from relatime to the existing value
mnt: Correct permission checks in do_remount
mnt: Move the test for MNT_LOCK_READONLY from change_mount_flags into do_remount
mnt: Only change user settable mount flags in remount
namespaces: Use task_lock and not rcu to protect nsproxy

+537 -97
+55 -10
fs/namespace.c
··· 890 890 891 891 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 892 892 /* Don't allow unprivileged users to change mount flags */ 893 - if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) 894 - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 893 + if (flag & CL_UNPRIVILEGED) { 894 + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; 895 + 896 + if (mnt->mnt.mnt_flags & MNT_READONLY) 897 + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 898 + 899 + if (mnt->mnt.mnt_flags & MNT_NODEV) 900 + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; 901 + 902 + if (mnt->mnt.mnt_flags & MNT_NOSUID) 903 + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; 904 + 905 + if (mnt->mnt.mnt_flags & MNT_NOEXEC) 906 + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; 907 + } 895 908 896 909 /* Don't allow unprivileged users to reveal what is under a mount */ 897 910 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) ··· 1909 1896 if (readonly_request == __mnt_is_readonly(mnt)) 1910 1897 return 0; 1911 1898 1912 - if (mnt->mnt_flags & MNT_LOCK_READONLY) 1913 - return -EPERM; 1914 - 1915 1899 if (readonly_request) 1916 1900 error = mnt_make_readonly(real_mount(mnt)); 1917 1901 else ··· 1934 1924 if (path->dentry != path->mnt->mnt_root) 1935 1925 return -EINVAL; 1936 1926 1927 + /* Don't allow changing of locked mnt flags. 1928 + * 1929 + * No locks need to be held here while testing the various 1930 + * MNT_LOCK flags because those flags can never be cleared 1931 + * once they are set. 1932 + */ 1933 + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && 1934 + !(mnt_flags & MNT_READONLY)) { 1935 + return -EPERM; 1936 + } 1937 + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 1938 + !(mnt_flags & MNT_NODEV)) { 1939 + return -EPERM; 1940 + } 1941 + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 1942 + !(mnt_flags & MNT_NOSUID)) { 1943 + return -EPERM; 1944 + } 1945 + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && 1946 + !(mnt_flags & MNT_NOEXEC)) { 1947 + return -EPERM; 1948 + } 1949 + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && 1950 + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { 1951 + return -EPERM; 1952 + } 1953 + 1937 1954 err = security_sb_remount(sb, data); 1938 1955 if (err) 1939 1956 return err; ··· 1974 1937 err = do_remount_sb(sb, flags, data, 0); 1975 1938 if (!err) { 1976 1939 lock_mount_hash(); 1977 - mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; 1940 + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 1978 1941 mnt->mnt.mnt_flags = mnt_flags; 1979 1942 touch_mnt_namespace(mnt->mnt_ns); 1980 1943 unlock_mount_hash(); ··· 2159 2122 */ 2160 2123 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2161 2124 flags |= MS_NODEV; 2162 - mnt_flags |= MNT_NODEV; 2125 + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; 2163 2126 } 2164 2127 } 2165 2128 ··· 2472 2435 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 2473 2436 if (flags & MS_RDONLY) 2474 2437 mnt_flags |= MNT_READONLY; 2438 + 2439 + /* The default atime for remount is preservation */ 2440 + if ((flags & MS_REMOUNT) && 2441 + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 2442 + MS_STRICTATIME)) == 0)) { 2443 + mnt_flags &= ~MNT_ATIME_MASK; 2444 + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; 2445 + } 2475 2446 2476 2447 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2477 2448 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | ··· 3017 2972 struct mnt_namespace *ns = NULL; 3018 2973 struct nsproxy *nsproxy; 3019 2974 3020 - rcu_read_lock(); 3021 - nsproxy = task_nsproxy(task); 2975 + task_lock(task); 2976 + nsproxy = task->nsproxy; 3022 2977 if (nsproxy) { 3023 2978 ns = nsproxy->mnt_ns; 3024 2979 get_mnt_ns(ns); 3025 2980 } 3026 - rcu_read_unlock(); 2981 + task_unlock(task); 3027 2982 3028 2983 return ns; 3029 2984 }
+55 -40
fs/nfs/client.c
··· 1205 1205 .open = nfs_server_list_open, 1206 1206 .read = seq_read, 1207 1207 .llseek = seq_lseek, 1208 - .release = seq_release, 1208 + .release = seq_release_net, 1209 1209 .owner = THIS_MODULE, 1210 1210 }; 1211 1211 ··· 1226 1226 .open = nfs_volume_list_open, 1227 1227 .read = seq_read, 1228 1228 .llseek = seq_lseek, 1229 - .release = seq_release, 1229 + .release = seq_release_net, 1230 1230 .owner = THIS_MODULE, 1231 1231 }; 1232 1232 ··· 1236 1236 */ 1237 1237 static int nfs_server_list_open(struct inode *inode, struct file *file) 1238 1238 { 1239 - struct seq_file *m; 1240 - int ret; 1241 - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; 1242 - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; 1243 - 1244 - ret = seq_open(file, &nfs_server_list_ops); 1245 - if (ret < 0) 1246 - return ret; 1247 - 1248 - m = file->private_data; 1249 - m->private = net; 1250 - 1251 - return 0; 1239 + return seq_open_net(inode, file, &nfs_server_list_ops, 1240 + sizeof(struct seq_net_private)); 1252 1241 } 1253 1242 1254 1243 /* ··· 1245 1256 */ 1246 1257 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1247 1258 { 1248 - struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1259 + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1249 1260 1250 1261 /* lock the list against modification */ 1251 1262 spin_lock(&nn->nfs_client_lock); ··· 1257 1268 */ 1258 1269 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) 1259 1270 { 1260 - struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1271 + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1261 1272 1262 1273 return seq_list_next(v, &nn->nfs_client_list, pos); 1263 1274 } ··· 1267 1278 */ 1268 1279 static void nfs_server_list_stop(struct seq_file *p, void *v) 1269 1280 { 1270 - struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1281 + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1271 1282 1272 1283 spin_unlock(&nn->nfs_client_lock); 1273 1284 } ··· 1278 1289 static int nfs_server_list_show(struct seq_file *m, void *v) 1279 1290 { 1280 1291 struct nfs_client *clp; 1281 - struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1292 + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1282 1293 1283 1294 /* display header on line 1 */ 1284 1295 if (v == &nn->nfs_client_list) { ··· 1310 1321 */ 1311 1322 static int nfs_volume_list_open(struct inode *inode, struct file *file) 1312 1323 { 1313 - struct seq_file *m; 1314 - int ret; 1315 - struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; 1316 - struct net *net = pid_ns->child_reaper->nsproxy->net_ns; 1317 - 1318 - ret = seq_open(file, &nfs_volume_list_ops); 1319 - if (ret < 0) 1320 - return ret; 1321 - 1322 - m = file->private_data; 1323 - m->private = net; 1324 - 1325 - return 0; 1324 + return seq_open_net(inode, file, &nfs_server_list_ops, 1325 + sizeof(struct seq_net_private)); 1326 1326 } 1327 1327 1328 1328 /* ··· 1319 1341 */ 1320 1342 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1321 1343 { 1322 - struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1344 + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1323 1345 1324 1346 /* lock the list against modification */ 1325 1347 spin_lock(&nn->nfs_client_lock); ··· 1331 1353 */ 1332 1354 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) 1333 1355 { 1334 - struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1356 + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1335 1357 1336 1358 return seq_list_next(v, &nn->nfs_volume_list, pos); 1337 1359 } ··· 1341 1363 */ 1342 1364 static void nfs_volume_list_stop(struct seq_file *p, void *v) 1343 1365 { 1344 - struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1366 + struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1345 1367 1346 1368 spin_unlock(&nn->nfs_client_lock); 1347 1369 } ··· 1354 1376 struct nfs_server *server; 1355 1377 struct nfs_client *clp; 1356 1378 char dev[8], fsid[17]; 1357 - struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1379 + struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1358 1380 1359 1381 /* display header on line 1 */ 1360 1382 if (v == &nn->nfs_volume_list) { ··· 1385 1407 return 0; 1386 1408 } 1387 1409 1410 + int nfs_fs_proc_net_init(struct net *net) 1411 + { 1412 + struct nfs_net *nn = net_generic(net, nfs_net_id); 1413 + struct proc_dir_entry *p; 1414 + 1415 + nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); 1416 + if (!nn->proc_nfsfs) 1417 + goto error_0; 1418 + 1419 + /* a file of servers with which we're dealing */ 1420 + p = proc_create("servers", S_IFREG|S_IRUGO, 1421 + nn->proc_nfsfs, &nfs_server_list_fops); 1422 + if (!p) 1423 + goto error_1; 1424 + 1425 + /* a file of volumes that we have mounted */ 1426 + p = proc_create("volumes", S_IFREG|S_IRUGO, 1427 + nn->proc_nfsfs, &nfs_volume_list_fops); 1428 + if (!p) 1429 + goto error_2; 1430 + return 0; 1431 + 1432 + error_2: 1433 + remove_proc_entry("servers", nn->proc_nfsfs); 1434 + error_1: 1435 + remove_proc_entry("fs/nfsfs", NULL); 1436 + error_0: 1437 + return -ENOMEM; 1438 + } 1439 + 1440 + void nfs_fs_proc_net_exit(struct net *net) 1441 + { 1442 + struct nfs_net *nn = net_generic(net, nfs_net_id); 1443 + 1444 + remove_proc_entry("volumes", nn->proc_nfsfs); 1445 + remove_proc_entry("servers", nn->proc_nfsfs); 1446 + remove_proc_entry("fs/nfsfs", NULL); 1447 + } 1448 + 1388 1449 /* 1389 1450 * initialise the /proc/fs/nfsfs/ directory 1390 1451 */ ··· 1436 1419 goto error_0; 1437 1420 1438 1421 /* a file of servers with which we're dealing */ 1439 - p = proc_create("servers", S_IFREG|S_IRUGO, 1440 - proc_fs_nfs, &nfs_server_list_fops); 1422 + p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); 1441 1423 if (!p) 1442 1424 goto error_1; 1443 1425 1444 1426 /* a file of volumes that we have mounted */ 1445 - p = proc_create("volumes", S_IFREG|S_IRUGO, 1446 - proc_fs_nfs, &nfs_volume_list_fops); 1427 + p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); 1447 1428 if (!p) 1448 1429 goto error_2; 1449 1430 return 0;
+2 -1
fs/nfs/inode.c
··· 1840 1840 static int nfs_net_init(struct net *net) 1841 1841 { 1842 1842 nfs_clients_init(net); 1843 - return 0; 1843 + return nfs_fs_proc_net_init(net); 1844 1844 } 1845 1845 1846 1846 static void nfs_net_exit(struct net *net) 1847 1847 { 1848 + nfs_fs_proc_net_exit(net); 1848 1849 nfs_cleanup_cb_ident_idr(net); 1849 1850 } 1850 1851
+9
fs/nfs/internal.h
··· 195 195 #ifdef CONFIG_PROC_FS 196 196 extern int __init nfs_fs_proc_init(void); 197 197 extern void nfs_fs_proc_exit(void); 198 + extern int nfs_fs_proc_net_init(struct net *net); 199 + extern void nfs_fs_proc_net_exit(struct net *net); 198 200 #else 201 + static inline int nfs_fs_proc_net_init(struct net *net) 202 + { 203 + return 0; 204 + } 205 + static inline void nfs_fs_proc_net_exit(struct net *net) 206 + { 207 + } 199 208 static inline int nfs_fs_proc_init(void) 200 209 { 201 210 return 0;
+3
fs/nfs/netns.h
··· 29 29 #endif 30 30 spinlock_t nfs_client_lock; 31 31 struct timespec boot_time; 32 + #ifdef CONFIG_PROC_FS 33 + struct proc_dir_entry *proc_nfsfs; 34 + #endif 32 35 }; 33 36 34 37 extern int nfs_net_id;
+1
fs/proc/Makefile
··· 23 23 proc-y += softirqs.o 24 24 proc-y += namespaces.o 25 25 proc-y += self.o 26 + proc-y += thread_self.o 26 27 proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 27 28 proc-$(CONFIG_NET) += proc_net.o 28 29 proc-$(CONFIG_PROC_KCORE) += kcore.o
+13 -5
fs/proc/base.c
··· 2814 2814 return iter; 2815 2815 } 2816 2816 2817 - #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) 2817 + #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) 2818 2818 2819 2819 /* for the /proc/ directory itself, after non-process stuff has been done */ 2820 2820 int proc_pid_readdir(struct file *file, struct dir_context *ctx) ··· 2826 2826 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2827 2827 return 0; 2828 2828 2829 - if (pos == TGID_OFFSET - 1) { 2829 + if (pos == TGID_OFFSET - 2) { 2830 2830 struct inode *inode = ns->proc_self->d_inode; 2831 2831 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) 2832 2832 return 0; 2833 - iter.tgid = 0; 2834 - } else { 2835 - iter.tgid = pos - TGID_OFFSET; 2833 + ctx->pos = pos = pos + 1; 2836 2834 } 2835 + if (pos == TGID_OFFSET - 1) { 2836 + struct inode *inode = ns->proc_thread_self->d_inode; 2837 + if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) 2838 + return 0; 2839 + ctx->pos = pos = pos + 1; 2840 + } 2841 + iter.tgid = pos - TGID_OFFSET; 2837 2842 iter.task = NULL; 2838 2843 for (iter = next_tgid(ns, iter); 2839 2844 iter.task; ··· 2867 2862 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2868 2863 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2869 2864 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2865 + #ifdef CONFIG_NET 2866 + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2867 + #endif 2870 2868 REG("environ", S_IRUSR, proc_environ_operations), 2871 2869 ONE("auxv", S_IRUSR, proc_pid_auxv), 2872 2870 ONE("status", S_IRUGO, proc_pid_status),
+6 -1
fs/proc/inode.c
··· 442 442 int proc_fill_super(struct super_block *s) 443 443 { 444 444 struct inode *root_inode; 445 + int ret; 445 446 446 447 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; 447 448 s->s_blocksize = 1024; ··· 464 463 return -ENOMEM; 465 464 } 466 465 467 - return proc_setup_self(s); 466 + ret = proc_setup_self(s); 467 + if (ret) { 468 + return ret; 469 + } 470 + return proc_setup_thread_self(s); 468 471 }
+6
fs/proc/internal.h
··· 231 231 extern int proc_setup_self(struct super_block *); 232 232 233 233 /* 234 + * proc_thread_self.c 235 + */ 236 + extern int proc_setup_thread_self(struct super_block *); 237 + extern void proc_thread_self_init(void); 238 + 239 + /* 234 240 * proc_sysctl.c 235 241 */ 236 242 #ifdef CONFIG_PROC_SYSCTL
+4 -2
fs/proc/proc_net.c
··· 113 113 rcu_read_lock(); 114 114 task = pid_task(proc_pid(dir), PIDTYPE_PID); 115 115 if (task != NULL) { 116 - ns = task_nsproxy(task); 116 + task_lock(task); 117 + ns = task->nsproxy; 117 118 if (ns != NULL) 118 119 net = get_net(ns->net_ns); 120 + task_unlock(task); 119 121 } 120 122 rcu_read_unlock(); 121 123 ··· 226 224 227 225 int __init proc_net_init(void) 228 226 { 229 - proc_symlink("net", NULL, "self/net"); 227 + proc_symlink("net", NULL, "thread-self/net"); 230 228 231 229 return register_pernet_subsys(&proc_net_ns_ops); 232 230 }
+4 -1
fs/proc/root.c
··· 149 149 ns = (struct pid_namespace *)sb->s_fs_info; 150 150 if (ns->proc_self) 151 151 dput(ns->proc_self); 152 + if (ns->proc_thread_self) 153 + dput(ns->proc_thread_self); 152 154 kill_anon_super(sb); 153 155 put_pid_ns(ns); 154 156 } ··· 172 170 return; 173 171 174 172 proc_self_init(); 175 - proc_symlink("mounts", NULL, "self/mounts"); 173 + proc_thread_self_init(); 174 + proc_symlink("mounts", NULL, "thread-self/mounts"); 176 175 177 176 proc_net_init(); 178 177
+85
fs/proc/thread_self.c
··· 1 + #include <linux/sched.h> 2 + #include <linux/namei.h> 3 + #include <linux/slab.h> 4 + #include <linux/pid_namespace.h> 5 + #include "internal.h" 6 + 7 + /* 8 + * /proc/thread_self: 9 + */ 10 + static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, 11 + int buflen) 12 + { 13 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 14 + pid_t tgid = task_tgid_nr_ns(current, ns); 15 + pid_t pid = task_pid_nr_ns(current, ns); 16 + char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF]; 17 + if (!pid) 18 + return -ENOENT; 19 + sprintf(tmp, "%d/task/%d", tgid, pid); 20 + return readlink_copy(buffer, buflen, tmp); 21 + } 22 + 23 + static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) 24 + { 25 + struct pid_namespace *ns = dentry->d_sb->s_fs_info; 26 + pid_t tgid = task_tgid_nr_ns(current, ns); 27 + pid_t pid = task_pid_nr_ns(current, ns); 28 + char *name = ERR_PTR(-ENOENT); 29 + if (pid) { 30 + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); 31 + if (!name) 32 + name = ERR_PTR(-ENOMEM); 33 + else 34 + sprintf(name, "%d/task/%d", tgid, pid); 35 + } 36 + nd_set_link(nd, name); 37 + return NULL; 38 + } 39 + 40 + static const struct inode_operations proc_thread_self_inode_operations = { 41 + .readlink = proc_thread_self_readlink, 42 + .follow_link = proc_thread_self_follow_link, 43 + .put_link = kfree_put_link, 44 + }; 45 + 46 + static unsigned thread_self_inum; 47 + 48 + int proc_setup_thread_self(struct super_block *s) 49 + { 50 + struct inode *root_inode = s->s_root->d_inode; 51 + struct pid_namespace *ns = s->s_fs_info; 52 + struct dentry *thread_self; 53 + 54 + mutex_lock(&root_inode->i_mutex); 55 + thread_self = d_alloc_name(s->s_root, "thread-self"); 56 + if (thread_self) { 57 + struct inode *inode = new_inode_pseudo(s); 58 + if (inode) { 59 + inode->i_ino = thread_self_inum; 60 + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 61 + inode->i_mode = S_IFLNK | S_IRWXUGO; 62 + inode->i_uid = GLOBAL_ROOT_UID; 63 + inode->i_gid = GLOBAL_ROOT_GID; 64 + inode->i_op = &proc_thread_self_inode_operations; 65 + d_add(thread_self, inode); 66 + } else { 67 + dput(thread_self); 68 + thread_self = ERR_PTR(-ENOMEM); 69 + } 70 + } else { 71 + thread_self = ERR_PTR(-ENOMEM); 72 + } 73 + mutex_unlock(&root_inode->i_mutex); 74 + if (IS_ERR(thread_self)) { 75 + pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); 76 + return PTR_ERR(thread_self); 77 + } 78 + ns->proc_thread_self = thread_self; 79 + return 0; 80 + } 81 + 82 + void __init proc_thread_self_init(void) 83 + { 84 + proc_alloc_inum(&thread_self_inum); 85 + }
+3 -5
fs/proc_namespace.c
··· 232 232 if (!task) 233 233 goto err; 234 234 235 - rcu_read_lock(); 236 - nsp = task_nsproxy(task); 235 + task_lock(task); 236 + nsp = task->nsproxy; 237 237 if (!nsp || !nsp->mnt_ns) { 238 - rcu_read_unlock(); 238 + task_unlock(task); 239 239 put_task_struct(task); 240 240 goto err; 241 241 } 242 242 ns = nsp->mnt_ns; 243 243 get_mnt_ns(ns); 244 - rcu_read_unlock(); 245 - task_lock(task); 246 244 if (!task->fs) { 247 245 task_unlock(task); 248 246 put_task_struct(task);
+8 -1
include/linux/mount.h
··· 42 42 * flag, consider how it interacts with shared mounts. 43 43 */ 44 44 #define MNT_SHARED_MASK (MNT_UNBINDABLE) 45 - #define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) 45 + #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ 46 + | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ 47 + | MNT_READONLY) 48 + #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) 46 49 47 50 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ 48 51 MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) 49 52 50 53 #define MNT_INTERNAL 0x4000 51 54 55 + #define MNT_LOCK_ATIME 0x040000 56 + #define MNT_LOCK_NOEXEC 0x080000 57 + #define MNT_LOCK_NOSUID 0x100000 58 + #define MNT_LOCK_NODEV 0x200000 52 59 #define MNT_LOCK_READONLY 0x400000 53 60 #define MNT_LOCKED 0x800000 54 61 #define MNT_DOOMED 0x1000000
+6 -10
include/linux/nsproxy.h
··· 40 40 * the namespaces access rules are: 41 41 * 42 42 * 1. only current task is allowed to change tsk->nsproxy pointer or 43 - * any pointer on the nsproxy itself 43 + * any pointer on the nsproxy itself. Current must hold the task_lock 44 + * when changing tsk->nsproxy. 44 45 * 45 46 * 2. when accessing (i.e. reading) current task's namespaces - no 46 47 * precautions should be taken - just dereference the pointers 47 48 * 48 49 * 3. the access to other task namespaces is performed like this 49 - * rcu_read_lock(); 50 - * nsproxy = task_nsproxy(tsk); 50 + * task_lock(task); 51 + * nsproxy = task->nsproxy; 51 52 * if (nsproxy != NULL) { 52 53 * / * 53 54 * * work with the namespaces here 54 55 * * e.g. get the reference on one of them 55 56 * * / 56 57 * } / * 57 - * * NULL task_nsproxy() means that this task is 58 + * * NULL task->nsproxy means that this task is 58 59 * * almost dead (zombie) 59 60 * * / 60 - * rcu_read_unlock(); 61 + * task_unlock(task); 61 62 * 62 63 */ 63 - 64 - static inline struct nsproxy *task_nsproxy(struct task_struct *tsk) 65 - { 66 - return rcu_dereference(tsk->nsproxy); 67 - } 68 64 69 65 int copy_namespaces(unsigned long flags, struct task_struct *tsk); 70 66 void exit_task_namespaces(struct task_struct *tsk);
+1
include/linux/pid_namespace.h
··· 33 33 #ifdef CONFIG_PROC_FS 34 34 struct vfsmount *proc_mnt; 35 35 struct dentry *proc_self; 36 + struct dentry *proc_thread_self; 36 37 #endif 37 38 #ifdef CONFIG_BSD_PROCESS_ACCT 38 39 struct bsd_acct_struct *bacct;
+3 -3
ipc/namespace.c
··· 154 154 struct ipc_namespace *ns = NULL; 155 155 struct nsproxy *nsproxy; 156 156 157 - rcu_read_lock(); 158 - nsproxy = task_nsproxy(task); 157 + task_lock(task); 158 + nsproxy = task->nsproxy; 159 159 if (nsproxy) 160 160 ns = get_ipc_ns(nsproxy->ipc_ns); 161 - rcu_read_unlock(); 161 + task_unlock(task); 162 162 163 163 return ns; 164 164 }
+4 -11
kernel/nsproxy.c
··· 204 204 205 205 might_sleep(); 206 206 207 + task_lock(p); 207 208 ns = p->nsproxy; 209 + p->nsproxy = new; 210 + task_unlock(p); 208 211 209 - rcu_assign_pointer(p->nsproxy, new); 210 - 211 - if (ns && atomic_dec_and_test(&ns->count)) { 212 - /* 213 - * wait for others to get what they want from this nsproxy. 214 - * 215 - * cannot release this nsproxy via the call_rcu() since 216 - * put_mnt_ns() will want to sleep 217 - */ 218 - synchronize_rcu(); 212 + if (ns && atomic_dec_and_test(&ns->count)) 219 213 free_nsproxy(ns); 220 - } 221 214 } 222 215 223 216 void exit_task_namespaces(struct task_struct *p)
+3 -3
kernel/utsname.c
··· 93 93 struct uts_namespace *ns = NULL; 94 94 struct nsproxy *nsproxy; 95 95 96 - rcu_read_lock(); 97 - nsproxy = task_nsproxy(task); 96 + task_lock(task); 97 + nsproxy = task->nsproxy; 98 98 if (nsproxy) { 99 99 ns = nsproxy->uts_ns; 100 100 get_uts_ns(ns); 101 101 } 102 - rcu_read_unlock(); 102 + task_unlock(task); 103 103 104 104 return ns; 105 105 }
+6 -4
net/core/net_namespace.c
··· 373 373 tsk = find_task_by_vpid(pid); 374 374 if (tsk) { 375 375 struct nsproxy *nsproxy; 376 - nsproxy = task_nsproxy(tsk); 376 + task_lock(tsk); 377 + nsproxy = tsk->nsproxy; 377 378 if (nsproxy) 378 379 net = get_net(nsproxy->net_ns); 380 + task_unlock(tsk); 379 381 } 380 382 rcu_read_unlock(); 381 383 return net; ··· 634 632 struct net *net = NULL; 635 633 struct nsproxy *nsproxy; 636 634 637 - rcu_read_lock(); 638 - nsproxy = task_nsproxy(task); 635 + task_lock(task); 636 + nsproxy = task->nsproxy; 639 637 if (nsproxy) 640 638 net = get_net(nsproxy->net_ns); 641 - rcu_read_unlock(); 639 + task_unlock(task); 642 640 643 641 return net; 644 642 }
+1
tools/testing/selftests/Makefile
··· 5 5 TARGETS += memfd 6 6 TARGETS += memory-hotplug 7 7 TARGETS += mqueue 8 + TARGETS += mount 8 9 TARGETS += net 9 10 TARGETS += ptrace 10 11 TARGETS += timers
+17
tools/testing/selftests/mount/Makefile
··· 1 + # Makefile for mount selftests. 2 + 3 + all: unprivileged-remount-test 4 + 5 + unprivileged-remount-test: unprivileged-remount-test.c 6 + gcc -Wall -O2 unprivileged-remount-test.c -o unprivileged-remount-test 7 + 8 + # Allow specific tests to be selected. 9 + test_unprivileged_remount: unprivileged-remount-test 10 + @if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi 11 + 12 + run_tests: all test_unprivileged_remount 13 + 14 + clean: 15 + rm -f unprivileged-remount-test 16 + 17 + .PHONY: all test_unprivileged_remount
+242
tools/testing/selftests/mount/unprivileged-remount-test.c
··· 1 + #define _GNU_SOURCE 2 + #include <sched.h> 3 + #include <stdio.h> 4 + #include <errno.h> 5 + #include <string.h> 6 + #include <sys/types.h> 7 + #include <sys/mount.h> 8 + #include <sys/wait.h> 9 + #include <stdlib.h> 10 + #include <unistd.h> 11 + #include <fcntl.h> 12 + #include <grp.h> 13 + #include <stdbool.h> 14 + #include <stdarg.h> 15 + 16 + #ifndef CLONE_NEWNS 17 + # define CLONE_NEWNS 0x00020000 18 + #endif 19 + #ifndef CLONE_NEWUTS 20 + # define CLONE_NEWUTS 0x04000000 21 + #endif 22 + #ifndef CLONE_NEWIPC 23 + # define CLONE_NEWIPC 0x08000000 24 + #endif 25 + #ifndef CLONE_NEWNET 26 + # define CLONE_NEWNET 0x40000000 27 + #endif 28 + #ifndef CLONE_NEWUSER 29 + # define CLONE_NEWUSER 0x10000000 30 + #endif 31 + #ifndef CLONE_NEWPID 32 + # define CLONE_NEWPID 0x20000000 33 + #endif 34 + 35 + #ifndef MS_RELATIME 36 + #define MS_RELATIME (1 << 21) 37 + #endif 38 + #ifndef MS_STRICTATIME 39 + #define MS_STRICTATIME (1 << 24) 40 + #endif 41 + 42 + static void die(char *fmt, ...) 43 + { 44 + va_list ap; 45 + va_start(ap, fmt); 46 + vfprintf(stderr, fmt, ap); 47 + va_end(ap); 48 + exit(EXIT_FAILURE); 49 + } 50 + 51 + static void write_file(char *filename, char *fmt, ...) 52 + { 53 + char buf[4096]; 54 + int fd; 55 + ssize_t written; 56 + int buf_len; 57 + va_list ap; 58 + 59 + va_start(ap, fmt); 60 + buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); 61 + va_end(ap); 62 + if (buf_len < 0) { 63 + die("vsnprintf failed: %s\n", 64 + strerror(errno)); 65 + } 66 + if (buf_len >= sizeof(buf)) { 67 + die("vsnprintf output truncated\n"); 68 + } 69 + 70 + fd = open(filename, O_WRONLY); 71 + if (fd < 0) { 72 + die("open of %s failed: %s\n", 73 + filename, strerror(errno)); 74 + } 75 + written = write(fd, buf, buf_len); 76 + if (written != buf_len) { 77 + if (written >= 0) { 78 + die("short write to %s\n", filename); 79 + } else { 80 + die("write to %s failed: %s\n", 81 + filename, strerror(errno)); 82 + } 83 + } 84 + if (close(fd) != 0) { 85 + die("close of %s failed: %s\n", 86 + filename, strerror(errno)); 87 + } 88 + } 89 + 90 + static void create_and_enter_userns(void) 91 + { 92 + uid_t uid; 93 + gid_t gid; 94 + 95 + uid = getuid(); 96 + gid = getgid(); 97 + 98 + if (unshare(CLONE_NEWUSER) !=0) { 99 + die("unshare(CLONE_NEWUSER) failed: %s\n", 100 + strerror(errno)); 101 + } 102 + 103 + write_file("/proc/self/uid_map", "0 %d 1", uid); 104 + write_file("/proc/self/gid_map", "0 %d 1", gid); 105 + 106 + if (setgroups(0, NULL) != 0) { 107 + die("setgroups failed: %s\n", 108 + strerror(errno)); 109 + } 110 + if (setgid(0) != 0) { 111 + die ("setgid(0) failed %s\n", 112 + strerror(errno)); 113 + } 114 + if (setuid(0) != 0) { 115 + die("setuid(0) failed %s\n", 116 + strerror(errno)); 117 + } 118 + } 119 + 120 + static 121 + bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) 122 + { 123 + pid_t child; 124 + 125 + child = fork(); 126 + if (child == -1) { 127 + die("fork failed: %s\n", 128 + strerror(errno)); 129 + } 130 + if (child != 0) { /* parent */ 131 + pid_t pid; 132 + int status; 133 + pid = waitpid(child, &status, 0); 134 + if (pid == -1) { 135 + die("waitpid failed: %s\n", 136 + strerror(errno)); 137 + } 138 + if (pid != child) { 139 + die("waited for %d got %d\n", 140 + child, pid); 141 + } 142 + if (!WIFEXITED(status)) { 143 + die("child did not terminate cleanly\n"); 144 + } 145 + return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; 146 + } 147 + 148 + create_and_enter_userns(); 149 + if (unshare(CLONE_NEWNS) != 0) { 150 + die("unshare(CLONE_NEWNS) failed: %s\n", 151 + strerror(errno)); 152 + } 153 + 154 + if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) { 155 + die("mount of /tmp failed: %s\n", 156 + strerror(errno)); 157 + } 158 + 159 + create_and_enter_userns(); 160 + 161 + if (unshare(CLONE_NEWNS) != 0) { 162 + die("unshare(CLONE_NEWNS) failed: %s\n", 163 + strerror(errno)); 164 + } 165 + 166 + if (mount("/tmp", "/tmp", "none", 167 + MS_REMOUNT | MS_BIND | remount_flags, NULL) != 0) { 168 + /* system("cat /proc/self/mounts"); */ 169 + die("remount of /tmp failed: %s\n", 170 + strerror(errno)); 171 + } 172 + 173 + if (mount("/tmp", "/tmp", "none", 174 + MS_REMOUNT | MS_BIND | invalid_flags, NULL) == 0) { 175 + /* system("cat /proc/self/mounts"); */ 176 + die("remount of /tmp with invalid flags " 177 + "succeeded unexpectedly\n"); 178 + } 179 + exit(EXIT_SUCCESS); 180 + } 181 + 182 + static bool test_unpriv_remount_simple(int mount_flags) 183 + { 184 + return test_unpriv_remount(mount_flags, mount_flags, 0); 185 + } 186 + 187 + static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags) 188 + { 189 + return test_unpriv_remount(mount_flags, mount_flags, invalid_flags); 190 + } 191 + 192 + int main(int argc, char **argv) 193 + { 194 + if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) { 195 + die("MS_RDONLY malfunctions\n"); 196 + } 197 + if (!test_unpriv_remount_simple(MS_NODEV)) { 198 + die("MS_NODEV malfunctions\n"); 199 + } 200 + if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) { 201 + die("MS_NOSUID malfunctions\n"); 202 + } 203 + if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) { 204 + die("MS_NOEXEC malfunctions\n"); 205 + } 206 + if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV, 207 + MS_NOATIME|MS_NODEV)) 208 + { 209 + die("MS_RELATIME malfunctions\n"); 210 + } 211 + if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV, 212 + MS_NOATIME|MS_NODEV)) 213 + { 214 + die("MS_STRICTATIME malfunctions\n"); 215 + } 216 + if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV, 217 + MS_STRICTATIME|MS_NODEV)) 218 + { 219 + die("MS_RELATIME malfunctions\n"); 220 + } 221 + if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV, 222 + MS_NOATIME|MS_NODEV)) 223 + { 224 + die("MS_RELATIME malfunctions\n"); 225 + } 226 + if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV, 227 + MS_NOATIME|MS_NODEV)) 228 + { 229 + die("MS_RELATIME malfunctions\n"); 230 + } 231 + if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV, 232 + MS_STRICTATIME|MS_NODEV)) 233 + { 234 + die("MS_RELATIME malfunctions\n"); 235 + } 236 + if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV, 237 + MS_NOATIME|MS_NODEV)) 238 + { 239 + die("Default atime malfunctions\n"); 240 + } 241 + return EXIT_SUCCESS; 242 + }