Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull thread updates from Christian Brauner:
"This contains the changes to add the missing support for attaching to
time namespaces via pidfds.

Last cycle setns() was changed to support attaching to multiple
namespaces atomically. This requires all namespaces to have a point of
no return where they can't fail anymore.

Specifically, <namespace-type>_install() is allowed to perform
permission checks and install the namespace into the new struct nsset
that it has been given but it is not allowed to make visible changes
to the affected task. Once <namespace-type>_install() returns,
anything that the given namespace type additionally requires to be
setup needs to ideally be done in a function that can't fail or if it
fails the failure must be non-fatal.

For time namespaces the relevant functions that fell into this
category were timens_set_vvar_page() and vdso_join_timens(). The
latter could still fail although it didn't need to. This function is
only implemented for vdso_join_timens() in current mainline. As
discussed on-list (cf. [1]), in order to make setns() support time
namespaces when attaching to multiple namespaces at once properly we
changed vdso_join_timens() to always succeed. So vdso_join_timens()
replaces the mmap_write_lock_killable() with mmap_read_lock().

Please note that arm is about to grow vdso support for time namespaces
(possibly this merge window). We've synced on this change and arm64
also uses mmap_read_lock(), i.e. makes vdso_join_timens() a function
that can't fail. Once the changes here and the arm64 changes have
landed, vdso_join_timens() should be turned into a void function so
it's obvious to callers and implementers on other architectures that
the expectation is that it can't fail.

We didn't do this right away because it would've introduced
unnecessary merge conflicts between the two trees for no major gain.

As always, tests included"

[1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein

* tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
tests: add CLONE_NEWTIME setns tests
nsproxy: support CLONE_NEWTIME with setns()
timens: add timens_commit() helper
timens: make vdso_join_timens() always succeed

+115 -19
+2 -3
arch/x86/entry/vdso/vma.c
··· 144 144 struct mm_struct *mm = task->mm; 145 145 struct vm_area_struct *vma; 146 146 147 - if (mmap_write_lock_killable(mm)) 148 - return -EINTR; 147 + mmap_read_lock(mm); 149 148 150 149 for (vma = mm->mmap; vma; vma = vma->vm_next) { 151 150 unsigned long size = vma->vm_end - vma->vm_start; ··· 153 154 zap_page_range(vma, vma->vm_start, size); 154 155 } 155 156 156 - mmap_write_unlock(mm); 157 + mmap_read_unlock(mm); 157 158 return 0; 158 159 } 159 160 #else
+6
include/linux/time_namespace.h
··· 33 33 #ifdef CONFIG_TIME_NS 34 34 extern int vdso_join_timens(struct task_struct *task, 35 35 struct time_namespace *ns); 36 + extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); 36 37 37 38 static inline struct time_namespace *get_time_ns(struct time_namespace *ns) 38 39 { ··· 95 94 struct time_namespace *ns) 96 95 { 97 96 return 0; 97 + } 98 + 99 + static inline void timens_commit(struct task_struct *tsk, 100 + struct time_namespace *ns) 101 + { 98 102 } 99 103 100 104 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
+19 -2
kernel/nsproxy.c
··· 262 262 static int check_setns_flags(unsigned long flags) 263 263 { 264 264 if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 265 - CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | 266 - CLONE_NEWCGROUP))) 265 + CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | 266 + CLONE_NEWPID | CLONE_NEWCGROUP))) 267 267 return -EINVAL; 268 268 269 269 #ifndef CONFIG_USER_NS ··· 288 288 #endif 289 289 #ifndef CONFIG_NET_NS 290 290 if (flags & CLONE_NEWNET) 291 + return -EINVAL; 292 + #endif 293 + #ifndef CONFIG_TIME_NS 294 + if (flags & CLONE_NEWTIME) 291 295 return -EINVAL; 292 296 #endif 293 297 ··· 468 464 } 469 465 #endif 470 466 467 + #ifdef CONFIG_TIME_NS 468 + if (flags & CLONE_NEWTIME) { 469 + ret = validate_ns(nsset, &nsp->time_ns->ns); 470 + if (ret) 471 + goto out; 472 + } 473 + #endif 474 + 471 475 out: 472 476 if (pid_ns) 473 477 put_pid_ns(pid_ns); ··· 517 505 #ifdef CONFIG_IPC_NS 518 506 if (flags & CLONE_NEWIPC) 519 507 exit_sem(me); 508 + #endif 509 + 510 + #ifdef CONFIG_TIME_NS 511 + if (flags & CLONE_NEWTIME) 512 + timens_commit(me, nsset->nsproxy->time_ns); 520 513 #endif 521 514 522 515 /* transfer ownership */
+8 -14
kernel/time/namespace.c
··· 280 280 put_time_ns(to_time_ns(ns)); 281 281 } 282 282 283 + void timens_commit(struct task_struct *tsk, struct time_namespace *ns) 284 + { 285 + timens_set_vvar_page(tsk, ns); 286 + vdso_join_timens(tsk, ns); 287 + } 288 + 283 289 static int timens_install(struct nsset *nsset, struct ns_common *new) 284 290 { 285 291 struct nsproxy *nsproxy = nsset->nsproxy; 286 292 struct time_namespace *ns = to_time_ns(new); 287 - int err; 288 293 289 294 if (!current_is_single_threaded()) 290 295 return -EUSERS; ··· 297 292 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || 298 293 !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) 299 294 return -EPERM; 300 - 301 - timens_set_vvar_page(current, ns); 302 - 303 - err = vdso_join_timens(current, ns); 304 - if (err) 305 - return err; 306 295 307 296 get_time_ns(ns); 308 297 put_time_ns(nsproxy->time_ns); ··· 312 313 { 313 314 struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; 314 315 struct time_namespace *ns = to_time_ns(nsc); 315 - int err; 316 316 317 317 /* create_new_namespaces() already incremented the ref counter */ 318 318 if (nsproxy->time_ns == nsproxy->time_ns_for_children) 319 319 return 0; 320 320 321 - timens_set_vvar_page(tsk, ns); 322 - 323 - err = vdso_join_timens(tsk, ns); 324 - if (err) 325 - return err; 326 - 327 321 get_time_ns(ns); 328 322 put_time_ns(nsproxy->time_ns); 329 323 nsproxy->time_ns = ns; 324 + 325 + timens_commit(tsk, ns); 330 326 331 327 return 0; 332 328 }
+4
tools/testing/selftests/pidfd/pidfd.h
··· 22 22 #define P_PIDFD 3 23 23 #endif 24 24 25 + #ifndef CLONE_NEWTIME 26 + #define CLONE_NEWTIME 0x00000080 27 + #endif 28 + 25 29 #ifndef CLONE_PIDFD 26 30 #define CLONE_PIDFD 0x00001000 27 31 #endif
+76
tools/testing/selftests/pidfd/pidfd_setns_test.c
··· 32 32 PIDFD_NS_NET, 33 33 PIDFD_NS_CGROUP, 34 34 PIDFD_NS_PIDCLD, 35 + PIDFD_NS_TIME, 35 36 PIDFD_NS_MAX 36 37 }; 37 38 ··· 48 47 [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, 49 48 [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, 50 49 [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, 50 + [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, }, 51 51 }; 52 52 53 53 FIXTURE(current_nsset) ··· 85 83 return sys_clone3(&args, sizeof(struct clone_args)); 86 84 } 87 85 86 + static bool switch_timens(void) 87 + { 88 + int fd, ret; 89 + 90 + if (unshare(CLONE_NEWTIME)) 91 + return false; 92 + 93 + fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC); 94 + if (fd < 0) 95 + return false; 96 + 97 + ret = setns(fd, CLONE_NEWTIME); 98 + close(fd); 99 + return ret == 0; 100 + } 101 + 102 + static ssize_t read_nointr(int fd, void *buf, size_t count) 103 + { 104 + ssize_t ret; 105 + 106 + do { 107 + ret = read(fd, buf, count); 108 + } while (ret < 0 && errno == EINTR); 109 + 110 + return ret; 111 + } 112 + 113 + static ssize_t write_nointr(int fd, const void *buf, size_t count) 114 + { 115 + ssize_t ret; 116 + 117 + do { 118 + ret = write(fd, buf, count); 119 + } while (ret < 0 && errno == EINTR); 120 + 121 + return ret; 122 + } 123 + 88 124 FIXTURE_SETUP(current_nsset) 89 125 { 90 126 int i, proc_fd, ret; 127 + int ipc_sockets[2]; 128 + char c; 91 129 92 130 for (i = 0; i < PIDFD_NS_MAX; i++) { 93 131 self->nsfds[i] = -EBADF; ··· 172 130 TH_LOG("%m - Failed to open pidfd for process %d", self->pid); 173 131 } 174 132 133 + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); 134 + EXPECT_EQ(ret, 0); 135 + 175 136 /* Create tasks that will be stopped. */ 176 137 self->child_pid1 = create_child(&self->child_pidfd1, 177 138 CLONE_NEWUSER | CLONE_NEWNS | ··· 184 139 EXPECT_GE(self->child_pid1, 0); 185 140 186 141 if (self->child_pid1 == 0) { 142 + close(ipc_sockets[0]); 143 + 144 + if (!switch_timens()) 145 + _exit(EXIT_FAILURE); 146 + 147 + if (write_nointr(ipc_sockets[1], "1", 1) < 0) 148 + _exit(EXIT_FAILURE); 149 + 150 + close(ipc_sockets[1]); 151 + 187 152 pause(); 188 153 _exit(EXIT_SUCCESS); 189 154 } 155 + 156 + close(ipc_sockets[1]); 157 + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); 158 + close(ipc_sockets[0]); 159 + 160 + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); 161 + EXPECT_EQ(ret, 0); 190 162 191 163 self->child_pid2 = create_child(&self->child_pidfd2, 192 164 CLONE_NEWUSER | CLONE_NEWNS | ··· 213 151 EXPECT_GE(self->child_pid2, 0); 214 152 215 153 if (self->child_pid2 == 0) { 154 + close(ipc_sockets[0]); 155 + 156 + if (!switch_timens()) 157 + _exit(EXIT_FAILURE); 158 + 159 + if (write_nointr(ipc_sockets[1], "1", 1) < 0) 160 + _exit(EXIT_FAILURE); 161 + 162 + close(ipc_sockets[1]); 163 + 216 164 pause(); 217 165 _exit(EXIT_SUCCESS); 218 166 } 167 + 168 + close(ipc_sockets[1]); 169 + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); 170 + close(ipc_sockets[0]); 219 171 220 172 for (i = 0; i < PIDFD_NS_MAX; i++) { 221 173 char p[100];