Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

When switching from kthreads to vhost_tasks two bugs were added:
1. The vhost worker tasks's now show up as processes so scripts doing
ps or ps a would not incorrectly detect the vhost task as another
process. 2. kthreads disabled freeze by setting PF_NOFREEZE, but
vhost tasks's didn't disable or add support for them.

To fix both bugs, this switches the vhost task to be thread in the
process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
SIGKILL/STOP support is required because CLONE_THREAD requires
CLONE_SIGHAND which requires those 2 signals to be supported.

This is a modified version of the patch written by Mike Christie
<michael.christie@oracle.com> which was a modified version of patch
originally written by Linus.

Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER.
Including ignoring signals, setting up the register state, and having
get_signal return instead of calling do_group_exit.

Tidied up the vhost_task abstraction so that the definition of
vhost_task only needs to be visible inside of vhost_task.c. Making
it easier to review the code and tell what needs to be done where.
As part of this the main loop has been moved from vhost_worker into
vhost_task_fn. vhost_worker now returns true if work was done.

The main loop has been updated to call get_signal which handles
SIGSTOP, freezing, and collects the message that tells the thread to
exit as part of process exit. This collection clears
__fatal_signal_pending. This collection is not guaranteed to
clear signal_pending() so clear that explicitly so the schedule()
sleeps.

For now the vhost thread continues to exist and run work until the
last file descriptor is closed and the release function is called as
part of freeing struct file. To avoid hangs in the coredump
rendezvous and when killing threads in a multi-threaded exec. The
coredump code and de_thread have been modified to ignore vhost threads.

Remvoing the special case for exec appears to require teaching
vhost_dev_flush how to directly complete transactions in case
the vhost thread is no longer running.

Removing the special case for coredump rendezvous requires either the
above fix needed for exec or moving the coredump rendezvous into
get_signal.

Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Co-developed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mike Christie and committed by
Linus Torvalds
f9010dbd 1874a42a

+89 -77
+1 -1
arch/x86/include/asm/fpu/sched.h
··· 39 39 static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) 40 40 { 41 41 if (cpu_feature_enabled(X86_FEATURE_FPU) && 42 - !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) { 42 + !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { 43 43 save_fpregs_to_fpstate(old_fpu); 44 44 /* 45 45 * The save operation preserved register state, so the
+1 -1
arch/x86/kernel/fpu/context.h
··· 57 57 struct fpu *fpu = &current->thread.fpu; 58 58 int cpu = smp_processor_id(); 59 59 60 - if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) 60 + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) 61 61 return; 62 62 63 63 if (!fpregs_state_valid(fpu, cpu)) {
+1 -1
arch/x86/kernel/fpu/core.c
··· 426 426 427 427 this_cpu_write(in_kernel_fpu, true); 428 428 429 - if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && 429 + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && 430 430 !test_thread_flag(TIF_NEED_FPU_LOAD)) { 431 431 set_thread_flag(TIF_NEED_FPU_LOAD); 432 432 save_fpregs_to_fpstate(&current->thread.fpu);
+5 -17
drivers/vhost/vhost.c
··· 256 256 * test_and_set_bit() implies a memory barrier. 257 257 */ 258 258 llist_add(&work->node, &dev->worker->work_list); 259 - wake_up_process(dev->worker->vtsk->task); 259 + vhost_task_wake(dev->worker->vtsk); 260 260 } 261 261 } 262 262 EXPORT_SYMBOL_GPL(vhost_work_queue); ··· 333 333 __vhost_vq_meta_reset(vq); 334 334 } 335 335 336 - static int vhost_worker(void *data) 336 + static bool vhost_worker(void *data) 337 337 { 338 338 struct vhost_worker *worker = data; 339 339 struct vhost_work *work, *work_next; 340 340 struct llist_node *node; 341 341 342 - for (;;) { 343 - /* mb paired w/ kthread_stop */ 344 - set_current_state(TASK_INTERRUPTIBLE); 345 - 346 - if (vhost_task_should_stop(worker->vtsk)) { 347 - __set_current_state(TASK_RUNNING); 348 - break; 349 - } 350 - 351 - node = llist_del_all(&worker->work_list); 352 - if (!node) 353 - schedule(); 354 - 342 + node = llist_del_all(&worker->work_list); 343 + if (node) { 355 344 node = llist_reverse_order(node); 356 345 /* make sure flag is seen after deletion */ 357 346 smp_wmb(); 358 347 llist_for_each_entry_safe(work, work_next, node, node) { 359 348 clear_bit(VHOST_WORK_QUEUED, &work->flags); 360 - __set_current_state(TASK_RUNNING); 361 349 kcov_remote_start_common(worker->kcov_handle); 362 350 work->fn(work); 363 351 kcov_remote_stop(); ··· 353 365 } 354 366 } 355 367 356 - return 0; 368 + return !!node; 357 369 } 358 370 359 371 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
+3 -1
fs/coredump.c
··· 371 371 if (t != current && !(t->flags & PF_POSTCOREDUMP)) { 372 372 sigaddset(&t->pending.signal, SIGKILL); 373 373 signal_wake_up(t, 1); 374 - nr++; 374 + /* The vhost_worker does not particpate in coredumps */ 375 + if ((t->flags & (PF_USER_WORKER | PF_IO_WORKER)) != PF_USER_WORKER) 376 + nr++; 375 377 } 376 378 } 377 379
-1
include/linux/sched/task.h
··· 29 29 u32 io_thread:1; 30 30 u32 user_worker:1; 31 31 u32 no_files:1; 32 - u32 ignore_signals:1; 33 32 unsigned long stack; 34 33 unsigned long stack_size; 35 34 unsigned long tls;
+3 -12
include/linux/sched/vhost_task.h
··· 2 2 #ifndef _LINUX_VHOST_TASK_H 3 3 #define _LINUX_VHOST_TASK_H 4 4 5 - #include <linux/completion.h> 6 5 7 - struct task_struct; 6 + struct vhost_task; 8 7 9 - struct vhost_task { 10 - int (*fn)(void *data); 11 - void *data; 12 - struct completion exited; 13 - unsigned long flags; 14 - struct task_struct *task; 15 - }; 16 - 17 - struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, 8 + struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, 18 9 const char *name); 19 10 void vhost_task_start(struct vhost_task *vtsk); 20 11 void vhost_task_stop(struct vhost_task *vtsk); 21 - bool vhost_task_should_stop(struct vhost_task *vtsk); 12 + void vhost_task_wake(struct vhost_task *vtsk); 22 13 23 14 #endif
+4 -1
kernel/exit.c
··· 411 411 tsk->flags |= PF_POSTCOREDUMP; 412 412 core_state = tsk->signal->core_state; 413 413 spin_unlock_irq(&tsk->sighand->siglock); 414 - if (core_state) { 414 + 415 + /* The vhost_worker does not particpate in coredumps */ 416 + if (core_state && 417 + ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { 415 418 struct core_thread self; 416 419 417 420 self.task = current;
+5 -8
kernel/fork.c
··· 2336 2336 p->flags &= ~PF_KTHREAD; 2337 2337 if (args->kthread) 2338 2338 p->flags |= PF_KTHREAD; 2339 - if (args->user_worker) 2340 - p->flags |= PF_USER_WORKER; 2341 - if (args->io_thread) { 2339 + if (args->user_worker) { 2342 2340 /* 2343 - * Mark us an IO worker, and block any signal that isn't 2341 + * Mark us a user worker, and block any signal that isn't 2344 2342 * fatal or STOP 2345 2343 */ 2346 - p->flags |= PF_IO_WORKER; 2344 + p->flags |= PF_USER_WORKER; 2347 2345 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2348 2346 } 2347 + if (args->io_thread) 2348 + p->flags |= PF_IO_WORKER; 2349 2349 2350 2350 if (args->name) 2351 2351 strscpy_pad(p->comm, args->name, sizeof(p->comm)); ··· 2516 2516 retval = copy_thread(p, args); 2517 2517 if (retval) 2518 2518 goto bad_fork_cleanup_io; 2519 - 2520 - if (args->ignore_signals) 2521 - ignore_signals(p); 2522 2519 2523 2520 stackleak_task_init(p); 2524 2521
+5 -3
kernel/signal.c
··· 1368 1368 1369 1369 while_each_thread(p, t) { 1370 1370 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); 1371 - count++; 1371 + /* Don't require de_thread to wait for the vhost_worker */ 1372 + if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) 1373 + count++; 1372 1374 1373 1375 /* Don't bother with already dead threads */ 1374 1376 if (t->exit_state) ··· 2863 2861 } 2864 2862 2865 2863 /* 2866 - * PF_IO_WORKER threads will catch and exit on fatal signals 2864 + * PF_USER_WORKER threads will catch and exit on fatal signals 2867 2865 * themselves. They have cleanup that must be performed, so 2868 2866 * we cannot call do_exit() on their behalf. 2869 2867 */ 2870 - if (current->flags & PF_IO_WORKER) 2868 + if (current->flags & PF_USER_WORKER) 2871 2869 goto out; 2872 2870 2873 2871 /*
+61 -31
kernel/vhost_task.c
··· 12 12 VHOST_TASK_FLAGS_STOP, 13 13 }; 14 14 15 + struct vhost_task { 16 + bool (*fn)(void *data); 17 + void *data; 18 + struct completion exited; 19 + unsigned long flags; 20 + struct task_struct *task; 21 + }; 22 + 15 23 static int vhost_task_fn(void *data) 16 24 { 17 25 struct vhost_task *vtsk = data; 18 - int ret; 26 + bool dead = false; 19 27 20 - ret = vtsk->fn(vtsk->data); 28 + for (;;) { 29 + bool did_work; 30 + 31 + /* mb paired w/ vhost_task_stop */ 32 + if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) 33 + break; 34 + 35 + if (!dead && signal_pending(current)) { 36 + struct ksignal ksig; 37 + /* 38 + * Calling get_signal will block in SIGSTOP, 39 + * or clear fatal_signal_pending, but remember 40 + * what was set. 41 + * 42 + * This thread won't actually exit until all 43 + * of the file descriptors are closed, and 44 + * the release function is called. 45 + */ 46 + dead = get_signal(&ksig); 47 + if (dead) 48 + clear_thread_flag(TIF_SIGPENDING); 49 + } 50 + 51 + did_work = vtsk->fn(vtsk->data); 52 + if (!did_work) { 53 + set_current_state(TASK_INTERRUPTIBLE); 54 + schedule(); 55 + } 56 + } 57 + 21 58 complete(&vtsk->exited); 22 - do_exit(ret); 59 + do_exit(0); 23 60 } 61 + 62 + /** 63 + * vhost_task_wake - wakeup the vhost_task 64 + * @vtsk: vhost_task to wake 65 + * 66 + * wake up the vhost_task worker thread 67 + */ 68 + void vhost_task_wake(struct vhost_task *vtsk) 69 + { 70 + wake_up_process(vtsk->task); 71 + } 72 + EXPORT_SYMBOL_GPL(vhost_task_wake); 24 73 25 74 /** 26 75 * vhost_task_stop - stop a vhost_task 27 76 * @vtsk: vhost_task to stop 28 77 * 29 - * Callers must call vhost_task_should_stop and return from their worker 30 - * function when it returns true; 78 + * vhost_task_fn ensures the worker thread exits after 79 + * VHOST_TASK_FLAGS_SOP becomes true. 31 80 */ 32 81 void vhost_task_stop(struct vhost_task *vtsk) 33 82 { 34 - pid_t pid = vtsk->task->pid; 35 - 36 83 set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); 37 - wake_up_process(vtsk->task); 84 + vhost_task_wake(vtsk); 38 85 /* 39 86 * Make sure vhost_task_fn is no longer accessing the vhost_task before 40 - * freeing it below. If userspace crashed or exited without closing, 41 - * then the vhost_task->task could already be marked dead so 42 - * kernel_wait will return early. 87 + * freeing it below. 43 88 */ 44 89 wait_for_completion(&vtsk->exited); 45 - /* 46 - * If we are just closing/removing a device and the parent process is 47 - * not exiting then reap the task. 48 - */ 49 - kernel_wait4(pid, NULL, __WCLONE, NULL); 50 90 kfree(vtsk); 51 91 } 52 92 EXPORT_SYMBOL_GPL(vhost_task_stop); 53 93 54 94 /** 55 - * vhost_task_should_stop - should the vhost task return from the work function 56 - * @vtsk: vhost_task to stop 57 - */ 58 - bool vhost_task_should_stop(struct vhost_task *vtsk) 59 - { 60 - return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); 61 - } 62 - EXPORT_SYMBOL_GPL(vhost_task_should_stop); 63 - 64 - /** 65 - * vhost_task_create - create a copy of a process to be used by the kernel 66 - * @fn: thread stack 95 + * vhost_task_create - create a copy of a task to be used by the kernel 96 + * @fn: vhost worker function 67 97 * @arg: data to be passed to fn 68 98 * @name: the thread's name 69 99 * ··· 101 71 * failure. The returned task is inactive, and the caller must fire it up 102 72 * through vhost_task_start(). 103 73 */ 104 - struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg, 74 + struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, 105 75 const char *name) 106 76 { 107 77 struct kernel_clone_args args = { 108 - .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM, 78 + .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | 79 + CLONE_THREAD | CLONE_SIGHAND, 109 80 .exit_signal = 0, 110 81 .fn = vhost_task_fn, 111 82 .name = name, 112 83 .user_worker = 1, 113 84 .no_files = 1, 114 - .ignore_signals = 1, 115 85 }; 116 86 struct vhost_task *vtsk; 117 87 struct task_struct *tsk;