Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vhost_task: Handle SIGKILL by flushing work and exiting

Instead of lingering until the device is closed, this has us handle
SIGKILL by:

1. marking the worker as killed so we no longer try to use it with
new virtqueues and new flush operations.
2. setting the virtqueue to worker mapping so no new works are queued.
3. running all the exiting works.

Suggested-by: Edward Adam Davis <eadavis@qq.com>
Reported-and-tested-by: syzbot+98edc2df894917b3431f@syzkaller.appspotmail.com
Message-Id: <tencent_546DA49414E876EEBECF2C78D26D242EE50A@qq.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Message-Id: <20240316004707.45557-9-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

authored by

Mike Christie and committed by
Michael S. Tsirkin
db5247d9 ba704ff4

+88 -24
+50 -4
drivers/vhost/vhost.c
··· 273 273 { 274 274 struct vhost_flush_struct flush; 275 275 276 - if (!worker->attachment_cnt) 276 + if (!worker->attachment_cnt || worker->killed) 277 277 return; 278 278 279 279 init_completion(&flush.wait_event); ··· 388 388 __vhost_vq_meta_reset(vq); 389 389 } 390 390 391 - static bool vhost_worker(void *data) 391 + static bool vhost_run_work_list(void *data) 392 392 { 393 393 struct vhost_worker *worker = data; 394 394 struct vhost_work *work, *work_next; ··· 411 411 } 412 412 413 413 return !!node; 414 + } 415 + 416 + static void vhost_worker_killed(void *data) 417 + { 418 + struct vhost_worker *worker = data; 419 + struct vhost_dev *dev = worker->dev; 420 + struct vhost_virtqueue *vq; 421 + int i, attach_cnt = 0; 422 + 423 + mutex_lock(&worker->mutex); 424 + worker->killed = true; 425 + 426 + for (i = 0; i < dev->nvqs; i++) { 427 + vq = dev->vqs[i]; 428 + 429 + mutex_lock(&vq->mutex); 430 + if (worker == 431 + rcu_dereference_check(vq->worker, 432 + lockdep_is_held(&vq->mutex))) { 433 + rcu_assign_pointer(vq->worker, NULL); 434 + attach_cnt++; 435 + } 436 + mutex_unlock(&vq->mutex); 437 + } 438 + 439 + worker->attachment_cnt -= attach_cnt; 440 + if (attach_cnt) 441 + synchronize_rcu(); 442 + /* 443 + * Finish vhost_worker_flush calls and any other works that snuck in 444 + * before the synchronize_rcu. 445 + */ 446 + vhost_run_work_list(worker); 447 + mutex_unlock(&worker->mutex); 414 448 } 415 449 416 450 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) ··· 661 627 if (!worker) 662 628 return NULL; 663 629 630 + worker->dev = dev; 664 631 snprintf(name, sizeof(name), "vhost-%d", current->pid); 665 632 666 - vtsk = vhost_task_create(vhost_worker, worker, name); 633 + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, 634 + worker, name); 667 635 if (!vtsk) 668 636 goto free_worker; 669 637 ··· 697 661 struct vhost_worker *old_worker; 698 662 699 663 mutex_lock(&worker->mutex); 664 + if (worker->killed) { 665 + mutex_unlock(&worker->mutex); 666 + return; 667 + } 668 + 700 669 mutex_lock(&vq->mutex); 701 670 702 671 old_worker = rcu_dereference_check(vq->worker, ··· 722 681 * device wide flushes which doesn't use RCU for execution. 723 682 */ 724 683 mutex_lock(&old_worker->mutex); 684 + if (old_worker->killed) { 685 + mutex_unlock(&old_worker->mutex); 686 + return; 687 + } 688 + 725 689 /* 726 690 * We don't want to call synchronize_rcu for every vq during setup 727 691 * because it will slow down VM startup. If we haven't done ··· 804 758 return -ENODEV; 805 759 806 760 mutex_lock(&worker->mutex); 807 - if (worker->attachment_cnt) { 761 + if (worker->attachment_cnt || worker->killed) { 808 762 mutex_unlock(&worker->mutex); 809 763 return -EBUSY; 810 764 }
+2
drivers/vhost/vhost.h
··· 28 28 29 29 struct vhost_worker { 30 30 struct vhost_task *vtsk; 31 + struct vhost_dev *dev; 31 32 /* Used to serialize device wide flushing with worker swapping. */ 32 33 struct mutex mutex; 33 34 struct llist_head work_list; 34 35 u64 kcov_handle; 35 36 u32 id; 36 37 int attachment_cnt; 38 + bool killed; 37 39 }; 38 40 39 41 /* Poll a file (eventfd or socket) */
+2 -1
include/linux/sched/vhost_task.h
··· 4 4 5 5 struct vhost_task; 6 6 7 - struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, 7 + struct vhost_task *vhost_task_create(bool (*fn)(void *), 8 + void (*handle_kill)(void *), void *arg, 8 9 const char *name); 9 10 void vhost_task_start(struct vhost_task *vtsk); 10 11 void vhost_task_stop(struct vhost_task *vtsk);
+34 -19
kernel/vhost_task.c
··· 10 10 11 11 enum vhost_task_flags { 12 12 VHOST_TASK_FLAGS_STOP, 13 + VHOST_TASK_FLAGS_KILLED, 13 14 }; 14 15 15 16 struct vhost_task { 16 17 bool (*fn)(void *data); 18 + void (*handle_sigkill)(void *data); 17 19 void *data; 18 20 struct completion exited; 19 21 unsigned long flags; 20 22 struct task_struct *task; 23 + /* serialize SIGKILL and vhost_task_stop calls */ 24 + struct mutex exit_mutex; 21 25 }; 22 26 23 27 static int vhost_task_fn(void *data) 24 28 { 25 29 struct vhost_task *vtsk = data; 26 - bool dead = false; 27 30 28 31 for (;;) { 29 32 bool did_work; 30 33 31 - if (!dead && signal_pending(current)) { 34 + if (signal_pending(current)) { 32 35 struct ksignal ksig; 33 - /* 34 - * Calling get_signal will block in SIGSTOP, 35 - * or clear fatal_signal_pending, but remember 36 - * what was set. 37 - * 38 - * This thread won't actually exit until all 39 - * of the file descriptors are closed, and 40 - * the release function is called. 41 - */ 42 - dead = get_signal(&ksig); 43 - if (dead) 44 - clear_thread_flag(TIF_SIGPENDING); 36 + 37 + if (get_signal(&ksig)) 38 + break; 45 39 } 46 40 47 41 /* mb paired w/ vhost_task_stop */ ··· 51 57 schedule(); 52 58 } 53 59 60 + mutex_lock(&vtsk->exit_mutex); 61 + /* 62 + * If a vhost_task_stop and SIGKILL race, we can ignore the SIGKILL. 63 + * When the vhost layer has called vhost_task_stop it's already stopped 64 + * new work and flushed. 65 + */ 66 + if (!test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { 67 + set_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags); 68 + vtsk->handle_sigkill(vtsk->data); 69 + } 70 + mutex_unlock(&vtsk->exit_mutex); 54 71 complete(&vtsk->exited); 72 + 55 73 do_exit(0); 56 74 } 57 75 ··· 84 78 * @vtsk: vhost_task to stop 85 79 * 86 80 * vhost_task_fn ensures the worker thread exits after 87 - * VHOST_TASK_FLAGS_SOP becomes true. 81 + * VHOST_TASK_FLAGS_STOP becomes true. 88 82 */ 89 83 void vhost_task_stop(struct vhost_task *vtsk) 90 84 { 91 - set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); 92 - vhost_task_wake(vtsk); 85 + mutex_lock(&vtsk->exit_mutex); 86 + if (!test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) { 87 + set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); 88 + vhost_task_wake(vtsk); 89 + } 90 + mutex_unlock(&vtsk->exit_mutex); 91 + 93 92 /* 94 93 * Make sure vhost_task_fn is no longer accessing the vhost_task before 95 94 * freeing it below. ··· 107 96 /** 108 97 * vhost_task_create - create a copy of a task to be used by the kernel 109 98 * @fn: vhost worker function 110 - * @arg: data to be passed to fn 99 + * @handle_sigkill: vhost function to handle when we are killed 100 + * @arg: data to be passed to fn and handled_kill 111 101 * @name: the thread's name 112 102 * 113 103 * This returns a specialized task for use by the vhost layer or NULL on 114 104 * failure. The returned task is inactive, and the caller must fire it up 115 105 * through vhost_task_start(). 116 106 */ 117 - struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, 107 + struct vhost_task *vhost_task_create(bool (*fn)(void *), 108 + void (*handle_sigkill)(void *), void *arg, 118 109 const char *name) 119 110 { 120 111 struct kernel_clone_args args = { ··· 135 122 if (!vtsk) 136 123 return NULL; 137 124 init_completion(&vtsk->exited); 125 + mutex_init(&vtsk->exit_mutex); 138 126 vtsk->data = arg; 139 127 vtsk->fn = fn; 128 + vtsk->handle_sigkill = handle_sigkill; 140 129 141 130 args.fn_arg = vtsk; 142 131