drivers/vhost/vhost.c at v5.10-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / vhost / vhost.c
at v5.10-rc4 2630 lines 64 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Copyright (C) 2009 Red Hat, Inc.
   3 * Copyright (C) 2006 Rusty Russell IBM Corporation
   4 *
   5 * Author: Michael S. Tsirkin <mst@redhat.com>
   6 *
   7 * Inspiration, some code, and most witty comments come from
   8 * Documentation/virtual/lguest/lguest.c, by Rusty Russell
   9 *
  10 * Generic code for virtio server in host kernel.
  11 */
  12
  13#include <linux/eventfd.h>
  14#include <linux/vhost.h>
  15#include <linux/uio.h>
  16#include <linux/mm.h>
  17#include <linux/miscdevice.h>
  18#include <linux/mutex.h>
  19#include <linux/poll.h>
  20#include <linux/file.h>
  21#include <linux/highmem.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <linux/kthread.h>
  25#include <linux/cgroup.h>
  26#include <linux/module.h>
  27#include <linux/sort.h>
  28#include <linux/sched/mm.h>
  29#include <linux/sched/signal.h>
  30#include <linux/interval_tree_generic.h>
  31#include <linux/nospec.h>
  32#include <linux/kcov.h>
  33
  34#include "vhost.h"
  35
  36static ushort max_mem_regions = 64;
  37module_param(max_mem_regions, ushort, 0444);
  38MODULE_PARM_DESC(max_mem_regions,
  39	"Maximum number of memory regions in memory map. (default: 64)");
  40static int max_iotlb_entries = 2048;
  41module_param(max_iotlb_entries, int, 0444);
  42MODULE_PARM_DESC(max_iotlb_entries,
  43	"Maximum number of iotlb entries. (default: 2048)");
  44
  45enum {
  46	VHOST_MEMORY_F_LOG = 0x1,
  47};
  48
  49#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
  50#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
  51
  52#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
  53static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  54{
  55	vq->user_be = !virtio_legacy_is_little_endian();
  56}
  57
  58static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
  59{
  60	vq->user_be = true;
  61}
  62
  63static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
  64{
  65	vq->user_be = false;
  66}
  67
  68static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  69{
  70	struct vhost_vring_state s;
  71
  72	if (vq->private_data)
  73		return -EBUSY;
  74
  75	if (copy_from_user(&s, argp, sizeof(s)))
  76		return -EFAULT;
  77
  78	if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
  79	    s.num != VHOST_VRING_BIG_ENDIAN)
  80		return -EINVAL;
  81
  82	if (s.num == VHOST_VRING_BIG_ENDIAN)
  83		vhost_enable_cross_endian_big(vq);
  84	else
  85		vhost_enable_cross_endian_little(vq);
  86
  87	return 0;
  88}
  89
  90static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  91				   int __user *argp)
  92{
  93	struct vhost_vring_state s = {
  94		.index = idx,
  95		.num = vq->user_be
  96	};
  97
  98	if (copy_to_user(argp, &s, sizeof(s)))
  99		return -EFAULT;
 100
 101	return 0;
 102}
 103
 104static void vhost_init_is_le(struct vhost_virtqueue *vq)
 105{
 106	/* Note for legacy virtio: user_be is initialized at reset time
 107	 * according to the host endianness. If userspace does not set an
 108	 * explicit endianness, the default behavior is native endian, as
 109	 * expected by legacy virtio.
 110	 */
 111	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
 112}
 113#else
 114static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 115{
 116}
 117
 118static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
 119{
 120	return -ENOIOCTLCMD;
 121}
 122
 123static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
 124				   int __user *argp)
 125{
 126	return -ENOIOCTLCMD;
 127}
 128
 129static void vhost_init_is_le(struct vhost_virtqueue *vq)
 130{
 131	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
 132		|| virtio_legacy_is_little_endian();
 133}
 134#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 135
 136static void vhost_reset_is_le(struct vhost_virtqueue *vq)
 137{
 138	vhost_init_is_le(vq);
 139}
 140
 141struct vhost_flush_struct {
 142	struct vhost_work work;
 143	struct completion wait_event;
 144};
 145
 146static void vhost_flush_work(struct vhost_work *work)
 147{
 148	struct vhost_flush_struct *s;
 149
 150	s = container_of(work, struct vhost_flush_struct, work);
 151	complete(&s->wait_event);
 152}
 153
 154static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 155			    poll_table *pt)
 156{
 157	struct vhost_poll *poll;
 158
 159	poll = container_of(pt, struct vhost_poll, table);
 160	poll->wqh = wqh;
 161	add_wait_queue(wqh, &poll->wait);
 162}
 163
 164static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 165			     void *key)
 166{
 167	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
 168	struct vhost_work *work = &poll->work;
 169
 170	if (!(key_to_poll(key) & poll->mask))
 171		return 0;
 172
 173	if (!poll->dev->use_worker)
 174		work->fn(work);
 175	else
 176		vhost_poll_queue(poll);
 177
 178	return 0;
 179}
 180
 181void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 182{
 183	clear_bit(VHOST_WORK_QUEUED, &work->flags);
 184	work->fn = fn;
 185}
 186EXPORT_SYMBOL_GPL(vhost_work_init);
 187
 188/* Init poll structure */
 189void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 190		     __poll_t mask, struct vhost_dev *dev)
 191{
 192	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 193	init_poll_funcptr(&poll->table, vhost_poll_func);
 194	poll->mask = mask;
 195	poll->dev = dev;
 196	poll->wqh = NULL;
 197
 198	vhost_work_init(&poll->work, fn);
 199}
 200EXPORT_SYMBOL_GPL(vhost_poll_init);
 201
 202/* Start polling a file. We add ourselves to file's wait queue. The caller must
 203 * keep a reference to a file until after vhost_poll_stop is called. */
 204int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 205{
 206	__poll_t mask;
 207
 208	if (poll->wqh)
 209		return 0;
 210
 211	mask = vfs_poll(file, &poll->table);
 212	if (mask)
 213		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 214	if (mask & EPOLLERR) {
 215		vhost_poll_stop(poll);
 216		return -EINVAL;
 217	}
 218
 219	return 0;
 220}
 221EXPORT_SYMBOL_GPL(vhost_poll_start);
 222
 223/* Stop polling a file. After this function returns, it becomes safe to drop the
 224 * file reference. You must also flush afterwards. */
 225void vhost_poll_stop(struct vhost_poll *poll)
 226{
 227	if (poll->wqh) {
 228		remove_wait_queue(poll->wqh, &poll->wait);
 229		poll->wqh = NULL;
 230	}
 231}
 232EXPORT_SYMBOL_GPL(vhost_poll_stop);
 233
 234void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 235{
 236	struct vhost_flush_struct flush;
 237
 238	if (dev->worker) {
 239		init_completion(&flush.wait_event);
 240		vhost_work_init(&flush.work, vhost_flush_work);
 241
 242		vhost_work_queue(dev, &flush.work);
 243		wait_for_completion(&flush.wait_event);
 244	}
 245}
 246EXPORT_SYMBOL_GPL(vhost_work_flush);
 247
 248/* Flush any work that has been scheduled. When calling this, don't hold any
 249 * locks that are also used by the callback. */
 250void vhost_poll_flush(struct vhost_poll *poll)
 251{
 252	vhost_work_flush(poll->dev, &poll->work);
 253}
 254EXPORT_SYMBOL_GPL(vhost_poll_flush);
 255
 256void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 257{
 258	if (!dev->worker)
 259		return;
 260
 261	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
 262		/* We can only add the work to the list after we're
 263		 * sure it was not in the list.
 264		 * test_and_set_bit() implies a memory barrier.
 265		 */
 266		llist_add(&work->node, &dev->work_list);
 267		wake_up_process(dev->worker);
 268	}
 269}
 270EXPORT_SYMBOL_GPL(vhost_work_queue);
 271
 272/* A lockless hint for busy polling code to exit the loop */
 273bool vhost_has_work(struct vhost_dev *dev)
 274{
 275	return !llist_empty(&dev->work_list);
 276}
 277EXPORT_SYMBOL_GPL(vhost_has_work);
 278
 279void vhost_poll_queue(struct vhost_poll *poll)
 280{
 281	vhost_work_queue(poll->dev, &poll->work);
 282}
 283EXPORT_SYMBOL_GPL(vhost_poll_queue);
 284
 285static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
 286{
 287	int j;
 288
 289	for (j = 0; j < VHOST_NUM_ADDRS; j++)
 290		vq->meta_iotlb[j] = NULL;
 291}
 292
 293static void vhost_vq_meta_reset(struct vhost_dev *d)
 294{
 295	int i;
 296
 297	for (i = 0; i < d->nvqs; ++i)
 298		__vhost_vq_meta_reset(d->vqs[i]);
 299}
 300
 301static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
 302{
 303	call_ctx->ctx = NULL;
 304	memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
 305}
 306
 307static void vhost_vq_reset(struct vhost_dev *dev,
 308			   struct vhost_virtqueue *vq)
 309{
 310	vq->num = 1;
 311	vq->desc = NULL;
 312	vq->avail = NULL;
 313	vq->used = NULL;
 314	vq->last_avail_idx = 0;
 315	vq->avail_idx = 0;
 316	vq->last_used_idx = 0;
 317	vq->signalled_used = 0;
 318	vq->signalled_used_valid = false;
 319	vq->used_flags = 0;
 320	vq->log_used = false;
 321	vq->log_addr = -1ull;
 322	vq->private_data = NULL;
 323	vq->acked_features = 0;
 324	vq->acked_backend_features = 0;
 325	vq->log_base = NULL;
 326	vq->error_ctx = NULL;
 327	vq->kick = NULL;
 328	vq->log_ctx = NULL;
 329	vhost_reset_is_le(vq);
 330	vhost_disable_cross_endian(vq);
 331	vq->busyloop_timeout = 0;
 332	vq->umem = NULL;
 333	vq->iotlb = NULL;
 334	vhost_vring_call_reset(&vq->call_ctx);
 335	__vhost_vq_meta_reset(vq);
 336}
 337
 338static int vhost_worker(void *data)
 339{
 340	struct vhost_dev *dev = data;
 341	struct vhost_work *work, *work_next;
 342	struct llist_node *node;
 343
 344	kthread_use_mm(dev->mm);
 345
 346	for (;;) {
 347		/* mb paired w/ kthread_stop */
 348		set_current_state(TASK_INTERRUPTIBLE);
 349
 350		if (kthread_should_stop()) {
 351			__set_current_state(TASK_RUNNING);
 352			break;
 353		}
 354
 355		node = llist_del_all(&dev->work_list);
 356		if (!node)
 357			schedule();
 358
 359		node = llist_reverse_order(node);
 360		/* make sure flag is seen after deletion */
 361		smp_wmb();
 362		llist_for_each_entry_safe(work, work_next, node, node) {
 363			clear_bit(VHOST_WORK_QUEUED, &work->flags);
 364			__set_current_state(TASK_RUNNING);
 365			kcov_remote_start_common(dev->kcov_handle);
 366			work->fn(work);
 367			kcov_remote_stop();
 368			if (need_resched())
 369				schedule();
 370		}
 371	}
 372	kthread_unuse_mm(dev->mm);
 373	return 0;
 374}
 375
 376static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 377{
 378	kfree(vq->indirect);
 379	vq->indirect = NULL;
 380	kfree(vq->log);
 381	vq->log = NULL;
 382	kfree(vq->heads);
 383	vq->heads = NULL;
 384}
 385
 386/* Helper to allocate iovec buffers for all vqs. */
 387static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 388{
 389	struct vhost_virtqueue *vq;
 390	int i;
 391
 392	for (i = 0; i < dev->nvqs; ++i) {
 393		vq = dev->vqs[i];
 394		vq->indirect = kmalloc_array(UIO_MAXIOV,
 395					     sizeof(*vq->indirect),
 396					     GFP_KERNEL);
 397		vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
 398					GFP_KERNEL);
 399		vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
 400					  GFP_KERNEL);
 401		if (!vq->indirect || !vq->log || !vq->heads)
 402			goto err_nomem;
 403	}
 404	return 0;
 405
 406err_nomem:
 407	for (; i >= 0; --i)
 408		vhost_vq_free_iovecs(dev->vqs[i]);
 409	return -ENOMEM;
 410}
 411
 412static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 413{
 414	int i;
 415
 416	for (i = 0; i < dev->nvqs; ++i)
 417		vhost_vq_free_iovecs(dev->vqs[i]);
 418}
 419
 420bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
 421			  int pkts, int total_len)
 422{
 423	struct vhost_dev *dev = vq->dev;
 424
 425	if ((dev->byte_weight && total_len >= dev->byte_weight) ||
 426	    pkts >= dev->weight) {
 427		vhost_poll_queue(&vq->poll);
 428		return true;
 429	}
 430
 431	return false;
 432}
 433EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
 434
 435static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
 436				   unsigned int num)
 437{
 438	size_t event __maybe_unused =
 439	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 440
 441	return sizeof(*vq->avail) +
 442	       sizeof(*vq->avail->ring) * num + event;
 443}
 444
 445static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
 446				  unsigned int num)
 447{
 448	size_t event __maybe_unused =
 449	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 450
 451	return sizeof(*vq->used) +
 452	       sizeof(*vq->used->ring) * num + event;
 453}
 454
 455static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
 456				  unsigned int num)
 457{
 458	return sizeof(*vq->desc) * num;
 459}
 460
 461void vhost_dev_init(struct vhost_dev *dev,
 462		    struct vhost_virtqueue **vqs, int nvqs,
 463		    int iov_limit, int weight, int byte_weight,
 464		    bool use_worker,
 465		    int (*msg_handler)(struct vhost_dev *dev,
 466				       struct vhost_iotlb_msg *msg))
 467{
 468	struct vhost_virtqueue *vq;
 469	int i;
 470
 471	dev->vqs = vqs;
 472	dev->nvqs = nvqs;
 473	mutex_init(&dev->mutex);
 474	dev->log_ctx = NULL;
 475	dev->umem = NULL;
 476	dev->iotlb = NULL;
 477	dev->mm = NULL;
 478	dev->worker = NULL;
 479	dev->iov_limit = iov_limit;
 480	dev->weight = weight;
 481	dev->byte_weight = byte_weight;
 482	dev->use_worker = use_worker;
 483	dev->msg_handler = msg_handler;
 484	init_llist_head(&dev->work_list);
 485	init_waitqueue_head(&dev->wait);
 486	INIT_LIST_HEAD(&dev->read_list);
 487	INIT_LIST_HEAD(&dev->pending_list);
 488	spin_lock_init(&dev->iotlb_lock);
 489
 490
 491	for (i = 0; i < dev->nvqs; ++i) {
 492		vq = dev->vqs[i];
 493		vq->log = NULL;
 494		vq->indirect = NULL;
 495		vq->heads = NULL;
 496		vq->dev = dev;
 497		mutex_init(&vq->mutex);
 498		vhost_vq_reset(dev, vq);
 499		if (vq->handle_kick)
 500			vhost_poll_init(&vq->poll, vq->handle_kick,
 501					EPOLLIN, dev);
 502	}
 503}
 504EXPORT_SYMBOL_GPL(vhost_dev_init);
 505
 506/* Caller should have device mutex */
 507long vhost_dev_check_owner(struct vhost_dev *dev)
 508{
 509	/* Are you the owner? If not, I don't think you mean to do that */
 510	return dev->mm == current->mm ? 0 : -EPERM;
 511}
 512EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 513
 514struct vhost_attach_cgroups_struct {
 515	struct vhost_work work;
 516	struct task_struct *owner;
 517	int ret;
 518};
 519
 520static void vhost_attach_cgroups_work(struct vhost_work *work)
 521{
 522	struct vhost_attach_cgroups_struct *s;
 523
 524	s = container_of(work, struct vhost_attach_cgroups_struct, work);
 525	s->ret = cgroup_attach_task_all(s->owner, current);
 526}
 527
 528static int vhost_attach_cgroups(struct vhost_dev *dev)
 529{
 530	struct vhost_attach_cgroups_struct attach;
 531
 532	attach.owner = current;
 533	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
 534	vhost_work_queue(dev, &attach.work);
 535	vhost_work_flush(dev, &attach.work);
 536	return attach.ret;
 537}
 538
 539/* Caller should have device mutex */
 540bool vhost_dev_has_owner(struct vhost_dev *dev)
 541{
 542	return dev->mm;
 543}
 544EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
 545
 546static void vhost_attach_mm(struct vhost_dev *dev)
 547{
 548	/* No owner, become one */
 549	if (dev->use_worker) {
 550		dev->mm = get_task_mm(current);
 551	} else {
 552		/* vDPA device does not use worker thead, so there's
 553		 * no need to hold the address space for mm. This help
 554		 * to avoid deadlock in the case of mmap() which may
 555		 * held the refcnt of the file and depends on release
 556		 * method to remove vma.
 557		 */
 558		dev->mm = current->mm;
 559		mmgrab(dev->mm);
 560	}
 561}
 562
 563static void vhost_detach_mm(struct vhost_dev *dev)
 564{
 565	if (!dev->mm)
 566		return;
 567
 568	if (dev->use_worker)
 569		mmput(dev->mm);
 570	else
 571		mmdrop(dev->mm);
 572
 573	dev->mm = NULL;
 574}
 575
 576/* Caller should have device mutex */
 577long vhost_dev_set_owner(struct vhost_dev *dev)
 578{
 579	struct task_struct *worker;
 580	int err;
 581
 582	/* Is there an owner already? */
 583	if (vhost_dev_has_owner(dev)) {
 584		err = -EBUSY;
 585		goto err_mm;
 586	}
 587
 588	vhost_attach_mm(dev);
 589
 590	dev->kcov_handle = kcov_common_handle();
 591	if (dev->use_worker) {
 592		worker = kthread_create(vhost_worker, dev,
 593					"vhost-%d", current->pid);
 594		if (IS_ERR(worker)) {
 595			err = PTR_ERR(worker);
 596			goto err_worker;
 597		}
 598
 599		dev->worker = worker;
 600		wake_up_process(worker); /* avoid contributing to loadavg */
 601
 602		err = vhost_attach_cgroups(dev);
 603		if (err)
 604			goto err_cgroup;
 605	}
 606
 607	err = vhost_dev_alloc_iovecs(dev);
 608	if (err)
 609		goto err_cgroup;
 610
 611	return 0;
 612err_cgroup:
 613	if (dev->worker) {
 614		kthread_stop(dev->worker);
 615		dev->worker = NULL;
 616	}
 617err_worker:
 618	vhost_detach_mm(dev);
 619	dev->kcov_handle = 0;
 620err_mm:
 621	return err;
 622}
 623EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 624
 625static struct vhost_iotlb *iotlb_alloc(void)
 626{
 627	return vhost_iotlb_alloc(max_iotlb_entries,
 628				 VHOST_IOTLB_FLAG_RETIRE);
 629}
 630
 631struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
 632{
 633	return iotlb_alloc();
 634}
 635EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 636
 637/* Caller should have device mutex */
 638void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
 639{
 640	int i;
 641
 642	vhost_dev_cleanup(dev);
 643
 644	dev->umem = umem;
 645	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 646	 * VQs aren't running.
 647	 */
 648	for (i = 0; i < dev->nvqs; ++i)
 649		dev->vqs[i]->umem = umem;
 650}
 651EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 652
 653void vhost_dev_stop(struct vhost_dev *dev)
 654{
 655	int i;
 656
 657	for (i = 0; i < dev->nvqs; ++i) {
 658		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
 659			vhost_poll_stop(&dev->vqs[i]->poll);
 660			vhost_poll_flush(&dev->vqs[i]->poll);
 661		}
 662	}
 663}
 664EXPORT_SYMBOL_GPL(vhost_dev_stop);
 665
 666static void vhost_clear_msg(struct vhost_dev *dev)
 667{
 668	struct vhost_msg_node *node, *n;
 669
 670	spin_lock(&dev->iotlb_lock);
 671
 672	list_for_each_entry_safe(node, n, &dev->read_list, node) {
 673		list_del(&node->node);
 674		kfree(node);
 675	}
 676
 677	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
 678		list_del(&node->node);
 679		kfree(node);
 680	}
 681
 682	spin_unlock(&dev->iotlb_lock);
 683}
 684
 685void vhost_dev_cleanup(struct vhost_dev *dev)
 686{
 687	int i;
 688
 689	for (i = 0; i < dev->nvqs; ++i) {
 690		if (dev->vqs[i]->error_ctx)
 691			eventfd_ctx_put(dev->vqs[i]->error_ctx);
 692		if (dev->vqs[i]->kick)
 693			fput(dev->vqs[i]->kick);
 694		if (dev->vqs[i]->call_ctx.ctx)
 695			eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
 696		vhost_vq_reset(dev, dev->vqs[i]);
 697	}
 698	vhost_dev_free_iovecs(dev);
 699	if (dev->log_ctx)
 700		eventfd_ctx_put(dev->log_ctx);
 701	dev->log_ctx = NULL;
 702	/* No one will access memory at this point */
 703	vhost_iotlb_free(dev->umem);
 704	dev->umem = NULL;
 705	vhost_iotlb_free(dev->iotlb);
 706	dev->iotlb = NULL;
 707	vhost_clear_msg(dev);
 708	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
 709	WARN_ON(!llist_empty(&dev->work_list));
 710	if (dev->worker) {
 711		kthread_stop(dev->worker);
 712		dev->worker = NULL;
 713		dev->kcov_handle = 0;
 714	}
 715	vhost_detach_mm(dev);
 716}
 717EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
 718
 719static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 720{
 721	u64 a = addr / VHOST_PAGE_SIZE / 8;
 722
 723	/* Make sure 64 bit math will not overflow. */
 724	if (a > ULONG_MAX - (unsigned long)log_base ||
 725	    a + (unsigned long)log_base > ULONG_MAX)
 726		return false;
 727
 728	return access_ok(log_base + a,
 729			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
 730}
 731
 732static bool vhost_overflow(u64 uaddr, u64 size)
 733{
 734	/* Make sure 64 bit math will not overflow. */
 735	return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size;
 736}
 737
 738/* Caller should have vq mutex and device mutex. */
 739static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
 740				int log_all)
 741{
 742	struct vhost_iotlb_map *map;
 743
 744	if (!umem)
 745		return false;
 746
 747	list_for_each_entry(map, &umem->list, link) {
 748		unsigned long a = map->addr;
 749
 750		if (vhost_overflow(map->addr, map->size))
 751			return false;
 752
 753
 754		if (!access_ok((void __user *)a, map->size))
 755			return false;
 756		else if (log_all && !log_access_ok(log_base,
 757						   map->start,
 758						   map->size))
 759			return false;
 760	}
 761	return true;
 762}
 763
 764static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
 765					       u64 addr, unsigned int size,
 766					       int type)
 767{
 768	const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
 769
 770	if (!map)
 771		return NULL;
 772
 773	return (void __user *)(uintptr_t)(map->addr + addr - map->start);
 774}
 775
 776/* Can we switch to this memory table? */
 777/* Caller should have device mutex but not vq mutex */
 778static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
 779			     int log_all)
 780{
 781	int i;
 782
 783	for (i = 0; i < d->nvqs; ++i) {
 784		bool ok;
 785		bool log;
 786
 787		mutex_lock(&d->vqs[i]->mutex);
 788		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 789		/* If ring is inactive, will check when it's enabled. */
 790		if (d->vqs[i]->private_data)
 791			ok = vq_memory_access_ok(d->vqs[i]->log_base,
 792						 umem, log);
 793		else
 794			ok = true;
 795		mutex_unlock(&d->vqs[i]->mutex);
 796		if (!ok)
 797			return false;
 798	}
 799	return true;
 800}
 801
 802static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 803			  struct iovec iov[], int iov_size, int access);
 804
 805static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
 806			      const void *from, unsigned size)
 807{
 808	int ret;
 809
 810	if (!vq->iotlb)
 811		return __copy_to_user(to, from, size);
 812	else {
 813		/* This function should be called after iotlb
 814		 * prefetch, which means we're sure that all vq
 815		 * could be access through iotlb. So -EAGAIN should
 816		 * not happen in this case.
 817		 */
 818		struct iov_iter t;
 819		void __user *uaddr = vhost_vq_meta_fetch(vq,
 820				     (u64)(uintptr_t)to, size,
 821				     VHOST_ADDR_USED);
 822
 823		if (uaddr)
 824			return __copy_to_user(uaddr, from, size);
 825
 826		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
 827				     ARRAY_SIZE(vq->iotlb_iov),
 828				     VHOST_ACCESS_WO);
 829		if (ret < 0)
 830			goto out;
 831		iov_iter_init(&t, WRITE, vq->iotlb_iov, ret, size);
 832		ret = copy_to_iter(from, size, &t);
 833		if (ret == size)
 834			ret = 0;
 835	}
 836out:
 837	return ret;
 838}
 839
 840static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 841				void __user *from, unsigned size)
 842{
 843	int ret;
 844
 845	if (!vq->iotlb)
 846		return __copy_from_user(to, from, size);
 847	else {
 848		/* This function should be called after iotlb
 849		 * prefetch, which means we're sure that vq
 850		 * could be access through iotlb. So -EAGAIN should
 851		 * not happen in this case.
 852		 */
 853		void __user *uaddr = vhost_vq_meta_fetch(vq,
 854				     (u64)(uintptr_t)from, size,
 855				     VHOST_ADDR_DESC);
 856		struct iov_iter f;
 857
 858		if (uaddr)
 859			return __copy_from_user(to, uaddr, size);
 860
 861		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
 862				     ARRAY_SIZE(vq->iotlb_iov),
 863				     VHOST_ACCESS_RO);
 864		if (ret < 0) {
 865			vq_err(vq, "IOTLB translation failure: uaddr "
 866			       "%p size 0x%llx\n", from,
 867			       (unsigned long long) size);
 868			goto out;
 869		}
 870		iov_iter_init(&f, READ, vq->iotlb_iov, ret, size);
 871		ret = copy_from_iter(to, size, &f);
 872		if (ret == size)
 873			ret = 0;
 874	}
 875
 876out:
 877	return ret;
 878}
 879
 880static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
 881					  void __user *addr, unsigned int size,
 882					  int type)
 883{
 884	int ret;
 885
 886	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
 887			     ARRAY_SIZE(vq->iotlb_iov),
 888			     VHOST_ACCESS_RO);
 889	if (ret < 0) {
 890		vq_err(vq, "IOTLB translation failure: uaddr "
 891			"%p size 0x%llx\n", addr,
 892			(unsigned long long) size);
 893		return NULL;
 894	}
 895
 896	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
 897		vq_err(vq, "Non atomic userspace memory access: uaddr "
 898			"%p size 0x%llx\n", addr,
 899			(unsigned long long) size);
 900		return NULL;
 901	}
 902
 903	return vq->iotlb_iov[0].iov_base;
 904}
 905
 906/* This function should be called after iotlb
 907 * prefetch, which means we're sure that vq
 908 * could be access through iotlb. So -EAGAIN should
 909 * not happen in this case.
 910 */
 911static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 912					    void __user *addr, unsigned int size,
 913					    int type)
 914{
 915	void __user *uaddr = vhost_vq_meta_fetch(vq,
 916			     (u64)(uintptr_t)addr, size, type);
 917	if (uaddr)
 918		return uaddr;
 919
 920	return __vhost_get_user_slow(vq, addr, size, type);
 921}
 922
 923#define vhost_put_user(vq, x, ptr)		\
 924({ \
 925	int ret; \
 926	if (!vq->iotlb) { \
 927		ret = __put_user(x, ptr); \
 928	} else { \
 929		__typeof__(ptr) to = \
 930			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
 931					  sizeof(*ptr), VHOST_ADDR_USED); \
 932		if (to != NULL) \
 933			ret = __put_user(x, to); \
 934		else \
 935			ret = -EFAULT;	\
 936	} \
 937	ret; \
 938})
 939
 940static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 941{
 942	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 943			      vhost_avail_event(vq));
 944}
 945
 946static inline int vhost_put_used(struct vhost_virtqueue *vq,
 947				 struct vring_used_elem *head, int idx,
 948				 int count)
 949{
 950	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 951				  count * sizeof(*head));
 952}
 953
 954static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 955
 956{
 957	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 958			      &vq->used->flags);
 959}
 960
 961static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 962
 963{
 964	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 965			      &vq->used->idx);
 966}
 967
 968#define vhost_get_user(vq, x, ptr, type)		\
 969({ \
 970	int ret; \
 971	if (!vq->iotlb) { \
 972		ret = __get_user(x, ptr); \
 973	} else { \
 974		__typeof__(ptr) from = \
 975			(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
 976							   sizeof(*ptr), \
 977							   type); \
 978		if (from != NULL) \
 979			ret = __get_user(x, from); \
 980		else \
 981			ret = -EFAULT; \
 982	} \
 983	ret; \
 984})
 985
 986#define vhost_get_avail(vq, x, ptr) \
 987	vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
 988
 989#define vhost_get_used(vq, x, ptr) \
 990	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 991
 992static void vhost_dev_lock_vqs(struct vhost_dev *d)
 993{
 994	int i = 0;
 995	for (i = 0; i < d->nvqs; ++i)
 996		mutex_lock_nested(&d->vqs[i]->mutex, i);
 997}
 998
 999static void vhost_dev_unlock_vqs(struct vhost_dev *d)
1000{
1001	int i = 0;
1002	for (i = 0; i < d->nvqs; ++i)
1003		mutex_unlock(&d->vqs[i]->mutex);
1004}
1005
1006static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
1007				      __virtio16 *idx)
1008{
1009	return vhost_get_avail(vq, *idx, &vq->avail->idx);
1010}
1011
1012static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
1013				       __virtio16 *head, int idx)
1014{
1015	return vhost_get_avail(vq, *head,
1016			       &vq->avail->ring[idx & (vq->num - 1)]);
1017}
1018
1019static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
1020					__virtio16 *flags)
1021{
1022	return vhost_get_avail(vq, *flags, &vq->avail->flags);
1023}
1024
1025static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
1026				       __virtio16 *event)
1027{
1028	return vhost_get_avail(vq, *event, vhost_used_event(vq));
1029}
1030
1031static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
1032				     __virtio16 *idx)
1033{
1034	return vhost_get_used(vq, *idx, &vq->used->idx);
1035}
1036
1037static inline int vhost_get_desc(struct vhost_virtqueue *vq,
1038				 struct vring_desc *desc, int idx)
1039{
1040	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
1041}
1042
1043static void vhost_iotlb_notify_vq(struct vhost_dev *d,
1044				  struct vhost_iotlb_msg *msg)
1045{
1046	struct vhost_msg_node *node, *n;
1047
1048	spin_lock(&d->iotlb_lock);
1049
1050	list_for_each_entry_safe(node, n, &d->pending_list, node) {
1051		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
1052		if (msg->iova <= vq_msg->iova &&
1053		    msg->iova + msg->size - 1 >= vq_msg->iova &&
1054		    vq_msg->type == VHOST_IOTLB_MISS) {
1055			vhost_poll_queue(&node->vq->poll);
1056			list_del(&node->node);
1057			kfree(node);
1058		}
1059	}
1060
1061	spin_unlock(&d->iotlb_lock);
1062}
1063
1064static bool umem_access_ok(u64 uaddr, u64 size, int access)
1065{
1066	unsigned long a = uaddr;
1067
1068	/* Make sure 64 bit math will not overflow. */
1069	if (vhost_overflow(uaddr, size))
1070		return false;
1071
1072	if ((access & VHOST_ACCESS_RO) &&
1073	    !access_ok((void __user *)a, size))
1074		return false;
1075	if ((access & VHOST_ACCESS_WO) &&
1076	    !access_ok((void __user *)a, size))
1077		return false;
1078	return true;
1079}
1080
1081static int vhost_process_iotlb_msg(struct vhost_dev *dev,
1082				   struct vhost_iotlb_msg *msg)
1083{
1084	int ret = 0;
1085
1086	mutex_lock(&dev->mutex);
1087	vhost_dev_lock_vqs(dev);
1088	switch (msg->type) {
1089	case VHOST_IOTLB_UPDATE:
1090		if (!dev->iotlb) {
1091			ret = -EFAULT;
1092			break;
1093		}
1094		if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
1095			ret = -EFAULT;
1096			break;
1097		}
1098		vhost_vq_meta_reset(dev);
1099		if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
1100					  msg->iova + msg->size - 1,
1101					  msg->uaddr, msg->perm)) {
1102			ret = -ENOMEM;
1103			break;
1104		}
1105		vhost_iotlb_notify_vq(dev, msg);
1106		break;
1107	case VHOST_IOTLB_INVALIDATE:
1108		if (!dev->iotlb) {
1109			ret = -EFAULT;
1110			break;
1111		}
1112		vhost_vq_meta_reset(dev);
1113		vhost_iotlb_del_range(dev->iotlb, msg->iova,
1114				      msg->iova + msg->size - 1);
1115		break;
1116	default:
1117		ret = -EINVAL;
1118		break;
1119	}
1120
1121	vhost_dev_unlock_vqs(dev);
1122	mutex_unlock(&dev->mutex);
1123
1124	return ret;
1125}
1126ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
1127			     struct iov_iter *from)
1128{
1129	struct vhost_iotlb_msg msg;
1130	size_t offset;
1131	int type, ret;
1132
1133	ret = copy_from_iter(&type, sizeof(type), from);
1134	if (ret != sizeof(type)) {
1135		ret = -EINVAL;
1136		goto done;
1137	}
1138
1139	switch (type) {
1140	case VHOST_IOTLB_MSG:
1141		/* There maybe a hole after type for V1 message type,
1142		 * so skip it here.
1143		 */
1144		offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
1145		break;
1146	case VHOST_IOTLB_MSG_V2:
1147		offset = sizeof(__u32);
1148		break;
1149	default:
1150		ret = -EINVAL;
1151		goto done;
1152	}
1153
1154	iov_iter_advance(from, offset);
1155	ret = copy_from_iter(&msg, sizeof(msg), from);
1156	if (ret != sizeof(msg)) {
1157		ret = -EINVAL;
1158		goto done;
1159	}
1160
1161	if (dev->msg_handler)
1162		ret = dev->msg_handler(dev, &msg);
1163	else
1164		ret = vhost_process_iotlb_msg(dev, &msg);
1165	if (ret) {
1166		ret = -EFAULT;
1167		goto done;
1168	}
1169
1170	ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
1171	      sizeof(struct vhost_msg_v2);
1172done:
1173	return ret;
1174}
1175EXPORT_SYMBOL(vhost_chr_write_iter);
1176
1177__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
1178			    poll_table *wait)
1179{
1180	__poll_t mask = 0;
1181
1182	poll_wait(file, &dev->wait, wait);
1183
1184	if (!list_empty(&dev->read_list))
1185		mask |= EPOLLIN | EPOLLRDNORM;
1186
1187	return mask;
1188}
1189EXPORT_SYMBOL(vhost_chr_poll);
1190
1191ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
1192			    int noblock)
1193{
1194	DEFINE_WAIT(wait);
1195	struct vhost_msg_node *node;
1196	ssize_t ret = 0;
1197	unsigned size = sizeof(struct vhost_msg);
1198
1199	if (iov_iter_count(to) < size)
1200		return 0;
1201
1202	while (1) {
1203		if (!noblock)
1204			prepare_to_wait(&dev->wait, &wait,
1205					TASK_INTERRUPTIBLE);
1206
1207		node = vhost_dequeue_msg(dev, &dev->read_list);
1208		if (node)
1209			break;
1210		if (noblock) {
1211			ret = -EAGAIN;
1212			break;
1213		}
1214		if (signal_pending(current)) {
1215			ret = -ERESTARTSYS;
1216			break;
1217		}
1218		if (!dev->iotlb) {
1219			ret = -EBADFD;
1220			break;
1221		}
1222
1223		schedule();
1224	}
1225
1226	if (!noblock)
1227		finish_wait(&dev->wait, &wait);
1228
1229	if (node) {
1230		struct vhost_iotlb_msg *msg;
1231		void *start = &node->msg;
1232
1233		switch (node->msg.type) {
1234		case VHOST_IOTLB_MSG:
1235			size = sizeof(node->msg);
1236			msg = &node->msg.iotlb;
1237			break;
1238		case VHOST_IOTLB_MSG_V2:
1239			size = sizeof(node->msg_v2);
1240			msg = &node->msg_v2.iotlb;
1241			break;
1242		default:
1243			BUG();
1244			break;
1245		}
1246
1247		ret = copy_to_iter(start, size, to);
1248		if (ret != size || msg->type != VHOST_IOTLB_MISS) {
1249			kfree(node);
1250			return ret;
1251		}
1252		vhost_enqueue_msg(dev, &dev->pending_list, node);
1253	}
1254
1255	return ret;
1256}
1257EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
1258
1259static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
1260{
1261	struct vhost_dev *dev = vq->dev;
1262	struct vhost_msg_node *node;
1263	struct vhost_iotlb_msg *msg;
1264	bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
1265
1266	node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
1267	if (!node)
1268		return -ENOMEM;
1269
1270	if (v2) {
1271		node->msg_v2.type = VHOST_IOTLB_MSG_V2;
1272		msg = &node->msg_v2.iotlb;
1273	} else {
1274		msg = &node->msg.iotlb;
1275	}
1276
1277	msg->type = VHOST_IOTLB_MISS;
1278	msg->iova = iova;
1279	msg->perm = access;
1280
1281	vhost_enqueue_msg(dev, &dev->read_list, node);
1282
1283	return 0;
1284}
1285
1286static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
1287			 vring_desc_t __user *desc,
1288			 vring_avail_t __user *avail,
1289			 vring_used_t __user *used)
1290
1291{
1292	/* If an IOTLB device is present, the vring addresses are
1293	 * GIOVAs. Access validation occurs at prefetch time. */
1294	if (vq->iotlb)
1295		return true;
1296
1297	return access_ok(desc, vhost_get_desc_size(vq, num)) &&
1298	       access_ok(avail, vhost_get_avail_size(vq, num)) &&
1299	       access_ok(used, vhost_get_used_size(vq, num));
1300}
1301
1302static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
1303				 const struct vhost_iotlb_map *map,
1304				 int type)
1305{
1306	int access = (type == VHOST_ADDR_USED) ?
1307		     VHOST_ACCESS_WO : VHOST_ACCESS_RO;
1308
1309	if (likely(map->perm & access))
1310		vq->meta_iotlb[type] = map;
1311}
1312
1313static bool iotlb_access_ok(struct vhost_virtqueue *vq,
1314			    int access, u64 addr, u64 len, int type)
1315{
1316	const struct vhost_iotlb_map *map;
1317	struct vhost_iotlb *umem = vq->iotlb;
1318	u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
1319
1320	if (vhost_vq_meta_fetch(vq, addr, len, type))
1321		return true;
1322
1323	while (len > s) {
1324		map = vhost_iotlb_itree_first(umem, addr, last);
1325		if (map == NULL || map->start > addr) {
1326			vhost_iotlb_miss(vq, addr, access);
1327			return false;
1328		} else if (!(map->perm & access)) {
1329			/* Report the possible access violation by
1330			 * request another translation from userspace.
1331			 */
1332			return false;
1333		}
1334
1335		size = map->size - addr + map->start;
1336
1337		if (orig_addr == addr && size >= len)
1338			vhost_vq_meta_update(vq, map, type);
1339
1340		s += size;
1341		addr += size;
1342	}
1343
1344	return true;
1345}
1346
1347int vq_meta_prefetch(struct vhost_virtqueue *vq)
1348{
1349	unsigned int num = vq->num;
1350
1351	if (!vq->iotlb)
1352		return 1;
1353
1354	return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
1355			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
1356	       iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
1357			       vhost_get_avail_size(vq, num),
1358			       VHOST_ADDR_AVAIL) &&
1359	       iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
1360			       vhost_get_used_size(vq, num), VHOST_ADDR_USED);
1361}
1362EXPORT_SYMBOL_GPL(vq_meta_prefetch);
1363
1364/* Can we log writes? */
1365/* Caller should have device mutex but not vq mutex */
1366bool vhost_log_access_ok(struct vhost_dev *dev)
1367{
1368	return memory_access_ok(dev, dev->umem, 1);
1369}
1370EXPORT_SYMBOL_GPL(vhost_log_access_ok);
1371
1372static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
1373				  void __user *log_base,
1374				  bool log_used,
1375				  u64 log_addr)
1376{
1377	/* If an IOTLB device is present, log_addr is a GIOVA that
1378	 * will never be logged by log_used(). */
1379	if (vq->iotlb)
1380		return true;
1381
1382	return !log_used || log_access_ok(log_base, log_addr,
1383					  vhost_get_used_size(vq, vq->num));
1384}
1385
1386/* Verify access for write logging. */
1387/* Caller should have vq mutex and device mutex */
1388static bool vq_log_access_ok(struct vhost_virtqueue *vq,
1389			     void __user *log_base)
1390{
1391	return vq_memory_access_ok(log_base, vq->umem,
1392				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
1393		vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
1394}
1395
1396/* Can we start vq? */
1397/* Caller should have vq mutex and device mutex */
1398bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
1399{
1400	if (!vq_log_access_ok(vq, vq->log_base))
1401		return false;
1402
1403	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
1404}
1405EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
1406
1407static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
1408{
1409	struct vhost_memory mem, *newmem;
1410	struct vhost_memory_region *region;
1411	struct vhost_iotlb *newumem, *oldumem;
1412	unsigned long size = offsetof(struct vhost_memory, regions);
1413	int i;
1414
1415	if (copy_from_user(&mem, m, size))
1416		return -EFAULT;
1417	if (mem.padding)
1418		return -EOPNOTSUPP;
1419	if (mem.nregions > max_mem_regions)
1420		return -E2BIG;
1421	newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
1422			GFP_KERNEL);
1423	if (!newmem)
1424		return -ENOMEM;
1425
1426	memcpy(newmem, &mem, size);
1427	if (copy_from_user(newmem->regions, m->regions,
1428			   flex_array_size(newmem, regions, mem.nregions))) {
1429		kvfree(newmem);
1430		return -EFAULT;
1431	}
1432
1433	newumem = iotlb_alloc();
1434	if (!newumem) {
1435		kvfree(newmem);
1436		return -ENOMEM;
1437	}
1438
1439	for (region = newmem->regions;
1440	     region < newmem->regions + mem.nregions;
1441	     region++) {
1442		if (vhost_iotlb_add_range(newumem,
1443					  region->guest_phys_addr,
1444					  region->guest_phys_addr +
1445					  region->memory_size - 1,
1446					  region->userspace_addr,
1447					  VHOST_MAP_RW))
1448			goto err;
1449	}
1450
1451	if (!memory_access_ok(d, newumem, 0))
1452		goto err;
1453
1454	oldumem = d->umem;
1455	d->umem = newumem;
1456
1457	/* All memory accesses are done under some VQ mutex. */
1458	for (i = 0; i < d->nvqs; ++i) {
1459		mutex_lock(&d->vqs[i]->mutex);
1460		d->vqs[i]->umem = newumem;
1461		mutex_unlock(&d->vqs[i]->mutex);
1462	}
1463
1464	kvfree(newmem);
1465	vhost_iotlb_free(oldumem);
1466	return 0;
1467
1468err:
1469	vhost_iotlb_free(newumem);
1470	kvfree(newmem);
1471	return -EFAULT;
1472}
1473
1474static long vhost_vring_set_num(struct vhost_dev *d,
1475				struct vhost_virtqueue *vq,
1476				void __user *argp)
1477{
1478	struct vhost_vring_state s;
1479
1480	/* Resizing ring with an active backend?
1481	 * You don't want to do that. */
1482	if (vq->private_data)
1483		return -EBUSY;
1484
1485	if (copy_from_user(&s, argp, sizeof s))
1486		return -EFAULT;
1487
1488	if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
1489		return -EINVAL;
1490	vq->num = s.num;
1491
1492	return 0;
1493}
1494
1495static long vhost_vring_set_addr(struct vhost_dev *d,
1496				 struct vhost_virtqueue *vq,
1497				 void __user *argp)
1498{
1499	struct vhost_vring_addr a;
1500
1501	if (copy_from_user(&a, argp, sizeof a))
1502		return -EFAULT;
1503	if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
1504		return -EOPNOTSUPP;
1505
1506	/* For 32bit, verify that the top 32bits of the user
1507	   data are set to zero. */
1508	if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
1509	    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
1510	    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
1511		return -EFAULT;
1512
1513	/* Make sure it's safe to cast pointers to vring types. */
1514	BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
1515	BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
1516	if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
1517	    (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
1518	    (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
1519		return -EINVAL;
1520
1521	/* We only verify access here if backend is configured.
1522	 * If it is not, we don't as size might not have been setup.
1523	 * We will verify when backend is configured. */
1524	if (vq->private_data) {
1525		if (!vq_access_ok(vq, vq->num,
1526			(void __user *)(unsigned long)a.desc_user_addr,
1527			(void __user *)(unsigned long)a.avail_user_addr,
1528			(void __user *)(unsigned long)a.used_user_addr))
1529			return -EINVAL;
1530
1531		/* Also validate log access for used ring if enabled. */
1532		if (!vq_log_used_access_ok(vq, vq->log_base,
1533				a.flags & (0x1 << VHOST_VRING_F_LOG),
1534				a.log_guest_addr))
1535			return -EINVAL;
1536	}
1537
1538	vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
1539	vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
1540	vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
1541	vq->log_addr = a.log_guest_addr;
1542	vq->used = (void __user *)(unsigned long)a.used_user_addr;
1543
1544	return 0;
1545}
1546
1547static long vhost_vring_set_num_addr(struct vhost_dev *d,
1548				     struct vhost_virtqueue *vq,
1549				     unsigned int ioctl,
1550				     void __user *argp)
1551{
1552	long r;
1553
1554	mutex_lock(&vq->mutex);
1555
1556	switch (ioctl) {
1557	case VHOST_SET_VRING_NUM:
1558		r = vhost_vring_set_num(d, vq, argp);
1559		break;
1560	case VHOST_SET_VRING_ADDR:
1561		r = vhost_vring_set_addr(d, vq, argp);
1562		break;
1563	default:
1564		BUG();
1565	}
1566
1567	mutex_unlock(&vq->mutex);
1568
1569	return r;
1570}
1571long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1572{
1573	struct file *eventfp, *filep = NULL;
1574	bool pollstart = false, pollstop = false;
1575	struct eventfd_ctx *ctx = NULL;
1576	u32 __user *idxp = argp;
1577	struct vhost_virtqueue *vq;
1578	struct vhost_vring_state s;
1579	struct vhost_vring_file f;
1580	u32 idx;
1581	long r;
1582
1583	r = get_user(idx, idxp);
1584	if (r < 0)
1585		return r;
1586	if (idx >= d->nvqs)
1587		return -ENOBUFS;
1588
1589	idx = array_index_nospec(idx, d->nvqs);
1590	vq = d->vqs[idx];
1591
1592	if (ioctl == VHOST_SET_VRING_NUM ||
1593	    ioctl == VHOST_SET_VRING_ADDR) {
1594		return vhost_vring_set_num_addr(d, vq, ioctl, argp);
1595	}
1596
1597	mutex_lock(&vq->mutex);
1598
1599	switch (ioctl) {
1600	case VHOST_SET_VRING_BASE:
1601		/* Moving base with an active backend?
1602		 * You don't want to do that. */
1603		if (vq->private_data) {
1604			r = -EBUSY;
1605			break;
1606		}
1607		if (copy_from_user(&s, argp, sizeof s)) {
1608			r = -EFAULT;
1609			break;
1610		}
1611		if (s.num > 0xffff) {
1612			r = -EINVAL;
1613			break;
1614		}
1615		vq->last_avail_idx = s.num;
1616		/* Forget the cached index value. */
1617		vq->avail_idx = vq->last_avail_idx;
1618		break;
1619	case VHOST_GET_VRING_BASE:
1620		s.index = idx;
1621		s.num = vq->last_avail_idx;
1622		if (copy_to_user(argp, &s, sizeof s))
1623			r = -EFAULT;
1624		break;
1625	case VHOST_SET_VRING_KICK:
1626		if (copy_from_user(&f, argp, sizeof f)) {
1627			r = -EFAULT;
1628			break;
1629		}
1630		eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
1631		if (IS_ERR(eventfp)) {
1632			r = PTR_ERR(eventfp);
1633			break;
1634		}
1635		if (eventfp != vq->kick) {
1636			pollstop = (filep = vq->kick) != NULL;
1637			pollstart = (vq->kick = eventfp) != NULL;
1638		} else
1639			filep = eventfp;
1640		break;
1641	case VHOST_SET_VRING_CALL:
1642		if (copy_from_user(&f, argp, sizeof f)) {
1643			r = -EFAULT;
1644			break;
1645		}
1646		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1647		if (IS_ERR(ctx)) {
1648			r = PTR_ERR(ctx);
1649			break;
1650		}
1651
1652		swap(ctx, vq->call_ctx.ctx);
1653		break;
1654	case VHOST_SET_VRING_ERR:
1655		if (copy_from_user(&f, argp, sizeof f)) {
1656			r = -EFAULT;
1657			break;
1658		}
1659		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1660		if (IS_ERR(ctx)) {
1661			r = PTR_ERR(ctx);
1662			break;
1663		}
1664		swap(ctx, vq->error_ctx);
1665		break;
1666	case VHOST_SET_VRING_ENDIAN:
1667		r = vhost_set_vring_endian(vq, argp);
1668		break;
1669	case VHOST_GET_VRING_ENDIAN:
1670		r = vhost_get_vring_endian(vq, idx, argp);
1671		break;
1672	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
1673		if (copy_from_user(&s, argp, sizeof(s))) {
1674			r = -EFAULT;
1675			break;
1676		}
1677		vq->busyloop_timeout = s.num;
1678		break;
1679	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
1680		s.index = idx;
1681		s.num = vq->busyloop_timeout;
1682		if (copy_to_user(argp, &s, sizeof(s)))
1683			r = -EFAULT;
1684		break;
1685	default:
1686		r = -ENOIOCTLCMD;
1687	}
1688
1689	if (pollstop && vq->handle_kick)
1690		vhost_poll_stop(&vq->poll);
1691
1692	if (!IS_ERR_OR_NULL(ctx))
1693		eventfd_ctx_put(ctx);
1694	if (filep)
1695		fput(filep);
1696
1697	if (pollstart && vq->handle_kick)
1698		r = vhost_poll_start(&vq->poll, vq->kick);
1699
1700	mutex_unlock(&vq->mutex);
1701
1702	if (pollstop && vq->handle_kick)
1703		vhost_poll_flush(&vq->poll);
1704	return r;
1705}
1706EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
1707
1708int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
1709{
1710	struct vhost_iotlb *niotlb, *oiotlb;
1711	int i;
1712
1713	niotlb = iotlb_alloc();
1714	if (!niotlb)
1715		return -ENOMEM;
1716
1717	oiotlb = d->iotlb;
1718	d->iotlb = niotlb;
1719
1720	for (i = 0; i < d->nvqs; ++i) {
1721		struct vhost_virtqueue *vq = d->vqs[i];
1722
1723		mutex_lock(&vq->mutex);
1724		vq->iotlb = niotlb;
1725		__vhost_vq_meta_reset(vq);
1726		mutex_unlock(&vq->mutex);
1727	}
1728
1729	vhost_iotlb_free(oiotlb);
1730
1731	return 0;
1732}
1733EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
1734
1735/* Caller must have device mutex */
1736long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1737{
1738	struct eventfd_ctx *ctx;
1739	u64 p;
1740	long r;
1741	int i, fd;
1742
1743	/* If you are not the owner, you can become one */
1744	if (ioctl == VHOST_SET_OWNER) {
1745		r = vhost_dev_set_owner(d);
1746		goto done;
1747	}
1748
1749	/* You must be the owner to do anything else */
1750	r = vhost_dev_check_owner(d);
1751	if (r)
1752		goto done;
1753
1754	switch (ioctl) {
1755	case VHOST_SET_MEM_TABLE:
1756		r = vhost_set_memory(d, argp);
1757		break;
1758	case VHOST_SET_LOG_BASE:
1759		if (copy_from_user(&p, argp, sizeof p)) {
1760			r = -EFAULT;
1761			break;
1762		}
1763		if ((u64)(unsigned long)p != p) {
1764			r = -EFAULT;
1765			break;
1766		}
1767		for (i = 0; i < d->nvqs; ++i) {
1768			struct vhost_virtqueue *vq;
1769			void __user *base = (void __user *)(unsigned long)p;
1770			vq = d->vqs[i];
1771			mutex_lock(&vq->mutex);
1772			/* If ring is inactive, will check when it's enabled. */
1773			if (vq->private_data && !vq_log_access_ok(vq, base))
1774				r = -EFAULT;
1775			else
1776				vq->log_base = base;
1777			mutex_unlock(&vq->mutex);
1778		}
1779		break;
1780	case VHOST_SET_LOG_FD:
1781		r = get_user(fd, (int __user *)argp);
1782		if (r < 0)
1783			break;
1784		ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
1785		if (IS_ERR(ctx)) {
1786			r = PTR_ERR(ctx);
1787			break;
1788		}
1789		swap(ctx, d->log_ctx);
1790		for (i = 0; i < d->nvqs; ++i) {
1791			mutex_lock(&d->vqs[i]->mutex);
1792			d->vqs[i]->log_ctx = d->log_ctx;
1793			mutex_unlock(&d->vqs[i]->mutex);
1794		}
1795		if (ctx)
1796			eventfd_ctx_put(ctx);
1797		break;
1798	default:
1799		r = -ENOIOCTLCMD;
1800		break;
1801	}
1802done:
1803	return r;
1804}
1805EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
1806
1807/* TODO: This is really inefficient.  We need something like get_user()
1808 * (instruction directly accesses the data, with an exception table entry
1809 * returning -EFAULT). See Documentation/x86/exception-tables.rst.
1810 */
1811static int set_bit_to_user(int nr, void __user *addr)
1812{
1813	unsigned long log = (unsigned long)addr;
1814	struct page *page;
1815	void *base;
1816	int bit = nr + (log % PAGE_SIZE) * 8;
1817	int r;
1818
1819	r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
1820	if (r < 0)
1821		return r;
1822	BUG_ON(r != 1);
1823	base = kmap_atomic(page);
1824	set_bit(bit, base);
1825	kunmap_atomic(base);
1826	unpin_user_pages_dirty_lock(&page, 1, true);
1827	return 0;
1828}
1829
1830static int log_write(void __user *log_base,
1831		     u64 write_address, u64 write_length)
1832{
1833	u64 write_page = write_address / VHOST_PAGE_SIZE;
1834	int r;
1835
1836	if (!write_length)
1837		return 0;
1838	write_length += write_address % VHOST_PAGE_SIZE;
1839	for (;;) {
1840		u64 base = (u64)(unsigned long)log_base;
1841		u64 log = base + write_page / 8;
1842		int bit = write_page % 8;
1843		if ((u64)(unsigned long)log != log)
1844			return -EFAULT;
1845		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
1846		if (r < 0)
1847			return r;
1848		if (write_length <= VHOST_PAGE_SIZE)
1849			break;
1850		write_length -= VHOST_PAGE_SIZE;
1851		write_page += 1;
1852	}
1853	return r;
1854}
1855
1856static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
1857{
1858	struct vhost_iotlb *umem = vq->umem;
1859	struct vhost_iotlb_map *u;
1860	u64 start, end, l, min;
1861	int r;
1862	bool hit = false;
1863
1864	while (len) {
1865		min = len;
1866		/* More than one GPAs can be mapped into a single HVA. So
1867		 * iterate all possible umems here to be safe.
1868		 */
1869		list_for_each_entry(u, &umem->list, link) {
1870			if (u->addr > hva - 1 + len ||
1871			    u->addr - 1 + u->size < hva)
1872				continue;
1873			start = max(u->addr, hva);
1874			end = min(u->addr - 1 + u->size, hva - 1 + len);
1875			l = end - start + 1;
1876			r = log_write(vq->log_base,
1877				      u->start + start - u->addr,
1878				      l);
1879			if (r < 0)
1880				return r;
1881			hit = true;
1882			min = min(l, min);
1883		}
1884
1885		if (!hit)
1886			return -EFAULT;
1887
1888		len -= min;
1889		hva += min;
1890	}
1891
1892	return 0;
1893}
1894
1895static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
1896{
1897	struct iovec *iov = vq->log_iov;
1898	int i, ret;
1899
1900	if (!vq->iotlb)
1901		return log_write(vq->log_base, vq->log_addr + used_offset, len);
1902
1903	ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
1904			     len, iov, 64, VHOST_ACCESS_WO);
1905	if (ret < 0)
1906		return ret;
1907
1908	for (i = 0; i < ret; i++) {
1909		ret = log_write_hva(vq,	(uintptr_t)iov[i].iov_base,
1910				    iov[i].iov_len);
1911		if (ret)
1912			return ret;
1913	}
1914
1915	return 0;
1916}
1917
1918int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
1919		    unsigned int log_num, u64 len, struct iovec *iov, int count)
1920{
1921	int i, r;
1922
1923	/* Make sure data written is seen before log. */
1924	smp_wmb();
1925
1926	if (vq->iotlb) {
1927		for (i = 0; i < count; i++) {
1928			r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
1929					  iov[i].iov_len);
1930			if (r < 0)
1931				return r;
1932		}
1933		return 0;
1934	}
1935
1936	for (i = 0; i < log_num; ++i) {
1937		u64 l = min(log[i].len, len);
1938		r = log_write(vq->log_base, log[i].addr, l);
1939		if (r < 0)
1940			return r;
1941		len -= l;
1942		if (!len) {
1943			if (vq->log_ctx)
1944				eventfd_signal(vq->log_ctx, 1);
1945			return 0;
1946		}
1947	}
1948	/* Length written exceeds what we have stored. This is a bug. */
1949	BUG();
1950	return 0;
1951}
1952EXPORT_SYMBOL_GPL(vhost_log_write);
1953
1954static int vhost_update_used_flags(struct vhost_virtqueue *vq)
1955{
1956	void __user *used;
1957	if (vhost_put_used_flags(vq))
1958		return -EFAULT;
1959	if (unlikely(vq->log_used)) {
1960		/* Make sure the flag is seen before log. */
1961		smp_wmb();
1962		/* Log used flag write. */
1963		used = &vq->used->flags;
1964		log_used(vq, (used - (void __user *)vq->used),
1965			 sizeof vq->used->flags);
1966		if (vq->log_ctx)
1967			eventfd_signal(vq->log_ctx, 1);
1968	}
1969	return 0;
1970}
1971
1972static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event)
1973{
1974	if (vhost_put_avail_event(vq))
1975		return -EFAULT;
1976	if (unlikely(vq->log_used)) {
1977		void __user *used;
1978		/* Make sure the event is seen before log. */
1979		smp_wmb();
1980		/* Log avail event write */
1981		used = vhost_avail_event(vq);
1982		log_used(vq, (used - (void __user *)vq->used),
1983			 sizeof *vhost_avail_event(vq));
1984		if (vq->log_ctx)
1985			eventfd_signal(vq->log_ctx, 1);
1986	}
1987	return 0;
1988}
1989
1990int vhost_vq_init_access(struct vhost_virtqueue *vq)
1991{
1992	__virtio16 last_used_idx;
1993	int r;
1994	bool is_le = vq->is_le;
1995
1996	if (!vq->private_data)
1997		return 0;
1998
1999	vhost_init_is_le(vq);
2000
2001	r = vhost_update_used_flags(vq);
2002	if (r)
2003		goto err;
2004	vq->signalled_used_valid = false;
2005	if (!vq->iotlb &&
2006	    !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
2007		r = -EFAULT;
2008		goto err;
2009	}
2010	r = vhost_get_used_idx(vq, &last_used_idx);
2011	if (r) {
2012		vq_err(vq, "Can't access used idx at %p\n",
2013		       &vq->used->idx);
2014		goto err;
2015	}
2016	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
2017	return 0;
2018
2019err:
2020	vq->is_le = is_le;
2021	return r;
2022}
2023EXPORT_SYMBOL_GPL(vhost_vq_init_access);
2024
2025static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
2026			  struct iovec iov[], int iov_size, int access)
2027{
2028	const struct vhost_iotlb_map *map;
2029	struct vhost_dev *dev = vq->dev;
2030	struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
2031	struct iovec *_iov;
2032	u64 s = 0;
2033	int ret = 0;
2034
2035	while ((u64)len > s) {
2036		u64 size;
2037		if (unlikely(ret >= iov_size)) {
2038			ret = -ENOBUFS;
2039			break;
2040		}
2041
2042		map = vhost_iotlb_itree_first(umem, addr, addr + len - 1);
2043		if (map == NULL || map->start > addr) {
2044			if (umem != dev->iotlb) {
2045				ret = -EFAULT;
2046				break;
2047			}
2048			ret = -EAGAIN;
2049			break;
2050		} else if (!(map->perm & access)) {
2051			ret = -EPERM;
2052			break;
2053		}
2054
2055		_iov = iov + ret;
2056		size = map->size - addr + map->start;
2057		_iov->iov_len = min((u64)len - s, size);
2058		_iov->iov_base = (void __user *)(unsigned long)
2059				 (map->addr + addr - map->start);
2060		s += size;
2061		addr += size;
2062		++ret;
2063	}
2064
2065	if (ret == -EAGAIN)
2066		vhost_iotlb_miss(vq, addr, access);
2067	return ret;
2068}
2069
2070/* Each buffer in the virtqueues is actually a chain of descriptors.  This
2071 * function returns the next descriptor in the chain,
2072 * or -1U if we're at the end. */
2073static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
2074{
2075	unsigned int next;
2076
2077	/* If this descriptor says it doesn't chain, we're done. */
2078	if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
2079		return -1U;
2080
2081	/* Check they're not leading us off end of descriptors. */
2082	next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
2083	return next;
2084}
2085
2086static int get_indirect(struct vhost_virtqueue *vq,
2087			struct iovec iov[], unsigned int iov_size,
2088			unsigned int *out_num, unsigned int *in_num,
2089			struct vhost_log *log, unsigned int *log_num,
2090			struct vring_desc *indirect)
2091{
2092	struct vring_desc desc;
2093	unsigned int i = 0, count, found = 0;
2094	u32 len = vhost32_to_cpu(vq, indirect->len);
2095	struct iov_iter from;
2096	int ret, access;
2097
2098	/* Sanity check */
2099	if (unlikely(len % sizeof desc)) {
2100		vq_err(vq, "Invalid length in indirect descriptor: "
2101		       "len 0x%llx not multiple of 0x%zx\n",
2102		       (unsigned long long)len,
2103		       sizeof desc);
2104		return -EINVAL;
2105	}
2106
2107	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
2108			     UIO_MAXIOV, VHOST_ACCESS_RO);
2109	if (unlikely(ret < 0)) {
2110		if (ret != -EAGAIN)
2111			vq_err(vq, "Translation failure %d in indirect.\n", ret);
2112		return ret;
2113	}
2114	iov_iter_init(&from, READ, vq->indirect, ret, len);
2115	count = len / sizeof desc;
2116	/* Buffers are chained via a 16 bit next field, so
2117	 * we can have at most 2^16 of these. */
2118	if (unlikely(count > USHRT_MAX + 1)) {
2119		vq_err(vq, "Indirect buffer length too big: %d\n",
2120		       indirect->len);
2121		return -E2BIG;
2122	}
2123
2124	do {
2125		unsigned iov_count = *in_num + *out_num;
2126		if (unlikely(++found > count)) {
2127			vq_err(vq, "Loop detected: last one at %u "
2128			       "indirect size %u\n",
2129			       i, count);
2130			return -EINVAL;
2131		}
2132		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
2133			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
2134			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2135			return -EINVAL;
2136		}
2137		if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
2138			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
2139			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2140			return -EINVAL;
2141		}
2142
2143		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2144			access = VHOST_ACCESS_WO;
2145		else
2146			access = VHOST_ACCESS_RO;
2147
2148		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2149				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2150				     iov_size - iov_count, access);
2151		if (unlikely(ret < 0)) {
2152			if (ret != -EAGAIN)
2153				vq_err(vq, "Translation failure %d indirect idx %d\n",
2154					ret, i);
2155			return ret;
2156		}
2157		/* If this is an input descriptor, increment that count. */
2158		if (access == VHOST_ACCESS_WO) {
2159			*in_num += ret;
2160			if (unlikely(log && ret)) {
2161				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2162				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2163				++*log_num;
2164			}
2165		} else {
2166			/* If it's an output descriptor, they're all supposed
2167			 * to come before any input descriptors. */
2168			if (unlikely(*in_num)) {
2169				vq_err(vq, "Indirect descriptor "
2170				       "has out after in: idx %d\n", i);
2171				return -EINVAL;
2172			}
2173			*out_num += ret;
2174		}
2175	} while ((i = next_desc(vq, &desc)) != -1);
2176	return 0;
2177}
2178
2179/* This looks in the virtqueue and for the first available buffer, and converts
2180 * it to an iovec for convenient access.  Since descriptors consist of some
2181 * number of output then some number of input descriptors, it's actually two
2182 * iovecs, but we pack them into one and note how many of each there were.
2183 *
2184 * This function returns the descriptor number found, or vq->num (which is
2185 * never a valid descriptor number) if none was found.  A negative code is
2186 * returned on error. */
2187int vhost_get_vq_desc(struct vhost_virtqueue *vq,
2188		      struct iovec iov[], unsigned int iov_size,
2189		      unsigned int *out_num, unsigned int *in_num,
2190		      struct vhost_log *log, unsigned int *log_num)
2191{
2192	struct vring_desc desc;
2193	unsigned int i, head, found = 0;
2194	u16 last_avail_idx;
2195	__virtio16 avail_idx;
2196	__virtio16 ring_head;
2197	int ret, access;
2198
2199	/* Check it isn't doing very strange things with descriptor numbers. */
2200	last_avail_idx = vq->last_avail_idx;
2201
2202	if (vq->avail_idx == vq->last_avail_idx) {
2203		if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
2204			vq_err(vq, "Failed to access avail idx at %p\n",
2205				&vq->avail->idx);
2206			return -EFAULT;
2207		}
2208		vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2209
2210		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
2211			vq_err(vq, "Guest moved used index from %u to %u",
2212				last_avail_idx, vq->avail_idx);
2213			return -EFAULT;
2214		}
2215
2216		/* If there's nothing new since last we looked, return
2217		 * invalid.
2218		 */
2219		if (vq->avail_idx == last_avail_idx)
2220			return vq->num;
2221
2222		/* Only get avail ring entries after they have been
2223		 * exposed by guest.
2224		 */
2225		smp_rmb();
2226	}
2227
2228	/* Grab the next descriptor number they're advertising, and increment
2229	 * the index we've seen. */
2230	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
2231		vq_err(vq, "Failed to read head: idx %d address %p\n",
2232		       last_avail_idx,
2233		       &vq->avail->ring[last_avail_idx % vq->num]);
2234		return -EFAULT;
2235	}
2236
2237	head = vhost16_to_cpu(vq, ring_head);
2238
2239	/* If their number is silly, that's an error. */
2240	if (unlikely(head >= vq->num)) {
2241		vq_err(vq, "Guest says index %u > %u is available",
2242		       head, vq->num);
2243		return -EINVAL;
2244	}
2245
2246	/* When we start there are none of either input nor output. */
2247	*out_num = *in_num = 0;
2248	if (unlikely(log))
2249		*log_num = 0;
2250
2251	i = head;
2252	do {
2253		unsigned iov_count = *in_num + *out_num;
2254		if (unlikely(i >= vq->num)) {
2255			vq_err(vq, "Desc index is %u > %u, head = %u",
2256			       i, vq->num, head);
2257			return -EINVAL;
2258		}
2259		if (unlikely(++found > vq->num)) {
2260			vq_err(vq, "Loop detected: last one at %u "
2261			       "vq size %u head %u\n",
2262			       i, vq->num, head);
2263			return -EINVAL;
2264		}
2265		ret = vhost_get_desc(vq, &desc, i);
2266		if (unlikely(ret)) {
2267			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
2268			       i, vq->desc + i);
2269			return -EFAULT;
2270		}
2271		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
2272			ret = get_indirect(vq, iov, iov_size,
2273					   out_num, in_num,
2274					   log, log_num, &desc);
2275			if (unlikely(ret < 0)) {
2276				if (ret != -EAGAIN)
2277					vq_err(vq, "Failure detected "
2278						"in indirect descriptor at idx %d\n", i);
2279				return ret;
2280			}
2281			continue;
2282		}
2283
2284		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2285			access = VHOST_ACCESS_WO;
2286		else
2287			access = VHOST_ACCESS_RO;
2288		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2289				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2290				     iov_size - iov_count, access);
2291		if (unlikely(ret < 0)) {
2292			if (ret != -EAGAIN)
2293				vq_err(vq, "Translation failure %d descriptor idx %d\n",
2294					ret, i);
2295			return ret;
2296		}
2297		if (access == VHOST_ACCESS_WO) {
2298			/* If this is an input descriptor,
2299			 * increment that count. */
2300			*in_num += ret;
2301			if (unlikely(log && ret)) {
2302				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2303				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2304				++*log_num;
2305			}
2306		} else {
2307			/* If it's an output descriptor, they're all supposed
2308			 * to come before any input descriptors. */
2309			if (unlikely(*in_num)) {
2310				vq_err(vq, "Descriptor has out after in: "
2311				       "idx %d\n", i);
2312				return -EINVAL;
2313			}
2314			*out_num += ret;
2315		}
2316	} while ((i = next_desc(vq, &desc)) != -1);
2317
2318	/* On success, increment avail index. */
2319	vq->last_avail_idx++;
2320
2321	/* Assume notifications from guest are disabled at this point,
2322	 * if they aren't we would need to update avail_event index. */
2323	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
2324	return head;
2325}
2326EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
2327
2328/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
2329void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
2330{
2331	vq->last_avail_idx -= n;
2332}
2333EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
2334
2335/* After we've used one of their buffers, we tell them about it.  We'll then
2336 * want to notify the guest, using eventfd. */
2337int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
2338{
2339	struct vring_used_elem heads = {
2340		cpu_to_vhost32(vq, head),
2341		cpu_to_vhost32(vq, len)
2342	};
2343
2344	return vhost_add_used_n(vq, &heads, 1);
2345}
2346EXPORT_SYMBOL_GPL(vhost_add_used);
2347
2348static int __vhost_add_used_n(struct vhost_virtqueue *vq,
2349			    struct vring_used_elem *heads,
2350			    unsigned count)
2351{
2352	vring_used_elem_t __user *used;
2353	u16 old, new;
2354	int start;
2355
2356	start = vq->last_used_idx & (vq->num - 1);
2357	used = vq->used->ring + start;
2358	if (vhost_put_used(vq, heads, start, count)) {
2359		vq_err(vq, "Failed to write used");
2360		return -EFAULT;
2361	}
2362	if (unlikely(vq->log_used)) {
2363		/* Make sure data is seen before log. */
2364		smp_wmb();
2365		/* Log used ring entry write. */
2366		log_used(vq, ((void __user *)used - (void __user *)vq->used),
2367			 count * sizeof *used);
2368	}
2369	old = vq->last_used_idx;
2370	new = (vq->last_used_idx += count);
2371	/* If the driver never bothers to signal in a very long while,
2372	 * used index might wrap around. If that happens, invalidate
2373	 * signalled_used index we stored. TODO: make sure driver
2374	 * signals at least once in 2^16 and remove this. */
2375	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
2376		vq->signalled_used_valid = false;
2377	return 0;
2378}
2379
2380/* After we've used one of their buffers, we tell them about it.  We'll then
2381 * want to notify the guest, using eventfd. */
2382int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
2383		     unsigned count)
2384{
2385	int start, n, r;
2386
2387	start = vq->last_used_idx & (vq->num - 1);
2388	n = vq->num - start;
2389	if (n < count) {
2390		r = __vhost_add_used_n(vq, heads, n);
2391		if (r < 0)
2392			return r;
2393		heads += n;
2394		count -= n;
2395	}
2396	r = __vhost_add_used_n(vq, heads, count);
2397
2398	/* Make sure buffer is written before we update index. */
2399	smp_wmb();
2400	if (vhost_put_used_idx(vq)) {
2401		vq_err(vq, "Failed to increment used idx");
2402		return -EFAULT;
2403	}
2404	if (unlikely(vq->log_used)) {
2405		/* Make sure used idx is seen before log. */
2406		smp_wmb();
2407		/* Log used index update. */
2408		log_used(vq, offsetof(struct vring_used, idx),
2409			 sizeof vq->used->idx);
2410		if (vq->log_ctx)
2411			eventfd_signal(vq->log_ctx, 1);
2412	}
2413	return r;
2414}
2415EXPORT_SYMBOL_GPL(vhost_add_used_n);
2416
2417static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2418{
2419	__u16 old, new;
2420	__virtio16 event;
2421	bool v;
2422	/* Flush out used index updates. This is paired
2423	 * with the barrier that the Guest executes when enabling
2424	 * interrupts. */
2425	smp_mb();
2426
2427	if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
2428	    unlikely(vq->avail_idx == vq->last_avail_idx))
2429		return true;
2430
2431	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2432		__virtio16 flags;
2433		if (vhost_get_avail_flags(vq, &flags)) {
2434			vq_err(vq, "Failed to get flags");
2435			return true;
2436		}
2437		return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
2438	}
2439	old = vq->signalled_used;
2440	v = vq->signalled_used_valid;
2441	new = vq->signalled_used = vq->last_used_idx;
2442	vq->signalled_used_valid = true;
2443
2444	if (unlikely(!v))
2445		return true;
2446
2447	if (vhost_get_used_event(vq, &event)) {
2448		vq_err(vq, "Failed to get used event idx");
2449		return true;
2450	}
2451	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
2452}
2453
2454/* This actually signals the guest, using eventfd. */
2455void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2456{
2457	/* Signal the Guest tell them we used something up. */
2458	if (vq->call_ctx.ctx && vhost_notify(dev, vq))
2459		eventfd_signal(vq->call_ctx.ctx, 1);
2460}
2461EXPORT_SYMBOL_GPL(vhost_signal);
2462
2463/* And here's the combo meal deal.  Supersize me! */
2464void vhost_add_used_and_signal(struct vhost_dev *dev,
2465			       struct vhost_virtqueue *vq,
2466			       unsigned int head, int len)
2467{
2468	vhost_add_used(vq, head, len);
2469	vhost_signal(dev, vq);
2470}
2471EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
2472
2473/* multi-buffer version of vhost_add_used_and_signal */
2474void vhost_add_used_and_signal_n(struct vhost_dev *dev,
2475				 struct vhost_virtqueue *vq,
2476				 struct vring_used_elem *heads, unsigned count)
2477{
2478	vhost_add_used_n(vq, heads, count);
2479	vhost_signal(dev, vq);
2480}
2481EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
2482
2483/* return true if we're sure that avaiable ring is empty */
2484bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2485{
2486	__virtio16 avail_idx;
2487	int r;
2488
2489	if (vq->avail_idx != vq->last_avail_idx)
2490		return false;
2491
2492	r = vhost_get_avail_idx(vq, &avail_idx);
2493	if (unlikely(r))
2494		return false;
2495	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2496
2497	return vq->avail_idx == vq->last_avail_idx;
2498}
2499EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
2500
2501/* OK, now we need to know about added descriptors. */
2502bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2503{
2504	__virtio16 avail_idx;
2505	int r;
2506
2507	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
2508		return false;
2509	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
2510	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2511		r = vhost_update_used_flags(vq);
2512		if (r) {
2513			vq_err(vq, "Failed to enable notification at %p: %d\n",
2514			       &vq->used->flags, r);
2515			return false;
2516		}
2517	} else {
2518		r = vhost_update_avail_event(vq, vq->avail_idx);
2519		if (r) {
2520			vq_err(vq, "Failed to update avail event index at %p: %d\n",
2521			       vhost_avail_event(vq), r);
2522			return false;
2523		}
2524	}
2525	/* They could have slipped one in as we were doing that: make
2526	 * sure it's written, then check again. */
2527	smp_mb();
2528	r = vhost_get_avail_idx(vq, &avail_idx);
2529	if (r) {
2530		vq_err(vq, "Failed to check avail idx at %p: %d\n",
2531		       &vq->avail->idx, r);
2532		return false;
2533	}
2534
2535	return vhost16_to_cpu(vq, avail_idx) != vq->avail_idx;
2536}
2537EXPORT_SYMBOL_GPL(vhost_enable_notify);
2538
2539/* We don't need to be notified again. */
2540void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2541{
2542	int r;
2543
2544	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
2545		return;
2546	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
2547	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2548		r = vhost_update_used_flags(vq);
2549		if (r)
2550			vq_err(vq, "Failed to disable notification at %p: %d\n",
2551			       &vq->used->flags, r);
2552	}
2553}
2554EXPORT_SYMBOL_GPL(vhost_disable_notify);
2555
2556/* Create a new message. */
2557struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
2558{
2559	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
2560	if (!node)
2561		return NULL;
2562
2563	/* Make sure all padding within the structure is initialized. */
2564	memset(&node->msg, 0, sizeof node->msg);
2565	node->vq = vq;
2566	node->msg.type = type;
2567	return node;
2568}
2569EXPORT_SYMBOL_GPL(vhost_new_msg);
2570
2571void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
2572		       struct vhost_msg_node *node)
2573{
2574	spin_lock(&dev->iotlb_lock);
2575	list_add_tail(&node->node, head);
2576	spin_unlock(&dev->iotlb_lock);
2577
2578	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
2579}
2580EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
2581
2582struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
2583					 struct list_head *head)
2584{
2585	struct vhost_msg_node *node = NULL;
2586
2587	spin_lock(&dev->iotlb_lock);
2588	if (!list_empty(head)) {
2589		node = list_first_entry(head, struct vhost_msg_node,
2590					node);
2591		list_del(&node->node);
2592	}
2593	spin_unlock(&dev->iotlb_lock);
2594
2595	return node;
2596}
2597EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
2598
2599void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
2600{
2601	struct vhost_virtqueue *vq;
2602	int i;
2603
2604	mutex_lock(&dev->mutex);
2605	for (i = 0; i < dev->nvqs; ++i) {
2606		vq = dev->vqs[i];
2607		mutex_lock(&vq->mutex);
2608		vq->acked_backend_features = features;
2609		mutex_unlock(&vq->mutex);
2610	}
2611	mutex_unlock(&dev->mutex);
2612}
2613EXPORT_SYMBOL_GPL(vhost_set_backend_features);
2614
2615static int __init vhost_init(void)
2616{
2617	return 0;
2618}
2619
2620static void __exit vhost_exit(void)
2621{
2622}
2623
2624module_init(vhost_init);
2625module_exit(vhost_exit);
2626
2627MODULE_VERSION("0.0.1");
2628MODULE_LICENSE("GPL v2");
2629MODULE_AUTHOR("Michael S. Tsirkin");
2630MODULE_DESCRIPTION("Host kernel accelerator for virtio");