drivers/vhost/vhost.c at v6.4-rc1 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / vhost / vhost.c
at v6.4-rc1 2649 lines 64 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Copyright (C) 2009 Red Hat, Inc.
   3 * Copyright (C) 2006 Rusty Russell IBM Corporation
   4 *
   5 * Author: Michael S. Tsirkin <mst@redhat.com>
   6 *
   7 * Inspiration, some code, and most witty comments come from
   8 * Documentation/virtual/lguest/lguest.c, by Rusty Russell
   9 *
  10 * Generic code for virtio server in host kernel.
  11 */
  12
  13#include <linux/eventfd.h>
  14#include <linux/vhost.h>
  15#include <linux/uio.h>
  16#include <linux/mm.h>
  17#include <linux/miscdevice.h>
  18#include <linux/mutex.h>
  19#include <linux/poll.h>
  20#include <linux/file.h>
  21#include <linux/highmem.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <linux/kthread.h>
  25#include <linux/module.h>
  26#include <linux/sort.h>
  27#include <linux/sched/mm.h>
  28#include <linux/sched/signal.h>
  29#include <linux/sched/vhost_task.h>
  30#include <linux/interval_tree_generic.h>
  31#include <linux/nospec.h>
  32#include <linux/kcov.h>
  33
  34#include "vhost.h"
  35
  36static ushort max_mem_regions = 64;
  37module_param(max_mem_regions, ushort, 0444);
  38MODULE_PARM_DESC(max_mem_regions,
  39	"Maximum number of memory regions in memory map. (default: 64)");
  40static int max_iotlb_entries = 2048;
  41module_param(max_iotlb_entries, int, 0444);
  42MODULE_PARM_DESC(max_iotlb_entries,
  43	"Maximum number of iotlb entries. (default: 2048)");
  44
  45enum {
  46	VHOST_MEMORY_F_LOG = 0x1,
  47};
  48
  49#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
  50#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
  51
  52#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
  53static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  54{
  55	vq->user_be = !virtio_legacy_is_little_endian();
  56}
  57
  58static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
  59{
  60	vq->user_be = true;
  61}
  62
  63static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
  64{
  65	vq->user_be = false;
  66}
  67
  68static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  69{
  70	struct vhost_vring_state s;
  71
  72	if (vq->private_data)
  73		return -EBUSY;
  74
  75	if (copy_from_user(&s, argp, sizeof(s)))
  76		return -EFAULT;
  77
  78	if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
  79	    s.num != VHOST_VRING_BIG_ENDIAN)
  80		return -EINVAL;
  81
  82	if (s.num == VHOST_VRING_BIG_ENDIAN)
  83		vhost_enable_cross_endian_big(vq);
  84	else
  85		vhost_enable_cross_endian_little(vq);
  86
  87	return 0;
  88}
  89
  90static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  91				   int __user *argp)
  92{
  93	struct vhost_vring_state s = {
  94		.index = idx,
  95		.num = vq->user_be
  96	};
  97
  98	if (copy_to_user(argp, &s, sizeof(s)))
  99		return -EFAULT;
 100
 101	return 0;
 102}
 103
 104static void vhost_init_is_le(struct vhost_virtqueue *vq)
 105{
 106	/* Note for legacy virtio: user_be is initialized at reset time
 107	 * according to the host endianness. If userspace does not set an
 108	 * explicit endianness, the default behavior is native endian, as
 109	 * expected by legacy virtio.
 110	 */
 111	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
 112}
 113#else
 114static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 115{
 116}
 117
 118static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
 119{
 120	return -ENOIOCTLCMD;
 121}
 122
 123static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
 124				   int __user *argp)
 125{
 126	return -ENOIOCTLCMD;
 127}
 128
 129static void vhost_init_is_le(struct vhost_virtqueue *vq)
 130{
 131	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
 132		|| virtio_legacy_is_little_endian();
 133}
 134#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 135
 136static void vhost_reset_is_le(struct vhost_virtqueue *vq)
 137{
 138	vhost_init_is_le(vq);
 139}
 140
 141struct vhost_flush_struct {
 142	struct vhost_work work;
 143	struct completion wait_event;
 144};
 145
 146static void vhost_flush_work(struct vhost_work *work)
 147{
 148	struct vhost_flush_struct *s;
 149
 150	s = container_of(work, struct vhost_flush_struct, work);
 151	complete(&s->wait_event);
 152}
 153
 154static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 155			    poll_table *pt)
 156{
 157	struct vhost_poll *poll;
 158
 159	poll = container_of(pt, struct vhost_poll, table);
 160	poll->wqh = wqh;
 161	add_wait_queue(wqh, &poll->wait);
 162}
 163
 164static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 165			     void *key)
 166{
 167	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
 168	struct vhost_work *work = &poll->work;
 169
 170	if (!(key_to_poll(key) & poll->mask))
 171		return 0;
 172
 173	if (!poll->dev->use_worker)
 174		work->fn(work);
 175	else
 176		vhost_poll_queue(poll);
 177
 178	return 0;
 179}
 180
 181void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 182{
 183	clear_bit(VHOST_WORK_QUEUED, &work->flags);
 184	work->fn = fn;
 185}
 186EXPORT_SYMBOL_GPL(vhost_work_init);
 187
 188/* Init poll structure */
 189void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 190		     __poll_t mask, struct vhost_dev *dev)
 191{
 192	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 193	init_poll_funcptr(&poll->table, vhost_poll_func);
 194	poll->mask = mask;
 195	poll->dev = dev;
 196	poll->wqh = NULL;
 197
 198	vhost_work_init(&poll->work, fn);
 199}
 200EXPORT_SYMBOL_GPL(vhost_poll_init);
 201
 202/* Start polling a file. We add ourselves to file's wait queue. The caller must
 203 * keep a reference to a file until after vhost_poll_stop is called. */
 204int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 205{
 206	__poll_t mask;
 207
 208	if (poll->wqh)
 209		return 0;
 210
 211	mask = vfs_poll(file, &poll->table);
 212	if (mask)
 213		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 214	if (mask & EPOLLERR) {
 215		vhost_poll_stop(poll);
 216		return -EINVAL;
 217	}
 218
 219	return 0;
 220}
 221EXPORT_SYMBOL_GPL(vhost_poll_start);
 222
 223/* Stop polling a file. After this function returns, it becomes safe to drop the
 224 * file reference. You must also flush afterwards. */
 225void vhost_poll_stop(struct vhost_poll *poll)
 226{
 227	if (poll->wqh) {
 228		remove_wait_queue(poll->wqh, &poll->wait);
 229		poll->wqh = NULL;
 230	}
 231}
 232EXPORT_SYMBOL_GPL(vhost_poll_stop);
 233
 234void vhost_dev_flush(struct vhost_dev *dev)
 235{
 236	struct vhost_flush_struct flush;
 237
 238	if (dev->worker) {
 239		init_completion(&flush.wait_event);
 240		vhost_work_init(&flush.work, vhost_flush_work);
 241
 242		vhost_work_queue(dev, &flush.work);
 243		wait_for_completion(&flush.wait_event);
 244	}
 245}
 246EXPORT_SYMBOL_GPL(vhost_dev_flush);
 247
 248void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 249{
 250	if (!dev->worker)
 251		return;
 252
 253	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
 254		/* We can only add the work to the list after we're
 255		 * sure it was not in the list.
 256		 * test_and_set_bit() implies a memory barrier.
 257		 */
 258		llist_add(&work->node, &dev->worker->work_list);
 259		wake_up_process(dev->worker->vtsk->task);
 260	}
 261}
 262EXPORT_SYMBOL_GPL(vhost_work_queue);
 263
 264/* A lockless hint for busy polling code to exit the loop */
 265bool vhost_has_work(struct vhost_dev *dev)
 266{
 267	return dev->worker && !llist_empty(&dev->worker->work_list);
 268}
 269EXPORT_SYMBOL_GPL(vhost_has_work);
 270
 271void vhost_poll_queue(struct vhost_poll *poll)
 272{
 273	vhost_work_queue(poll->dev, &poll->work);
 274}
 275EXPORT_SYMBOL_GPL(vhost_poll_queue);
 276
 277static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
 278{
 279	int j;
 280
 281	for (j = 0; j < VHOST_NUM_ADDRS; j++)
 282		vq->meta_iotlb[j] = NULL;
 283}
 284
 285static void vhost_vq_meta_reset(struct vhost_dev *d)
 286{
 287	int i;
 288
 289	for (i = 0; i < d->nvqs; ++i)
 290		__vhost_vq_meta_reset(d->vqs[i]);
 291}
 292
 293static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
 294{
 295	call_ctx->ctx = NULL;
 296	memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
 297}
 298
 299bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
 300{
 301	return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
 302}
 303EXPORT_SYMBOL_GPL(vhost_vq_is_setup);
 304
 305static void vhost_vq_reset(struct vhost_dev *dev,
 306			   struct vhost_virtqueue *vq)
 307{
 308	vq->num = 1;
 309	vq->desc = NULL;
 310	vq->avail = NULL;
 311	vq->used = NULL;
 312	vq->last_avail_idx = 0;
 313	vq->avail_idx = 0;
 314	vq->last_used_idx = 0;
 315	vq->signalled_used = 0;
 316	vq->signalled_used_valid = false;
 317	vq->used_flags = 0;
 318	vq->log_used = false;
 319	vq->log_addr = -1ull;
 320	vq->private_data = NULL;
 321	vq->acked_features = 0;
 322	vq->acked_backend_features = 0;
 323	vq->log_base = NULL;
 324	vq->error_ctx = NULL;
 325	vq->kick = NULL;
 326	vq->log_ctx = NULL;
 327	vhost_disable_cross_endian(vq);
 328	vhost_reset_is_le(vq);
 329	vq->busyloop_timeout = 0;
 330	vq->umem = NULL;
 331	vq->iotlb = NULL;
 332	vhost_vring_call_reset(&vq->call_ctx);
 333	__vhost_vq_meta_reset(vq);
 334}
 335
 336static int vhost_worker(void *data)
 337{
 338	struct vhost_worker *worker = data;
 339	struct vhost_work *work, *work_next;
 340	struct llist_node *node;
 341
 342	for (;;) {
 343		/* mb paired w/ kthread_stop */
 344		set_current_state(TASK_INTERRUPTIBLE);
 345
 346		if (vhost_task_should_stop(worker->vtsk)) {
 347			__set_current_state(TASK_RUNNING);
 348			break;
 349		}
 350
 351		node = llist_del_all(&worker->work_list);
 352		if (!node)
 353			schedule();
 354
 355		node = llist_reverse_order(node);
 356		/* make sure flag is seen after deletion */
 357		smp_wmb();
 358		llist_for_each_entry_safe(work, work_next, node, node) {
 359			clear_bit(VHOST_WORK_QUEUED, &work->flags);
 360			__set_current_state(TASK_RUNNING);
 361			kcov_remote_start_common(worker->kcov_handle);
 362			work->fn(work);
 363			kcov_remote_stop();
 364			cond_resched();
 365		}
 366	}
 367
 368	return 0;
 369}
 370
 371static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 372{
 373	kfree(vq->indirect);
 374	vq->indirect = NULL;
 375	kfree(vq->log);
 376	vq->log = NULL;
 377	kfree(vq->heads);
 378	vq->heads = NULL;
 379}
 380
 381/* Helper to allocate iovec buffers for all vqs. */
 382static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 383{
 384	struct vhost_virtqueue *vq;
 385	int i;
 386
 387	for (i = 0; i < dev->nvqs; ++i) {
 388		vq = dev->vqs[i];
 389		vq->indirect = kmalloc_array(UIO_MAXIOV,
 390					     sizeof(*vq->indirect),
 391					     GFP_KERNEL);
 392		vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
 393					GFP_KERNEL);
 394		vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
 395					  GFP_KERNEL);
 396		if (!vq->indirect || !vq->log || !vq->heads)
 397			goto err_nomem;
 398	}
 399	return 0;
 400
 401err_nomem:
 402	for (; i >= 0; --i)
 403		vhost_vq_free_iovecs(dev->vqs[i]);
 404	return -ENOMEM;
 405}
 406
 407static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 408{
 409	int i;
 410
 411	for (i = 0; i < dev->nvqs; ++i)
 412		vhost_vq_free_iovecs(dev->vqs[i]);
 413}
 414
 415bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
 416			  int pkts, int total_len)
 417{
 418	struct vhost_dev *dev = vq->dev;
 419
 420	if ((dev->byte_weight && total_len >= dev->byte_weight) ||
 421	    pkts >= dev->weight) {
 422		vhost_poll_queue(&vq->poll);
 423		return true;
 424	}
 425
 426	return false;
 427}
 428EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
 429
 430static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
 431				   unsigned int num)
 432{
 433	size_t event __maybe_unused =
 434	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 435
 436	return size_add(struct_size(vq->avail, ring, num), event);
 437}
 438
 439static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
 440				  unsigned int num)
 441{
 442	size_t event __maybe_unused =
 443	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 444
 445	return size_add(struct_size(vq->used, ring, num), event);
 446}
 447
 448static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
 449				  unsigned int num)
 450{
 451	return sizeof(*vq->desc) * num;
 452}
 453
 454void vhost_dev_init(struct vhost_dev *dev,
 455		    struct vhost_virtqueue **vqs, int nvqs,
 456		    int iov_limit, int weight, int byte_weight,
 457		    bool use_worker,
 458		    int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 459				       struct vhost_iotlb_msg *msg))
 460{
 461	struct vhost_virtqueue *vq;
 462	int i;
 463
 464	dev->vqs = vqs;
 465	dev->nvqs = nvqs;
 466	mutex_init(&dev->mutex);
 467	dev->log_ctx = NULL;
 468	dev->umem = NULL;
 469	dev->iotlb = NULL;
 470	dev->mm = NULL;
 471	dev->worker = NULL;
 472	dev->iov_limit = iov_limit;
 473	dev->weight = weight;
 474	dev->byte_weight = byte_weight;
 475	dev->use_worker = use_worker;
 476	dev->msg_handler = msg_handler;
 477	init_waitqueue_head(&dev->wait);
 478	INIT_LIST_HEAD(&dev->read_list);
 479	INIT_LIST_HEAD(&dev->pending_list);
 480	spin_lock_init(&dev->iotlb_lock);
 481
 482
 483	for (i = 0; i < dev->nvqs; ++i) {
 484		vq = dev->vqs[i];
 485		vq->log = NULL;
 486		vq->indirect = NULL;
 487		vq->heads = NULL;
 488		vq->dev = dev;
 489		mutex_init(&vq->mutex);
 490		vhost_vq_reset(dev, vq);
 491		if (vq->handle_kick)
 492			vhost_poll_init(&vq->poll, vq->handle_kick,
 493					EPOLLIN, dev);
 494	}
 495}
 496EXPORT_SYMBOL_GPL(vhost_dev_init);
 497
 498/* Caller should have device mutex */
 499long vhost_dev_check_owner(struct vhost_dev *dev)
 500{
 501	/* Are you the owner? If not, I don't think you mean to do that */
 502	return dev->mm == current->mm ? 0 : -EPERM;
 503}
 504EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 505
 506/* Caller should have device mutex */
 507bool vhost_dev_has_owner(struct vhost_dev *dev)
 508{
 509	return dev->mm;
 510}
 511EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
 512
 513static void vhost_attach_mm(struct vhost_dev *dev)
 514{
 515	/* No owner, become one */
 516	if (dev->use_worker) {
 517		dev->mm = get_task_mm(current);
 518	} else {
 519		/* vDPA device does not use worker thead, so there's
 520		 * no need to hold the address space for mm. This help
 521		 * to avoid deadlock in the case of mmap() which may
 522		 * held the refcnt of the file and depends on release
 523		 * method to remove vma.
 524		 */
 525		dev->mm = current->mm;
 526		mmgrab(dev->mm);
 527	}
 528}
 529
 530static void vhost_detach_mm(struct vhost_dev *dev)
 531{
 532	if (!dev->mm)
 533		return;
 534
 535	if (dev->use_worker)
 536		mmput(dev->mm);
 537	else
 538		mmdrop(dev->mm);
 539
 540	dev->mm = NULL;
 541}
 542
 543static void vhost_worker_free(struct vhost_dev *dev)
 544{
 545	struct vhost_worker *worker = dev->worker;
 546
 547	if (!worker)
 548		return;
 549
 550	dev->worker = NULL;
 551	WARN_ON(!llist_empty(&worker->work_list));
 552	vhost_task_stop(worker->vtsk);
 553	kfree(worker);
 554}
 555
 556static int vhost_worker_create(struct vhost_dev *dev)
 557{
 558	struct vhost_worker *worker;
 559	struct vhost_task *vtsk;
 560	char name[TASK_COMM_LEN];
 561	int ret;
 562
 563	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
 564	if (!worker)
 565		return -ENOMEM;
 566
 567	dev->worker = worker;
 568	worker->kcov_handle = kcov_common_handle();
 569	init_llist_head(&worker->work_list);
 570	snprintf(name, sizeof(name), "vhost-%d", current->pid);
 571
 572	vtsk = vhost_task_create(vhost_worker, worker, name);
 573	if (!vtsk) {
 574		ret = -ENOMEM;
 575		goto free_worker;
 576	}
 577
 578	worker->vtsk = vtsk;
 579	vhost_task_start(vtsk);
 580	return 0;
 581
 582free_worker:
 583	kfree(worker);
 584	dev->worker = NULL;
 585	return ret;
 586}
 587
 588/* Caller should have device mutex */
 589long vhost_dev_set_owner(struct vhost_dev *dev)
 590{
 591	int err;
 592
 593	/* Is there an owner already? */
 594	if (vhost_dev_has_owner(dev)) {
 595		err = -EBUSY;
 596		goto err_mm;
 597	}
 598
 599	vhost_attach_mm(dev);
 600
 601	if (dev->use_worker) {
 602		err = vhost_worker_create(dev);
 603		if (err)
 604			goto err_worker;
 605	}
 606
 607	err = vhost_dev_alloc_iovecs(dev);
 608	if (err)
 609		goto err_iovecs;
 610
 611	return 0;
 612err_iovecs:
 613	vhost_worker_free(dev);
 614err_worker:
 615	vhost_detach_mm(dev);
 616err_mm:
 617	return err;
 618}
 619EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 620
 621static struct vhost_iotlb *iotlb_alloc(void)
 622{
 623	return vhost_iotlb_alloc(max_iotlb_entries,
 624				 VHOST_IOTLB_FLAG_RETIRE);
 625}
 626
 627struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
 628{
 629	return iotlb_alloc();
 630}
 631EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 632
 633/* Caller should have device mutex */
 634void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
 635{
 636	int i;
 637
 638	vhost_dev_cleanup(dev);
 639
 640	dev->umem = umem;
 641	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 642	 * VQs aren't running.
 643	 */
 644	for (i = 0; i < dev->nvqs; ++i)
 645		dev->vqs[i]->umem = umem;
 646}
 647EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 648
 649void vhost_dev_stop(struct vhost_dev *dev)
 650{
 651	int i;
 652
 653	for (i = 0; i < dev->nvqs; ++i) {
 654		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
 655			vhost_poll_stop(&dev->vqs[i]->poll);
 656	}
 657
 658	vhost_dev_flush(dev);
 659}
 660EXPORT_SYMBOL_GPL(vhost_dev_stop);
 661
 662void vhost_clear_msg(struct vhost_dev *dev)
 663{
 664	struct vhost_msg_node *node, *n;
 665
 666	spin_lock(&dev->iotlb_lock);
 667
 668	list_for_each_entry_safe(node, n, &dev->read_list, node) {
 669		list_del(&node->node);
 670		kfree(node);
 671	}
 672
 673	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
 674		list_del(&node->node);
 675		kfree(node);
 676	}
 677
 678	spin_unlock(&dev->iotlb_lock);
 679}
 680EXPORT_SYMBOL_GPL(vhost_clear_msg);
 681
 682void vhost_dev_cleanup(struct vhost_dev *dev)
 683{
 684	int i;
 685
 686	for (i = 0; i < dev->nvqs; ++i) {
 687		if (dev->vqs[i]->error_ctx)
 688			eventfd_ctx_put(dev->vqs[i]->error_ctx);
 689		if (dev->vqs[i]->kick)
 690			fput(dev->vqs[i]->kick);
 691		if (dev->vqs[i]->call_ctx.ctx)
 692			eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
 693		vhost_vq_reset(dev, dev->vqs[i]);
 694	}
 695	vhost_dev_free_iovecs(dev);
 696	if (dev->log_ctx)
 697		eventfd_ctx_put(dev->log_ctx);
 698	dev->log_ctx = NULL;
 699	/* No one will access memory at this point */
 700	vhost_iotlb_free(dev->umem);
 701	dev->umem = NULL;
 702	vhost_iotlb_free(dev->iotlb);
 703	dev->iotlb = NULL;
 704	vhost_clear_msg(dev);
 705	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
 706	vhost_worker_free(dev);
 707	vhost_detach_mm(dev);
 708}
 709EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
 710
 711static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 712{
 713	u64 a = addr / VHOST_PAGE_SIZE / 8;
 714
 715	/* Make sure 64 bit math will not overflow. */
 716	if (a > ULONG_MAX - (unsigned long)log_base ||
 717	    a + (unsigned long)log_base > ULONG_MAX)
 718		return false;
 719
 720	return access_ok(log_base + a,
 721			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
 722}
 723
 724/* Make sure 64 bit math will not overflow. */
 725static bool vhost_overflow(u64 uaddr, u64 size)
 726{
 727	if (uaddr > ULONG_MAX || size > ULONG_MAX)
 728		return true;
 729
 730	if (!size)
 731		return false;
 732
 733	return uaddr > ULONG_MAX - size + 1;
 734}
 735
 736/* Caller should have vq mutex and device mutex. */
 737static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
 738				int log_all)
 739{
 740	struct vhost_iotlb_map *map;
 741
 742	if (!umem)
 743		return false;
 744
 745	list_for_each_entry(map, &umem->list, link) {
 746		unsigned long a = map->addr;
 747
 748		if (vhost_overflow(map->addr, map->size))
 749			return false;
 750
 751
 752		if (!access_ok((void __user *)a, map->size))
 753			return false;
 754		else if (log_all && !log_access_ok(log_base,
 755						   map->start,
 756						   map->size))
 757			return false;
 758	}
 759	return true;
 760}
 761
 762static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
 763					       u64 addr, unsigned int size,
 764					       int type)
 765{
 766	const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
 767
 768	if (!map)
 769		return NULL;
 770
 771	return (void __user *)(uintptr_t)(map->addr + addr - map->start);
 772}
 773
 774/* Can we switch to this memory table? */
 775/* Caller should have device mutex but not vq mutex */
 776static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
 777			     int log_all)
 778{
 779	int i;
 780
 781	for (i = 0; i < d->nvqs; ++i) {
 782		bool ok;
 783		bool log;
 784
 785		mutex_lock(&d->vqs[i]->mutex);
 786		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 787		/* If ring is inactive, will check when it's enabled. */
 788		if (d->vqs[i]->private_data)
 789			ok = vq_memory_access_ok(d->vqs[i]->log_base,
 790						 umem, log);
 791		else
 792			ok = true;
 793		mutex_unlock(&d->vqs[i]->mutex);
 794		if (!ok)
 795			return false;
 796	}
 797	return true;
 798}
 799
 800static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 801			  struct iovec iov[], int iov_size, int access);
 802
 803static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
 804			      const void *from, unsigned size)
 805{
 806	int ret;
 807
 808	if (!vq->iotlb)
 809		return __copy_to_user(to, from, size);
 810	else {
 811		/* This function should be called after iotlb
 812		 * prefetch, which means we're sure that all vq
 813		 * could be access through iotlb. So -EAGAIN should
 814		 * not happen in this case.
 815		 */
 816		struct iov_iter t;
 817		void __user *uaddr = vhost_vq_meta_fetch(vq,
 818				     (u64)(uintptr_t)to, size,
 819				     VHOST_ADDR_USED);
 820
 821		if (uaddr)
 822			return __copy_to_user(uaddr, from, size);
 823
 824		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
 825				     ARRAY_SIZE(vq->iotlb_iov),
 826				     VHOST_ACCESS_WO);
 827		if (ret < 0)
 828			goto out;
 829		iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
 830		ret = copy_to_iter(from, size, &t);
 831		if (ret == size)
 832			ret = 0;
 833	}
 834out:
 835	return ret;
 836}
 837
 838static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 839				void __user *from, unsigned size)
 840{
 841	int ret;
 842
 843	if (!vq->iotlb)
 844		return __copy_from_user(to, from, size);
 845	else {
 846		/* This function should be called after iotlb
 847		 * prefetch, which means we're sure that vq
 848		 * could be access through iotlb. So -EAGAIN should
 849		 * not happen in this case.
 850		 */
 851		void __user *uaddr = vhost_vq_meta_fetch(vq,
 852				     (u64)(uintptr_t)from, size,
 853				     VHOST_ADDR_DESC);
 854		struct iov_iter f;
 855
 856		if (uaddr)
 857			return __copy_from_user(to, uaddr, size);
 858
 859		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
 860				     ARRAY_SIZE(vq->iotlb_iov),
 861				     VHOST_ACCESS_RO);
 862		if (ret < 0) {
 863			vq_err(vq, "IOTLB translation failure: uaddr "
 864			       "%p size 0x%llx\n", from,
 865			       (unsigned long long) size);
 866			goto out;
 867		}
 868		iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
 869		ret = copy_from_iter(to, size, &f);
 870		if (ret == size)
 871			ret = 0;
 872	}
 873
 874out:
 875	return ret;
 876}
 877
 878static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
 879					  void __user *addr, unsigned int size,
 880					  int type)
 881{
 882	int ret;
 883
 884	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
 885			     ARRAY_SIZE(vq->iotlb_iov),
 886			     VHOST_ACCESS_RO);
 887	if (ret < 0) {
 888		vq_err(vq, "IOTLB translation failure: uaddr "
 889			"%p size 0x%llx\n", addr,
 890			(unsigned long long) size);
 891		return NULL;
 892	}
 893
 894	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
 895		vq_err(vq, "Non atomic userspace memory access: uaddr "
 896			"%p size 0x%llx\n", addr,
 897			(unsigned long long) size);
 898		return NULL;
 899	}
 900
 901	return vq->iotlb_iov[0].iov_base;
 902}
 903
 904/* This function should be called after iotlb
 905 * prefetch, which means we're sure that vq
 906 * could be access through iotlb. So -EAGAIN should
 907 * not happen in this case.
 908 */
 909static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 910					    void __user *addr, unsigned int size,
 911					    int type)
 912{
 913	void __user *uaddr = vhost_vq_meta_fetch(vq,
 914			     (u64)(uintptr_t)addr, size, type);
 915	if (uaddr)
 916		return uaddr;
 917
 918	return __vhost_get_user_slow(vq, addr, size, type);
 919}
 920
 921#define vhost_put_user(vq, x, ptr)		\
 922({ \
 923	int ret; \
 924	if (!vq->iotlb) { \
 925		ret = __put_user(x, ptr); \
 926	} else { \
 927		__typeof__(ptr) to = \
 928			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
 929					  sizeof(*ptr), VHOST_ADDR_USED); \
 930		if (to != NULL) \
 931			ret = __put_user(x, to); \
 932		else \
 933			ret = -EFAULT;	\
 934	} \
 935	ret; \
 936})
 937
 938static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 939{
 940	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 941			      vhost_avail_event(vq));
 942}
 943
 944static inline int vhost_put_used(struct vhost_virtqueue *vq,
 945				 struct vring_used_elem *head, int idx,
 946				 int count)
 947{
 948	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 949				  count * sizeof(*head));
 950}
 951
 952static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 953
 954{
 955	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 956			      &vq->used->flags);
 957}
 958
 959static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 960
 961{
 962	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 963			      &vq->used->idx);
 964}
 965
 966#define vhost_get_user(vq, x, ptr, type)		\
 967({ \
 968	int ret; \
 969	if (!vq->iotlb) { \
 970		ret = __get_user(x, ptr); \
 971	} else { \
 972		__typeof__(ptr) from = \
 973			(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
 974							   sizeof(*ptr), \
 975							   type); \
 976		if (from != NULL) \
 977			ret = __get_user(x, from); \
 978		else \
 979			ret = -EFAULT; \
 980	} \
 981	ret; \
 982})
 983
 984#define vhost_get_avail(vq, x, ptr) \
 985	vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
 986
 987#define vhost_get_used(vq, x, ptr) \
 988	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 989
 990static void vhost_dev_lock_vqs(struct vhost_dev *d)
 991{
 992	int i = 0;
 993	for (i = 0; i < d->nvqs; ++i)
 994		mutex_lock_nested(&d->vqs[i]->mutex, i);
 995}
 996
 997static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 998{
 999	int i = 0;
1000	for (i = 0; i < d->nvqs; ++i)
1001		mutex_unlock(&d->vqs[i]->mutex);
1002}
1003
1004static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
1005				      __virtio16 *idx)
1006{
1007	return vhost_get_avail(vq, *idx, &vq->avail->idx);
1008}
1009
1010static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
1011				       __virtio16 *head, int idx)
1012{
1013	return vhost_get_avail(vq, *head,
1014			       &vq->avail->ring[idx & (vq->num - 1)]);
1015}
1016
1017static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
1018					__virtio16 *flags)
1019{
1020	return vhost_get_avail(vq, *flags, &vq->avail->flags);
1021}
1022
1023static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
1024				       __virtio16 *event)
1025{
1026	return vhost_get_avail(vq, *event, vhost_used_event(vq));
1027}
1028
1029static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
1030				     __virtio16 *idx)
1031{
1032	return vhost_get_used(vq, *idx, &vq->used->idx);
1033}
1034
1035static inline int vhost_get_desc(struct vhost_virtqueue *vq,
1036				 struct vring_desc *desc, int idx)
1037{
1038	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
1039}
1040
1041static void vhost_iotlb_notify_vq(struct vhost_dev *d,
1042				  struct vhost_iotlb_msg *msg)
1043{
1044	struct vhost_msg_node *node, *n;
1045
1046	spin_lock(&d->iotlb_lock);
1047
1048	list_for_each_entry_safe(node, n, &d->pending_list, node) {
1049		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
1050		if (msg->iova <= vq_msg->iova &&
1051		    msg->iova + msg->size - 1 >= vq_msg->iova &&
1052		    vq_msg->type == VHOST_IOTLB_MISS) {
1053			vhost_poll_queue(&node->vq->poll);
1054			list_del(&node->node);
1055			kfree(node);
1056		}
1057	}
1058
1059	spin_unlock(&d->iotlb_lock);
1060}
1061
1062static bool umem_access_ok(u64 uaddr, u64 size, int access)
1063{
1064	unsigned long a = uaddr;
1065
1066	/* Make sure 64 bit math will not overflow. */
1067	if (vhost_overflow(uaddr, size))
1068		return false;
1069
1070	if ((access & VHOST_ACCESS_RO) &&
1071	    !access_ok((void __user *)a, size))
1072		return false;
1073	if ((access & VHOST_ACCESS_WO) &&
1074	    !access_ok((void __user *)a, size))
1075		return false;
1076	return true;
1077}
1078
1079static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
1080				   struct vhost_iotlb_msg *msg)
1081{
1082	int ret = 0;
1083
1084	if (asid != 0)
1085		return -EINVAL;
1086
1087	mutex_lock(&dev->mutex);
1088	vhost_dev_lock_vqs(dev);
1089	switch (msg->type) {
1090	case VHOST_IOTLB_UPDATE:
1091		if (!dev->iotlb) {
1092			ret = -EFAULT;
1093			break;
1094		}
1095		if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
1096			ret = -EFAULT;
1097			break;
1098		}
1099		vhost_vq_meta_reset(dev);
1100		if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
1101					  msg->iova + msg->size - 1,
1102					  msg->uaddr, msg->perm)) {
1103			ret = -ENOMEM;
1104			break;
1105		}
1106		vhost_iotlb_notify_vq(dev, msg);
1107		break;
1108	case VHOST_IOTLB_INVALIDATE:
1109		if (!dev->iotlb) {
1110			ret = -EFAULT;
1111			break;
1112		}
1113		vhost_vq_meta_reset(dev);
1114		vhost_iotlb_del_range(dev->iotlb, msg->iova,
1115				      msg->iova + msg->size - 1);
1116		break;
1117	default:
1118		ret = -EINVAL;
1119		break;
1120	}
1121
1122	vhost_dev_unlock_vqs(dev);
1123	mutex_unlock(&dev->mutex);
1124
1125	return ret;
1126}
1127ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
1128			     struct iov_iter *from)
1129{
1130	struct vhost_iotlb_msg msg;
1131	size_t offset;
1132	int type, ret;
1133	u32 asid = 0;
1134
1135	ret = copy_from_iter(&type, sizeof(type), from);
1136	if (ret != sizeof(type)) {
1137		ret = -EINVAL;
1138		goto done;
1139	}
1140
1141	switch (type) {
1142	case VHOST_IOTLB_MSG:
1143		/* There maybe a hole after type for V1 message type,
1144		 * so skip it here.
1145		 */
1146		offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
1147		break;
1148	case VHOST_IOTLB_MSG_V2:
1149		if (vhost_backend_has_feature(dev->vqs[0],
1150					      VHOST_BACKEND_F_IOTLB_ASID)) {
1151			ret = copy_from_iter(&asid, sizeof(asid), from);
1152			if (ret != sizeof(asid)) {
1153				ret = -EINVAL;
1154				goto done;
1155			}
1156			offset = 0;
1157		} else
1158			offset = sizeof(__u32);
1159		break;
1160	default:
1161		ret = -EINVAL;
1162		goto done;
1163	}
1164
1165	iov_iter_advance(from, offset);
1166	ret = copy_from_iter(&msg, sizeof(msg), from);
1167	if (ret != sizeof(msg)) {
1168		ret = -EINVAL;
1169		goto done;
1170	}
1171
1172	if ((msg.type == VHOST_IOTLB_UPDATE ||
1173	     msg.type == VHOST_IOTLB_INVALIDATE) &&
1174	     msg.size == 0) {
1175		ret = -EINVAL;
1176		goto done;
1177	}
1178
1179	if (dev->msg_handler)
1180		ret = dev->msg_handler(dev, asid, &msg);
1181	else
1182		ret = vhost_process_iotlb_msg(dev, asid, &msg);
1183	if (ret) {
1184		ret = -EFAULT;
1185		goto done;
1186	}
1187
1188	ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
1189	      sizeof(struct vhost_msg_v2);
1190done:
1191	return ret;
1192}
1193EXPORT_SYMBOL(vhost_chr_write_iter);
1194
1195__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
1196			    poll_table *wait)
1197{
1198	__poll_t mask = 0;
1199
1200	poll_wait(file, &dev->wait, wait);
1201
1202	if (!list_empty(&dev->read_list))
1203		mask |= EPOLLIN | EPOLLRDNORM;
1204
1205	return mask;
1206}
1207EXPORT_SYMBOL(vhost_chr_poll);
1208
1209ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
1210			    int noblock)
1211{
1212	DEFINE_WAIT(wait);
1213	struct vhost_msg_node *node;
1214	ssize_t ret = 0;
1215	unsigned size = sizeof(struct vhost_msg);
1216
1217	if (iov_iter_count(to) < size)
1218		return 0;
1219
1220	while (1) {
1221		if (!noblock)
1222			prepare_to_wait(&dev->wait, &wait,
1223					TASK_INTERRUPTIBLE);
1224
1225		node = vhost_dequeue_msg(dev, &dev->read_list);
1226		if (node)
1227			break;
1228		if (noblock) {
1229			ret = -EAGAIN;
1230			break;
1231		}
1232		if (signal_pending(current)) {
1233			ret = -ERESTARTSYS;
1234			break;
1235		}
1236		if (!dev->iotlb) {
1237			ret = -EBADFD;
1238			break;
1239		}
1240
1241		schedule();
1242	}
1243
1244	if (!noblock)
1245		finish_wait(&dev->wait, &wait);
1246
1247	if (node) {
1248		struct vhost_iotlb_msg *msg;
1249		void *start = &node->msg;
1250
1251		switch (node->msg.type) {
1252		case VHOST_IOTLB_MSG:
1253			size = sizeof(node->msg);
1254			msg = &node->msg.iotlb;
1255			break;
1256		case VHOST_IOTLB_MSG_V2:
1257			size = sizeof(node->msg_v2);
1258			msg = &node->msg_v2.iotlb;
1259			break;
1260		default:
1261			BUG();
1262			break;
1263		}
1264
1265		ret = copy_to_iter(start, size, to);
1266		if (ret != size || msg->type != VHOST_IOTLB_MISS) {
1267			kfree(node);
1268			return ret;
1269		}
1270		vhost_enqueue_msg(dev, &dev->pending_list, node);
1271	}
1272
1273	return ret;
1274}
1275EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
1276
1277static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
1278{
1279	struct vhost_dev *dev = vq->dev;
1280	struct vhost_msg_node *node;
1281	struct vhost_iotlb_msg *msg;
1282	bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
1283
1284	node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
1285	if (!node)
1286		return -ENOMEM;
1287
1288	if (v2) {
1289		node->msg_v2.type = VHOST_IOTLB_MSG_V2;
1290		msg = &node->msg_v2.iotlb;
1291	} else {
1292		msg = &node->msg.iotlb;
1293	}
1294
1295	msg->type = VHOST_IOTLB_MISS;
1296	msg->iova = iova;
1297	msg->perm = access;
1298
1299	vhost_enqueue_msg(dev, &dev->read_list, node);
1300
1301	return 0;
1302}
1303
1304static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
1305			 vring_desc_t __user *desc,
1306			 vring_avail_t __user *avail,
1307			 vring_used_t __user *used)
1308
1309{
1310	/* If an IOTLB device is present, the vring addresses are
1311	 * GIOVAs. Access validation occurs at prefetch time. */
1312	if (vq->iotlb)
1313		return true;
1314
1315	return access_ok(desc, vhost_get_desc_size(vq, num)) &&
1316	       access_ok(avail, vhost_get_avail_size(vq, num)) &&
1317	       access_ok(used, vhost_get_used_size(vq, num));
1318}
1319
1320static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
1321				 const struct vhost_iotlb_map *map,
1322				 int type)
1323{
1324	int access = (type == VHOST_ADDR_USED) ?
1325		     VHOST_ACCESS_WO : VHOST_ACCESS_RO;
1326
1327	if (likely(map->perm & access))
1328		vq->meta_iotlb[type] = map;
1329}
1330
1331static bool iotlb_access_ok(struct vhost_virtqueue *vq,
1332			    int access, u64 addr, u64 len, int type)
1333{
1334	const struct vhost_iotlb_map *map;
1335	struct vhost_iotlb *umem = vq->iotlb;
1336	u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
1337
1338	if (vhost_vq_meta_fetch(vq, addr, len, type))
1339		return true;
1340
1341	while (len > s) {
1342		map = vhost_iotlb_itree_first(umem, addr, last);
1343		if (map == NULL || map->start > addr) {
1344			vhost_iotlb_miss(vq, addr, access);
1345			return false;
1346		} else if (!(map->perm & access)) {
1347			/* Report the possible access violation by
1348			 * request another translation from userspace.
1349			 */
1350			return false;
1351		}
1352
1353		size = map->size - addr + map->start;
1354
1355		if (orig_addr == addr && size >= len)
1356			vhost_vq_meta_update(vq, map, type);
1357
1358		s += size;
1359		addr += size;
1360	}
1361
1362	return true;
1363}
1364
1365int vq_meta_prefetch(struct vhost_virtqueue *vq)
1366{
1367	unsigned int num = vq->num;
1368
1369	if (!vq->iotlb)
1370		return 1;
1371
1372	return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
1373			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
1374	       iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
1375			       vhost_get_avail_size(vq, num),
1376			       VHOST_ADDR_AVAIL) &&
1377	       iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
1378			       vhost_get_used_size(vq, num), VHOST_ADDR_USED);
1379}
1380EXPORT_SYMBOL_GPL(vq_meta_prefetch);
1381
1382/* Can we log writes? */
1383/* Caller should have device mutex but not vq mutex */
1384bool vhost_log_access_ok(struct vhost_dev *dev)
1385{
1386	return memory_access_ok(dev, dev->umem, 1);
1387}
1388EXPORT_SYMBOL_GPL(vhost_log_access_ok);
1389
1390static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
1391				  void __user *log_base,
1392				  bool log_used,
1393				  u64 log_addr)
1394{
1395	/* If an IOTLB device is present, log_addr is a GIOVA that
1396	 * will never be logged by log_used(). */
1397	if (vq->iotlb)
1398		return true;
1399
1400	return !log_used || log_access_ok(log_base, log_addr,
1401					  vhost_get_used_size(vq, vq->num));
1402}
1403
1404/* Verify access for write logging. */
1405/* Caller should have vq mutex and device mutex */
1406static bool vq_log_access_ok(struct vhost_virtqueue *vq,
1407			     void __user *log_base)
1408{
1409	return vq_memory_access_ok(log_base, vq->umem,
1410				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
1411		vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
1412}
1413
1414/* Can we start vq? */
1415/* Caller should have vq mutex and device mutex */
1416bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
1417{
1418	if (!vq_log_access_ok(vq, vq->log_base))
1419		return false;
1420
1421	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
1422}
1423EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
1424
1425static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
1426{
1427	struct vhost_memory mem, *newmem;
1428	struct vhost_memory_region *region;
1429	struct vhost_iotlb *newumem, *oldumem;
1430	unsigned long size = offsetof(struct vhost_memory, regions);
1431	int i;
1432
1433	if (copy_from_user(&mem, m, size))
1434		return -EFAULT;
1435	if (mem.padding)
1436		return -EOPNOTSUPP;
1437	if (mem.nregions > max_mem_regions)
1438		return -E2BIG;
1439	newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
1440			GFP_KERNEL);
1441	if (!newmem)
1442		return -ENOMEM;
1443
1444	memcpy(newmem, &mem, size);
1445	if (copy_from_user(newmem->regions, m->regions,
1446			   flex_array_size(newmem, regions, mem.nregions))) {
1447		kvfree(newmem);
1448		return -EFAULT;
1449	}
1450
1451	newumem = iotlb_alloc();
1452	if (!newumem) {
1453		kvfree(newmem);
1454		return -ENOMEM;
1455	}
1456
1457	for (region = newmem->regions;
1458	     region < newmem->regions + mem.nregions;
1459	     region++) {
1460		if (vhost_iotlb_add_range(newumem,
1461					  region->guest_phys_addr,
1462					  region->guest_phys_addr +
1463					  region->memory_size - 1,
1464					  region->userspace_addr,
1465					  VHOST_MAP_RW))
1466			goto err;
1467	}
1468
1469	if (!memory_access_ok(d, newumem, 0))
1470		goto err;
1471
1472	oldumem = d->umem;
1473	d->umem = newumem;
1474
1475	/* All memory accesses are done under some VQ mutex. */
1476	for (i = 0; i < d->nvqs; ++i) {
1477		mutex_lock(&d->vqs[i]->mutex);
1478		d->vqs[i]->umem = newumem;
1479		mutex_unlock(&d->vqs[i]->mutex);
1480	}
1481
1482	kvfree(newmem);
1483	vhost_iotlb_free(oldumem);
1484	return 0;
1485
1486err:
1487	vhost_iotlb_free(newumem);
1488	kvfree(newmem);
1489	return -EFAULT;
1490}
1491
1492static long vhost_vring_set_num(struct vhost_dev *d,
1493				struct vhost_virtqueue *vq,
1494				void __user *argp)
1495{
1496	struct vhost_vring_state s;
1497
1498	/* Resizing ring with an active backend?
1499	 * You don't want to do that. */
1500	if (vq->private_data)
1501		return -EBUSY;
1502
1503	if (copy_from_user(&s, argp, sizeof s))
1504		return -EFAULT;
1505
1506	if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
1507		return -EINVAL;
1508	vq->num = s.num;
1509
1510	return 0;
1511}
1512
1513static long vhost_vring_set_addr(struct vhost_dev *d,
1514				 struct vhost_virtqueue *vq,
1515				 void __user *argp)
1516{
1517	struct vhost_vring_addr a;
1518
1519	if (copy_from_user(&a, argp, sizeof a))
1520		return -EFAULT;
1521	if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
1522		return -EOPNOTSUPP;
1523
1524	/* For 32bit, verify that the top 32bits of the user
1525	   data are set to zero. */
1526	if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
1527	    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
1528	    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
1529		return -EFAULT;
1530
1531	/* Make sure it's safe to cast pointers to vring types. */
1532	BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
1533	BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
1534	if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
1535	    (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
1536	    (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
1537		return -EINVAL;
1538
1539	/* We only verify access here if backend is configured.
1540	 * If it is not, we don't as size might not have been setup.
1541	 * We will verify when backend is configured. */
1542	if (vq->private_data) {
1543		if (!vq_access_ok(vq, vq->num,
1544			(void __user *)(unsigned long)a.desc_user_addr,
1545			(void __user *)(unsigned long)a.avail_user_addr,
1546			(void __user *)(unsigned long)a.used_user_addr))
1547			return -EINVAL;
1548
1549		/* Also validate log access for used ring if enabled. */
1550		if (!vq_log_used_access_ok(vq, vq->log_base,
1551				a.flags & (0x1 << VHOST_VRING_F_LOG),
1552				a.log_guest_addr))
1553			return -EINVAL;
1554	}
1555
1556	vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
1557	vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
1558	vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
1559	vq->log_addr = a.log_guest_addr;
1560	vq->used = (void __user *)(unsigned long)a.used_user_addr;
1561
1562	return 0;
1563}
1564
1565static long vhost_vring_set_num_addr(struct vhost_dev *d,
1566				     struct vhost_virtqueue *vq,
1567				     unsigned int ioctl,
1568				     void __user *argp)
1569{
1570	long r;
1571
1572	mutex_lock(&vq->mutex);
1573
1574	switch (ioctl) {
1575	case VHOST_SET_VRING_NUM:
1576		r = vhost_vring_set_num(d, vq, argp);
1577		break;
1578	case VHOST_SET_VRING_ADDR:
1579		r = vhost_vring_set_addr(d, vq, argp);
1580		break;
1581	default:
1582		BUG();
1583	}
1584
1585	mutex_unlock(&vq->mutex);
1586
1587	return r;
1588}
1589long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1590{
1591	struct file *eventfp, *filep = NULL;
1592	bool pollstart = false, pollstop = false;
1593	struct eventfd_ctx *ctx = NULL;
1594	u32 __user *idxp = argp;
1595	struct vhost_virtqueue *vq;
1596	struct vhost_vring_state s;
1597	struct vhost_vring_file f;
1598	u32 idx;
1599	long r;
1600
1601	r = get_user(idx, idxp);
1602	if (r < 0)
1603		return r;
1604	if (idx >= d->nvqs)
1605		return -ENOBUFS;
1606
1607	idx = array_index_nospec(idx, d->nvqs);
1608	vq = d->vqs[idx];
1609
1610	if (ioctl == VHOST_SET_VRING_NUM ||
1611	    ioctl == VHOST_SET_VRING_ADDR) {
1612		return vhost_vring_set_num_addr(d, vq, ioctl, argp);
1613	}
1614
1615	mutex_lock(&vq->mutex);
1616
1617	switch (ioctl) {
1618	case VHOST_SET_VRING_BASE:
1619		/* Moving base with an active backend?
1620		 * You don't want to do that. */
1621		if (vq->private_data) {
1622			r = -EBUSY;
1623			break;
1624		}
1625		if (copy_from_user(&s, argp, sizeof s)) {
1626			r = -EFAULT;
1627			break;
1628		}
1629		if (s.num > 0xffff) {
1630			r = -EINVAL;
1631			break;
1632		}
1633		vq->last_avail_idx = s.num;
1634		/* Forget the cached index value. */
1635		vq->avail_idx = vq->last_avail_idx;
1636		break;
1637	case VHOST_GET_VRING_BASE:
1638		s.index = idx;
1639		s.num = vq->last_avail_idx;
1640		if (copy_to_user(argp, &s, sizeof s))
1641			r = -EFAULT;
1642		break;
1643	case VHOST_SET_VRING_KICK:
1644		if (copy_from_user(&f, argp, sizeof f)) {
1645			r = -EFAULT;
1646			break;
1647		}
1648		eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
1649		if (IS_ERR(eventfp)) {
1650			r = PTR_ERR(eventfp);
1651			break;
1652		}
1653		if (eventfp != vq->kick) {
1654			pollstop = (filep = vq->kick) != NULL;
1655			pollstart = (vq->kick = eventfp) != NULL;
1656		} else
1657			filep = eventfp;
1658		break;
1659	case VHOST_SET_VRING_CALL:
1660		if (copy_from_user(&f, argp, sizeof f)) {
1661			r = -EFAULT;
1662			break;
1663		}
1664		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1665		if (IS_ERR(ctx)) {
1666			r = PTR_ERR(ctx);
1667			break;
1668		}
1669
1670		swap(ctx, vq->call_ctx.ctx);
1671		break;
1672	case VHOST_SET_VRING_ERR:
1673		if (copy_from_user(&f, argp, sizeof f)) {
1674			r = -EFAULT;
1675			break;
1676		}
1677		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1678		if (IS_ERR(ctx)) {
1679			r = PTR_ERR(ctx);
1680			break;
1681		}
1682		swap(ctx, vq->error_ctx);
1683		break;
1684	case VHOST_SET_VRING_ENDIAN:
1685		r = vhost_set_vring_endian(vq, argp);
1686		break;
1687	case VHOST_GET_VRING_ENDIAN:
1688		r = vhost_get_vring_endian(vq, idx, argp);
1689		break;
1690	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
1691		if (copy_from_user(&s, argp, sizeof(s))) {
1692			r = -EFAULT;
1693			break;
1694		}
1695		vq->busyloop_timeout = s.num;
1696		break;
1697	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
1698		s.index = idx;
1699		s.num = vq->busyloop_timeout;
1700		if (copy_to_user(argp, &s, sizeof(s)))
1701			r = -EFAULT;
1702		break;
1703	default:
1704		r = -ENOIOCTLCMD;
1705	}
1706
1707	if (pollstop && vq->handle_kick)
1708		vhost_poll_stop(&vq->poll);
1709
1710	if (!IS_ERR_OR_NULL(ctx))
1711		eventfd_ctx_put(ctx);
1712	if (filep)
1713		fput(filep);
1714
1715	if (pollstart && vq->handle_kick)
1716		r = vhost_poll_start(&vq->poll, vq->kick);
1717
1718	mutex_unlock(&vq->mutex);
1719
1720	if (pollstop && vq->handle_kick)
1721		vhost_dev_flush(vq->poll.dev);
1722	return r;
1723}
1724EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
1725
1726int vhost_init_device_iotlb(struct vhost_dev *d)
1727{
1728	struct vhost_iotlb *niotlb, *oiotlb;
1729	int i;
1730
1731	niotlb = iotlb_alloc();
1732	if (!niotlb)
1733		return -ENOMEM;
1734
1735	oiotlb = d->iotlb;
1736	d->iotlb = niotlb;
1737
1738	for (i = 0; i < d->nvqs; ++i) {
1739		struct vhost_virtqueue *vq = d->vqs[i];
1740
1741		mutex_lock(&vq->mutex);
1742		vq->iotlb = niotlb;
1743		__vhost_vq_meta_reset(vq);
1744		mutex_unlock(&vq->mutex);
1745	}
1746
1747	vhost_iotlb_free(oiotlb);
1748
1749	return 0;
1750}
1751EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
1752
1753/* Caller must have device mutex */
1754long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1755{
1756	struct eventfd_ctx *ctx;
1757	u64 p;
1758	long r;
1759	int i, fd;
1760
1761	/* If you are not the owner, you can become one */
1762	if (ioctl == VHOST_SET_OWNER) {
1763		r = vhost_dev_set_owner(d);
1764		goto done;
1765	}
1766
1767	/* You must be the owner to do anything else */
1768	r = vhost_dev_check_owner(d);
1769	if (r)
1770		goto done;
1771
1772	switch (ioctl) {
1773	case VHOST_SET_MEM_TABLE:
1774		r = vhost_set_memory(d, argp);
1775		break;
1776	case VHOST_SET_LOG_BASE:
1777		if (copy_from_user(&p, argp, sizeof p)) {
1778			r = -EFAULT;
1779			break;
1780		}
1781		if ((u64)(unsigned long)p != p) {
1782			r = -EFAULT;
1783			break;
1784		}
1785		for (i = 0; i < d->nvqs; ++i) {
1786			struct vhost_virtqueue *vq;
1787			void __user *base = (void __user *)(unsigned long)p;
1788			vq = d->vqs[i];
1789			mutex_lock(&vq->mutex);
1790			/* If ring is inactive, will check when it's enabled. */
1791			if (vq->private_data && !vq_log_access_ok(vq, base))
1792				r = -EFAULT;
1793			else
1794				vq->log_base = base;
1795			mutex_unlock(&vq->mutex);
1796		}
1797		break;
1798	case VHOST_SET_LOG_FD:
1799		r = get_user(fd, (int __user *)argp);
1800		if (r < 0)
1801			break;
1802		ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
1803		if (IS_ERR(ctx)) {
1804			r = PTR_ERR(ctx);
1805			break;
1806		}
1807		swap(ctx, d->log_ctx);
1808		for (i = 0; i < d->nvqs; ++i) {
1809			mutex_lock(&d->vqs[i]->mutex);
1810			d->vqs[i]->log_ctx = d->log_ctx;
1811			mutex_unlock(&d->vqs[i]->mutex);
1812		}
1813		if (ctx)
1814			eventfd_ctx_put(ctx);
1815		break;
1816	default:
1817		r = -ENOIOCTLCMD;
1818		break;
1819	}
1820done:
1821	return r;
1822}
1823EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
1824
1825/* TODO: This is really inefficient.  We need something like get_user()
1826 * (instruction directly accesses the data, with an exception table entry
1827 * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
1828 */
1829static int set_bit_to_user(int nr, void __user *addr)
1830{
1831	unsigned long log = (unsigned long)addr;
1832	struct page *page;
1833	void *base;
1834	int bit = nr + (log % PAGE_SIZE) * 8;
1835	int r;
1836
1837	r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
1838	if (r < 0)
1839		return r;
1840	BUG_ON(r != 1);
1841	base = kmap_atomic(page);
1842	set_bit(bit, base);
1843	kunmap_atomic(base);
1844	unpin_user_pages_dirty_lock(&page, 1, true);
1845	return 0;
1846}
1847
1848static int log_write(void __user *log_base,
1849		     u64 write_address, u64 write_length)
1850{
1851	u64 write_page = write_address / VHOST_PAGE_SIZE;
1852	int r;
1853
1854	if (!write_length)
1855		return 0;
1856	write_length += write_address % VHOST_PAGE_SIZE;
1857	for (;;) {
1858		u64 base = (u64)(unsigned long)log_base;
1859		u64 log = base + write_page / 8;
1860		int bit = write_page % 8;
1861		if ((u64)(unsigned long)log != log)
1862			return -EFAULT;
1863		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
1864		if (r < 0)
1865			return r;
1866		if (write_length <= VHOST_PAGE_SIZE)
1867			break;
1868		write_length -= VHOST_PAGE_SIZE;
1869		write_page += 1;
1870	}
1871	return r;
1872}
1873
1874static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
1875{
1876	struct vhost_iotlb *umem = vq->umem;
1877	struct vhost_iotlb_map *u;
1878	u64 start, end, l, min;
1879	int r;
1880	bool hit = false;
1881
1882	while (len) {
1883		min = len;
1884		/* More than one GPAs can be mapped into a single HVA. So
1885		 * iterate all possible umems here to be safe.
1886		 */
1887		list_for_each_entry(u, &umem->list, link) {
1888			if (u->addr > hva - 1 + len ||
1889			    u->addr - 1 + u->size < hva)
1890				continue;
1891			start = max(u->addr, hva);
1892			end = min(u->addr - 1 + u->size, hva - 1 + len);
1893			l = end - start + 1;
1894			r = log_write(vq->log_base,
1895				      u->start + start - u->addr,
1896				      l);
1897			if (r < 0)
1898				return r;
1899			hit = true;
1900			min = min(l, min);
1901		}
1902
1903		if (!hit)
1904			return -EFAULT;
1905
1906		len -= min;
1907		hva += min;
1908	}
1909
1910	return 0;
1911}
1912
1913static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
1914{
1915	struct iovec *iov = vq->log_iov;
1916	int i, ret;
1917
1918	if (!vq->iotlb)
1919		return log_write(vq->log_base, vq->log_addr + used_offset, len);
1920
1921	ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
1922			     len, iov, 64, VHOST_ACCESS_WO);
1923	if (ret < 0)
1924		return ret;
1925
1926	for (i = 0; i < ret; i++) {
1927		ret = log_write_hva(vq,	(uintptr_t)iov[i].iov_base,
1928				    iov[i].iov_len);
1929		if (ret)
1930			return ret;
1931	}
1932
1933	return 0;
1934}
1935
1936int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
1937		    unsigned int log_num, u64 len, struct iovec *iov, int count)
1938{
1939	int i, r;
1940
1941	/* Make sure data written is seen before log. */
1942	smp_wmb();
1943
1944	if (vq->iotlb) {
1945		for (i = 0; i < count; i++) {
1946			r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
1947					  iov[i].iov_len);
1948			if (r < 0)
1949				return r;
1950		}
1951		return 0;
1952	}
1953
1954	for (i = 0; i < log_num; ++i) {
1955		u64 l = min(log[i].len, len);
1956		r = log_write(vq->log_base, log[i].addr, l);
1957		if (r < 0)
1958			return r;
1959		len -= l;
1960		if (!len) {
1961			if (vq->log_ctx)
1962				eventfd_signal(vq->log_ctx, 1);
1963			return 0;
1964		}
1965	}
1966	/* Length written exceeds what we have stored. This is a bug. */
1967	BUG();
1968	return 0;
1969}
1970EXPORT_SYMBOL_GPL(vhost_log_write);
1971
1972static int vhost_update_used_flags(struct vhost_virtqueue *vq)
1973{
1974	void __user *used;
1975	if (vhost_put_used_flags(vq))
1976		return -EFAULT;
1977	if (unlikely(vq->log_used)) {
1978		/* Make sure the flag is seen before log. */
1979		smp_wmb();
1980		/* Log used flag write. */
1981		used = &vq->used->flags;
1982		log_used(vq, (used - (void __user *)vq->used),
1983			 sizeof vq->used->flags);
1984		if (vq->log_ctx)
1985			eventfd_signal(vq->log_ctx, 1);
1986	}
1987	return 0;
1988}
1989
1990static int vhost_update_avail_event(struct vhost_virtqueue *vq)
1991{
1992	if (vhost_put_avail_event(vq))
1993		return -EFAULT;
1994	if (unlikely(vq->log_used)) {
1995		void __user *used;
1996		/* Make sure the event is seen before log. */
1997		smp_wmb();
1998		/* Log avail event write */
1999		used = vhost_avail_event(vq);
2000		log_used(vq, (used - (void __user *)vq->used),
2001			 sizeof *vhost_avail_event(vq));
2002		if (vq->log_ctx)
2003			eventfd_signal(vq->log_ctx, 1);
2004	}
2005	return 0;
2006}
2007
2008int vhost_vq_init_access(struct vhost_virtqueue *vq)
2009{
2010	__virtio16 last_used_idx;
2011	int r;
2012	bool is_le = vq->is_le;
2013
2014	if (!vq->private_data)
2015		return 0;
2016
2017	vhost_init_is_le(vq);
2018
2019	r = vhost_update_used_flags(vq);
2020	if (r)
2021		goto err;
2022	vq->signalled_used_valid = false;
2023	if (!vq->iotlb &&
2024	    !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
2025		r = -EFAULT;
2026		goto err;
2027	}
2028	r = vhost_get_used_idx(vq, &last_used_idx);
2029	if (r) {
2030		vq_err(vq, "Can't access used idx at %p\n",
2031		       &vq->used->idx);
2032		goto err;
2033	}
2034	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
2035	return 0;
2036
2037err:
2038	vq->is_le = is_le;
2039	return r;
2040}
2041EXPORT_SYMBOL_GPL(vhost_vq_init_access);
2042
2043static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
2044			  struct iovec iov[], int iov_size, int access)
2045{
2046	const struct vhost_iotlb_map *map;
2047	struct vhost_dev *dev = vq->dev;
2048	struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
2049	struct iovec *_iov;
2050	u64 s = 0, last = addr + len - 1;
2051	int ret = 0;
2052
2053	while ((u64)len > s) {
2054		u64 size;
2055		if (unlikely(ret >= iov_size)) {
2056			ret = -ENOBUFS;
2057			break;
2058		}
2059
2060		map = vhost_iotlb_itree_first(umem, addr, last);
2061		if (map == NULL || map->start > addr) {
2062			if (umem != dev->iotlb) {
2063				ret = -EFAULT;
2064				break;
2065			}
2066			ret = -EAGAIN;
2067			break;
2068		} else if (!(map->perm & access)) {
2069			ret = -EPERM;
2070			break;
2071		}
2072
2073		_iov = iov + ret;
2074		size = map->size - addr + map->start;
2075		_iov->iov_len = min((u64)len - s, size);
2076		_iov->iov_base = (void __user *)(unsigned long)
2077				 (map->addr + addr - map->start);
2078		s += size;
2079		addr += size;
2080		++ret;
2081	}
2082
2083	if (ret == -EAGAIN)
2084		vhost_iotlb_miss(vq, addr, access);
2085	return ret;
2086}
2087
2088/* Each buffer in the virtqueues is actually a chain of descriptors.  This
2089 * function returns the next descriptor in the chain,
2090 * or -1U if we're at the end. */
2091static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
2092{
2093	unsigned int next;
2094
2095	/* If this descriptor says it doesn't chain, we're done. */
2096	if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
2097		return -1U;
2098
2099	/* Check they're not leading us off end of descriptors. */
2100	next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
2101	return next;
2102}
2103
2104static int get_indirect(struct vhost_virtqueue *vq,
2105			struct iovec iov[], unsigned int iov_size,
2106			unsigned int *out_num, unsigned int *in_num,
2107			struct vhost_log *log, unsigned int *log_num,
2108			struct vring_desc *indirect)
2109{
2110	struct vring_desc desc;
2111	unsigned int i = 0, count, found = 0;
2112	u32 len = vhost32_to_cpu(vq, indirect->len);
2113	struct iov_iter from;
2114	int ret, access;
2115
2116	/* Sanity check */
2117	if (unlikely(len % sizeof desc)) {
2118		vq_err(vq, "Invalid length in indirect descriptor: "
2119		       "len 0x%llx not multiple of 0x%zx\n",
2120		       (unsigned long long)len,
2121		       sizeof desc);
2122		return -EINVAL;
2123	}
2124
2125	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
2126			     UIO_MAXIOV, VHOST_ACCESS_RO);
2127	if (unlikely(ret < 0)) {
2128		if (ret != -EAGAIN)
2129			vq_err(vq, "Translation failure %d in indirect.\n", ret);
2130		return ret;
2131	}
2132	iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
2133	count = len / sizeof desc;
2134	/* Buffers are chained via a 16 bit next field, so
2135	 * we can have at most 2^16 of these. */
2136	if (unlikely(count > USHRT_MAX + 1)) {
2137		vq_err(vq, "Indirect buffer length too big: %d\n",
2138		       indirect->len);
2139		return -E2BIG;
2140	}
2141
2142	do {
2143		unsigned iov_count = *in_num + *out_num;
2144		if (unlikely(++found > count)) {
2145			vq_err(vq, "Loop detected: last one at %u "
2146			       "indirect size %u\n",
2147			       i, count);
2148			return -EINVAL;
2149		}
2150		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
2151			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
2152			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2153			return -EINVAL;
2154		}
2155		if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
2156			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
2157			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2158			return -EINVAL;
2159		}
2160
2161		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2162			access = VHOST_ACCESS_WO;
2163		else
2164			access = VHOST_ACCESS_RO;
2165
2166		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2167				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2168				     iov_size - iov_count, access);
2169		if (unlikely(ret < 0)) {
2170			if (ret != -EAGAIN)
2171				vq_err(vq, "Translation failure %d indirect idx %d\n",
2172					ret, i);
2173			return ret;
2174		}
2175		/* If this is an input descriptor, increment that count. */
2176		if (access == VHOST_ACCESS_WO) {
2177			*in_num += ret;
2178			if (unlikely(log && ret)) {
2179				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2180				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2181				++*log_num;
2182			}
2183		} else {
2184			/* If it's an output descriptor, they're all supposed
2185			 * to come before any input descriptors. */
2186			if (unlikely(*in_num)) {
2187				vq_err(vq, "Indirect descriptor "
2188				       "has out after in: idx %d\n", i);
2189				return -EINVAL;
2190			}
2191			*out_num += ret;
2192		}
2193	} while ((i = next_desc(vq, &desc)) != -1);
2194	return 0;
2195}
2196
2197/* This looks in the virtqueue and for the first available buffer, and converts
2198 * it to an iovec for convenient access.  Since descriptors consist of some
2199 * number of output then some number of input descriptors, it's actually two
2200 * iovecs, but we pack them into one and note how many of each there were.
2201 *
2202 * This function returns the descriptor number found, or vq->num (which is
2203 * never a valid descriptor number) if none was found.  A negative code is
2204 * returned on error. */
2205int vhost_get_vq_desc(struct vhost_virtqueue *vq,
2206		      struct iovec iov[], unsigned int iov_size,
2207		      unsigned int *out_num, unsigned int *in_num,
2208		      struct vhost_log *log, unsigned int *log_num)
2209{
2210	struct vring_desc desc;
2211	unsigned int i, head, found = 0;
2212	u16 last_avail_idx;
2213	__virtio16 avail_idx;
2214	__virtio16 ring_head;
2215	int ret, access;
2216
2217	/* Check it isn't doing very strange things with descriptor numbers. */
2218	last_avail_idx = vq->last_avail_idx;
2219
2220	if (vq->avail_idx == vq->last_avail_idx) {
2221		if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
2222			vq_err(vq, "Failed to access avail idx at %p\n",
2223				&vq->avail->idx);
2224			return -EFAULT;
2225		}
2226		vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2227
2228		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
2229			vq_err(vq, "Guest moved used index from %u to %u",
2230				last_avail_idx, vq->avail_idx);
2231			return -EFAULT;
2232		}
2233
2234		/* If there's nothing new since last we looked, return
2235		 * invalid.
2236		 */
2237		if (vq->avail_idx == last_avail_idx)
2238			return vq->num;
2239
2240		/* Only get avail ring entries after they have been
2241		 * exposed by guest.
2242		 */
2243		smp_rmb();
2244	}
2245
2246	/* Grab the next descriptor number they're advertising, and increment
2247	 * the index we've seen. */
2248	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
2249		vq_err(vq, "Failed to read head: idx %d address %p\n",
2250		       last_avail_idx,
2251		       &vq->avail->ring[last_avail_idx % vq->num]);
2252		return -EFAULT;
2253	}
2254
2255	head = vhost16_to_cpu(vq, ring_head);
2256
2257	/* If their number is silly, that's an error. */
2258	if (unlikely(head >= vq->num)) {
2259		vq_err(vq, "Guest says index %u > %u is available",
2260		       head, vq->num);
2261		return -EINVAL;
2262	}
2263
2264	/* When we start there are none of either input nor output. */
2265	*out_num = *in_num = 0;
2266	if (unlikely(log))
2267		*log_num = 0;
2268
2269	i = head;
2270	do {
2271		unsigned iov_count = *in_num + *out_num;
2272		if (unlikely(i >= vq->num)) {
2273			vq_err(vq, "Desc index is %u > %u, head = %u",
2274			       i, vq->num, head);
2275			return -EINVAL;
2276		}
2277		if (unlikely(++found > vq->num)) {
2278			vq_err(vq, "Loop detected: last one at %u "
2279			       "vq size %u head %u\n",
2280			       i, vq->num, head);
2281			return -EINVAL;
2282		}
2283		ret = vhost_get_desc(vq, &desc, i);
2284		if (unlikely(ret)) {
2285			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
2286			       i, vq->desc + i);
2287			return -EFAULT;
2288		}
2289		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
2290			ret = get_indirect(vq, iov, iov_size,
2291					   out_num, in_num,
2292					   log, log_num, &desc);
2293			if (unlikely(ret < 0)) {
2294				if (ret != -EAGAIN)
2295					vq_err(vq, "Failure detected "
2296						"in indirect descriptor at idx %d\n", i);
2297				return ret;
2298			}
2299			continue;
2300		}
2301
2302		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2303			access = VHOST_ACCESS_WO;
2304		else
2305			access = VHOST_ACCESS_RO;
2306		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2307				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2308				     iov_size - iov_count, access);
2309		if (unlikely(ret < 0)) {
2310			if (ret != -EAGAIN)
2311				vq_err(vq, "Translation failure %d descriptor idx %d\n",
2312					ret, i);
2313			return ret;
2314		}
2315		if (access == VHOST_ACCESS_WO) {
2316			/* If this is an input descriptor,
2317			 * increment that count. */
2318			*in_num += ret;
2319			if (unlikely(log && ret)) {
2320				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2321				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2322				++*log_num;
2323			}
2324		} else {
2325			/* If it's an output descriptor, they're all supposed
2326			 * to come before any input descriptors. */
2327			if (unlikely(*in_num)) {
2328				vq_err(vq, "Descriptor has out after in: "
2329				       "idx %d\n", i);
2330				return -EINVAL;
2331			}
2332			*out_num += ret;
2333		}
2334	} while ((i = next_desc(vq, &desc)) != -1);
2335
2336	/* On success, increment avail index. */
2337	vq->last_avail_idx++;
2338
2339	/* Assume notifications from guest are disabled at this point,
2340	 * if they aren't we would need to update avail_event index. */
2341	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
2342	return head;
2343}
2344EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
2345
2346/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
2347void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
2348{
2349	vq->last_avail_idx -= n;
2350}
2351EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
2352
2353/* After we've used one of their buffers, we tell them about it.  We'll then
2354 * want to notify the guest, using eventfd. */
2355int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
2356{
2357	struct vring_used_elem heads = {
2358		cpu_to_vhost32(vq, head),
2359		cpu_to_vhost32(vq, len)
2360	};
2361
2362	return vhost_add_used_n(vq, &heads, 1);
2363}
2364EXPORT_SYMBOL_GPL(vhost_add_used);
2365
2366static int __vhost_add_used_n(struct vhost_virtqueue *vq,
2367			    struct vring_used_elem *heads,
2368			    unsigned count)
2369{
2370	vring_used_elem_t __user *used;
2371	u16 old, new;
2372	int start;
2373
2374	start = vq->last_used_idx & (vq->num - 1);
2375	used = vq->used->ring + start;
2376	if (vhost_put_used(vq, heads, start, count)) {
2377		vq_err(vq, "Failed to write used");
2378		return -EFAULT;
2379	}
2380	if (unlikely(vq->log_used)) {
2381		/* Make sure data is seen before log. */
2382		smp_wmb();
2383		/* Log used ring entry write. */
2384		log_used(vq, ((void __user *)used - (void __user *)vq->used),
2385			 count * sizeof *used);
2386	}
2387	old = vq->last_used_idx;
2388	new = (vq->last_used_idx += count);
2389	/* If the driver never bothers to signal in a very long while,
2390	 * used index might wrap around. If that happens, invalidate
2391	 * signalled_used index we stored. TODO: make sure driver
2392	 * signals at least once in 2^16 and remove this. */
2393	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
2394		vq->signalled_used_valid = false;
2395	return 0;
2396}
2397
2398/* After we've used one of their buffers, we tell them about it.  We'll then
2399 * want to notify the guest, using eventfd. */
2400int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
2401		     unsigned count)
2402{
2403	int start, n, r;
2404
2405	start = vq->last_used_idx & (vq->num - 1);
2406	n = vq->num - start;
2407	if (n < count) {
2408		r = __vhost_add_used_n(vq, heads, n);
2409		if (r < 0)
2410			return r;
2411		heads += n;
2412		count -= n;
2413	}
2414	r = __vhost_add_used_n(vq, heads, count);
2415
2416	/* Make sure buffer is written before we update index. */
2417	smp_wmb();
2418	if (vhost_put_used_idx(vq)) {
2419		vq_err(vq, "Failed to increment used idx");
2420		return -EFAULT;
2421	}
2422	if (unlikely(vq->log_used)) {
2423		/* Make sure used idx is seen before log. */
2424		smp_wmb();
2425		/* Log used index update. */
2426		log_used(vq, offsetof(struct vring_used, idx),
2427			 sizeof vq->used->idx);
2428		if (vq->log_ctx)
2429			eventfd_signal(vq->log_ctx, 1);
2430	}
2431	return r;
2432}
2433EXPORT_SYMBOL_GPL(vhost_add_used_n);
2434
2435static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2436{
2437	__u16 old, new;
2438	__virtio16 event;
2439	bool v;
2440	/* Flush out used index updates. This is paired
2441	 * with the barrier that the Guest executes when enabling
2442	 * interrupts. */
2443	smp_mb();
2444
2445	if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
2446	    unlikely(vq->avail_idx == vq->last_avail_idx))
2447		return true;
2448
2449	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2450		__virtio16 flags;
2451		if (vhost_get_avail_flags(vq, &flags)) {
2452			vq_err(vq, "Failed to get flags");
2453			return true;
2454		}
2455		return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
2456	}
2457	old = vq->signalled_used;
2458	v = vq->signalled_used_valid;
2459	new = vq->signalled_used = vq->last_used_idx;
2460	vq->signalled_used_valid = true;
2461
2462	if (unlikely(!v))
2463		return true;
2464
2465	if (vhost_get_used_event(vq, &event)) {
2466		vq_err(vq, "Failed to get used event idx");
2467		return true;
2468	}
2469	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
2470}
2471
2472/* This actually signals the guest, using eventfd. */
2473void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2474{
2475	/* Signal the Guest tell them we used something up. */
2476	if (vq->call_ctx.ctx && vhost_notify(dev, vq))
2477		eventfd_signal(vq->call_ctx.ctx, 1);
2478}
2479EXPORT_SYMBOL_GPL(vhost_signal);
2480
2481/* And here's the combo meal deal.  Supersize me! */
2482void vhost_add_used_and_signal(struct vhost_dev *dev,
2483			       struct vhost_virtqueue *vq,
2484			       unsigned int head, int len)
2485{
2486	vhost_add_used(vq, head, len);
2487	vhost_signal(dev, vq);
2488}
2489EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
2490
2491/* multi-buffer version of vhost_add_used_and_signal */
2492void vhost_add_used_and_signal_n(struct vhost_dev *dev,
2493				 struct vhost_virtqueue *vq,
2494				 struct vring_used_elem *heads, unsigned count)
2495{
2496	vhost_add_used_n(vq, heads, count);
2497	vhost_signal(dev, vq);
2498}
2499EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
2500
2501/* return true if we're sure that avaiable ring is empty */
2502bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2503{
2504	__virtio16 avail_idx;
2505	int r;
2506
2507	if (vq->avail_idx != vq->last_avail_idx)
2508		return false;
2509
2510	r = vhost_get_avail_idx(vq, &avail_idx);
2511	if (unlikely(r))
2512		return false;
2513	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2514
2515	return vq->avail_idx == vq->last_avail_idx;
2516}
2517EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
2518
2519/* OK, now we need to know about added descriptors. */
2520bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2521{
2522	__virtio16 avail_idx;
2523	int r;
2524
2525	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
2526		return false;
2527	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
2528	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2529		r = vhost_update_used_flags(vq);
2530		if (r) {
2531			vq_err(vq, "Failed to enable notification at %p: %d\n",
2532			       &vq->used->flags, r);
2533			return false;
2534		}
2535	} else {
2536		r = vhost_update_avail_event(vq);
2537		if (r) {
2538			vq_err(vq, "Failed to update avail event index at %p: %d\n",
2539			       vhost_avail_event(vq), r);
2540			return false;
2541		}
2542	}
2543	/* They could have slipped one in as we were doing that: make
2544	 * sure it's written, then check again. */
2545	smp_mb();
2546	r = vhost_get_avail_idx(vq, &avail_idx);
2547	if (r) {
2548		vq_err(vq, "Failed to check avail idx at %p: %d\n",
2549		       &vq->avail->idx, r);
2550		return false;
2551	}
2552	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2553
2554	return vq->avail_idx != vq->last_avail_idx;
2555}
2556EXPORT_SYMBOL_GPL(vhost_enable_notify);
2557
2558/* We don't need to be notified again. */
2559void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2560{
2561	int r;
2562
2563	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
2564		return;
2565	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
2566	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2567		r = vhost_update_used_flags(vq);
2568		if (r)
2569			vq_err(vq, "Failed to disable notification at %p: %d\n",
2570			       &vq->used->flags, r);
2571	}
2572}
2573EXPORT_SYMBOL_GPL(vhost_disable_notify);
2574
2575/* Create a new message. */
2576struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
2577{
2578	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
2579	if (!node)
2580		return NULL;
2581
2582	/* Make sure all padding within the structure is initialized. */
2583	memset(&node->msg, 0, sizeof node->msg);
2584	node->vq = vq;
2585	node->msg.type = type;
2586	return node;
2587}
2588EXPORT_SYMBOL_GPL(vhost_new_msg);
2589
2590void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
2591		       struct vhost_msg_node *node)
2592{
2593	spin_lock(&dev->iotlb_lock);
2594	list_add_tail(&node->node, head);
2595	spin_unlock(&dev->iotlb_lock);
2596
2597	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
2598}
2599EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
2600
2601struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
2602					 struct list_head *head)
2603{
2604	struct vhost_msg_node *node = NULL;
2605
2606	spin_lock(&dev->iotlb_lock);
2607	if (!list_empty(head)) {
2608		node = list_first_entry(head, struct vhost_msg_node,
2609					node);
2610		list_del(&node->node);
2611	}
2612	spin_unlock(&dev->iotlb_lock);
2613
2614	return node;
2615}
2616EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
2617
2618void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
2619{
2620	struct vhost_virtqueue *vq;
2621	int i;
2622
2623	mutex_lock(&dev->mutex);
2624	for (i = 0; i < dev->nvqs; ++i) {
2625		vq = dev->vqs[i];
2626		mutex_lock(&vq->mutex);
2627		vq->acked_backend_features = features;
2628		mutex_unlock(&vq->mutex);
2629	}
2630	mutex_unlock(&dev->mutex);
2631}
2632EXPORT_SYMBOL_GPL(vhost_set_backend_features);
2633
2634static int __init vhost_init(void)
2635{
2636	return 0;
2637}
2638
2639static void __exit vhost_exit(void)
2640{
2641}
2642
2643module_init(vhost_init);
2644module_exit(vhost_exit);
2645
2646MODULE_VERSION("0.0.1");
2647MODULE_LICENSE("GPL v2");
2648MODULE_AUTHOR("Michael S. Tsirkin");
2649MODULE_DESCRIPTION("Host kernel accelerator for virtio");