drivers/vhost/vhost.c at v6.4-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / vhost / vhost.c
at v6.4-rc6 2630 lines 64 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Copyright (C) 2009 Red Hat, Inc.
   3 * Copyright (C) 2006 Rusty Russell IBM Corporation
   4 *
   5 * Author: Michael S. Tsirkin <mst@redhat.com>
   6 *
   7 * Inspiration, some code, and most witty comments come from
   8 * Documentation/virtual/lguest/lguest.c, by Rusty Russell
   9 *
  10 * Generic code for virtio server in host kernel.
  11 */
  12
  13#include <linux/eventfd.h>
  14#include <linux/vhost.h>
  15#include <linux/uio.h>
  16#include <linux/mm.h>
  17#include <linux/miscdevice.h>
  18#include <linux/mutex.h>
  19#include <linux/poll.h>
  20#include <linux/file.h>
  21#include <linux/highmem.h>
  22#include <linux/slab.h>
  23#include <linux/vmalloc.h>
  24#include <linux/kthread.h>
  25#include <linux/module.h>
  26#include <linux/sort.h>
  27#include <linux/sched/mm.h>
  28#include <linux/sched/signal.h>
  29#include <linux/sched/vhost_task.h>
  30#include <linux/interval_tree_generic.h>
  31#include <linux/nospec.h>
  32#include <linux/kcov.h>
  33
  34#include "vhost.h"
  35
  36static ushort max_mem_regions = 64;
  37module_param(max_mem_regions, ushort, 0444);
  38MODULE_PARM_DESC(max_mem_regions,
  39	"Maximum number of memory regions in memory map. (default: 64)");
  40static int max_iotlb_entries = 2048;
  41module_param(max_iotlb_entries, int, 0444);
  42MODULE_PARM_DESC(max_iotlb_entries,
  43	"Maximum number of iotlb entries. (default: 2048)");
  44
  45enum {
  46	VHOST_MEMORY_F_LOG = 0x1,
  47};
  48
  49#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
  50#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
  51
  52#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
  53static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  54{
  55	vq->user_be = !virtio_legacy_is_little_endian();
  56}
  57
  58static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
  59{
  60	vq->user_be = true;
  61}
  62
  63static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
  64{
  65	vq->user_be = false;
  66}
  67
  68static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  69{
  70	struct vhost_vring_state s;
  71
  72	if (vq->private_data)
  73		return -EBUSY;
  74
  75	if (copy_from_user(&s, argp, sizeof(s)))
  76		return -EFAULT;
  77
  78	if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
  79	    s.num != VHOST_VRING_BIG_ENDIAN)
  80		return -EINVAL;
  81
  82	if (s.num == VHOST_VRING_BIG_ENDIAN)
  83		vhost_enable_cross_endian_big(vq);
  84	else
  85		vhost_enable_cross_endian_little(vq);
  86
  87	return 0;
  88}
  89
  90static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  91				   int __user *argp)
  92{
  93	struct vhost_vring_state s = {
  94		.index = idx,
  95		.num = vq->user_be
  96	};
  97
  98	if (copy_to_user(argp, &s, sizeof(s)))
  99		return -EFAULT;
 100
 101	return 0;
 102}
 103
 104static void vhost_init_is_le(struct vhost_virtqueue *vq)
 105{
 106	/* Note for legacy virtio: user_be is initialized at reset time
 107	 * according to the host endianness. If userspace does not set an
 108	 * explicit endianness, the default behavior is native endian, as
 109	 * expected by legacy virtio.
 110	 */
 111	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
 112}
 113#else
 114static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 115{
 116}
 117
 118static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
 119{
 120	return -ENOIOCTLCMD;
 121}
 122
 123static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
 124				   int __user *argp)
 125{
 126	return -ENOIOCTLCMD;
 127}
 128
 129static void vhost_init_is_le(struct vhost_virtqueue *vq)
 130{
 131	vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
 132		|| virtio_legacy_is_little_endian();
 133}
 134#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
 135
 136static void vhost_reset_is_le(struct vhost_virtqueue *vq)
 137{
 138	vhost_init_is_le(vq);
 139}
 140
 141struct vhost_flush_struct {
 142	struct vhost_work work;
 143	struct completion wait_event;
 144};
 145
 146static void vhost_flush_work(struct vhost_work *work)
 147{
 148	struct vhost_flush_struct *s;
 149
 150	s = container_of(work, struct vhost_flush_struct, work);
 151	complete(&s->wait_event);
 152}
 153
 154static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 155			    poll_table *pt)
 156{
 157	struct vhost_poll *poll;
 158
 159	poll = container_of(pt, struct vhost_poll, table);
 160	poll->wqh = wqh;
 161	add_wait_queue(wqh, &poll->wait);
 162}
 163
 164static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 165			     void *key)
 166{
 167	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
 168	struct vhost_work *work = &poll->work;
 169
 170	if (!(key_to_poll(key) & poll->mask))
 171		return 0;
 172
 173	if (!poll->dev->use_worker)
 174		work->fn(work);
 175	else
 176		vhost_poll_queue(poll);
 177
 178	return 0;
 179}
 180
 181void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
 182{
 183	clear_bit(VHOST_WORK_QUEUED, &work->flags);
 184	work->fn = fn;
 185}
 186EXPORT_SYMBOL_GPL(vhost_work_init);
 187
 188/* Init poll structure */
 189void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 190		     __poll_t mask, struct vhost_dev *dev)
 191{
 192	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 193	init_poll_funcptr(&poll->table, vhost_poll_func);
 194	poll->mask = mask;
 195	poll->dev = dev;
 196	poll->wqh = NULL;
 197
 198	vhost_work_init(&poll->work, fn);
 199}
 200EXPORT_SYMBOL_GPL(vhost_poll_init);
 201
 202/* Start polling a file. We add ourselves to file's wait queue. The caller must
 203 * keep a reference to a file until after vhost_poll_stop is called. */
 204int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 205{
 206	__poll_t mask;
 207
 208	if (poll->wqh)
 209		return 0;
 210
 211	mask = vfs_poll(file, &poll->table);
 212	if (mask)
 213		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 214	if (mask & EPOLLERR) {
 215		vhost_poll_stop(poll);
 216		return -EINVAL;
 217	}
 218
 219	return 0;
 220}
 221EXPORT_SYMBOL_GPL(vhost_poll_start);
 222
 223/* Stop polling a file. After this function returns, it becomes safe to drop the
 224 * file reference. You must also flush afterwards. */
 225void vhost_poll_stop(struct vhost_poll *poll)
 226{
 227	if (poll->wqh) {
 228		remove_wait_queue(poll->wqh, &poll->wait);
 229		poll->wqh = NULL;
 230	}
 231}
 232EXPORT_SYMBOL_GPL(vhost_poll_stop);
 233
 234void vhost_dev_flush(struct vhost_dev *dev)
 235{
 236	struct vhost_flush_struct flush;
 237
 238	if (dev->worker.vtsk) {
 239		init_completion(&flush.wait_event);
 240		vhost_work_init(&flush.work, vhost_flush_work);
 241
 242		vhost_work_queue(dev, &flush.work);
 243		wait_for_completion(&flush.wait_event);
 244	}
 245}
 246EXPORT_SYMBOL_GPL(vhost_dev_flush);
 247
 248void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 249{
 250	if (!dev->worker.vtsk)
 251		return;
 252
 253	if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
 254		/* We can only add the work to the list after we're
 255		 * sure it was not in the list.
 256		 * test_and_set_bit() implies a memory barrier.
 257		 */
 258		llist_add(&work->node, &dev->worker.work_list);
 259		vhost_task_wake(dev->worker.vtsk);
 260	}
 261}
 262EXPORT_SYMBOL_GPL(vhost_work_queue);
 263
 264/* A lockless hint for busy polling code to exit the loop */
 265bool vhost_has_work(struct vhost_dev *dev)
 266{
 267	return !llist_empty(&dev->worker.work_list);
 268}
 269EXPORT_SYMBOL_GPL(vhost_has_work);
 270
 271void vhost_poll_queue(struct vhost_poll *poll)
 272{
 273	vhost_work_queue(poll->dev, &poll->work);
 274}
 275EXPORT_SYMBOL_GPL(vhost_poll_queue);
 276
 277static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
 278{
 279	int j;
 280
 281	for (j = 0; j < VHOST_NUM_ADDRS; j++)
 282		vq->meta_iotlb[j] = NULL;
 283}
 284
 285static void vhost_vq_meta_reset(struct vhost_dev *d)
 286{
 287	int i;
 288
 289	for (i = 0; i < d->nvqs; ++i)
 290		__vhost_vq_meta_reset(d->vqs[i]);
 291}
 292
 293static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
 294{
 295	call_ctx->ctx = NULL;
 296	memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
 297}
 298
 299bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
 300{
 301	return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
 302}
 303EXPORT_SYMBOL_GPL(vhost_vq_is_setup);
 304
 305static void vhost_vq_reset(struct vhost_dev *dev,
 306			   struct vhost_virtqueue *vq)
 307{
 308	vq->num = 1;
 309	vq->desc = NULL;
 310	vq->avail = NULL;
 311	vq->used = NULL;
 312	vq->last_avail_idx = 0;
 313	vq->avail_idx = 0;
 314	vq->last_used_idx = 0;
 315	vq->signalled_used = 0;
 316	vq->signalled_used_valid = false;
 317	vq->used_flags = 0;
 318	vq->log_used = false;
 319	vq->log_addr = -1ull;
 320	vq->private_data = NULL;
 321	vq->acked_features = 0;
 322	vq->acked_backend_features = 0;
 323	vq->log_base = NULL;
 324	vq->error_ctx = NULL;
 325	vq->kick = NULL;
 326	vq->log_ctx = NULL;
 327	vhost_disable_cross_endian(vq);
 328	vhost_reset_is_le(vq);
 329	vq->busyloop_timeout = 0;
 330	vq->umem = NULL;
 331	vq->iotlb = NULL;
 332	vhost_vring_call_reset(&vq->call_ctx);
 333	__vhost_vq_meta_reset(vq);
 334}
 335
 336static bool vhost_worker(void *data)
 337{
 338	struct vhost_worker *worker = data;
 339	struct vhost_work *work, *work_next;
 340	struct llist_node *node;
 341
 342	node = llist_del_all(&worker->work_list);
 343	if (node) {
 344		__set_current_state(TASK_RUNNING);
 345
 346		node = llist_reverse_order(node);
 347		/* make sure flag is seen after deletion */
 348		smp_wmb();
 349		llist_for_each_entry_safe(work, work_next, node, node) {
 350			clear_bit(VHOST_WORK_QUEUED, &work->flags);
 351			kcov_remote_start_common(worker->kcov_handle);
 352			work->fn(work);
 353			kcov_remote_stop();
 354			cond_resched();
 355		}
 356	}
 357
 358	return !!node;
 359}
 360
 361static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 362{
 363	kfree(vq->indirect);
 364	vq->indirect = NULL;
 365	kfree(vq->log);
 366	vq->log = NULL;
 367	kfree(vq->heads);
 368	vq->heads = NULL;
 369}
 370
 371/* Helper to allocate iovec buffers for all vqs. */
 372static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 373{
 374	struct vhost_virtqueue *vq;
 375	int i;
 376
 377	for (i = 0; i < dev->nvqs; ++i) {
 378		vq = dev->vqs[i];
 379		vq->indirect = kmalloc_array(UIO_MAXIOV,
 380					     sizeof(*vq->indirect),
 381					     GFP_KERNEL);
 382		vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
 383					GFP_KERNEL);
 384		vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
 385					  GFP_KERNEL);
 386		if (!vq->indirect || !vq->log || !vq->heads)
 387			goto err_nomem;
 388	}
 389	return 0;
 390
 391err_nomem:
 392	for (; i >= 0; --i)
 393		vhost_vq_free_iovecs(dev->vqs[i]);
 394	return -ENOMEM;
 395}
 396
 397static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 398{
 399	int i;
 400
 401	for (i = 0; i < dev->nvqs; ++i)
 402		vhost_vq_free_iovecs(dev->vqs[i]);
 403}
 404
 405bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
 406			  int pkts, int total_len)
 407{
 408	struct vhost_dev *dev = vq->dev;
 409
 410	if ((dev->byte_weight && total_len >= dev->byte_weight) ||
 411	    pkts >= dev->weight) {
 412		vhost_poll_queue(&vq->poll);
 413		return true;
 414	}
 415
 416	return false;
 417}
 418EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
 419
 420static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
 421				   unsigned int num)
 422{
 423	size_t event __maybe_unused =
 424	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 425
 426	return size_add(struct_size(vq->avail, ring, num), event);
 427}
 428
 429static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
 430				  unsigned int num)
 431{
 432	size_t event __maybe_unused =
 433	       vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 434
 435	return size_add(struct_size(vq->used, ring, num), event);
 436}
 437
 438static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
 439				  unsigned int num)
 440{
 441	return sizeof(*vq->desc) * num;
 442}
 443
 444void vhost_dev_init(struct vhost_dev *dev,
 445		    struct vhost_virtqueue **vqs, int nvqs,
 446		    int iov_limit, int weight, int byte_weight,
 447		    bool use_worker,
 448		    int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 449				       struct vhost_iotlb_msg *msg))
 450{
 451	struct vhost_virtqueue *vq;
 452	int i;
 453
 454	dev->vqs = vqs;
 455	dev->nvqs = nvqs;
 456	mutex_init(&dev->mutex);
 457	dev->log_ctx = NULL;
 458	dev->umem = NULL;
 459	dev->iotlb = NULL;
 460	dev->mm = NULL;
 461	memset(&dev->worker, 0, sizeof(dev->worker));
 462	init_llist_head(&dev->worker.work_list);
 463	dev->iov_limit = iov_limit;
 464	dev->weight = weight;
 465	dev->byte_weight = byte_weight;
 466	dev->use_worker = use_worker;
 467	dev->msg_handler = msg_handler;
 468	init_waitqueue_head(&dev->wait);
 469	INIT_LIST_HEAD(&dev->read_list);
 470	INIT_LIST_HEAD(&dev->pending_list);
 471	spin_lock_init(&dev->iotlb_lock);
 472
 473
 474	for (i = 0; i < dev->nvqs; ++i) {
 475		vq = dev->vqs[i];
 476		vq->log = NULL;
 477		vq->indirect = NULL;
 478		vq->heads = NULL;
 479		vq->dev = dev;
 480		mutex_init(&vq->mutex);
 481		vhost_vq_reset(dev, vq);
 482		if (vq->handle_kick)
 483			vhost_poll_init(&vq->poll, vq->handle_kick,
 484					EPOLLIN, dev);
 485	}
 486}
 487EXPORT_SYMBOL_GPL(vhost_dev_init);
 488
 489/* Caller should have device mutex */
 490long vhost_dev_check_owner(struct vhost_dev *dev)
 491{
 492	/* Are you the owner? If not, I don't think you mean to do that */
 493	return dev->mm == current->mm ? 0 : -EPERM;
 494}
 495EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 496
 497/* Caller should have device mutex */
 498bool vhost_dev_has_owner(struct vhost_dev *dev)
 499{
 500	return dev->mm;
 501}
 502EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
 503
 504static void vhost_attach_mm(struct vhost_dev *dev)
 505{
 506	/* No owner, become one */
 507	if (dev->use_worker) {
 508		dev->mm = get_task_mm(current);
 509	} else {
 510		/* vDPA device does not use worker thead, so there's
 511		 * no need to hold the address space for mm. This help
 512		 * to avoid deadlock in the case of mmap() which may
 513		 * held the refcnt of the file and depends on release
 514		 * method to remove vma.
 515		 */
 516		dev->mm = current->mm;
 517		mmgrab(dev->mm);
 518	}
 519}
 520
 521static void vhost_detach_mm(struct vhost_dev *dev)
 522{
 523	if (!dev->mm)
 524		return;
 525
 526	if (dev->use_worker)
 527		mmput(dev->mm);
 528	else
 529		mmdrop(dev->mm);
 530
 531	dev->mm = NULL;
 532}
 533
 534static void vhost_worker_free(struct vhost_dev *dev)
 535{
 536	if (!dev->worker.vtsk)
 537		return;
 538
 539	WARN_ON(!llist_empty(&dev->worker.work_list));
 540	vhost_task_stop(dev->worker.vtsk);
 541	dev->worker.kcov_handle = 0;
 542	dev->worker.vtsk = NULL;
 543}
 544
 545static int vhost_worker_create(struct vhost_dev *dev)
 546{
 547	struct vhost_task *vtsk;
 548	char name[TASK_COMM_LEN];
 549
 550	snprintf(name, sizeof(name), "vhost-%d", current->pid);
 551
 552	vtsk = vhost_task_create(vhost_worker, &dev->worker, name);
 553	if (!vtsk)
 554		return -ENOMEM;
 555
 556	dev->worker.kcov_handle = kcov_common_handle();
 557	dev->worker.vtsk = vtsk;
 558	vhost_task_start(vtsk);
 559	return 0;
 560}
 561
 562/* Caller should have device mutex */
 563long vhost_dev_set_owner(struct vhost_dev *dev)
 564{
 565	int err;
 566
 567	/* Is there an owner already? */
 568	if (vhost_dev_has_owner(dev)) {
 569		err = -EBUSY;
 570		goto err_mm;
 571	}
 572
 573	vhost_attach_mm(dev);
 574
 575	if (dev->use_worker) {
 576		err = vhost_worker_create(dev);
 577		if (err)
 578			goto err_worker;
 579	}
 580
 581	err = vhost_dev_alloc_iovecs(dev);
 582	if (err)
 583		goto err_iovecs;
 584
 585	return 0;
 586err_iovecs:
 587	vhost_worker_free(dev);
 588err_worker:
 589	vhost_detach_mm(dev);
 590err_mm:
 591	return err;
 592}
 593EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 594
 595static struct vhost_iotlb *iotlb_alloc(void)
 596{
 597	return vhost_iotlb_alloc(max_iotlb_entries,
 598				 VHOST_IOTLB_FLAG_RETIRE);
 599}
 600
 601struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
 602{
 603	return iotlb_alloc();
 604}
 605EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 606
 607/* Caller should have device mutex */
 608void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
 609{
 610	int i;
 611
 612	vhost_dev_cleanup(dev);
 613
 614	dev->umem = umem;
 615	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 616	 * VQs aren't running.
 617	 */
 618	for (i = 0; i < dev->nvqs; ++i)
 619		dev->vqs[i]->umem = umem;
 620}
 621EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
 622
 623void vhost_dev_stop(struct vhost_dev *dev)
 624{
 625	int i;
 626
 627	for (i = 0; i < dev->nvqs; ++i) {
 628		if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
 629			vhost_poll_stop(&dev->vqs[i]->poll);
 630	}
 631
 632	vhost_dev_flush(dev);
 633}
 634EXPORT_SYMBOL_GPL(vhost_dev_stop);
 635
 636void vhost_clear_msg(struct vhost_dev *dev)
 637{
 638	struct vhost_msg_node *node, *n;
 639
 640	spin_lock(&dev->iotlb_lock);
 641
 642	list_for_each_entry_safe(node, n, &dev->read_list, node) {
 643		list_del(&node->node);
 644		kfree(node);
 645	}
 646
 647	list_for_each_entry_safe(node, n, &dev->pending_list, node) {
 648		list_del(&node->node);
 649		kfree(node);
 650	}
 651
 652	spin_unlock(&dev->iotlb_lock);
 653}
 654EXPORT_SYMBOL_GPL(vhost_clear_msg);
 655
 656void vhost_dev_cleanup(struct vhost_dev *dev)
 657{
 658	int i;
 659
 660	for (i = 0; i < dev->nvqs; ++i) {
 661		if (dev->vqs[i]->error_ctx)
 662			eventfd_ctx_put(dev->vqs[i]->error_ctx);
 663		if (dev->vqs[i]->kick)
 664			fput(dev->vqs[i]->kick);
 665		if (dev->vqs[i]->call_ctx.ctx)
 666			eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
 667		vhost_vq_reset(dev, dev->vqs[i]);
 668	}
 669	vhost_dev_free_iovecs(dev);
 670	if (dev->log_ctx)
 671		eventfd_ctx_put(dev->log_ctx);
 672	dev->log_ctx = NULL;
 673	/* No one will access memory at this point */
 674	vhost_iotlb_free(dev->umem);
 675	dev->umem = NULL;
 676	vhost_iotlb_free(dev->iotlb);
 677	dev->iotlb = NULL;
 678	vhost_clear_msg(dev);
 679	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
 680	vhost_worker_free(dev);
 681	vhost_detach_mm(dev);
 682}
 683EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
 684
 685static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 686{
 687	u64 a = addr / VHOST_PAGE_SIZE / 8;
 688
 689	/* Make sure 64 bit math will not overflow. */
 690	if (a > ULONG_MAX - (unsigned long)log_base ||
 691	    a + (unsigned long)log_base > ULONG_MAX)
 692		return false;
 693
 694	return access_ok(log_base + a,
 695			 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
 696}
 697
 698/* Make sure 64 bit math will not overflow. */
 699static bool vhost_overflow(u64 uaddr, u64 size)
 700{
 701	if (uaddr > ULONG_MAX || size > ULONG_MAX)
 702		return true;
 703
 704	if (!size)
 705		return false;
 706
 707	return uaddr > ULONG_MAX - size + 1;
 708}
 709
 710/* Caller should have vq mutex and device mutex. */
 711static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
 712				int log_all)
 713{
 714	struct vhost_iotlb_map *map;
 715
 716	if (!umem)
 717		return false;
 718
 719	list_for_each_entry(map, &umem->list, link) {
 720		unsigned long a = map->addr;
 721
 722		if (vhost_overflow(map->addr, map->size))
 723			return false;
 724
 725
 726		if (!access_ok((void __user *)a, map->size))
 727			return false;
 728		else if (log_all && !log_access_ok(log_base,
 729						   map->start,
 730						   map->size))
 731			return false;
 732	}
 733	return true;
 734}
 735
 736static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
 737					       u64 addr, unsigned int size,
 738					       int type)
 739{
 740	const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
 741
 742	if (!map)
 743		return NULL;
 744
 745	return (void __user *)(uintptr_t)(map->addr + addr - map->start);
 746}
 747
 748/* Can we switch to this memory table? */
 749/* Caller should have device mutex but not vq mutex */
 750static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
 751			     int log_all)
 752{
 753	int i;
 754
 755	for (i = 0; i < d->nvqs; ++i) {
 756		bool ok;
 757		bool log;
 758
 759		mutex_lock(&d->vqs[i]->mutex);
 760		log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
 761		/* If ring is inactive, will check when it's enabled. */
 762		if (d->vqs[i]->private_data)
 763			ok = vq_memory_access_ok(d->vqs[i]->log_base,
 764						 umem, log);
 765		else
 766			ok = true;
 767		mutex_unlock(&d->vqs[i]->mutex);
 768		if (!ok)
 769			return false;
 770	}
 771	return true;
 772}
 773
 774static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 775			  struct iovec iov[], int iov_size, int access);
 776
 777static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
 778			      const void *from, unsigned size)
 779{
 780	int ret;
 781
 782	if (!vq->iotlb)
 783		return __copy_to_user(to, from, size);
 784	else {
 785		/* This function should be called after iotlb
 786		 * prefetch, which means we're sure that all vq
 787		 * could be access through iotlb. So -EAGAIN should
 788		 * not happen in this case.
 789		 */
 790		struct iov_iter t;
 791		void __user *uaddr = vhost_vq_meta_fetch(vq,
 792				     (u64)(uintptr_t)to, size,
 793				     VHOST_ADDR_USED);
 794
 795		if (uaddr)
 796			return __copy_to_user(uaddr, from, size);
 797
 798		ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
 799				     ARRAY_SIZE(vq->iotlb_iov),
 800				     VHOST_ACCESS_WO);
 801		if (ret < 0)
 802			goto out;
 803		iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
 804		ret = copy_to_iter(from, size, &t);
 805		if (ret == size)
 806			ret = 0;
 807	}
 808out:
 809	return ret;
 810}
 811
 812static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
 813				void __user *from, unsigned size)
 814{
 815	int ret;
 816
 817	if (!vq->iotlb)
 818		return __copy_from_user(to, from, size);
 819	else {
 820		/* This function should be called after iotlb
 821		 * prefetch, which means we're sure that vq
 822		 * could be access through iotlb. So -EAGAIN should
 823		 * not happen in this case.
 824		 */
 825		void __user *uaddr = vhost_vq_meta_fetch(vq,
 826				     (u64)(uintptr_t)from, size,
 827				     VHOST_ADDR_DESC);
 828		struct iov_iter f;
 829
 830		if (uaddr)
 831			return __copy_from_user(to, uaddr, size);
 832
 833		ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
 834				     ARRAY_SIZE(vq->iotlb_iov),
 835				     VHOST_ACCESS_RO);
 836		if (ret < 0) {
 837			vq_err(vq, "IOTLB translation failure: uaddr "
 838			       "%p size 0x%llx\n", from,
 839			       (unsigned long long) size);
 840			goto out;
 841		}
 842		iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
 843		ret = copy_from_iter(to, size, &f);
 844		if (ret == size)
 845			ret = 0;
 846	}
 847
 848out:
 849	return ret;
 850}
 851
 852static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
 853					  void __user *addr, unsigned int size,
 854					  int type)
 855{
 856	int ret;
 857
 858	ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
 859			     ARRAY_SIZE(vq->iotlb_iov),
 860			     VHOST_ACCESS_RO);
 861	if (ret < 0) {
 862		vq_err(vq, "IOTLB translation failure: uaddr "
 863			"%p size 0x%llx\n", addr,
 864			(unsigned long long) size);
 865		return NULL;
 866	}
 867
 868	if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
 869		vq_err(vq, "Non atomic userspace memory access: uaddr "
 870			"%p size 0x%llx\n", addr,
 871			(unsigned long long) size);
 872		return NULL;
 873	}
 874
 875	return vq->iotlb_iov[0].iov_base;
 876}
 877
 878/* This function should be called after iotlb
 879 * prefetch, which means we're sure that vq
 880 * could be access through iotlb. So -EAGAIN should
 881 * not happen in this case.
 882 */
 883static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
 884					    void __user *addr, unsigned int size,
 885					    int type)
 886{
 887	void __user *uaddr = vhost_vq_meta_fetch(vq,
 888			     (u64)(uintptr_t)addr, size, type);
 889	if (uaddr)
 890		return uaddr;
 891
 892	return __vhost_get_user_slow(vq, addr, size, type);
 893}
 894
 895#define vhost_put_user(vq, x, ptr)		\
 896({ \
 897	int ret; \
 898	if (!vq->iotlb) { \
 899		ret = __put_user(x, ptr); \
 900	} else { \
 901		__typeof__(ptr) to = \
 902			(__typeof__(ptr)) __vhost_get_user(vq, ptr,	\
 903					  sizeof(*ptr), VHOST_ADDR_USED); \
 904		if (to != NULL) \
 905			ret = __put_user(x, to); \
 906		else \
 907			ret = -EFAULT;	\
 908	} \
 909	ret; \
 910})
 911
 912static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 913{
 914	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
 915			      vhost_avail_event(vq));
 916}
 917
 918static inline int vhost_put_used(struct vhost_virtqueue *vq,
 919				 struct vring_used_elem *head, int idx,
 920				 int count)
 921{
 922	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
 923				  count * sizeof(*head));
 924}
 925
 926static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 927
 928{
 929	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
 930			      &vq->used->flags);
 931}
 932
 933static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 934
 935{
 936	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
 937			      &vq->used->idx);
 938}
 939
 940#define vhost_get_user(vq, x, ptr, type)		\
 941({ \
 942	int ret; \
 943	if (!vq->iotlb) { \
 944		ret = __get_user(x, ptr); \
 945	} else { \
 946		__typeof__(ptr) from = \
 947			(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
 948							   sizeof(*ptr), \
 949							   type); \
 950		if (from != NULL) \
 951			ret = __get_user(x, from); \
 952		else \
 953			ret = -EFAULT; \
 954	} \
 955	ret; \
 956})
 957
 958#define vhost_get_avail(vq, x, ptr) \
 959	vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
 960
 961#define vhost_get_used(vq, x, ptr) \
 962	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 963
 964static void vhost_dev_lock_vqs(struct vhost_dev *d)
 965{
 966	int i = 0;
 967	for (i = 0; i < d->nvqs; ++i)
 968		mutex_lock_nested(&d->vqs[i]->mutex, i);
 969}
 970
 971static void vhost_dev_unlock_vqs(struct vhost_dev *d)
 972{
 973	int i = 0;
 974	for (i = 0; i < d->nvqs; ++i)
 975		mutex_unlock(&d->vqs[i]->mutex);
 976}
 977
 978static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 979				      __virtio16 *idx)
 980{
 981	return vhost_get_avail(vq, *idx, &vq->avail->idx);
 982}
 983
 984static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 985				       __virtio16 *head, int idx)
 986{
 987	return vhost_get_avail(vq, *head,
 988			       &vq->avail->ring[idx & (vq->num - 1)]);
 989}
 990
 991static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 992					__virtio16 *flags)
 993{
 994	return vhost_get_avail(vq, *flags, &vq->avail->flags);
 995}
 996
 997static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 998				       __virtio16 *event)
 999{
1000	return vhost_get_avail(vq, *event, vhost_used_event(vq));
1001}
1002
1003static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
1004				     __virtio16 *idx)
1005{
1006	return vhost_get_used(vq, *idx, &vq->used->idx);
1007}
1008
1009static inline int vhost_get_desc(struct vhost_virtqueue *vq,
1010				 struct vring_desc *desc, int idx)
1011{
1012	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
1013}
1014
1015static void vhost_iotlb_notify_vq(struct vhost_dev *d,
1016				  struct vhost_iotlb_msg *msg)
1017{
1018	struct vhost_msg_node *node, *n;
1019
1020	spin_lock(&d->iotlb_lock);
1021
1022	list_for_each_entry_safe(node, n, &d->pending_list, node) {
1023		struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
1024		if (msg->iova <= vq_msg->iova &&
1025		    msg->iova + msg->size - 1 >= vq_msg->iova &&
1026		    vq_msg->type == VHOST_IOTLB_MISS) {
1027			vhost_poll_queue(&node->vq->poll);
1028			list_del(&node->node);
1029			kfree(node);
1030		}
1031	}
1032
1033	spin_unlock(&d->iotlb_lock);
1034}
1035
1036static bool umem_access_ok(u64 uaddr, u64 size, int access)
1037{
1038	unsigned long a = uaddr;
1039
1040	/* Make sure 64 bit math will not overflow. */
1041	if (vhost_overflow(uaddr, size))
1042		return false;
1043
1044	if ((access & VHOST_ACCESS_RO) &&
1045	    !access_ok((void __user *)a, size))
1046		return false;
1047	if ((access & VHOST_ACCESS_WO) &&
1048	    !access_ok((void __user *)a, size))
1049		return false;
1050	return true;
1051}
1052
1053static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
1054				   struct vhost_iotlb_msg *msg)
1055{
1056	int ret = 0;
1057
1058	if (asid != 0)
1059		return -EINVAL;
1060
1061	mutex_lock(&dev->mutex);
1062	vhost_dev_lock_vqs(dev);
1063	switch (msg->type) {
1064	case VHOST_IOTLB_UPDATE:
1065		if (!dev->iotlb) {
1066			ret = -EFAULT;
1067			break;
1068		}
1069		if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
1070			ret = -EFAULT;
1071			break;
1072		}
1073		vhost_vq_meta_reset(dev);
1074		if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
1075					  msg->iova + msg->size - 1,
1076					  msg->uaddr, msg->perm)) {
1077			ret = -ENOMEM;
1078			break;
1079		}
1080		vhost_iotlb_notify_vq(dev, msg);
1081		break;
1082	case VHOST_IOTLB_INVALIDATE:
1083		if (!dev->iotlb) {
1084			ret = -EFAULT;
1085			break;
1086		}
1087		vhost_vq_meta_reset(dev);
1088		vhost_iotlb_del_range(dev->iotlb, msg->iova,
1089				      msg->iova + msg->size - 1);
1090		break;
1091	default:
1092		ret = -EINVAL;
1093		break;
1094	}
1095
1096	vhost_dev_unlock_vqs(dev);
1097	mutex_unlock(&dev->mutex);
1098
1099	return ret;
1100}
1101ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
1102			     struct iov_iter *from)
1103{
1104	struct vhost_iotlb_msg msg;
1105	size_t offset;
1106	int type, ret;
1107	u32 asid = 0;
1108
1109	ret = copy_from_iter(&type, sizeof(type), from);
1110	if (ret != sizeof(type)) {
1111		ret = -EINVAL;
1112		goto done;
1113	}
1114
1115	switch (type) {
1116	case VHOST_IOTLB_MSG:
1117		/* There maybe a hole after type for V1 message type,
1118		 * so skip it here.
1119		 */
1120		offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
1121		break;
1122	case VHOST_IOTLB_MSG_V2:
1123		if (vhost_backend_has_feature(dev->vqs[0],
1124					      VHOST_BACKEND_F_IOTLB_ASID)) {
1125			ret = copy_from_iter(&asid, sizeof(asid), from);
1126			if (ret != sizeof(asid)) {
1127				ret = -EINVAL;
1128				goto done;
1129			}
1130			offset = 0;
1131		} else
1132			offset = sizeof(__u32);
1133		break;
1134	default:
1135		ret = -EINVAL;
1136		goto done;
1137	}
1138
1139	iov_iter_advance(from, offset);
1140	ret = copy_from_iter(&msg, sizeof(msg), from);
1141	if (ret != sizeof(msg)) {
1142		ret = -EINVAL;
1143		goto done;
1144	}
1145
1146	if ((msg.type == VHOST_IOTLB_UPDATE ||
1147	     msg.type == VHOST_IOTLB_INVALIDATE) &&
1148	     msg.size == 0) {
1149		ret = -EINVAL;
1150		goto done;
1151	}
1152
1153	if (dev->msg_handler)
1154		ret = dev->msg_handler(dev, asid, &msg);
1155	else
1156		ret = vhost_process_iotlb_msg(dev, asid, &msg);
1157	if (ret) {
1158		ret = -EFAULT;
1159		goto done;
1160	}
1161
1162	ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
1163	      sizeof(struct vhost_msg_v2);
1164done:
1165	return ret;
1166}
1167EXPORT_SYMBOL(vhost_chr_write_iter);
1168
1169__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
1170			    poll_table *wait)
1171{
1172	__poll_t mask = 0;
1173
1174	poll_wait(file, &dev->wait, wait);
1175
1176	if (!list_empty(&dev->read_list))
1177		mask |= EPOLLIN | EPOLLRDNORM;
1178
1179	return mask;
1180}
1181EXPORT_SYMBOL(vhost_chr_poll);
1182
1183ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
1184			    int noblock)
1185{
1186	DEFINE_WAIT(wait);
1187	struct vhost_msg_node *node;
1188	ssize_t ret = 0;
1189	unsigned size = sizeof(struct vhost_msg);
1190
1191	if (iov_iter_count(to) < size)
1192		return 0;
1193
1194	while (1) {
1195		if (!noblock)
1196			prepare_to_wait(&dev->wait, &wait,
1197					TASK_INTERRUPTIBLE);
1198
1199		node = vhost_dequeue_msg(dev, &dev->read_list);
1200		if (node)
1201			break;
1202		if (noblock) {
1203			ret = -EAGAIN;
1204			break;
1205		}
1206		if (signal_pending(current)) {
1207			ret = -ERESTARTSYS;
1208			break;
1209		}
1210		if (!dev->iotlb) {
1211			ret = -EBADFD;
1212			break;
1213		}
1214
1215		schedule();
1216	}
1217
1218	if (!noblock)
1219		finish_wait(&dev->wait, &wait);
1220
1221	if (node) {
1222		struct vhost_iotlb_msg *msg;
1223		void *start = &node->msg;
1224
1225		switch (node->msg.type) {
1226		case VHOST_IOTLB_MSG:
1227			size = sizeof(node->msg);
1228			msg = &node->msg.iotlb;
1229			break;
1230		case VHOST_IOTLB_MSG_V2:
1231			size = sizeof(node->msg_v2);
1232			msg = &node->msg_v2.iotlb;
1233			break;
1234		default:
1235			BUG();
1236			break;
1237		}
1238
1239		ret = copy_to_iter(start, size, to);
1240		if (ret != size || msg->type != VHOST_IOTLB_MISS) {
1241			kfree(node);
1242			return ret;
1243		}
1244		vhost_enqueue_msg(dev, &dev->pending_list, node);
1245	}
1246
1247	return ret;
1248}
1249EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
1250
1251static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
1252{
1253	struct vhost_dev *dev = vq->dev;
1254	struct vhost_msg_node *node;
1255	struct vhost_iotlb_msg *msg;
1256	bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
1257
1258	node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
1259	if (!node)
1260		return -ENOMEM;
1261
1262	if (v2) {
1263		node->msg_v2.type = VHOST_IOTLB_MSG_V2;
1264		msg = &node->msg_v2.iotlb;
1265	} else {
1266		msg = &node->msg.iotlb;
1267	}
1268
1269	msg->type = VHOST_IOTLB_MISS;
1270	msg->iova = iova;
1271	msg->perm = access;
1272
1273	vhost_enqueue_msg(dev, &dev->read_list, node);
1274
1275	return 0;
1276}
1277
1278static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
1279			 vring_desc_t __user *desc,
1280			 vring_avail_t __user *avail,
1281			 vring_used_t __user *used)
1282
1283{
1284	/* If an IOTLB device is present, the vring addresses are
1285	 * GIOVAs. Access validation occurs at prefetch time. */
1286	if (vq->iotlb)
1287		return true;
1288
1289	return access_ok(desc, vhost_get_desc_size(vq, num)) &&
1290	       access_ok(avail, vhost_get_avail_size(vq, num)) &&
1291	       access_ok(used, vhost_get_used_size(vq, num));
1292}
1293
1294static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
1295				 const struct vhost_iotlb_map *map,
1296				 int type)
1297{
1298	int access = (type == VHOST_ADDR_USED) ?
1299		     VHOST_ACCESS_WO : VHOST_ACCESS_RO;
1300
1301	if (likely(map->perm & access))
1302		vq->meta_iotlb[type] = map;
1303}
1304
1305static bool iotlb_access_ok(struct vhost_virtqueue *vq,
1306			    int access, u64 addr, u64 len, int type)
1307{
1308	const struct vhost_iotlb_map *map;
1309	struct vhost_iotlb *umem = vq->iotlb;
1310	u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
1311
1312	if (vhost_vq_meta_fetch(vq, addr, len, type))
1313		return true;
1314
1315	while (len > s) {
1316		map = vhost_iotlb_itree_first(umem, addr, last);
1317		if (map == NULL || map->start > addr) {
1318			vhost_iotlb_miss(vq, addr, access);
1319			return false;
1320		} else if (!(map->perm & access)) {
1321			/* Report the possible access violation by
1322			 * request another translation from userspace.
1323			 */
1324			return false;
1325		}
1326
1327		size = map->size - addr + map->start;
1328
1329		if (orig_addr == addr && size >= len)
1330			vhost_vq_meta_update(vq, map, type);
1331
1332		s += size;
1333		addr += size;
1334	}
1335
1336	return true;
1337}
1338
1339int vq_meta_prefetch(struct vhost_virtqueue *vq)
1340{
1341	unsigned int num = vq->num;
1342
1343	if (!vq->iotlb)
1344		return 1;
1345
1346	return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
1347			       vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
1348	       iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
1349			       vhost_get_avail_size(vq, num),
1350			       VHOST_ADDR_AVAIL) &&
1351	       iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
1352			       vhost_get_used_size(vq, num), VHOST_ADDR_USED);
1353}
1354EXPORT_SYMBOL_GPL(vq_meta_prefetch);
1355
1356/* Can we log writes? */
1357/* Caller should have device mutex but not vq mutex */
1358bool vhost_log_access_ok(struct vhost_dev *dev)
1359{
1360	return memory_access_ok(dev, dev->umem, 1);
1361}
1362EXPORT_SYMBOL_GPL(vhost_log_access_ok);
1363
1364static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
1365				  void __user *log_base,
1366				  bool log_used,
1367				  u64 log_addr)
1368{
1369	/* If an IOTLB device is present, log_addr is a GIOVA that
1370	 * will never be logged by log_used(). */
1371	if (vq->iotlb)
1372		return true;
1373
1374	return !log_used || log_access_ok(log_base, log_addr,
1375					  vhost_get_used_size(vq, vq->num));
1376}
1377
1378/* Verify access for write logging. */
1379/* Caller should have vq mutex and device mutex */
1380static bool vq_log_access_ok(struct vhost_virtqueue *vq,
1381			     void __user *log_base)
1382{
1383	return vq_memory_access_ok(log_base, vq->umem,
1384				   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
1385		vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
1386}
1387
1388/* Can we start vq? */
1389/* Caller should have vq mutex and device mutex */
1390bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
1391{
1392	if (!vq_log_access_ok(vq, vq->log_base))
1393		return false;
1394
1395	return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
1396}
1397EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
1398
1399static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
1400{
1401	struct vhost_memory mem, *newmem;
1402	struct vhost_memory_region *region;
1403	struct vhost_iotlb *newumem, *oldumem;
1404	unsigned long size = offsetof(struct vhost_memory, regions);
1405	int i;
1406
1407	if (copy_from_user(&mem, m, size))
1408		return -EFAULT;
1409	if (mem.padding)
1410		return -EOPNOTSUPP;
1411	if (mem.nregions > max_mem_regions)
1412		return -E2BIG;
1413	newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
1414			GFP_KERNEL);
1415	if (!newmem)
1416		return -ENOMEM;
1417
1418	memcpy(newmem, &mem, size);
1419	if (copy_from_user(newmem->regions, m->regions,
1420			   flex_array_size(newmem, regions, mem.nregions))) {
1421		kvfree(newmem);
1422		return -EFAULT;
1423	}
1424
1425	newumem = iotlb_alloc();
1426	if (!newumem) {
1427		kvfree(newmem);
1428		return -ENOMEM;
1429	}
1430
1431	for (region = newmem->regions;
1432	     region < newmem->regions + mem.nregions;
1433	     region++) {
1434		if (vhost_iotlb_add_range(newumem,
1435					  region->guest_phys_addr,
1436					  region->guest_phys_addr +
1437					  region->memory_size - 1,
1438					  region->userspace_addr,
1439					  VHOST_MAP_RW))
1440			goto err;
1441	}
1442
1443	if (!memory_access_ok(d, newumem, 0))
1444		goto err;
1445
1446	oldumem = d->umem;
1447	d->umem = newumem;
1448
1449	/* All memory accesses are done under some VQ mutex. */
1450	for (i = 0; i < d->nvqs; ++i) {
1451		mutex_lock(&d->vqs[i]->mutex);
1452		d->vqs[i]->umem = newumem;
1453		mutex_unlock(&d->vqs[i]->mutex);
1454	}
1455
1456	kvfree(newmem);
1457	vhost_iotlb_free(oldumem);
1458	return 0;
1459
1460err:
1461	vhost_iotlb_free(newumem);
1462	kvfree(newmem);
1463	return -EFAULT;
1464}
1465
1466static long vhost_vring_set_num(struct vhost_dev *d,
1467				struct vhost_virtqueue *vq,
1468				void __user *argp)
1469{
1470	struct vhost_vring_state s;
1471
1472	/* Resizing ring with an active backend?
1473	 * You don't want to do that. */
1474	if (vq->private_data)
1475		return -EBUSY;
1476
1477	if (copy_from_user(&s, argp, sizeof s))
1478		return -EFAULT;
1479
1480	if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
1481		return -EINVAL;
1482	vq->num = s.num;
1483
1484	return 0;
1485}
1486
1487static long vhost_vring_set_addr(struct vhost_dev *d,
1488				 struct vhost_virtqueue *vq,
1489				 void __user *argp)
1490{
1491	struct vhost_vring_addr a;
1492
1493	if (copy_from_user(&a, argp, sizeof a))
1494		return -EFAULT;
1495	if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
1496		return -EOPNOTSUPP;
1497
1498	/* For 32bit, verify that the top 32bits of the user
1499	   data are set to zero. */
1500	if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
1501	    (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
1502	    (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
1503		return -EFAULT;
1504
1505	/* Make sure it's safe to cast pointers to vring types. */
1506	BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
1507	BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
1508	if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
1509	    (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
1510	    (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
1511		return -EINVAL;
1512
1513	/* We only verify access here if backend is configured.
1514	 * If it is not, we don't as size might not have been setup.
1515	 * We will verify when backend is configured. */
1516	if (vq->private_data) {
1517		if (!vq_access_ok(vq, vq->num,
1518			(void __user *)(unsigned long)a.desc_user_addr,
1519			(void __user *)(unsigned long)a.avail_user_addr,
1520			(void __user *)(unsigned long)a.used_user_addr))
1521			return -EINVAL;
1522
1523		/* Also validate log access for used ring if enabled. */
1524		if (!vq_log_used_access_ok(vq, vq->log_base,
1525				a.flags & (0x1 << VHOST_VRING_F_LOG),
1526				a.log_guest_addr))
1527			return -EINVAL;
1528	}
1529
1530	vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
1531	vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
1532	vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
1533	vq->log_addr = a.log_guest_addr;
1534	vq->used = (void __user *)(unsigned long)a.used_user_addr;
1535
1536	return 0;
1537}
1538
1539static long vhost_vring_set_num_addr(struct vhost_dev *d,
1540				     struct vhost_virtqueue *vq,
1541				     unsigned int ioctl,
1542				     void __user *argp)
1543{
1544	long r;
1545
1546	mutex_lock(&vq->mutex);
1547
1548	switch (ioctl) {
1549	case VHOST_SET_VRING_NUM:
1550		r = vhost_vring_set_num(d, vq, argp);
1551		break;
1552	case VHOST_SET_VRING_ADDR:
1553		r = vhost_vring_set_addr(d, vq, argp);
1554		break;
1555	default:
1556		BUG();
1557	}
1558
1559	mutex_unlock(&vq->mutex);
1560
1561	return r;
1562}
1563long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1564{
1565	struct file *eventfp, *filep = NULL;
1566	bool pollstart = false, pollstop = false;
1567	struct eventfd_ctx *ctx = NULL;
1568	u32 __user *idxp = argp;
1569	struct vhost_virtqueue *vq;
1570	struct vhost_vring_state s;
1571	struct vhost_vring_file f;
1572	u32 idx;
1573	long r;
1574
1575	r = get_user(idx, idxp);
1576	if (r < 0)
1577		return r;
1578	if (idx >= d->nvqs)
1579		return -ENOBUFS;
1580
1581	idx = array_index_nospec(idx, d->nvqs);
1582	vq = d->vqs[idx];
1583
1584	if (ioctl == VHOST_SET_VRING_NUM ||
1585	    ioctl == VHOST_SET_VRING_ADDR) {
1586		return vhost_vring_set_num_addr(d, vq, ioctl, argp);
1587	}
1588
1589	mutex_lock(&vq->mutex);
1590
1591	switch (ioctl) {
1592	case VHOST_SET_VRING_BASE:
1593		/* Moving base with an active backend?
1594		 * You don't want to do that. */
1595		if (vq->private_data) {
1596			r = -EBUSY;
1597			break;
1598		}
1599		if (copy_from_user(&s, argp, sizeof s)) {
1600			r = -EFAULT;
1601			break;
1602		}
1603		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
1604			vq->last_avail_idx = s.num & 0xffff;
1605			vq->last_used_idx = (s.num >> 16) & 0xffff;
1606		} else {
1607			if (s.num > 0xffff) {
1608				r = -EINVAL;
1609				break;
1610			}
1611			vq->last_avail_idx = s.num;
1612		}
1613		/* Forget the cached index value. */
1614		vq->avail_idx = vq->last_avail_idx;
1615		break;
1616	case VHOST_GET_VRING_BASE:
1617		s.index = idx;
1618		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
1619			s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16);
1620		else
1621			s.num = vq->last_avail_idx;
1622		if (copy_to_user(argp, &s, sizeof s))
1623			r = -EFAULT;
1624		break;
1625	case VHOST_SET_VRING_KICK:
1626		if (copy_from_user(&f, argp, sizeof f)) {
1627			r = -EFAULT;
1628			break;
1629		}
1630		eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
1631		if (IS_ERR(eventfp)) {
1632			r = PTR_ERR(eventfp);
1633			break;
1634		}
1635		if (eventfp != vq->kick) {
1636			pollstop = (filep = vq->kick) != NULL;
1637			pollstart = (vq->kick = eventfp) != NULL;
1638		} else
1639			filep = eventfp;
1640		break;
1641	case VHOST_SET_VRING_CALL:
1642		if (copy_from_user(&f, argp, sizeof f)) {
1643			r = -EFAULT;
1644			break;
1645		}
1646		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1647		if (IS_ERR(ctx)) {
1648			r = PTR_ERR(ctx);
1649			break;
1650		}
1651
1652		swap(ctx, vq->call_ctx.ctx);
1653		break;
1654	case VHOST_SET_VRING_ERR:
1655		if (copy_from_user(&f, argp, sizeof f)) {
1656			r = -EFAULT;
1657			break;
1658		}
1659		ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
1660		if (IS_ERR(ctx)) {
1661			r = PTR_ERR(ctx);
1662			break;
1663		}
1664		swap(ctx, vq->error_ctx);
1665		break;
1666	case VHOST_SET_VRING_ENDIAN:
1667		r = vhost_set_vring_endian(vq, argp);
1668		break;
1669	case VHOST_GET_VRING_ENDIAN:
1670		r = vhost_get_vring_endian(vq, idx, argp);
1671		break;
1672	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
1673		if (copy_from_user(&s, argp, sizeof(s))) {
1674			r = -EFAULT;
1675			break;
1676		}
1677		vq->busyloop_timeout = s.num;
1678		break;
1679	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
1680		s.index = idx;
1681		s.num = vq->busyloop_timeout;
1682		if (copy_to_user(argp, &s, sizeof(s)))
1683			r = -EFAULT;
1684		break;
1685	default:
1686		r = -ENOIOCTLCMD;
1687	}
1688
1689	if (pollstop && vq->handle_kick)
1690		vhost_poll_stop(&vq->poll);
1691
1692	if (!IS_ERR_OR_NULL(ctx))
1693		eventfd_ctx_put(ctx);
1694	if (filep)
1695		fput(filep);
1696
1697	if (pollstart && vq->handle_kick)
1698		r = vhost_poll_start(&vq->poll, vq->kick);
1699
1700	mutex_unlock(&vq->mutex);
1701
1702	if (pollstop && vq->handle_kick)
1703		vhost_dev_flush(vq->poll.dev);
1704	return r;
1705}
1706EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
1707
1708int vhost_init_device_iotlb(struct vhost_dev *d)
1709{
1710	struct vhost_iotlb *niotlb, *oiotlb;
1711	int i;
1712
1713	niotlb = iotlb_alloc();
1714	if (!niotlb)
1715		return -ENOMEM;
1716
1717	oiotlb = d->iotlb;
1718	d->iotlb = niotlb;
1719
1720	for (i = 0; i < d->nvqs; ++i) {
1721		struct vhost_virtqueue *vq = d->vqs[i];
1722
1723		mutex_lock(&vq->mutex);
1724		vq->iotlb = niotlb;
1725		__vhost_vq_meta_reset(vq);
1726		mutex_unlock(&vq->mutex);
1727	}
1728
1729	vhost_iotlb_free(oiotlb);
1730
1731	return 0;
1732}
1733EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
1734
1735/* Caller must have device mutex */
1736long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
1737{
1738	struct eventfd_ctx *ctx;
1739	u64 p;
1740	long r;
1741	int i, fd;
1742
1743	/* If you are not the owner, you can become one */
1744	if (ioctl == VHOST_SET_OWNER) {
1745		r = vhost_dev_set_owner(d);
1746		goto done;
1747	}
1748
1749	/* You must be the owner to do anything else */
1750	r = vhost_dev_check_owner(d);
1751	if (r)
1752		goto done;
1753
1754	switch (ioctl) {
1755	case VHOST_SET_MEM_TABLE:
1756		r = vhost_set_memory(d, argp);
1757		break;
1758	case VHOST_SET_LOG_BASE:
1759		if (copy_from_user(&p, argp, sizeof p)) {
1760			r = -EFAULT;
1761			break;
1762		}
1763		if ((u64)(unsigned long)p != p) {
1764			r = -EFAULT;
1765			break;
1766		}
1767		for (i = 0; i < d->nvqs; ++i) {
1768			struct vhost_virtqueue *vq;
1769			void __user *base = (void __user *)(unsigned long)p;
1770			vq = d->vqs[i];
1771			mutex_lock(&vq->mutex);
1772			/* If ring is inactive, will check when it's enabled. */
1773			if (vq->private_data && !vq_log_access_ok(vq, base))
1774				r = -EFAULT;
1775			else
1776				vq->log_base = base;
1777			mutex_unlock(&vq->mutex);
1778		}
1779		break;
1780	case VHOST_SET_LOG_FD:
1781		r = get_user(fd, (int __user *)argp);
1782		if (r < 0)
1783			break;
1784		ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
1785		if (IS_ERR(ctx)) {
1786			r = PTR_ERR(ctx);
1787			break;
1788		}
1789		swap(ctx, d->log_ctx);
1790		for (i = 0; i < d->nvqs; ++i) {
1791			mutex_lock(&d->vqs[i]->mutex);
1792			d->vqs[i]->log_ctx = d->log_ctx;
1793			mutex_unlock(&d->vqs[i]->mutex);
1794		}
1795		if (ctx)
1796			eventfd_ctx_put(ctx);
1797		break;
1798	default:
1799		r = -ENOIOCTLCMD;
1800		break;
1801	}
1802done:
1803	return r;
1804}
1805EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
1806
1807/* TODO: This is really inefficient.  We need something like get_user()
1808 * (instruction directly accesses the data, with an exception table entry
1809 * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
1810 */
1811static int set_bit_to_user(int nr, void __user *addr)
1812{
1813	unsigned long log = (unsigned long)addr;
1814	struct page *page;
1815	void *base;
1816	int bit = nr + (log % PAGE_SIZE) * 8;
1817	int r;
1818
1819	r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
1820	if (r < 0)
1821		return r;
1822	BUG_ON(r != 1);
1823	base = kmap_atomic(page);
1824	set_bit(bit, base);
1825	kunmap_atomic(base);
1826	unpin_user_pages_dirty_lock(&page, 1, true);
1827	return 0;
1828}
1829
1830static int log_write(void __user *log_base,
1831		     u64 write_address, u64 write_length)
1832{
1833	u64 write_page = write_address / VHOST_PAGE_SIZE;
1834	int r;
1835
1836	if (!write_length)
1837		return 0;
1838	write_length += write_address % VHOST_PAGE_SIZE;
1839	for (;;) {
1840		u64 base = (u64)(unsigned long)log_base;
1841		u64 log = base + write_page / 8;
1842		int bit = write_page % 8;
1843		if ((u64)(unsigned long)log != log)
1844			return -EFAULT;
1845		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
1846		if (r < 0)
1847			return r;
1848		if (write_length <= VHOST_PAGE_SIZE)
1849			break;
1850		write_length -= VHOST_PAGE_SIZE;
1851		write_page += 1;
1852	}
1853	return r;
1854}
1855
1856static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
1857{
1858	struct vhost_iotlb *umem = vq->umem;
1859	struct vhost_iotlb_map *u;
1860	u64 start, end, l, min;
1861	int r;
1862	bool hit = false;
1863
1864	while (len) {
1865		min = len;
1866		/* More than one GPAs can be mapped into a single HVA. So
1867		 * iterate all possible umems here to be safe.
1868		 */
1869		list_for_each_entry(u, &umem->list, link) {
1870			if (u->addr > hva - 1 + len ||
1871			    u->addr - 1 + u->size < hva)
1872				continue;
1873			start = max(u->addr, hva);
1874			end = min(u->addr - 1 + u->size, hva - 1 + len);
1875			l = end - start + 1;
1876			r = log_write(vq->log_base,
1877				      u->start + start - u->addr,
1878				      l);
1879			if (r < 0)
1880				return r;
1881			hit = true;
1882			min = min(l, min);
1883		}
1884
1885		if (!hit)
1886			return -EFAULT;
1887
1888		len -= min;
1889		hva += min;
1890	}
1891
1892	return 0;
1893}
1894
1895static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
1896{
1897	struct iovec *iov = vq->log_iov;
1898	int i, ret;
1899
1900	if (!vq->iotlb)
1901		return log_write(vq->log_base, vq->log_addr + used_offset, len);
1902
1903	ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
1904			     len, iov, 64, VHOST_ACCESS_WO);
1905	if (ret < 0)
1906		return ret;
1907
1908	for (i = 0; i < ret; i++) {
1909		ret = log_write_hva(vq,	(uintptr_t)iov[i].iov_base,
1910				    iov[i].iov_len);
1911		if (ret)
1912			return ret;
1913	}
1914
1915	return 0;
1916}
1917
1918int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
1919		    unsigned int log_num, u64 len, struct iovec *iov, int count)
1920{
1921	int i, r;
1922
1923	/* Make sure data written is seen before log. */
1924	smp_wmb();
1925
1926	if (vq->iotlb) {
1927		for (i = 0; i < count; i++) {
1928			r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
1929					  iov[i].iov_len);
1930			if (r < 0)
1931				return r;
1932		}
1933		return 0;
1934	}
1935
1936	for (i = 0; i < log_num; ++i) {
1937		u64 l = min(log[i].len, len);
1938		r = log_write(vq->log_base, log[i].addr, l);
1939		if (r < 0)
1940			return r;
1941		len -= l;
1942		if (!len) {
1943			if (vq->log_ctx)
1944				eventfd_signal(vq->log_ctx, 1);
1945			return 0;
1946		}
1947	}
1948	/* Length written exceeds what we have stored. This is a bug. */
1949	BUG();
1950	return 0;
1951}
1952EXPORT_SYMBOL_GPL(vhost_log_write);
1953
1954static int vhost_update_used_flags(struct vhost_virtqueue *vq)
1955{
1956	void __user *used;
1957	if (vhost_put_used_flags(vq))
1958		return -EFAULT;
1959	if (unlikely(vq->log_used)) {
1960		/* Make sure the flag is seen before log. */
1961		smp_wmb();
1962		/* Log used flag write. */
1963		used = &vq->used->flags;
1964		log_used(vq, (used - (void __user *)vq->used),
1965			 sizeof vq->used->flags);
1966		if (vq->log_ctx)
1967			eventfd_signal(vq->log_ctx, 1);
1968	}
1969	return 0;
1970}
1971
1972static int vhost_update_avail_event(struct vhost_virtqueue *vq)
1973{
1974	if (vhost_put_avail_event(vq))
1975		return -EFAULT;
1976	if (unlikely(vq->log_used)) {
1977		void __user *used;
1978		/* Make sure the event is seen before log. */
1979		smp_wmb();
1980		/* Log avail event write */
1981		used = vhost_avail_event(vq);
1982		log_used(vq, (used - (void __user *)vq->used),
1983			 sizeof *vhost_avail_event(vq));
1984		if (vq->log_ctx)
1985			eventfd_signal(vq->log_ctx, 1);
1986	}
1987	return 0;
1988}
1989
1990int vhost_vq_init_access(struct vhost_virtqueue *vq)
1991{
1992	__virtio16 last_used_idx;
1993	int r;
1994	bool is_le = vq->is_le;
1995
1996	if (!vq->private_data)
1997		return 0;
1998
1999	vhost_init_is_le(vq);
2000
2001	r = vhost_update_used_flags(vq);
2002	if (r)
2003		goto err;
2004	vq->signalled_used_valid = false;
2005	if (!vq->iotlb &&
2006	    !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
2007		r = -EFAULT;
2008		goto err;
2009	}
2010	r = vhost_get_used_idx(vq, &last_used_idx);
2011	if (r) {
2012		vq_err(vq, "Can't access used idx at %p\n",
2013		       &vq->used->idx);
2014		goto err;
2015	}
2016	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
2017	return 0;
2018
2019err:
2020	vq->is_le = is_le;
2021	return r;
2022}
2023EXPORT_SYMBOL_GPL(vhost_vq_init_access);
2024
2025static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
2026			  struct iovec iov[], int iov_size, int access)
2027{
2028	const struct vhost_iotlb_map *map;
2029	struct vhost_dev *dev = vq->dev;
2030	struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
2031	struct iovec *_iov;
2032	u64 s = 0, last = addr + len - 1;
2033	int ret = 0;
2034
2035	while ((u64)len > s) {
2036		u64 size;
2037		if (unlikely(ret >= iov_size)) {
2038			ret = -ENOBUFS;
2039			break;
2040		}
2041
2042		map = vhost_iotlb_itree_first(umem, addr, last);
2043		if (map == NULL || map->start > addr) {
2044			if (umem != dev->iotlb) {
2045				ret = -EFAULT;
2046				break;
2047			}
2048			ret = -EAGAIN;
2049			break;
2050		} else if (!(map->perm & access)) {
2051			ret = -EPERM;
2052			break;
2053		}
2054
2055		_iov = iov + ret;
2056		size = map->size - addr + map->start;
2057		_iov->iov_len = min((u64)len - s, size);
2058		_iov->iov_base = (void __user *)(unsigned long)
2059				 (map->addr + addr - map->start);
2060		s += size;
2061		addr += size;
2062		++ret;
2063	}
2064
2065	if (ret == -EAGAIN)
2066		vhost_iotlb_miss(vq, addr, access);
2067	return ret;
2068}
2069
2070/* Each buffer in the virtqueues is actually a chain of descriptors.  This
2071 * function returns the next descriptor in the chain,
2072 * or -1U if we're at the end. */
2073static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
2074{
2075	unsigned int next;
2076
2077	/* If this descriptor says it doesn't chain, we're done. */
2078	if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
2079		return -1U;
2080
2081	/* Check they're not leading us off end of descriptors. */
2082	next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
2083	return next;
2084}
2085
2086static int get_indirect(struct vhost_virtqueue *vq,
2087			struct iovec iov[], unsigned int iov_size,
2088			unsigned int *out_num, unsigned int *in_num,
2089			struct vhost_log *log, unsigned int *log_num,
2090			struct vring_desc *indirect)
2091{
2092	struct vring_desc desc;
2093	unsigned int i = 0, count, found = 0;
2094	u32 len = vhost32_to_cpu(vq, indirect->len);
2095	struct iov_iter from;
2096	int ret, access;
2097
2098	/* Sanity check */
2099	if (unlikely(len % sizeof desc)) {
2100		vq_err(vq, "Invalid length in indirect descriptor: "
2101		       "len 0x%llx not multiple of 0x%zx\n",
2102		       (unsigned long long)len,
2103		       sizeof desc);
2104		return -EINVAL;
2105	}
2106
2107	ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
2108			     UIO_MAXIOV, VHOST_ACCESS_RO);
2109	if (unlikely(ret < 0)) {
2110		if (ret != -EAGAIN)
2111			vq_err(vq, "Translation failure %d in indirect.\n", ret);
2112		return ret;
2113	}
2114	iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
2115	count = len / sizeof desc;
2116	/* Buffers are chained via a 16 bit next field, so
2117	 * we can have at most 2^16 of these. */
2118	if (unlikely(count > USHRT_MAX + 1)) {
2119		vq_err(vq, "Indirect buffer length too big: %d\n",
2120		       indirect->len);
2121		return -E2BIG;
2122	}
2123
2124	do {
2125		unsigned iov_count = *in_num + *out_num;
2126		if (unlikely(++found > count)) {
2127			vq_err(vq, "Loop detected: last one at %u "
2128			       "indirect size %u\n",
2129			       i, count);
2130			return -EINVAL;
2131		}
2132		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
2133			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
2134			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2135			return -EINVAL;
2136		}
2137		if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
2138			vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
2139			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
2140			return -EINVAL;
2141		}
2142
2143		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2144			access = VHOST_ACCESS_WO;
2145		else
2146			access = VHOST_ACCESS_RO;
2147
2148		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2149				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2150				     iov_size - iov_count, access);
2151		if (unlikely(ret < 0)) {
2152			if (ret != -EAGAIN)
2153				vq_err(vq, "Translation failure %d indirect idx %d\n",
2154					ret, i);
2155			return ret;
2156		}
2157		/* If this is an input descriptor, increment that count. */
2158		if (access == VHOST_ACCESS_WO) {
2159			*in_num += ret;
2160			if (unlikely(log && ret)) {
2161				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2162				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2163				++*log_num;
2164			}
2165		} else {
2166			/* If it's an output descriptor, they're all supposed
2167			 * to come before any input descriptors. */
2168			if (unlikely(*in_num)) {
2169				vq_err(vq, "Indirect descriptor "
2170				       "has out after in: idx %d\n", i);
2171				return -EINVAL;
2172			}
2173			*out_num += ret;
2174		}
2175	} while ((i = next_desc(vq, &desc)) != -1);
2176	return 0;
2177}
2178
2179/* This looks in the virtqueue and for the first available buffer, and converts
2180 * it to an iovec for convenient access.  Since descriptors consist of some
2181 * number of output then some number of input descriptors, it's actually two
2182 * iovecs, but we pack them into one and note how many of each there were.
2183 *
2184 * This function returns the descriptor number found, or vq->num (which is
2185 * never a valid descriptor number) if none was found.  A negative code is
2186 * returned on error. */
2187int vhost_get_vq_desc(struct vhost_virtqueue *vq,
2188		      struct iovec iov[], unsigned int iov_size,
2189		      unsigned int *out_num, unsigned int *in_num,
2190		      struct vhost_log *log, unsigned int *log_num)
2191{
2192	struct vring_desc desc;
2193	unsigned int i, head, found = 0;
2194	u16 last_avail_idx;
2195	__virtio16 avail_idx;
2196	__virtio16 ring_head;
2197	int ret, access;
2198
2199	/* Check it isn't doing very strange things with descriptor numbers. */
2200	last_avail_idx = vq->last_avail_idx;
2201
2202	if (vq->avail_idx == vq->last_avail_idx) {
2203		if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
2204			vq_err(vq, "Failed to access avail idx at %p\n",
2205				&vq->avail->idx);
2206			return -EFAULT;
2207		}
2208		vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2209
2210		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
2211			vq_err(vq, "Guest moved used index from %u to %u",
2212				last_avail_idx, vq->avail_idx);
2213			return -EFAULT;
2214		}
2215
2216		/* If there's nothing new since last we looked, return
2217		 * invalid.
2218		 */
2219		if (vq->avail_idx == last_avail_idx)
2220			return vq->num;
2221
2222		/* Only get avail ring entries after they have been
2223		 * exposed by guest.
2224		 */
2225		smp_rmb();
2226	}
2227
2228	/* Grab the next descriptor number they're advertising, and increment
2229	 * the index we've seen. */
2230	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
2231		vq_err(vq, "Failed to read head: idx %d address %p\n",
2232		       last_avail_idx,
2233		       &vq->avail->ring[last_avail_idx % vq->num]);
2234		return -EFAULT;
2235	}
2236
2237	head = vhost16_to_cpu(vq, ring_head);
2238
2239	/* If their number is silly, that's an error. */
2240	if (unlikely(head >= vq->num)) {
2241		vq_err(vq, "Guest says index %u > %u is available",
2242		       head, vq->num);
2243		return -EINVAL;
2244	}
2245
2246	/* When we start there are none of either input nor output. */
2247	*out_num = *in_num = 0;
2248	if (unlikely(log))
2249		*log_num = 0;
2250
2251	i = head;
2252	do {
2253		unsigned iov_count = *in_num + *out_num;
2254		if (unlikely(i >= vq->num)) {
2255			vq_err(vq, "Desc index is %u > %u, head = %u",
2256			       i, vq->num, head);
2257			return -EINVAL;
2258		}
2259		if (unlikely(++found > vq->num)) {
2260			vq_err(vq, "Loop detected: last one at %u "
2261			       "vq size %u head %u\n",
2262			       i, vq->num, head);
2263			return -EINVAL;
2264		}
2265		ret = vhost_get_desc(vq, &desc, i);
2266		if (unlikely(ret)) {
2267			vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
2268			       i, vq->desc + i);
2269			return -EFAULT;
2270		}
2271		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
2272			ret = get_indirect(vq, iov, iov_size,
2273					   out_num, in_num,
2274					   log, log_num, &desc);
2275			if (unlikely(ret < 0)) {
2276				if (ret != -EAGAIN)
2277					vq_err(vq, "Failure detected "
2278						"in indirect descriptor at idx %d\n", i);
2279				return ret;
2280			}
2281			continue;
2282		}
2283
2284		if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
2285			access = VHOST_ACCESS_WO;
2286		else
2287			access = VHOST_ACCESS_RO;
2288		ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
2289				     vhost32_to_cpu(vq, desc.len), iov + iov_count,
2290				     iov_size - iov_count, access);
2291		if (unlikely(ret < 0)) {
2292			if (ret != -EAGAIN)
2293				vq_err(vq, "Translation failure %d descriptor idx %d\n",
2294					ret, i);
2295			return ret;
2296		}
2297		if (access == VHOST_ACCESS_WO) {
2298			/* If this is an input descriptor,
2299			 * increment that count. */
2300			*in_num += ret;
2301			if (unlikely(log && ret)) {
2302				log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
2303				log[*log_num].len = vhost32_to_cpu(vq, desc.len);
2304				++*log_num;
2305			}
2306		} else {
2307			/* If it's an output descriptor, they're all supposed
2308			 * to come before any input descriptors. */
2309			if (unlikely(*in_num)) {
2310				vq_err(vq, "Descriptor has out after in: "
2311				       "idx %d\n", i);
2312				return -EINVAL;
2313			}
2314			*out_num += ret;
2315		}
2316	} while ((i = next_desc(vq, &desc)) != -1);
2317
2318	/* On success, increment avail index. */
2319	vq->last_avail_idx++;
2320
2321	/* Assume notifications from guest are disabled at this point,
2322	 * if they aren't we would need to update avail_event index. */
2323	BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
2324	return head;
2325}
2326EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
2327
2328/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
2329void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
2330{
2331	vq->last_avail_idx -= n;
2332}
2333EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
2334
2335/* After we've used one of their buffers, we tell them about it.  We'll then
2336 * want to notify the guest, using eventfd. */
2337int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
2338{
2339	struct vring_used_elem heads = {
2340		cpu_to_vhost32(vq, head),
2341		cpu_to_vhost32(vq, len)
2342	};
2343
2344	return vhost_add_used_n(vq, &heads, 1);
2345}
2346EXPORT_SYMBOL_GPL(vhost_add_used);
2347
2348static int __vhost_add_used_n(struct vhost_virtqueue *vq,
2349			    struct vring_used_elem *heads,
2350			    unsigned count)
2351{
2352	vring_used_elem_t __user *used;
2353	u16 old, new;
2354	int start;
2355
2356	start = vq->last_used_idx & (vq->num - 1);
2357	used = vq->used->ring + start;
2358	if (vhost_put_used(vq, heads, start, count)) {
2359		vq_err(vq, "Failed to write used");
2360		return -EFAULT;
2361	}
2362	if (unlikely(vq->log_used)) {
2363		/* Make sure data is seen before log. */
2364		smp_wmb();
2365		/* Log used ring entry write. */
2366		log_used(vq, ((void __user *)used - (void __user *)vq->used),
2367			 count * sizeof *used);
2368	}
2369	old = vq->last_used_idx;
2370	new = (vq->last_used_idx += count);
2371	/* If the driver never bothers to signal in a very long while,
2372	 * used index might wrap around. If that happens, invalidate
2373	 * signalled_used index we stored. TODO: make sure driver
2374	 * signals at least once in 2^16 and remove this. */
2375	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
2376		vq->signalled_used_valid = false;
2377	return 0;
2378}
2379
2380/* After we've used one of their buffers, we tell them about it.  We'll then
2381 * want to notify the guest, using eventfd. */
2382int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
2383		     unsigned count)
2384{
2385	int start, n, r;
2386
2387	start = vq->last_used_idx & (vq->num - 1);
2388	n = vq->num - start;
2389	if (n < count) {
2390		r = __vhost_add_used_n(vq, heads, n);
2391		if (r < 0)
2392			return r;
2393		heads += n;
2394		count -= n;
2395	}
2396	r = __vhost_add_used_n(vq, heads, count);
2397
2398	/* Make sure buffer is written before we update index. */
2399	smp_wmb();
2400	if (vhost_put_used_idx(vq)) {
2401		vq_err(vq, "Failed to increment used idx");
2402		return -EFAULT;
2403	}
2404	if (unlikely(vq->log_used)) {
2405		/* Make sure used idx is seen before log. */
2406		smp_wmb();
2407		/* Log used index update. */
2408		log_used(vq, offsetof(struct vring_used, idx),
2409			 sizeof vq->used->idx);
2410		if (vq->log_ctx)
2411			eventfd_signal(vq->log_ctx, 1);
2412	}
2413	return r;
2414}
2415EXPORT_SYMBOL_GPL(vhost_add_used_n);
2416
2417static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2418{
2419	__u16 old, new;
2420	__virtio16 event;
2421	bool v;
2422	/* Flush out used index updates. This is paired
2423	 * with the barrier that the Guest executes when enabling
2424	 * interrupts. */
2425	smp_mb();
2426
2427	if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
2428	    unlikely(vq->avail_idx == vq->last_avail_idx))
2429		return true;
2430
2431	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2432		__virtio16 flags;
2433		if (vhost_get_avail_flags(vq, &flags)) {
2434			vq_err(vq, "Failed to get flags");
2435			return true;
2436		}
2437		return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
2438	}
2439	old = vq->signalled_used;
2440	v = vq->signalled_used_valid;
2441	new = vq->signalled_used = vq->last_used_idx;
2442	vq->signalled_used_valid = true;
2443
2444	if (unlikely(!v))
2445		return true;
2446
2447	if (vhost_get_used_event(vq, &event)) {
2448		vq_err(vq, "Failed to get used event idx");
2449		return true;
2450	}
2451	return vring_need_event(vhost16_to_cpu(vq, event), new, old);
2452}
2453
2454/* This actually signals the guest, using eventfd. */
2455void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2456{
2457	/* Signal the Guest tell them we used something up. */
2458	if (vq->call_ctx.ctx && vhost_notify(dev, vq))
2459		eventfd_signal(vq->call_ctx.ctx, 1);
2460}
2461EXPORT_SYMBOL_GPL(vhost_signal);
2462
2463/* And here's the combo meal deal.  Supersize me! */
2464void vhost_add_used_and_signal(struct vhost_dev *dev,
2465			       struct vhost_virtqueue *vq,
2466			       unsigned int head, int len)
2467{
2468	vhost_add_used(vq, head, len);
2469	vhost_signal(dev, vq);
2470}
2471EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
2472
2473/* multi-buffer version of vhost_add_used_and_signal */
2474void vhost_add_used_and_signal_n(struct vhost_dev *dev,
2475				 struct vhost_virtqueue *vq,
2476				 struct vring_used_elem *heads, unsigned count)
2477{
2478	vhost_add_used_n(vq, heads, count);
2479	vhost_signal(dev, vq);
2480}
2481EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
2482
2483/* return true if we're sure that avaiable ring is empty */
2484bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2485{
2486	__virtio16 avail_idx;
2487	int r;
2488
2489	if (vq->avail_idx != vq->last_avail_idx)
2490		return false;
2491
2492	r = vhost_get_avail_idx(vq, &avail_idx);
2493	if (unlikely(r))
2494		return false;
2495	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2496
2497	return vq->avail_idx == vq->last_avail_idx;
2498}
2499EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
2500
2501/* OK, now we need to know about added descriptors. */
2502bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2503{
2504	__virtio16 avail_idx;
2505	int r;
2506
2507	if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
2508		return false;
2509	vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
2510	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2511		r = vhost_update_used_flags(vq);
2512		if (r) {
2513			vq_err(vq, "Failed to enable notification at %p: %d\n",
2514			       &vq->used->flags, r);
2515			return false;
2516		}
2517	} else {
2518		r = vhost_update_avail_event(vq);
2519		if (r) {
2520			vq_err(vq, "Failed to update avail event index at %p: %d\n",
2521			       vhost_avail_event(vq), r);
2522			return false;
2523		}
2524	}
2525	/* They could have slipped one in as we were doing that: make
2526	 * sure it's written, then check again. */
2527	smp_mb();
2528	r = vhost_get_avail_idx(vq, &avail_idx);
2529	if (r) {
2530		vq_err(vq, "Failed to check avail idx at %p: %d\n",
2531		       &vq->avail->idx, r);
2532		return false;
2533	}
2534	vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
2535
2536	return vq->avail_idx != vq->last_avail_idx;
2537}
2538EXPORT_SYMBOL_GPL(vhost_enable_notify);
2539
2540/* We don't need to be notified again. */
2541void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
2542{
2543	int r;
2544
2545	if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
2546		return;
2547	vq->used_flags |= VRING_USED_F_NO_NOTIFY;
2548	if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
2549		r = vhost_update_used_flags(vq);
2550		if (r)
2551			vq_err(vq, "Failed to disable notification at %p: %d\n",
2552			       &vq->used->flags, r);
2553	}
2554}
2555EXPORT_SYMBOL_GPL(vhost_disable_notify);
2556
2557/* Create a new message. */
2558struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
2559{
2560	/* Make sure all padding within the structure is initialized. */
2561	struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
2562	if (!node)
2563		return NULL;
2564
2565	node->vq = vq;
2566	node->msg.type = type;
2567	return node;
2568}
2569EXPORT_SYMBOL_GPL(vhost_new_msg);
2570
2571void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
2572		       struct vhost_msg_node *node)
2573{
2574	spin_lock(&dev->iotlb_lock);
2575	list_add_tail(&node->node, head);
2576	spin_unlock(&dev->iotlb_lock);
2577
2578	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
2579}
2580EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
2581
2582struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
2583					 struct list_head *head)
2584{
2585	struct vhost_msg_node *node = NULL;
2586
2587	spin_lock(&dev->iotlb_lock);
2588	if (!list_empty(head)) {
2589		node = list_first_entry(head, struct vhost_msg_node,
2590					node);
2591		list_del(&node->node);
2592	}
2593	spin_unlock(&dev->iotlb_lock);
2594
2595	return node;
2596}
2597EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
2598
2599void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
2600{
2601	struct vhost_virtqueue *vq;
2602	int i;
2603
2604	mutex_lock(&dev->mutex);
2605	for (i = 0; i < dev->nvqs; ++i) {
2606		vq = dev->vqs[i];
2607		mutex_lock(&vq->mutex);
2608		vq->acked_backend_features = features;
2609		mutex_unlock(&vq->mutex);
2610	}
2611	mutex_unlock(&dev->mutex);
2612}
2613EXPORT_SYMBOL_GPL(vhost_set_backend_features);
2614
2615static int __init vhost_init(void)
2616{
2617	return 0;
2618}
2619
2620static void __exit vhost_exit(void)
2621{
2622}
2623
2624module_init(vhost_init);
2625module_exit(vhost_exit);
2626
2627MODULE_VERSION("0.0.1");
2628MODULE_LICENSE("GPL v2");
2629MODULE_AUTHOR("Michael S. Tsirkin");
2630MODULE_DESCRIPTION("Host kernel accelerator for virtio");