drivers/vfio/vfio.c at v5.13-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / drivers / vfio / vfio.c
at v5.13-rc1 2363 lines 61 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION	"0.3"
  37#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC	"VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41	struct class			*class;
  42	struct list_head		iommu_drivers_list;
  43	struct mutex			iommu_drivers_lock;
  44	struct list_head		group_list;
  45	struct idr			group_idr;
  46	struct mutex			group_lock;
  47	struct cdev			group_cdev;
  48	dev_t				group_devt;
  49} vfio;
  50
  51struct vfio_iommu_driver {
  52	const struct vfio_iommu_driver_ops	*ops;
  53	struct list_head			vfio_next;
  54};
  55
  56struct vfio_container {
  57	struct kref			kref;
  58	struct list_head		group_list;
  59	struct rw_semaphore		group_lock;
  60	struct vfio_iommu_driver	*iommu_driver;
  61	void				*iommu_data;
  62	bool				noiommu;
  63};
  64
  65struct vfio_unbound_dev {
  66	struct device			*dev;
  67	struct list_head		unbound_next;
  68};
  69
  70struct vfio_group {
  71	struct kref			kref;
  72	int				minor;
  73	atomic_t			container_users;
  74	struct iommu_group		*iommu_group;
  75	struct vfio_container		*container;
  76	struct list_head		device_list;
  77	struct mutex			device_lock;
  78	struct device			*dev;
  79	struct notifier_block		nb;
  80	struct list_head		vfio_next;
  81	struct list_head		container_next;
  82	struct list_head		unbound_list;
  83	struct mutex			unbound_lock;
  84	atomic_t			opened;
  85	wait_queue_head_t		container_q;
  86	bool				noiommu;
  87	unsigned int			dev_counter;
  88	struct kvm			*kvm;
  89	struct blocking_notifier_head	notifier;
  90};
  91
  92#ifdef CONFIG_VFIO_NOIOMMU
  93static bool noiommu __read_mostly;
  94module_param_named(enable_unsafe_noiommu_mode,
  95		   noiommu, bool, S_IRUGO | S_IWUSR);
  96MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  97#endif
  98
  99/*
 100 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 101 * and remove functions, any use cases other than acquiring the first
 102 * reference for the purpose of calling vfio_register_group_dev() or removing
 103 * that symmetric reference after vfio_unregister_group_dev() should use the raw
 104 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 105 * removes the device from the dummy group and cannot be nested.
 106 */
 107struct iommu_group *vfio_iommu_group_get(struct device *dev)
 108{
 109	struct iommu_group *group;
 110	int __maybe_unused ret;
 111
 112	group = iommu_group_get(dev);
 113
 114#ifdef CONFIG_VFIO_NOIOMMU
 115	/*
 116	 * With noiommu enabled, an IOMMU group will be created for a device
 117	 * that doesn't already have one and doesn't have an iommu_ops on their
 118	 * bus.  We set iommudata simply to be able to identify these groups
 119	 * as special use and for reclamation later.
 120	 */
 121	if (group || !noiommu || iommu_present(dev->bus))
 122		return group;
 123
 124	group = iommu_group_alloc();
 125	if (IS_ERR(group))
 126		return NULL;
 127
 128	iommu_group_set_name(group, "vfio-noiommu");
 129	iommu_group_set_iommudata(group, &noiommu, NULL);
 130	ret = iommu_group_add_device(group, dev);
 131	if (ret) {
 132		iommu_group_put(group);
 133		return NULL;
 134	}
 135
 136	/*
 137	 * Where to taint?  At this point we've added an IOMMU group for a
 138	 * device that is not backed by iommu_ops, therefore any iommu_
 139	 * callback using iommu_ops can legitimately Oops.  So, while we may
 140	 * be about to give a DMA capable device to a user without IOMMU
 141	 * protection, which is clearly taint-worthy, let's go ahead and do
 142	 * it here.
 143	 */
 144	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 145	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 146#endif
 147
 148	return group;
 149}
 150EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 151
 152void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 153{
 154#ifdef CONFIG_VFIO_NOIOMMU
 155	if (iommu_group_get_iommudata(group) == &noiommu)
 156		iommu_group_remove_device(dev);
 157#endif
 158
 159	iommu_group_put(group);
 160}
 161EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 162
 163#ifdef CONFIG_VFIO_NOIOMMU
 164static void *vfio_noiommu_open(unsigned long arg)
 165{
 166	if (arg != VFIO_NOIOMMU_IOMMU)
 167		return ERR_PTR(-EINVAL);
 168	if (!capable(CAP_SYS_RAWIO))
 169		return ERR_PTR(-EPERM);
 170
 171	return NULL;
 172}
 173
 174static void vfio_noiommu_release(void *iommu_data)
 175{
 176}
 177
 178static long vfio_noiommu_ioctl(void *iommu_data,
 179			       unsigned int cmd, unsigned long arg)
 180{
 181	if (cmd == VFIO_CHECK_EXTENSION)
 182		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 183
 184	return -ENOTTY;
 185}
 186
 187static int vfio_noiommu_attach_group(void *iommu_data,
 188				     struct iommu_group *iommu_group)
 189{
 190	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 191}
 192
 193static void vfio_noiommu_detach_group(void *iommu_data,
 194				      struct iommu_group *iommu_group)
 195{
 196}
 197
 198static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 199	.name = "vfio-noiommu",
 200	.owner = THIS_MODULE,
 201	.open = vfio_noiommu_open,
 202	.release = vfio_noiommu_release,
 203	.ioctl = vfio_noiommu_ioctl,
 204	.attach_group = vfio_noiommu_attach_group,
 205	.detach_group = vfio_noiommu_detach_group,
 206};
 207#endif
 208
 209
 210/**
 211 * IOMMU driver registration
 212 */
 213int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 214{
 215	struct vfio_iommu_driver *driver, *tmp;
 216
 217	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 218	if (!driver)
 219		return -ENOMEM;
 220
 221	driver->ops = ops;
 222
 223	mutex_lock(&vfio.iommu_drivers_lock);
 224
 225	/* Check for duplicates */
 226	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 227		if (tmp->ops == ops) {
 228			mutex_unlock(&vfio.iommu_drivers_lock);
 229			kfree(driver);
 230			return -EINVAL;
 231		}
 232	}
 233
 234	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 235
 236	mutex_unlock(&vfio.iommu_drivers_lock);
 237
 238	return 0;
 239}
 240EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 241
 242void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 243{
 244	struct vfio_iommu_driver *driver;
 245
 246	mutex_lock(&vfio.iommu_drivers_lock);
 247	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 248		if (driver->ops == ops) {
 249			list_del(&driver->vfio_next);
 250			mutex_unlock(&vfio.iommu_drivers_lock);
 251			kfree(driver);
 252			return;
 253		}
 254	}
 255	mutex_unlock(&vfio.iommu_drivers_lock);
 256}
 257EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 258
 259/**
 260 * Group minor allocation/free - both called with vfio.group_lock held
 261 */
 262static int vfio_alloc_group_minor(struct vfio_group *group)
 263{
 264	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 265}
 266
 267static void vfio_free_group_minor(int minor)
 268{
 269	idr_remove(&vfio.group_idr, minor);
 270}
 271
 272static int vfio_iommu_group_notifier(struct notifier_block *nb,
 273				     unsigned long action, void *data);
 274static void vfio_group_get(struct vfio_group *group);
 275
 276/**
 277 * Container objects - containers are created when /dev/vfio/vfio is
 278 * opened, but their lifecycle extends until the last user is done, so
 279 * it's freed via kref.  Must support container/group/device being
 280 * closed in any order.
 281 */
 282static void vfio_container_get(struct vfio_container *container)
 283{
 284	kref_get(&container->kref);
 285}
 286
 287static void vfio_container_release(struct kref *kref)
 288{
 289	struct vfio_container *container;
 290	container = container_of(kref, struct vfio_container, kref);
 291
 292	kfree(container);
 293}
 294
 295static void vfio_container_put(struct vfio_container *container)
 296{
 297	kref_put(&container->kref, vfio_container_release);
 298}
 299
 300static void vfio_group_unlock_and_free(struct vfio_group *group)
 301{
 302	mutex_unlock(&vfio.group_lock);
 303	/*
 304	 * Unregister outside of lock.  A spurious callback is harmless now
 305	 * that the group is no longer in vfio.group_list.
 306	 */
 307	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 308	kfree(group);
 309}
 310
 311/**
 312 * Group objects - create, release, get, put, search
 313 */
 314static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 315{
 316	struct vfio_group *group, *tmp;
 317	struct device *dev;
 318	int ret, minor;
 319
 320	group = kzalloc(sizeof(*group), GFP_KERNEL);
 321	if (!group)
 322		return ERR_PTR(-ENOMEM);
 323
 324	kref_init(&group->kref);
 325	INIT_LIST_HEAD(&group->device_list);
 326	mutex_init(&group->device_lock);
 327	INIT_LIST_HEAD(&group->unbound_list);
 328	mutex_init(&group->unbound_lock);
 329	atomic_set(&group->container_users, 0);
 330	atomic_set(&group->opened, 0);
 331	init_waitqueue_head(&group->container_q);
 332	group->iommu_group = iommu_group;
 333#ifdef CONFIG_VFIO_NOIOMMU
 334	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 335#endif
 336	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 337
 338	group->nb.notifier_call = vfio_iommu_group_notifier;
 339
 340	/*
 341	 * blocking notifiers acquire a rwsem around registering and hold
 342	 * it around callback.  Therefore, need to register outside of
 343	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 344	 * do anything unless it can find the group in vfio.group_list, so
 345	 * no harm in registering early.
 346	 */
 347	ret = iommu_group_register_notifier(iommu_group, &group->nb);
 348	if (ret) {
 349		kfree(group);
 350		return ERR_PTR(ret);
 351	}
 352
 353	mutex_lock(&vfio.group_lock);
 354
 355	/* Did we race creating this group? */
 356	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 357		if (tmp->iommu_group == iommu_group) {
 358			vfio_group_get(tmp);
 359			vfio_group_unlock_and_free(group);
 360			return tmp;
 361		}
 362	}
 363
 364	minor = vfio_alloc_group_minor(group);
 365	if (minor < 0) {
 366		vfio_group_unlock_and_free(group);
 367		return ERR_PTR(minor);
 368	}
 369
 370	dev = device_create(vfio.class, NULL,
 371			    MKDEV(MAJOR(vfio.group_devt), minor),
 372			    group, "%s%d", group->noiommu ? "noiommu-" : "",
 373			    iommu_group_id(iommu_group));
 374	if (IS_ERR(dev)) {
 375		vfio_free_group_minor(minor);
 376		vfio_group_unlock_and_free(group);
 377		return ERR_CAST(dev);
 378	}
 379
 380	group->minor = minor;
 381	group->dev = dev;
 382
 383	list_add(&group->vfio_next, &vfio.group_list);
 384
 385	mutex_unlock(&vfio.group_lock);
 386
 387	return group;
 388}
 389
 390/* called with vfio.group_lock held */
 391static void vfio_group_release(struct kref *kref)
 392{
 393	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 394	struct vfio_unbound_dev *unbound, *tmp;
 395	struct iommu_group *iommu_group = group->iommu_group;
 396
 397	WARN_ON(!list_empty(&group->device_list));
 398	WARN_ON(group->notifier.head);
 399
 400	list_for_each_entry_safe(unbound, tmp,
 401				 &group->unbound_list, unbound_next) {
 402		list_del(&unbound->unbound_next);
 403		kfree(unbound);
 404	}
 405
 406	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 407	list_del(&group->vfio_next);
 408	vfio_free_group_minor(group->minor);
 409	vfio_group_unlock_and_free(group);
 410	iommu_group_put(iommu_group);
 411}
 412
 413static void vfio_group_put(struct vfio_group *group)
 414{
 415	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 416}
 417
 418struct vfio_group_put_work {
 419	struct work_struct work;
 420	struct vfio_group *group;
 421};
 422
 423static void vfio_group_put_bg(struct work_struct *work)
 424{
 425	struct vfio_group_put_work *do_work;
 426
 427	do_work = container_of(work, struct vfio_group_put_work, work);
 428
 429	vfio_group_put(do_work->group);
 430	kfree(do_work);
 431}
 432
 433static void vfio_group_schedule_put(struct vfio_group *group)
 434{
 435	struct vfio_group_put_work *do_work;
 436
 437	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 438	if (WARN_ON(!do_work))
 439		return;
 440
 441	INIT_WORK(&do_work->work, vfio_group_put_bg);
 442	do_work->group = group;
 443	schedule_work(&do_work->work);
 444}
 445
 446/* Assume group_lock or group reference is held */
 447static void vfio_group_get(struct vfio_group *group)
 448{
 449	kref_get(&group->kref);
 450}
 451
 452/*
 453 * Not really a try as we will sleep for mutex, but we need to make
 454 * sure the group pointer is valid under lock and get a reference.
 455 */
 456static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 457{
 458	struct vfio_group *target = group;
 459
 460	mutex_lock(&vfio.group_lock);
 461	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 462		if (group == target) {
 463			vfio_group_get(group);
 464			mutex_unlock(&vfio.group_lock);
 465			return group;
 466		}
 467	}
 468	mutex_unlock(&vfio.group_lock);
 469
 470	return NULL;
 471}
 472
 473static
 474struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 475{
 476	struct vfio_group *group;
 477
 478	mutex_lock(&vfio.group_lock);
 479	list_for_each_entry(group, &vfio.group_list, vfio_next) {
 480		if (group->iommu_group == iommu_group) {
 481			vfio_group_get(group);
 482			mutex_unlock(&vfio.group_lock);
 483			return group;
 484		}
 485	}
 486	mutex_unlock(&vfio.group_lock);
 487
 488	return NULL;
 489}
 490
 491static struct vfio_group *vfio_group_get_from_minor(int minor)
 492{
 493	struct vfio_group *group;
 494
 495	mutex_lock(&vfio.group_lock);
 496	group = idr_find(&vfio.group_idr, minor);
 497	if (!group) {
 498		mutex_unlock(&vfio.group_lock);
 499		return NULL;
 500	}
 501	vfio_group_get(group);
 502	mutex_unlock(&vfio.group_lock);
 503
 504	return group;
 505}
 506
 507static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 508{
 509	struct iommu_group *iommu_group;
 510	struct vfio_group *group;
 511
 512	iommu_group = iommu_group_get(dev);
 513	if (!iommu_group)
 514		return NULL;
 515
 516	group = vfio_group_get_from_iommu(iommu_group);
 517	iommu_group_put(iommu_group);
 518
 519	return group;
 520}
 521
 522/**
 523 * Device objects - create, release, get, put, search
 524 */
 525/* Device reference always implies a group reference */
 526void vfio_device_put(struct vfio_device *device)
 527{
 528	if (refcount_dec_and_test(&device->refcount))
 529		complete(&device->comp);
 530}
 531EXPORT_SYMBOL_GPL(vfio_device_put);
 532
 533static bool vfio_device_try_get(struct vfio_device *device)
 534{
 535	return refcount_inc_not_zero(&device->refcount);
 536}
 537
 538static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 539						 struct device *dev)
 540{
 541	struct vfio_device *device;
 542
 543	mutex_lock(&group->device_lock);
 544	list_for_each_entry(device, &group->device_list, group_next) {
 545		if (device->dev == dev && vfio_device_try_get(device)) {
 546			mutex_unlock(&group->device_lock);
 547			return device;
 548		}
 549	}
 550	mutex_unlock(&group->device_lock);
 551	return NULL;
 552}
 553
 554/*
 555 * Some drivers, like pci-stub, are only used to prevent other drivers from
 556 * claiming a device and are therefore perfectly legitimate for a user owned
 557 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 558 * of the device, but it does prevent the user from having direct access to
 559 * the device, which is useful in some circumstances.
 560 *
 561 * We also assume that we can include PCI interconnect devices, ie. bridges.
 562 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 563 * then all of the downstream devices will be part of the same IOMMU group as
 564 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 565 * breaks anything, it only does so for user owned devices downstream.  Note
 566 * that error notification via MSI can be affected for platforms that handle
 567 * MSI within the same IOVA space as DMA.
 568 */
 569static const char * const vfio_driver_allowed[] = { "pci-stub" };
 570
 571static bool vfio_dev_driver_allowed(struct device *dev,
 572				    struct device_driver *drv)
 573{
 574	if (dev_is_pci(dev)) {
 575		struct pci_dev *pdev = to_pci_dev(dev);
 576
 577		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 578			return true;
 579	}
 580
 581	return match_string(vfio_driver_allowed,
 582			    ARRAY_SIZE(vfio_driver_allowed),
 583			    drv->name) >= 0;
 584}
 585
 586/*
 587 * A vfio group is viable for use by userspace if all devices are in
 588 * one of the following states:
 589 *  - driver-less
 590 *  - bound to a vfio driver
 591 *  - bound to an otherwise allowed driver
 592 *  - a PCI interconnect device
 593 *
 594 * We use two methods to determine whether a device is bound to a vfio
 595 * driver.  The first is to test whether the device exists in the vfio
 596 * group.  The second is to test if the device exists on the group
 597 * unbound_list, indicating it's in the middle of transitioning from
 598 * a vfio driver to driver-less.
 599 */
 600static int vfio_dev_viable(struct device *dev, void *data)
 601{
 602	struct vfio_group *group = data;
 603	struct vfio_device *device;
 604	struct device_driver *drv = READ_ONCE(dev->driver);
 605	struct vfio_unbound_dev *unbound;
 606	int ret = -EINVAL;
 607
 608	mutex_lock(&group->unbound_lock);
 609	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 610		if (dev == unbound->dev) {
 611			ret = 0;
 612			break;
 613		}
 614	}
 615	mutex_unlock(&group->unbound_lock);
 616
 617	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 618		return 0;
 619
 620	device = vfio_group_get_device(group, dev);
 621	if (device) {
 622		vfio_device_put(device);
 623		return 0;
 624	}
 625
 626	return ret;
 627}
 628
 629/**
 630 * Async device support
 631 */
 632static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 633{
 634	struct vfio_device *device;
 635
 636	/* Do we already know about it?  We shouldn't */
 637	device = vfio_group_get_device(group, dev);
 638	if (WARN_ON_ONCE(device)) {
 639		vfio_device_put(device);
 640		return 0;
 641	}
 642
 643	/* Nothing to do for idle groups */
 644	if (!atomic_read(&group->container_users))
 645		return 0;
 646
 647	/* TODO Prevent device auto probing */
 648	dev_WARN(dev, "Device added to live group %d!\n",
 649		 iommu_group_id(group->iommu_group));
 650
 651	return 0;
 652}
 653
 654static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 655{
 656	/* We don't care what happens when the group isn't in use */
 657	if (!atomic_read(&group->container_users))
 658		return 0;
 659
 660	return vfio_dev_viable(dev, group);
 661}
 662
 663static int vfio_iommu_group_notifier(struct notifier_block *nb,
 664				     unsigned long action, void *data)
 665{
 666	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 667	struct device *dev = data;
 668	struct vfio_unbound_dev *unbound;
 669
 670	/*
 671	 * Need to go through a group_lock lookup to get a reference or we
 672	 * risk racing a group being removed.  Ignore spurious notifies.
 673	 */
 674	group = vfio_group_try_get(group);
 675	if (!group)
 676		return NOTIFY_OK;
 677
 678	switch (action) {
 679	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 680		vfio_group_nb_add_dev(group, dev);
 681		break;
 682	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 683		/*
 684		 * Nothing to do here.  If the device is in use, then the
 685		 * vfio sub-driver should block the remove callback until
 686		 * it is unused.  If the device is unused or attached to a
 687		 * stub driver, then it should be released and we don't
 688		 * care that it will be going away.
 689		 */
 690		break;
 691	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 692		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 693			iommu_group_id(group->iommu_group));
 694		break;
 695	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 696		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 697			iommu_group_id(group->iommu_group), dev->driver->name);
 698		BUG_ON(vfio_group_nb_verify(group, dev));
 699		break;
 700	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 701		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 702			__func__, iommu_group_id(group->iommu_group),
 703			dev->driver->name);
 704		break;
 705	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 706		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 707			iommu_group_id(group->iommu_group));
 708		/*
 709		 * XXX An unbound device in a live group is ok, but we'd
 710		 * really like to avoid the above BUG_ON by preventing other
 711		 * drivers from binding to it.  Once that occurs, we have to
 712		 * stop the system to maintain isolation.  At a minimum, we'd
 713		 * want a toggle to disable driver auto probe for this device.
 714		 */
 715
 716		mutex_lock(&group->unbound_lock);
 717		list_for_each_entry(unbound,
 718				    &group->unbound_list, unbound_next) {
 719			if (dev == unbound->dev) {
 720				list_del(&unbound->unbound_next);
 721				kfree(unbound);
 722				break;
 723			}
 724		}
 725		mutex_unlock(&group->unbound_lock);
 726		break;
 727	}
 728
 729	/*
 730	 * If we're the last reference to the group, the group will be
 731	 * released, which includes unregistering the iommu group notifier.
 732	 * We hold a read-lock on that notifier list, unregistering needs
 733	 * a write-lock... deadlock.  Release our reference asynchronously
 734	 * to avoid that situation.
 735	 */
 736	vfio_group_schedule_put(group);
 737	return NOTIFY_OK;
 738}
 739
 740/**
 741 * VFIO driver API
 742 */
 743void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 744			 const struct vfio_device_ops *ops)
 745{
 746	init_completion(&device->comp);
 747	device->dev = dev;
 748	device->ops = ops;
 749}
 750EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 751
 752int vfio_register_group_dev(struct vfio_device *device)
 753{
 754	struct vfio_device *existing_device;
 755	struct iommu_group *iommu_group;
 756	struct vfio_group *group;
 757
 758	iommu_group = iommu_group_get(device->dev);
 759	if (!iommu_group)
 760		return -EINVAL;
 761
 762	group = vfio_group_get_from_iommu(iommu_group);
 763	if (!group) {
 764		group = vfio_create_group(iommu_group);
 765		if (IS_ERR(group)) {
 766			iommu_group_put(iommu_group);
 767			return PTR_ERR(group);
 768		}
 769	} else {
 770		/*
 771		 * A found vfio_group already holds a reference to the
 772		 * iommu_group.  A created vfio_group keeps the reference.
 773		 */
 774		iommu_group_put(iommu_group);
 775	}
 776
 777	existing_device = vfio_group_get_device(group, device->dev);
 778	if (existing_device) {
 779		dev_WARN(device->dev, "Device already exists on group %d\n",
 780			 iommu_group_id(iommu_group));
 781		vfio_device_put(existing_device);
 782		vfio_group_put(group);
 783		return -EBUSY;
 784	}
 785
 786	/* Our reference on group is moved to the device */
 787	device->group = group;
 788
 789	/* Refcounting can't start until the driver calls register */
 790	refcount_set(&device->refcount, 1);
 791
 792	mutex_lock(&group->device_lock);
 793	list_add(&device->group_next, &group->device_list);
 794	group->dev_counter++;
 795	mutex_unlock(&group->device_lock);
 796
 797	return 0;
 798}
 799EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 800
 801/**
 802 * Get a reference to the vfio_device for a device.  Even if the
 803 * caller thinks they own the device, they could be racing with a
 804 * release call path, so we can't trust drvdata for the shortcut.
 805 * Go the long way around, from the iommu_group to the vfio_group
 806 * to the vfio_device.
 807 */
 808struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 809{
 810	struct vfio_group *group;
 811	struct vfio_device *device;
 812
 813	group = vfio_group_get_from_dev(dev);
 814	if (!group)
 815		return NULL;
 816
 817	device = vfio_group_get_device(group, dev);
 818	vfio_group_put(group);
 819
 820	return device;
 821}
 822EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 823
 824static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 825						     char *buf)
 826{
 827	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 828
 829	mutex_lock(&group->device_lock);
 830	list_for_each_entry(it, &group->device_list, group_next) {
 831		int ret;
 832
 833		if (it->ops->match) {
 834			ret = it->ops->match(it, buf);
 835			if (ret < 0) {
 836				device = ERR_PTR(ret);
 837				break;
 838			}
 839		} else {
 840			ret = !strcmp(dev_name(it->dev), buf);
 841		}
 842
 843		if (ret && vfio_device_try_get(it)) {
 844			device = it;
 845			break;
 846		}
 847	}
 848	mutex_unlock(&group->device_lock);
 849
 850	return device;
 851}
 852
 853/*
 854 * Decrement the device reference count and wait for the device to be
 855 * removed.  Open file descriptors for the device... */
 856void vfio_unregister_group_dev(struct vfio_device *device)
 857{
 858	struct vfio_group *group = device->group;
 859	struct vfio_unbound_dev *unbound;
 860	unsigned int i = 0;
 861	bool interrupted = false;
 862	long rc;
 863
 864	/*
 865	 * When the device is removed from the group, the group suddenly
 866	 * becomes non-viable; the device has a driver (until the unbind
 867	 * completes), but it's not present in the group.  This is bad news
 868	 * for any external users that need to re-acquire a group reference
 869	 * in order to match and release their existing reference.  To
 870	 * solve this, we track such devices on the unbound_list to bridge
 871	 * the gap until they're fully unbound.
 872	 */
 873	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 874	if (unbound) {
 875		unbound->dev = device->dev;
 876		mutex_lock(&group->unbound_lock);
 877		list_add(&unbound->unbound_next, &group->unbound_list);
 878		mutex_unlock(&group->unbound_lock);
 879	}
 880	WARN_ON(!unbound);
 881
 882	vfio_device_put(device);
 883	rc = try_wait_for_completion(&device->comp);
 884	while (rc <= 0) {
 885		if (device->ops->request)
 886			device->ops->request(device, i++);
 887
 888		if (interrupted) {
 889			rc = wait_for_completion_timeout(&device->comp,
 890							 HZ * 10);
 891		} else {
 892			rc = wait_for_completion_interruptible_timeout(
 893				&device->comp, HZ * 10);
 894			if (rc < 0) {
 895				interrupted = true;
 896				dev_warn(device->dev,
 897					 "Device is currently in use, task"
 898					 " \"%s\" (%d) "
 899					 "blocked until device is released",
 900					 current->comm, task_pid_nr(current));
 901			}
 902		}
 903	}
 904
 905	mutex_lock(&group->device_lock);
 906	list_del(&device->group_next);
 907	group->dev_counter--;
 908	mutex_unlock(&group->device_lock);
 909
 910	/*
 911	 * In order to support multiple devices per group, devices can be
 912	 * plucked from the group while other devices in the group are still
 913	 * in use.  The container persists with this group and those remaining
 914	 * devices still attached.  If the user creates an isolation violation
 915	 * by binding this device to another driver while the group is still in
 916	 * use, that's their fault.  However, in the case of removing the last,
 917	 * or potentially the only, device in the group there can be no other
 918	 * in-use devices in the group.  The user has done their due diligence
 919	 * and we should lay no claims to those devices.  In order to do that,
 920	 * we need to make sure the group is detached from the container.
 921	 * Without this stall, we're potentially racing with a user process
 922	 * that may attempt to immediately bind this device to another driver.
 923	 */
 924	if (list_empty(&group->device_list))
 925		wait_event(group->container_q, !group->container);
 926
 927	/* Matches the get in vfio_register_group_dev() */
 928	vfio_group_put(group);
 929}
 930EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 931
 932/**
 933 * VFIO base fd, /dev/vfio/vfio
 934 */
 935static long vfio_ioctl_check_extension(struct vfio_container *container,
 936				       unsigned long arg)
 937{
 938	struct vfio_iommu_driver *driver;
 939	long ret = 0;
 940
 941	down_read(&container->group_lock);
 942
 943	driver = container->iommu_driver;
 944
 945	switch (arg) {
 946		/* No base extensions yet */
 947	default:
 948		/*
 949		 * If no driver is set, poll all registered drivers for
 950		 * extensions and return the first positive result.  If
 951		 * a driver is already set, further queries will be passed
 952		 * only to that driver.
 953		 */
 954		if (!driver) {
 955			mutex_lock(&vfio.iommu_drivers_lock);
 956			list_for_each_entry(driver, &vfio.iommu_drivers_list,
 957					    vfio_next) {
 958
 959#ifdef CONFIG_VFIO_NOIOMMU
 960				if (!list_empty(&container->group_list) &&
 961				    (container->noiommu !=
 962				     (driver->ops == &vfio_noiommu_ops)))
 963					continue;
 964#endif
 965
 966				if (!try_module_get(driver->ops->owner))
 967					continue;
 968
 969				ret = driver->ops->ioctl(NULL,
 970							 VFIO_CHECK_EXTENSION,
 971							 arg);
 972				module_put(driver->ops->owner);
 973				if (ret > 0)
 974					break;
 975			}
 976			mutex_unlock(&vfio.iommu_drivers_lock);
 977		} else
 978			ret = driver->ops->ioctl(container->iommu_data,
 979						 VFIO_CHECK_EXTENSION, arg);
 980	}
 981
 982	up_read(&container->group_lock);
 983
 984	return ret;
 985}
 986
 987/* hold write lock on container->group_lock */
 988static int __vfio_container_attach_groups(struct vfio_container *container,
 989					  struct vfio_iommu_driver *driver,
 990					  void *data)
 991{
 992	struct vfio_group *group;
 993	int ret = -ENODEV;
 994
 995	list_for_each_entry(group, &container->group_list, container_next) {
 996		ret = driver->ops->attach_group(data, group->iommu_group);
 997		if (ret)
 998			goto unwind;
 999	}
1000
1001	return ret;
1002
1003unwind:
1004	list_for_each_entry_continue_reverse(group, &container->group_list,
1005					     container_next) {
1006		driver->ops->detach_group(data, group->iommu_group);
1007	}
1008
1009	return ret;
1010}
1011
1012static long vfio_ioctl_set_iommu(struct vfio_container *container,
1013				 unsigned long arg)
1014{
1015	struct vfio_iommu_driver *driver;
1016	long ret = -ENODEV;
1017
1018	down_write(&container->group_lock);
1019
1020	/*
1021	 * The container is designed to be an unprivileged interface while
1022	 * the group can be assigned to specific users.  Therefore, only by
1023	 * adding a group to a container does the user get the privilege of
1024	 * enabling the iommu, which may allocate finite resources.  There
1025	 * is no unset_iommu, but by removing all the groups from a container,
1026	 * the container is deprivileged and returns to an unset state.
1027	 */
1028	if (list_empty(&container->group_list) || container->iommu_driver) {
1029		up_write(&container->group_lock);
1030		return -EINVAL;
1031	}
1032
1033	mutex_lock(&vfio.iommu_drivers_lock);
1034	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1035		void *data;
1036
1037#ifdef CONFIG_VFIO_NOIOMMU
1038		/*
1039		 * Only noiommu containers can use vfio-noiommu and noiommu
1040		 * containers can only use vfio-noiommu.
1041		 */
1042		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1043			continue;
1044#endif
1045
1046		if (!try_module_get(driver->ops->owner))
1047			continue;
1048
1049		/*
1050		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1051		 * so test which iommu driver reported support for this
1052		 * extension and call open on them.  We also pass them the
1053		 * magic, allowing a single driver to support multiple
1054		 * interfaces if they'd like.
1055		 */
1056		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1057			module_put(driver->ops->owner);
1058			continue;
1059		}
1060
1061		data = driver->ops->open(arg);
1062		if (IS_ERR(data)) {
1063			ret = PTR_ERR(data);
1064			module_put(driver->ops->owner);
1065			continue;
1066		}
1067
1068		ret = __vfio_container_attach_groups(container, driver, data);
1069		if (ret) {
1070			driver->ops->release(data);
1071			module_put(driver->ops->owner);
1072			continue;
1073		}
1074
1075		container->iommu_driver = driver;
1076		container->iommu_data = data;
1077		break;
1078	}
1079
1080	mutex_unlock(&vfio.iommu_drivers_lock);
1081	up_write(&container->group_lock);
1082
1083	return ret;
1084}
1085
1086static long vfio_fops_unl_ioctl(struct file *filep,
1087				unsigned int cmd, unsigned long arg)
1088{
1089	struct vfio_container *container = filep->private_data;
1090	struct vfio_iommu_driver *driver;
1091	void *data;
1092	long ret = -EINVAL;
1093
1094	if (!container)
1095		return ret;
1096
1097	switch (cmd) {
1098	case VFIO_GET_API_VERSION:
1099		ret = VFIO_API_VERSION;
1100		break;
1101	case VFIO_CHECK_EXTENSION:
1102		ret = vfio_ioctl_check_extension(container, arg);
1103		break;
1104	case VFIO_SET_IOMMU:
1105		ret = vfio_ioctl_set_iommu(container, arg);
1106		break;
1107	default:
1108		driver = container->iommu_driver;
1109		data = container->iommu_data;
1110
1111		if (driver) /* passthrough all unrecognized ioctls */
1112			ret = driver->ops->ioctl(data, cmd, arg);
1113	}
1114
1115	return ret;
1116}
1117
1118static int vfio_fops_open(struct inode *inode, struct file *filep)
1119{
1120	struct vfio_container *container;
1121
1122	container = kzalloc(sizeof(*container), GFP_KERNEL);
1123	if (!container)
1124		return -ENOMEM;
1125
1126	INIT_LIST_HEAD(&container->group_list);
1127	init_rwsem(&container->group_lock);
1128	kref_init(&container->kref);
1129
1130	filep->private_data = container;
1131
1132	return 0;
1133}
1134
1135static int vfio_fops_release(struct inode *inode, struct file *filep)
1136{
1137	struct vfio_container *container = filep->private_data;
1138	struct vfio_iommu_driver *driver = container->iommu_driver;
1139
1140	if (driver && driver->ops->notify)
1141		driver->ops->notify(container->iommu_data,
1142				    VFIO_IOMMU_CONTAINER_CLOSE);
1143
1144	filep->private_data = NULL;
1145
1146	vfio_container_put(container);
1147
1148	return 0;
1149}
1150
1151/*
1152 * Once an iommu driver is set, we optionally pass read/write/mmap
1153 * on to the driver, allowing management interfaces beyond ioctl.
1154 */
1155static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1156			      size_t count, loff_t *ppos)
1157{
1158	struct vfio_container *container = filep->private_data;
1159	struct vfio_iommu_driver *driver;
1160	ssize_t ret = -EINVAL;
1161
1162	driver = container->iommu_driver;
1163	if (likely(driver && driver->ops->read))
1164		ret = driver->ops->read(container->iommu_data,
1165					buf, count, ppos);
1166
1167	return ret;
1168}
1169
1170static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1171			       size_t count, loff_t *ppos)
1172{
1173	struct vfio_container *container = filep->private_data;
1174	struct vfio_iommu_driver *driver;
1175	ssize_t ret = -EINVAL;
1176
1177	driver = container->iommu_driver;
1178	if (likely(driver && driver->ops->write))
1179		ret = driver->ops->write(container->iommu_data,
1180					 buf, count, ppos);
1181
1182	return ret;
1183}
1184
1185static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1186{
1187	struct vfio_container *container = filep->private_data;
1188	struct vfio_iommu_driver *driver;
1189	int ret = -EINVAL;
1190
1191	driver = container->iommu_driver;
1192	if (likely(driver && driver->ops->mmap))
1193		ret = driver->ops->mmap(container->iommu_data, vma);
1194
1195	return ret;
1196}
1197
1198static const struct file_operations vfio_fops = {
1199	.owner		= THIS_MODULE,
1200	.open		= vfio_fops_open,
1201	.release	= vfio_fops_release,
1202	.read		= vfio_fops_read,
1203	.write		= vfio_fops_write,
1204	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1205	.compat_ioctl	= compat_ptr_ioctl,
1206	.mmap		= vfio_fops_mmap,
1207};
1208
1209/**
1210 * VFIO Group fd, /dev/vfio/$GROUP
1211 */
1212static void __vfio_group_unset_container(struct vfio_group *group)
1213{
1214	struct vfio_container *container = group->container;
1215	struct vfio_iommu_driver *driver;
1216
1217	down_write(&container->group_lock);
1218
1219	driver = container->iommu_driver;
1220	if (driver)
1221		driver->ops->detach_group(container->iommu_data,
1222					  group->iommu_group);
1223
1224	group->container = NULL;
1225	wake_up(&group->container_q);
1226	list_del(&group->container_next);
1227
1228	/* Detaching the last group deprivileges a container, remove iommu */
1229	if (driver && list_empty(&container->group_list)) {
1230		driver->ops->release(container->iommu_data);
1231		module_put(driver->ops->owner);
1232		container->iommu_driver = NULL;
1233		container->iommu_data = NULL;
1234	}
1235
1236	up_write(&container->group_lock);
1237
1238	vfio_container_put(container);
1239}
1240
1241/*
1242 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1243 * if there was no container to unset.  Since the ioctl is called on
1244 * the group, we know that still exists, therefore the only valid
1245 * transition here is 1->0.
1246 */
1247static int vfio_group_unset_container(struct vfio_group *group)
1248{
1249	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1250
1251	if (!users)
1252		return -EINVAL;
1253	if (users != 1)
1254		return -EBUSY;
1255
1256	__vfio_group_unset_container(group);
1257
1258	return 0;
1259}
1260
1261/*
1262 * When removing container users, anything that removes the last user
1263 * implicitly removes the group from the container.  That is, if the
1264 * group file descriptor is closed, as well as any device file descriptors,
1265 * the group is free.
1266 */
1267static void vfio_group_try_dissolve_container(struct vfio_group *group)
1268{
1269	if (0 == atomic_dec_if_positive(&group->container_users))
1270		__vfio_group_unset_container(group);
1271}
1272
1273static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1274{
1275	struct fd f;
1276	struct vfio_container *container;
1277	struct vfio_iommu_driver *driver;
1278	int ret = 0;
1279
1280	if (atomic_read(&group->container_users))
1281		return -EINVAL;
1282
1283	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1284		return -EPERM;
1285
1286	f = fdget(container_fd);
1287	if (!f.file)
1288		return -EBADF;
1289
1290	/* Sanity check, is this really our fd? */
1291	if (f.file->f_op != &vfio_fops) {
1292		fdput(f);
1293		return -EINVAL;
1294	}
1295
1296	container = f.file->private_data;
1297	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1298
1299	down_write(&container->group_lock);
1300
1301	/* Real groups and fake groups cannot mix */
1302	if (!list_empty(&container->group_list) &&
1303	    container->noiommu != group->noiommu) {
1304		ret = -EPERM;
1305		goto unlock_out;
1306	}
1307
1308	driver = container->iommu_driver;
1309	if (driver) {
1310		ret = driver->ops->attach_group(container->iommu_data,
1311						group->iommu_group);
1312		if (ret)
1313			goto unlock_out;
1314	}
1315
1316	group->container = container;
1317	container->noiommu = group->noiommu;
1318	list_add(&group->container_next, &container->group_list);
1319
1320	/* Get a reference on the container and mark a user within the group */
1321	vfio_container_get(container);
1322	atomic_inc(&group->container_users);
1323
1324unlock_out:
1325	up_write(&container->group_lock);
1326	fdput(f);
1327	return ret;
1328}
1329
1330static bool vfio_group_viable(struct vfio_group *group)
1331{
1332	return (iommu_group_for_each_dev(group->iommu_group,
1333					 group, vfio_dev_viable) == 0);
1334}
1335
1336static int vfio_group_add_container_user(struct vfio_group *group)
1337{
1338	if (!atomic_inc_not_zero(&group->container_users))
1339		return -EINVAL;
1340
1341	if (group->noiommu) {
1342		atomic_dec(&group->container_users);
1343		return -EPERM;
1344	}
1345	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1346		atomic_dec(&group->container_users);
1347		return -EINVAL;
1348	}
1349
1350	return 0;
1351}
1352
1353static const struct file_operations vfio_device_fops;
1354
1355static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1356{
1357	struct vfio_device *device;
1358	struct file *filep;
1359	int ret;
1360
1361	if (0 == atomic_read(&group->container_users) ||
1362	    !group->container->iommu_driver || !vfio_group_viable(group))
1363		return -EINVAL;
1364
1365	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1366		return -EPERM;
1367
1368	device = vfio_device_get_from_name(group, buf);
1369	if (IS_ERR(device))
1370		return PTR_ERR(device);
1371
1372	ret = device->ops->open(device);
1373	if (ret) {
1374		vfio_device_put(device);
1375		return ret;
1376	}
1377
1378	/*
1379	 * We can't use anon_inode_getfd() because we need to modify
1380	 * the f_mode flags directly to allow more than just ioctls
1381	 */
1382	ret = get_unused_fd_flags(O_CLOEXEC);
1383	if (ret < 0) {
1384		device->ops->release(device);
1385		vfio_device_put(device);
1386		return ret;
1387	}
1388
1389	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1390				   device, O_RDWR);
1391	if (IS_ERR(filep)) {
1392		put_unused_fd(ret);
1393		ret = PTR_ERR(filep);
1394		device->ops->release(device);
1395		vfio_device_put(device);
1396		return ret;
1397	}
1398
1399	/*
1400	 * TODO: add an anon_inode interface to do this.
1401	 * Appears to be missing by lack of need rather than
1402	 * explicitly prevented.  Now there's need.
1403	 */
1404	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1405
1406	atomic_inc(&group->container_users);
1407
1408	fd_install(ret, filep);
1409
1410	if (group->noiommu)
1411		dev_warn(device->dev, "vfio-noiommu device opened by user "
1412			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1413
1414	return ret;
1415}
1416
1417static long vfio_group_fops_unl_ioctl(struct file *filep,
1418				      unsigned int cmd, unsigned long arg)
1419{
1420	struct vfio_group *group = filep->private_data;
1421	long ret = -ENOTTY;
1422
1423	switch (cmd) {
1424	case VFIO_GROUP_GET_STATUS:
1425	{
1426		struct vfio_group_status status;
1427		unsigned long minsz;
1428
1429		minsz = offsetofend(struct vfio_group_status, flags);
1430
1431		if (copy_from_user(&status, (void __user *)arg, minsz))
1432			return -EFAULT;
1433
1434		if (status.argsz < minsz)
1435			return -EINVAL;
1436
1437		status.flags = 0;
1438
1439		if (vfio_group_viable(group))
1440			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1441
1442		if (group->container)
1443			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1444
1445		if (copy_to_user((void __user *)arg, &status, minsz))
1446			return -EFAULT;
1447
1448		ret = 0;
1449		break;
1450	}
1451	case VFIO_GROUP_SET_CONTAINER:
1452	{
1453		int fd;
1454
1455		if (get_user(fd, (int __user *)arg))
1456			return -EFAULT;
1457
1458		if (fd < 0)
1459			return -EINVAL;
1460
1461		ret = vfio_group_set_container(group, fd);
1462		break;
1463	}
1464	case VFIO_GROUP_UNSET_CONTAINER:
1465		ret = vfio_group_unset_container(group);
1466		break;
1467	case VFIO_GROUP_GET_DEVICE_FD:
1468	{
1469		char *buf;
1470
1471		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1472		if (IS_ERR(buf))
1473			return PTR_ERR(buf);
1474
1475		ret = vfio_group_get_device_fd(group, buf);
1476		kfree(buf);
1477		break;
1478	}
1479	}
1480
1481	return ret;
1482}
1483
1484static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1485{
1486	struct vfio_group *group;
1487	int opened;
1488
1489	group = vfio_group_get_from_minor(iminor(inode));
1490	if (!group)
1491		return -ENODEV;
1492
1493	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1494		vfio_group_put(group);
1495		return -EPERM;
1496	}
1497
1498	/* Do we need multiple instances of the group open?  Seems not. */
1499	opened = atomic_cmpxchg(&group->opened, 0, 1);
1500	if (opened) {
1501		vfio_group_put(group);
1502		return -EBUSY;
1503	}
1504
1505	/* Is something still in use from a previous open? */
1506	if (group->container) {
1507		atomic_dec(&group->opened);
1508		vfio_group_put(group);
1509		return -EBUSY;
1510	}
1511
1512	/* Warn if previous user didn't cleanup and re-init to drop them */
1513	if (WARN_ON(group->notifier.head))
1514		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1515
1516	filep->private_data = group;
1517
1518	return 0;
1519}
1520
1521static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1522{
1523	struct vfio_group *group = filep->private_data;
1524
1525	filep->private_data = NULL;
1526
1527	vfio_group_try_dissolve_container(group);
1528
1529	atomic_dec(&group->opened);
1530
1531	vfio_group_put(group);
1532
1533	return 0;
1534}
1535
1536static const struct file_operations vfio_group_fops = {
1537	.owner		= THIS_MODULE,
1538	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1539	.compat_ioctl	= compat_ptr_ioctl,
1540	.open		= vfio_group_fops_open,
1541	.release	= vfio_group_fops_release,
1542};
1543
1544/**
1545 * VFIO Device fd
1546 */
1547static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1548{
1549	struct vfio_device *device = filep->private_data;
1550
1551	device->ops->release(device);
1552
1553	vfio_group_try_dissolve_container(device->group);
1554
1555	vfio_device_put(device);
1556
1557	return 0;
1558}
1559
1560static long vfio_device_fops_unl_ioctl(struct file *filep,
1561				       unsigned int cmd, unsigned long arg)
1562{
1563	struct vfio_device *device = filep->private_data;
1564
1565	if (unlikely(!device->ops->ioctl))
1566		return -EINVAL;
1567
1568	return device->ops->ioctl(device, cmd, arg);
1569}
1570
1571static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1572				     size_t count, loff_t *ppos)
1573{
1574	struct vfio_device *device = filep->private_data;
1575
1576	if (unlikely(!device->ops->read))
1577		return -EINVAL;
1578
1579	return device->ops->read(device, buf, count, ppos);
1580}
1581
1582static ssize_t vfio_device_fops_write(struct file *filep,
1583				      const char __user *buf,
1584				      size_t count, loff_t *ppos)
1585{
1586	struct vfio_device *device = filep->private_data;
1587
1588	if (unlikely(!device->ops->write))
1589		return -EINVAL;
1590
1591	return device->ops->write(device, buf, count, ppos);
1592}
1593
1594static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1595{
1596	struct vfio_device *device = filep->private_data;
1597
1598	if (unlikely(!device->ops->mmap))
1599		return -EINVAL;
1600
1601	return device->ops->mmap(device, vma);
1602}
1603
1604static const struct file_operations vfio_device_fops = {
1605	.owner		= THIS_MODULE,
1606	.release	= vfio_device_fops_release,
1607	.read		= vfio_device_fops_read,
1608	.write		= vfio_device_fops_write,
1609	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1610	.compat_ioctl	= compat_ptr_ioctl,
1611	.mmap		= vfio_device_fops_mmap,
1612};
1613
1614/**
1615 * External user API, exported by symbols to be linked dynamically.
1616 *
1617 * The protocol includes:
1618 *  1. do normal VFIO init operation:
1619 *	- opening a new container;
1620 *	- attaching group(s) to it;
1621 *	- setting an IOMMU driver for a container.
1622 * When IOMMU is set for a container, all groups in it are
1623 * considered ready to use by an external user.
1624 *
1625 * 2. User space passes a group fd to an external user.
1626 * The external user calls vfio_group_get_external_user()
1627 * to verify that:
1628 *	- the group is initialized;
1629 *	- IOMMU is set for it.
1630 * If both checks passed, vfio_group_get_external_user()
1631 * increments the container user counter to prevent
1632 * the VFIO group from disposal before KVM exits.
1633 *
1634 * 3. The external user calls vfio_external_user_iommu_id()
1635 * to know an IOMMU ID.
1636 *
1637 * 4. When the external KVM finishes, it calls
1638 * vfio_group_put_external_user() to release the VFIO group.
1639 * This call decrements the container user counter.
1640 */
1641struct vfio_group *vfio_group_get_external_user(struct file *filep)
1642{
1643	struct vfio_group *group = filep->private_data;
1644	int ret;
1645
1646	if (filep->f_op != &vfio_group_fops)
1647		return ERR_PTR(-EINVAL);
1648
1649	ret = vfio_group_add_container_user(group);
1650	if (ret)
1651		return ERR_PTR(ret);
1652
1653	vfio_group_get(group);
1654
1655	return group;
1656}
1657EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1658
1659/**
1660 * External user API, exported by symbols to be linked dynamically.
1661 * The external user passes in a device pointer
1662 * to verify that:
1663 *	- A VFIO group is assiciated with the device;
1664 *	- IOMMU is set for the group.
1665 * If both checks passed, vfio_group_get_external_user_from_dev()
1666 * increments the container user counter to prevent the VFIO group
1667 * from disposal before external user exits and returns the pointer
1668 * to the VFIO group.
1669 *
1670 * When the external user finishes using the VFIO group, it calls
1671 * vfio_group_put_external_user() to release the VFIO group and
1672 * decrement the container user counter.
1673 *
1674 * @dev [in]	: device
1675 * Return error PTR or pointer to VFIO group.
1676 */
1677
1678struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1679{
1680	struct vfio_group *group;
1681	int ret;
1682
1683	group = vfio_group_get_from_dev(dev);
1684	if (!group)
1685		return ERR_PTR(-ENODEV);
1686
1687	ret = vfio_group_add_container_user(group);
1688	if (ret) {
1689		vfio_group_put(group);
1690		return ERR_PTR(ret);
1691	}
1692
1693	return group;
1694}
1695EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1696
1697void vfio_group_put_external_user(struct vfio_group *group)
1698{
1699	vfio_group_try_dissolve_container(group);
1700	vfio_group_put(group);
1701}
1702EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1703
1704bool vfio_external_group_match_file(struct vfio_group *test_group,
1705				    struct file *filep)
1706{
1707	struct vfio_group *group = filep->private_data;
1708
1709	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1710}
1711EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1712
1713int vfio_external_user_iommu_id(struct vfio_group *group)
1714{
1715	return iommu_group_id(group->iommu_group);
1716}
1717EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1718
1719long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1720{
1721	return vfio_ioctl_check_extension(group->container, arg);
1722}
1723EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1724
1725/**
1726 * Sub-module support
1727 */
1728/*
1729 * Helper for managing a buffer of info chain capabilities, allocate or
1730 * reallocate a buffer with additional @size, filling in @id and @version
1731 * of the capability.  A pointer to the new capability is returned.
1732 *
1733 * NB. The chain is based at the head of the buffer, so new entries are
1734 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1735 * next offsets prior to copying to the user buffer.
1736 */
1737struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1738					       size_t size, u16 id, u16 version)
1739{
1740	void *buf;
1741	struct vfio_info_cap_header *header, *tmp;
1742
1743	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1744	if (!buf) {
1745		kfree(caps->buf);
1746		caps->size = 0;
1747		return ERR_PTR(-ENOMEM);
1748	}
1749
1750	caps->buf = buf;
1751	header = buf + caps->size;
1752
1753	/* Eventually copied to user buffer, zero */
1754	memset(header, 0, size);
1755
1756	header->id = id;
1757	header->version = version;
1758
1759	/* Add to the end of the capability chain */
1760	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1761		; /* nothing */
1762
1763	tmp->next = caps->size;
1764	caps->size += size;
1765
1766	return header;
1767}
1768EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1769
1770void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1771{
1772	struct vfio_info_cap_header *tmp;
1773	void *buf = (void *)caps->buf;
1774
1775	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1776		tmp->next += offset;
1777}
1778EXPORT_SYMBOL(vfio_info_cap_shift);
1779
1780int vfio_info_add_capability(struct vfio_info_cap *caps,
1781			     struct vfio_info_cap_header *cap, size_t size)
1782{
1783	struct vfio_info_cap_header *header;
1784
1785	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1786	if (IS_ERR(header))
1787		return PTR_ERR(header);
1788
1789	memcpy(header + 1, cap + 1, size - sizeof(*header));
1790
1791	return 0;
1792}
1793EXPORT_SYMBOL(vfio_info_add_capability);
1794
1795int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1796				       int max_irq_type, size_t *data_size)
1797{
1798	unsigned long minsz;
1799	size_t size;
1800
1801	minsz = offsetofend(struct vfio_irq_set, count);
1802
1803	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1804	    (hdr->count >= (U32_MAX - hdr->start)) ||
1805	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1806				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1807		return -EINVAL;
1808
1809	if (data_size)
1810		*data_size = 0;
1811
1812	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1813		return -EINVAL;
1814
1815	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1816	case VFIO_IRQ_SET_DATA_NONE:
1817		size = 0;
1818		break;
1819	case VFIO_IRQ_SET_DATA_BOOL:
1820		size = sizeof(uint8_t);
1821		break;
1822	case VFIO_IRQ_SET_DATA_EVENTFD:
1823		size = sizeof(int32_t);
1824		break;
1825	default:
1826		return -EINVAL;
1827	}
1828
1829	if (size) {
1830		if (hdr->argsz - minsz < hdr->count * size)
1831			return -EINVAL;
1832
1833		if (!data_size)
1834			return -EINVAL;
1835
1836		*data_size = hdr->count * size;
1837	}
1838
1839	return 0;
1840}
1841EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1842
1843/*
1844 * Pin a set of guest PFNs and return their associated host PFNs for local
1845 * domain only.
1846 * @dev [in]     : device
1847 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1848 * @npage [in]   : count of elements in user_pfn array.  This count should not
1849 *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1850 * @prot [in]    : protection flags
1851 * @phys_pfn[out]: array of host PFNs
1852 * Return error or number of pages pinned.
1853 */
1854int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1855		   int prot, unsigned long *phys_pfn)
1856{
1857	struct vfio_container *container;
1858	struct vfio_group *group;
1859	struct vfio_iommu_driver *driver;
1860	int ret;
1861
1862	if (!dev || !user_pfn || !phys_pfn || !npage)
1863		return -EINVAL;
1864
1865	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1866		return -E2BIG;
1867
1868	group = vfio_group_get_from_dev(dev);
1869	if (!group)
1870		return -ENODEV;
1871
1872	if (group->dev_counter > 1) {
1873		ret = -EINVAL;
1874		goto err_pin_pages;
1875	}
1876
1877	ret = vfio_group_add_container_user(group);
1878	if (ret)
1879		goto err_pin_pages;
1880
1881	container = group->container;
1882	driver = container->iommu_driver;
1883	if (likely(driver && driver->ops->pin_pages))
1884		ret = driver->ops->pin_pages(container->iommu_data,
1885					     group->iommu_group, user_pfn,
1886					     npage, prot, phys_pfn);
1887	else
1888		ret = -ENOTTY;
1889
1890	vfio_group_try_dissolve_container(group);
1891
1892err_pin_pages:
1893	vfio_group_put(group);
1894	return ret;
1895}
1896EXPORT_SYMBOL(vfio_pin_pages);
1897
1898/*
1899 * Unpin set of host PFNs for local domain only.
1900 * @dev [in]     : device
1901 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1902 *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1903 * @npage [in]   : count of elements in user_pfn array.  This count should not
1904 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1905 * Return error or number of pages unpinned.
1906 */
1907int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1908{
1909	struct vfio_container *container;
1910	struct vfio_group *group;
1911	struct vfio_iommu_driver *driver;
1912	int ret;
1913
1914	if (!dev || !user_pfn || !npage)
1915		return -EINVAL;
1916
1917	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1918		return -E2BIG;
1919
1920	group = vfio_group_get_from_dev(dev);
1921	if (!group)
1922		return -ENODEV;
1923
1924	ret = vfio_group_add_container_user(group);
1925	if (ret)
1926		goto err_unpin_pages;
1927
1928	container = group->container;
1929	driver = container->iommu_driver;
1930	if (likely(driver && driver->ops->unpin_pages))
1931		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1932					       npage);
1933	else
1934		ret = -ENOTTY;
1935
1936	vfio_group_try_dissolve_container(group);
1937
1938err_unpin_pages:
1939	vfio_group_put(group);
1940	return ret;
1941}
1942EXPORT_SYMBOL(vfio_unpin_pages);
1943
1944/*
1945 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
1946 * VFIO group.
1947 *
1948 * The caller needs to call vfio_group_get_external_user() or
1949 * vfio_group_get_external_user_from_dev() prior to calling this interface,
1950 * so as to prevent the VFIO group from disposal in the middle of the call.
1951 * But it can keep the reference to the VFIO group for several calls into
1952 * this interface.
1953 * After finishing using of the VFIO group, the caller needs to release the
1954 * VFIO group by calling vfio_group_put_external_user().
1955 *
1956 * @group [in]		: VFIO group
1957 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
1958 * @npage [in]		: count of elements in user_iova_pfn array.
1959 *			  This count should not be greater
1960 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
1961 * @prot [in]		: protection flags
1962 * @phys_pfn [out]	: array of host PFNs
1963 * Return error or number of pages pinned.
1964 */
1965int vfio_group_pin_pages(struct vfio_group *group,
1966			 unsigned long *user_iova_pfn, int npage,
1967			 int prot, unsigned long *phys_pfn)
1968{
1969	struct vfio_container *container;
1970	struct vfio_iommu_driver *driver;
1971	int ret;
1972
1973	if (!group || !user_iova_pfn || !phys_pfn || !npage)
1974		return -EINVAL;
1975
1976	if (group->dev_counter > 1)
1977		return -EINVAL;
1978
1979	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1980		return -E2BIG;
1981
1982	container = group->container;
1983	driver = container->iommu_driver;
1984	if (likely(driver && driver->ops->pin_pages))
1985		ret = driver->ops->pin_pages(container->iommu_data,
1986					     group->iommu_group, user_iova_pfn,
1987					     npage, prot, phys_pfn);
1988	else
1989		ret = -ENOTTY;
1990
1991	return ret;
1992}
1993EXPORT_SYMBOL(vfio_group_pin_pages);
1994
1995/*
1996 * Unpin a set of guest IOVA PFNs for a VFIO group.
1997 *
1998 * The caller needs to call vfio_group_get_external_user() or
1999 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2000 * so as to prevent the VFIO group from disposal in the middle of the call.
2001 * But it can keep the reference to the VFIO group for several calls into
2002 * this interface.
2003 * After finishing using of the VFIO group, the caller needs to release the
2004 * VFIO group by calling vfio_group_put_external_user().
2005 *
2006 * @group [in]		: vfio group
2007 * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2008 * @npage [in]		: count of elements in user_iova_pfn array.
2009 *			  This count should not be greater than
2010 *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2011 * Return error or number of pages unpinned.
2012 */
2013int vfio_group_unpin_pages(struct vfio_group *group,
2014			   unsigned long *user_iova_pfn, int npage)
2015{
2016	struct vfio_container *container;
2017	struct vfio_iommu_driver *driver;
2018	int ret;
2019
2020	if (!group || !user_iova_pfn || !npage)
2021		return -EINVAL;
2022
2023	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2024		return -E2BIG;
2025
2026	container = group->container;
2027	driver = container->iommu_driver;
2028	if (likely(driver && driver->ops->unpin_pages))
2029		ret = driver->ops->unpin_pages(container->iommu_data,
2030					       user_iova_pfn, npage);
2031	else
2032		ret = -ENOTTY;
2033
2034	return ret;
2035}
2036EXPORT_SYMBOL(vfio_group_unpin_pages);
2037
2038
2039/*
2040 * This interface allows the CPUs to perform some sort of virtual DMA on
2041 * behalf of the device.
2042 *
2043 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2044 * into/from a kernel buffer.
2045 *
2046 * As the read/write of user space memory is conducted via the CPUs and is
2047 * not a real device DMA, it is not necessary to pin the user space memory.
2048 *
2049 * The caller needs to call vfio_group_get_external_user() or
2050 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2051 * so as to prevent the VFIO group from disposal in the middle of the call.
2052 * But it can keep the reference to the VFIO group for several calls into
2053 * this interface.
2054 * After finishing using of the VFIO group, the caller needs to release the
2055 * VFIO group by calling vfio_group_put_external_user().
2056 *
2057 * @group [in]		: VFIO group
2058 * @user_iova [in]	: base IOVA of a user space buffer
2059 * @data [in]		: pointer to kernel buffer
2060 * @len [in]		: kernel buffer length
2061 * @write		: indicate read or write
2062 * Return error code on failure or 0 on success.
2063 */
2064int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2065		void *data, size_t len, bool write)
2066{
2067	struct vfio_container *container;
2068	struct vfio_iommu_driver *driver;
2069	int ret = 0;
2070
2071	if (!group || !data || len <= 0)
2072		return -EINVAL;
2073
2074	container = group->container;
2075	driver = container->iommu_driver;
2076
2077	if (likely(driver && driver->ops->dma_rw))
2078		ret = driver->ops->dma_rw(container->iommu_data,
2079					  user_iova, data, len, write);
2080	else
2081		ret = -ENOTTY;
2082
2083	return ret;
2084}
2085EXPORT_SYMBOL(vfio_dma_rw);
2086
2087static int vfio_register_iommu_notifier(struct vfio_group *group,
2088					unsigned long *events,
2089					struct notifier_block *nb)
2090{
2091	struct vfio_container *container;
2092	struct vfio_iommu_driver *driver;
2093	int ret;
2094
2095	ret = vfio_group_add_container_user(group);
2096	if (ret)
2097		return -EINVAL;
2098
2099	container = group->container;
2100	driver = container->iommu_driver;
2101	if (likely(driver && driver->ops->register_notifier))
2102		ret = driver->ops->register_notifier(container->iommu_data,
2103						     events, nb);
2104	else
2105		ret = -ENOTTY;
2106
2107	vfio_group_try_dissolve_container(group);
2108
2109	return ret;
2110}
2111
2112static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2113					  struct notifier_block *nb)
2114{
2115	struct vfio_container *container;
2116	struct vfio_iommu_driver *driver;
2117	int ret;
2118
2119	ret = vfio_group_add_container_user(group);
2120	if (ret)
2121		return -EINVAL;
2122
2123	container = group->container;
2124	driver = container->iommu_driver;
2125	if (likely(driver && driver->ops->unregister_notifier))
2126		ret = driver->ops->unregister_notifier(container->iommu_data,
2127						       nb);
2128	else
2129		ret = -ENOTTY;
2130
2131	vfio_group_try_dissolve_container(group);
2132
2133	return ret;
2134}
2135
2136void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2137{
2138	group->kvm = kvm;
2139	blocking_notifier_call_chain(&group->notifier,
2140				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2141}
2142EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2143
2144static int vfio_register_group_notifier(struct vfio_group *group,
2145					unsigned long *events,
2146					struct notifier_block *nb)
2147{
2148	int ret;
2149	bool set_kvm = false;
2150
2151	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2152		set_kvm = true;
2153
2154	/* clear known events */
2155	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2156
2157	/* refuse to continue if still events remaining */
2158	if (*events)
2159		return -EINVAL;
2160
2161	ret = vfio_group_add_container_user(group);
2162	if (ret)
2163		return -EINVAL;
2164
2165	ret = blocking_notifier_chain_register(&group->notifier, nb);
2166
2167	/*
2168	 * The attaching of kvm and vfio_group might already happen, so
2169	 * here we replay once upon registration.
2170	 */
2171	if (!ret && set_kvm && group->kvm)
2172		blocking_notifier_call_chain(&group->notifier,
2173					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2174
2175	vfio_group_try_dissolve_container(group);
2176
2177	return ret;
2178}
2179
2180static int vfio_unregister_group_notifier(struct vfio_group *group,
2181					 struct notifier_block *nb)
2182{
2183	int ret;
2184
2185	ret = vfio_group_add_container_user(group);
2186	if (ret)
2187		return -EINVAL;
2188
2189	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2190
2191	vfio_group_try_dissolve_container(group);
2192
2193	return ret;
2194}
2195
2196int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2197			   unsigned long *events, struct notifier_block *nb)
2198{
2199	struct vfio_group *group;
2200	int ret;
2201
2202	if (!dev || !nb || !events || (*events == 0))
2203		return -EINVAL;
2204
2205	group = vfio_group_get_from_dev(dev);
2206	if (!group)
2207		return -ENODEV;
2208
2209	switch (type) {
2210	case VFIO_IOMMU_NOTIFY:
2211		ret = vfio_register_iommu_notifier(group, events, nb);
2212		break;
2213	case VFIO_GROUP_NOTIFY:
2214		ret = vfio_register_group_notifier(group, events, nb);
2215		break;
2216	default:
2217		ret = -EINVAL;
2218	}
2219
2220	vfio_group_put(group);
2221	return ret;
2222}
2223EXPORT_SYMBOL(vfio_register_notifier);
2224
2225int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2226			     struct notifier_block *nb)
2227{
2228	struct vfio_group *group;
2229	int ret;
2230
2231	if (!dev || !nb)
2232		return -EINVAL;
2233
2234	group = vfio_group_get_from_dev(dev);
2235	if (!group)
2236		return -ENODEV;
2237
2238	switch (type) {
2239	case VFIO_IOMMU_NOTIFY:
2240		ret = vfio_unregister_iommu_notifier(group, nb);
2241		break;
2242	case VFIO_GROUP_NOTIFY:
2243		ret = vfio_unregister_group_notifier(group, nb);
2244		break;
2245	default:
2246		ret = -EINVAL;
2247	}
2248
2249	vfio_group_put(group);
2250	return ret;
2251}
2252EXPORT_SYMBOL(vfio_unregister_notifier);
2253
2254struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2255{
2256	struct vfio_container *container;
2257	struct vfio_iommu_driver *driver;
2258
2259	if (!group)
2260		return ERR_PTR(-EINVAL);
2261
2262	container = group->container;
2263	driver = container->iommu_driver;
2264	if (likely(driver && driver->ops->group_iommu_domain))
2265		return driver->ops->group_iommu_domain(container->iommu_data,
2266						       group->iommu_group);
2267
2268	return ERR_PTR(-ENOTTY);
2269}
2270EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2271
2272/**
2273 * Module/class support
2274 */
2275static char *vfio_devnode(struct device *dev, umode_t *mode)
2276{
2277	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2278}
2279
2280static struct miscdevice vfio_dev = {
2281	.minor = VFIO_MINOR,
2282	.name = "vfio",
2283	.fops = &vfio_fops,
2284	.nodename = "vfio/vfio",
2285	.mode = S_IRUGO | S_IWUGO,
2286};
2287
2288static int __init vfio_init(void)
2289{
2290	int ret;
2291
2292	idr_init(&vfio.group_idr);
2293	mutex_init(&vfio.group_lock);
2294	mutex_init(&vfio.iommu_drivers_lock);
2295	INIT_LIST_HEAD(&vfio.group_list);
2296	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2297
2298	ret = misc_register(&vfio_dev);
2299	if (ret) {
2300		pr_err("vfio: misc device register failed\n");
2301		return ret;
2302	}
2303
2304	/* /dev/vfio/$GROUP */
2305	vfio.class = class_create(THIS_MODULE, "vfio");
2306	if (IS_ERR(vfio.class)) {
2307		ret = PTR_ERR(vfio.class);
2308		goto err_class;
2309	}
2310
2311	vfio.class->devnode = vfio_devnode;
2312
2313	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2314	if (ret)
2315		goto err_alloc_chrdev;
2316
2317	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2318	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2319	if (ret)
2320		goto err_cdev_add;
2321
2322	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2323
2324#ifdef CONFIG_VFIO_NOIOMMU
2325	vfio_register_iommu_driver(&vfio_noiommu_ops);
2326#endif
2327	return 0;
2328
2329err_cdev_add:
2330	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2331err_alloc_chrdev:
2332	class_destroy(vfio.class);
2333	vfio.class = NULL;
2334err_class:
2335	misc_deregister(&vfio_dev);
2336	return ret;
2337}
2338
2339static void __exit vfio_cleanup(void)
2340{
2341	WARN_ON(!list_empty(&vfio.group_list));
2342
2343#ifdef CONFIG_VFIO_NOIOMMU
2344	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2345#endif
2346	idr_destroy(&vfio.group_idr);
2347	cdev_del(&vfio.group_cdev);
2348	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2349	class_destroy(vfio.class);
2350	vfio.class = NULL;
2351	misc_deregister(&vfio_dev);
2352}
2353
2354module_init(vfio_init);
2355module_exit(vfio_cleanup);
2356
2357MODULE_VERSION(DRIVER_VERSION);
2358MODULE_LICENSE("GPL v2");
2359MODULE_AUTHOR(DRIVER_AUTHOR);
2360MODULE_DESCRIPTION(DRIVER_DESC);
2361MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2362MODULE_ALIAS("devname:vfio/vfio");
2363MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
Configure Feed

Configure Feed