Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfio: VFIO core

VFIO is a secure user level driver for use with both virtual machines
and user level drivers. VFIO makes use of IOMMU groups to ensure the
isolation of devices in use, allowing unprivileged user access. It's
intended that VFIO will replace KVM device assignment and UIO drivers
(in cases where the target platform includes a sufficiently capable
IOMMU).

New in this version of VFIO is support for IOMMU groups managed
through the IOMMU core as well as a rework of the API, removing the
group merge interface. We now go back to a model more similar to
original VFIO with UIOMMU support where the file descriptor obtained
from /dev/vfio/vfio allows access to the IOMMU, but only after a
group is added, avoiding the previous privilege issues with this type
of model. IOMMU support is also now fully modular as IOMMUs have
vastly different interface requirements on different platforms. VFIO
users are able to query and initialize the IOMMU model of their
choice.

Please see the follow-on Documentation commit for further description
and usage example.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

+1801
+1
Documentation/ioctl/ioctl-number.txt
··· 88 88 and kernel/power/user.c 89 89 '8' all SNP8023 advanced NIC card 90 90 <mailto:mcr@solidum.com> 91 + ';' 64-7F linux/vfio.h 91 92 '@' 00-0F linux/radeonfb.h conflict! 92 93 '@' 00-0F drivers/video/aty/aty128fb.c conflict! 93 94 'A' 00-1F linux/apm_bios.h conflict!
+8
MAINTAINERS
··· 7411 7411 F: Documentation/filesystems/vfat.txt 7412 7412 F: fs/fat/ 7413 7413 7414 + VFIO DRIVER 7415 + M: Alex Williamson <alex.williamson@redhat.com> 7416 + L: kvm@vger.kernel.org 7417 + S: Maintained 7418 + F: Documentation/vfio.txt 7419 + F: drivers/vfio/ 7420 + F: include/linux/vfio.h 7421 + 7414 7422 VIDEOBUF2 FRAMEWORK 7415 7423 M: Pawel Osciak <pawel@osciak.com> 7416 7424 M: Marek Szyprowski <m.szyprowski@samsung.com>
+2
drivers/Kconfig
··· 112 112 113 113 source "drivers/uio/Kconfig" 114 114 115 + source "drivers/vfio/Kconfig" 116 + 115 117 source "drivers/vlynq/Kconfig" 116 118 117 119 source "drivers/virtio/Kconfig"
+1
drivers/Makefile
··· 60 60 obj-$(CONFIG_FUSION) += message/ 61 61 obj-y += firewire/ 62 62 obj-$(CONFIG_UIO) += uio/ 63 + obj-$(CONFIG_VFIO) += vfio/ 63 64 obj-y += cdrom/ 64 65 obj-y += auxdisplay/ 65 66 obj-$(CONFIG_PCCARD) += pcmcia/
+8
drivers/vfio/Kconfig
··· 1 + menuconfig VFIO 2 + tristate "VFIO Non-Privileged userspace driver framework" 3 + depends on IOMMU_API 4 + help 5 + VFIO provides a framework for secure userspace device drivers. 6 + See Documentation/vfio.txt for more details. 7 + 8 + If you don't know what to do here, say N.
+1
drivers/vfio/Makefile
··· 1 + obj-$(CONFIG_VFIO) += vfio.o
+1413
drivers/vfio/vfio.c
··· 1 + /* 2 + * VFIO core 3 + * 4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 5 + * Author: Alex Williamson <alex.williamson@redhat.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + * 11 + * Derived from original vfio: 12 + * Copyright 2010 Cisco Systems, Inc. All rights reserved. 13 + * Author: Tom Lyon, pugs@cisco.com 14 + */ 15 + 16 + #include <linux/cdev.h> 17 + #include <linux/compat.h> 18 + #include <linux/device.h> 19 + #include <linux/file.h> 20 + #include <linux/anon_inodes.h> 21 + #include <linux/fs.h> 22 + #include <linux/idr.h> 23 + #include <linux/iommu.h> 24 + #include <linux/list.h> 25 + #include <linux/module.h> 26 + #include <linux/mutex.h> 27 + #include <linux/sched.h> 28 + #include <linux/slab.h> 29 + #include <linux/string.h> 30 + #include <linux/uaccess.h> 31 + #include <linux/vfio.h> 32 + #include <linux/wait.h> 33 + 34 + #define DRIVER_VERSION "0.3" 35 + #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 36 + #define DRIVER_DESC "VFIO - User Level meta-driver" 37 + 38 + static struct vfio { 39 + struct class *class; 40 + struct list_head iommu_drivers_list; 41 + struct mutex iommu_drivers_lock; 42 + struct list_head group_list; 43 + struct idr group_idr; 44 + struct mutex group_lock; 45 + struct cdev group_cdev; 46 + struct device *dev; 47 + dev_t devt; 48 + struct cdev cdev; 49 + wait_queue_head_t release_q; 50 + } vfio; 51 + 52 + struct vfio_iommu_driver { 53 + const struct vfio_iommu_driver_ops *ops; 54 + struct list_head vfio_next; 55 + }; 56 + 57 + struct vfio_container { 58 + struct kref kref; 59 + struct list_head group_list; 60 + struct mutex group_lock; 61 + struct vfio_iommu_driver *iommu_driver; 62 + void *iommu_data; 63 + }; 64 + 65 + struct vfio_group { 66 + struct kref kref; 67 + int minor; 68 + atomic_t container_users; 69 + struct iommu_group *iommu_group; 70 + struct vfio_container *container; 71 + struct list_head device_list; 72 + struct mutex device_lock; 73 + struct device *dev; 74 + struct notifier_block nb; 75 + struct list_head vfio_next; 76 + struct list_head container_next; 77 + }; 78 + 79 + struct vfio_device { 80 + struct kref kref; 81 + struct device *dev; 82 + const struct vfio_device_ops *ops; 83 + struct vfio_group *group; 84 + struct list_head group_next; 85 + void *device_data; 86 + }; 87 + 88 + /** 89 + * IOMMU driver registration 90 + */ 91 + int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) 92 + { 93 + struct vfio_iommu_driver *driver, *tmp; 94 + 95 + driver = kzalloc(sizeof(*driver), GFP_KERNEL); 96 + if (!driver) 97 + return -ENOMEM; 98 + 99 + driver->ops = ops; 100 + 101 + mutex_lock(&vfio.iommu_drivers_lock); 102 + 103 + /* Check for duplicates */ 104 + list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { 105 + if (tmp->ops == ops) { 106 + mutex_unlock(&vfio.iommu_drivers_lock); 107 + kfree(driver); 108 + return -EINVAL; 109 + } 110 + } 111 + 112 + list_add(&driver->vfio_next, &vfio.iommu_drivers_list); 113 + 114 + mutex_unlock(&vfio.iommu_drivers_lock); 115 + 116 + return 0; 117 + } 118 + EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); 119 + 120 + void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) 121 + { 122 + struct vfio_iommu_driver *driver; 123 + 124 + mutex_lock(&vfio.iommu_drivers_lock); 125 + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 126 + if (driver->ops == ops) { 127 + list_del(&driver->vfio_next); 128 + mutex_unlock(&vfio.iommu_drivers_lock); 129 + kfree(driver); 130 + return; 131 + } 132 + } 133 + mutex_unlock(&vfio.iommu_drivers_lock); 134 + } 135 + EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); 136 + 137 + /** 138 + * Group minor allocation/free - both called with vfio.group_lock held 139 + */ 140 + static int vfio_alloc_group_minor(struct vfio_group *group) 141 + { 142 + int ret, minor; 143 + 144 + again: 145 + if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0)) 146 + return -ENOMEM; 147 + 148 + /* index 0 is used by /dev/vfio/vfio */ 149 + ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor); 150 + if (ret == -EAGAIN) 151 + goto again; 152 + if (ret || minor > MINORMASK) { 153 + if (minor > MINORMASK) 154 + idr_remove(&vfio.group_idr, minor); 155 + return -ENOSPC; 156 + } 157 + 158 + return minor; 159 + } 160 + 161 + static void vfio_free_group_minor(int minor) 162 + { 163 + idr_remove(&vfio.group_idr, minor); 164 + } 165 + 166 + static int vfio_iommu_group_notifier(struct notifier_block *nb, 167 + unsigned long action, void *data); 168 + static void vfio_group_get(struct vfio_group *group); 169 + 170 + /** 171 + * Container objects - containers are created when /dev/vfio/vfio is 172 + * opened, but their lifecycle extends until the last user is done, so 173 + * it's freed via kref. Must support container/group/device being 174 + * closed in any order. 175 + */ 176 + static void vfio_container_get(struct vfio_container *container) 177 + { 178 + kref_get(&container->kref); 179 + } 180 + 181 + static void vfio_container_release(struct kref *kref) 182 + { 183 + struct vfio_container *container; 184 + container = container_of(kref, struct vfio_container, kref); 185 + 186 + kfree(container); 187 + } 188 + 189 + static void vfio_container_put(struct vfio_container *container) 190 + { 191 + kref_put(&container->kref, vfio_container_release); 192 + } 193 + 194 + /** 195 + * Group objects - create, release, get, put, search 196 + */ 197 + static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) 198 + { 199 + struct vfio_group *group, *tmp; 200 + struct device *dev; 201 + int ret, minor; 202 + 203 + group = kzalloc(sizeof(*group), GFP_KERNEL); 204 + if (!group) 205 + return ERR_PTR(-ENOMEM); 206 + 207 + kref_init(&group->kref); 208 + INIT_LIST_HEAD(&group->device_list); 209 + mutex_init(&group->device_lock); 210 + atomic_set(&group->container_users, 0); 211 + group->iommu_group = iommu_group; 212 + 213 + group->nb.notifier_call = vfio_iommu_group_notifier; 214 + 215 + /* 216 + * blocking notifiers acquire a rwsem around registering and hold 217 + * it around callback. Therefore, need to register outside of 218 + * vfio.group_lock to avoid A-B/B-A contention. Our callback won't 219 + * do anything unless it can find the group in vfio.group_list, so 220 + * no harm in registering early. 221 + */ 222 + ret = iommu_group_register_notifier(iommu_group, &group->nb); 223 + if (ret) { 224 + kfree(group); 225 + return ERR_PTR(ret); 226 + } 227 + 228 + mutex_lock(&vfio.group_lock); 229 + 230 + minor = vfio_alloc_group_minor(group); 231 + if (minor < 0) { 232 + mutex_unlock(&vfio.group_lock); 233 + kfree(group); 234 + return ERR_PTR(minor); 235 + } 236 + 237 + /* Did we race creating this group? */ 238 + list_for_each_entry(tmp, &vfio.group_list, vfio_next) { 239 + if (tmp->iommu_group == iommu_group) { 240 + vfio_group_get(tmp); 241 + vfio_free_group_minor(minor); 242 + mutex_unlock(&vfio.group_lock); 243 + kfree(group); 244 + return tmp; 245 + } 246 + } 247 + 248 + dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor), 249 + group, "%d", iommu_group_id(iommu_group)); 250 + if (IS_ERR(dev)) { 251 + vfio_free_group_minor(minor); 252 + mutex_unlock(&vfio.group_lock); 253 + kfree(group); 254 + return (struct vfio_group *)dev; /* ERR_PTR */ 255 + } 256 + 257 + group->minor = minor; 258 + group->dev = dev; 259 + 260 + list_add(&group->vfio_next, &vfio.group_list); 261 + 262 + mutex_unlock(&vfio.group_lock); 263 + 264 + return group; 265 + } 266 + 267 + static void vfio_group_release(struct kref *kref) 268 + { 269 + struct vfio_group *group = container_of(kref, struct vfio_group, kref); 270 + 271 + WARN_ON(!list_empty(&group->device_list)); 272 + 273 + device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor)); 274 + list_del(&group->vfio_next); 275 + vfio_free_group_minor(group->minor); 276 + 277 + mutex_unlock(&vfio.group_lock); 278 + 279 + /* 280 + * Unregister outside of lock. A spurious callback is harmless now 281 + * that the group is no longer in vfio.group_list. 282 + */ 283 + iommu_group_unregister_notifier(group->iommu_group, &group->nb); 284 + 285 + kfree(group); 286 + } 287 + 288 + static void vfio_group_put(struct vfio_group *group) 289 + { 290 + mutex_lock(&vfio.group_lock); 291 + /* 292 + * Release needs to unlock to unregister the notifier, so only 293 + * unlock if not released. 294 + */ 295 + if (!kref_put(&group->kref, vfio_group_release)) 296 + mutex_unlock(&vfio.group_lock); 297 + } 298 + 299 + /* Assume group_lock or group reference is held */ 300 + static void vfio_group_get(struct vfio_group *group) 301 + { 302 + kref_get(&group->kref); 303 + } 304 + 305 + /* 306 + * Not really a try as we will sleep for mutex, but we need to make 307 + * sure the group pointer is valid under lock and get a reference. 308 + */ 309 + static struct vfio_group *vfio_group_try_get(struct vfio_group *group) 310 + { 311 + struct vfio_group *target = group; 312 + 313 + mutex_lock(&vfio.group_lock); 314 + list_for_each_entry(group, &vfio.group_list, vfio_next) { 315 + if (group == target) { 316 + vfio_group_get(group); 317 + mutex_unlock(&vfio.group_lock); 318 + return group; 319 + } 320 + } 321 + mutex_unlock(&vfio.group_lock); 322 + 323 + return NULL; 324 + } 325 + 326 + static 327 + struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) 328 + { 329 + struct vfio_group *group; 330 + 331 + mutex_lock(&vfio.group_lock); 332 + list_for_each_entry(group, &vfio.group_list, vfio_next) { 333 + if (group->iommu_group == iommu_group) { 334 + vfio_group_get(group); 335 + mutex_unlock(&vfio.group_lock); 336 + return group; 337 + } 338 + } 339 + mutex_unlock(&vfio.group_lock); 340 + 341 + return NULL; 342 + } 343 + 344 + static struct vfio_group *vfio_group_get_from_minor(int minor) 345 + { 346 + struct vfio_group *group; 347 + 348 + mutex_lock(&vfio.group_lock); 349 + group = idr_find(&vfio.group_idr, minor); 350 + if (!group) { 351 + mutex_unlock(&vfio.group_lock); 352 + return NULL; 353 + } 354 + vfio_group_get(group); 355 + mutex_unlock(&vfio.group_lock); 356 + 357 + return group; 358 + } 359 + 360 + /** 361 + * Device objects - create, release, get, put, search 362 + */ 363 + static 364 + struct vfio_device *vfio_group_create_device(struct vfio_group *group, 365 + struct device *dev, 366 + const struct vfio_device_ops *ops, 367 + void *device_data) 368 + { 369 + struct vfio_device *device; 370 + int ret; 371 + 372 + device = kzalloc(sizeof(*device), GFP_KERNEL); 373 + if (!device) 374 + return ERR_PTR(-ENOMEM); 375 + 376 + kref_init(&device->kref); 377 + device->dev = dev; 378 + device->group = group; 379 + device->ops = ops; 380 + device->device_data = device_data; 381 + 382 + ret = dev_set_drvdata(dev, device); 383 + if (ret) { 384 + kfree(device); 385 + return ERR_PTR(ret); 386 + } 387 + 388 + /* No need to get group_lock, caller has group reference */ 389 + vfio_group_get(group); 390 + 391 + mutex_lock(&group->device_lock); 392 + list_add(&device->group_next, &group->device_list); 393 + mutex_unlock(&group->device_lock); 394 + 395 + return device; 396 + } 397 + 398 + static void vfio_device_release(struct kref *kref) 399 + { 400 + struct vfio_device *device = container_of(kref, 401 + struct vfio_device, kref); 402 + struct vfio_group *group = device->group; 403 + 404 + mutex_lock(&group->device_lock); 405 + list_del(&device->group_next); 406 + mutex_unlock(&group->device_lock); 407 + 408 + dev_set_drvdata(device->dev, NULL); 409 + 410 + kfree(device); 411 + 412 + /* vfio_del_group_dev may be waiting for this device */ 413 + wake_up(&vfio.release_q); 414 + } 415 + 416 + /* Device reference always implies a group reference */ 417 + static void vfio_device_put(struct vfio_device *device) 418 + { 419 + kref_put(&device->kref, vfio_device_release); 420 + vfio_group_put(device->group); 421 + } 422 + 423 + static void vfio_device_get(struct vfio_device *device) 424 + { 425 + vfio_group_get(device->group); 426 + kref_get(&device->kref); 427 + } 428 + 429 + static struct vfio_device *vfio_group_get_device(struct vfio_group *group, 430 + struct device *dev) 431 + { 432 + struct vfio_device *device; 433 + 434 + mutex_lock(&group->device_lock); 435 + list_for_each_entry(device, &group->device_list, group_next) { 436 + if (device->dev == dev) { 437 + vfio_device_get(device); 438 + mutex_unlock(&group->device_lock); 439 + return device; 440 + } 441 + } 442 + mutex_unlock(&group->device_lock); 443 + return NULL; 444 + } 445 + 446 + /* 447 + * Whitelist some drivers that we know are safe (no dma) or just sit on 448 + * a device. It's not always practical to leave a device within a group 449 + * driverless as it could get re-bound to something unsafe. 450 + */ 451 + static const char * const vfio_driver_whitelist[] = { "pci-stub" }; 452 + 453 + static bool vfio_whitelisted_driver(struct device_driver *drv) 454 + { 455 + int i; 456 + 457 + for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { 458 + if (!strcmp(drv->name, vfio_driver_whitelist[i])) 459 + return true; 460 + } 461 + 462 + return false; 463 + } 464 + 465 + /* 466 + * A vfio group is viable for use by userspace if all devices are either 467 + * driver-less or bound to a vfio or whitelisted driver. We test the 468 + * latter by the existence of a struct vfio_device matching the dev. 469 + */ 470 + static int vfio_dev_viable(struct device *dev, void *data) 471 + { 472 + struct vfio_group *group = data; 473 + struct vfio_device *device; 474 + 475 + if (!dev->driver || vfio_whitelisted_driver(dev->driver)) 476 + return 0; 477 + 478 + device = vfio_group_get_device(group, dev); 479 + if (device) { 480 + vfio_device_put(device); 481 + return 0; 482 + } 483 + 484 + return -EINVAL; 485 + } 486 + 487 + /** 488 + * Async device support 489 + */ 490 + static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) 491 + { 492 + struct vfio_device *device; 493 + 494 + /* Do we already know about it? We shouldn't */ 495 + device = vfio_group_get_device(group, dev); 496 + if (WARN_ON_ONCE(device)) { 497 + vfio_device_put(device); 498 + return 0; 499 + } 500 + 501 + /* Nothing to do for idle groups */ 502 + if (!atomic_read(&group->container_users)) 503 + return 0; 504 + 505 + /* TODO Prevent device auto probing */ 506 + WARN("Device %s added to live group %d!\n", dev_name(dev), 507 + iommu_group_id(group->iommu_group)); 508 + 509 + return 0; 510 + } 511 + 512 + static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev) 513 + { 514 + struct vfio_device *device; 515 + 516 + /* 517 + * Expect to fall out here. If a device was in use, it would 518 + * have been bound to a vfio sub-driver, which would have blocked 519 + * in .remove at vfio_del_group_dev. Sanity check that we no 520 + * longer track the device, so it's safe to remove. 521 + */ 522 + device = vfio_group_get_device(group, dev); 523 + if (likely(!device)) 524 + return 0; 525 + 526 + WARN("Device %s removed from live group %d!\n", dev_name(dev), 527 + iommu_group_id(group->iommu_group)); 528 + 529 + vfio_device_put(device); 530 + return 0; 531 + } 532 + 533 + static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) 534 + { 535 + /* We don't care what happens when the group isn't in use */ 536 + if (!atomic_read(&group->container_users)) 537 + return 0; 538 + 539 + return vfio_dev_viable(dev, group); 540 + } 541 + 542 + static int vfio_iommu_group_notifier(struct notifier_block *nb, 543 + unsigned long action, void *data) 544 + { 545 + struct vfio_group *group = container_of(nb, struct vfio_group, nb); 546 + struct device *dev = data; 547 + 548 + /* 549 + * Need to go through a group_lock lookup to get a reference or 550 + * we risk racing a group being removed. Leave a WARN_ON for 551 + * debuging, but if the group no longer exists, a spurious notify 552 + * is harmless. 553 + */ 554 + group = vfio_group_try_get(group); 555 + if (WARN_ON(!group)) 556 + return NOTIFY_OK; 557 + 558 + switch (action) { 559 + case IOMMU_GROUP_NOTIFY_ADD_DEVICE: 560 + vfio_group_nb_add_dev(group, dev); 561 + break; 562 + case IOMMU_GROUP_NOTIFY_DEL_DEVICE: 563 + vfio_group_nb_del_dev(group, dev); 564 + break; 565 + case IOMMU_GROUP_NOTIFY_BIND_DRIVER: 566 + pr_debug("%s: Device %s, group %d binding to driver\n", 567 + __func__, dev_name(dev), 568 + iommu_group_id(group->iommu_group)); 569 + break; 570 + case IOMMU_GROUP_NOTIFY_BOUND_DRIVER: 571 + pr_debug("%s: Device %s, group %d bound to driver %s\n", 572 + __func__, dev_name(dev), 573 + iommu_group_id(group->iommu_group), dev->driver->name); 574 + BUG_ON(vfio_group_nb_verify(group, dev)); 575 + break; 576 + case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER: 577 + pr_debug("%s: Device %s, group %d unbinding from driver %s\n", 578 + __func__, dev_name(dev), 579 + iommu_group_id(group->iommu_group), dev->driver->name); 580 + break; 581 + case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER: 582 + pr_debug("%s: Device %s, group %d unbound from driver\n", 583 + __func__, dev_name(dev), 584 + iommu_group_id(group->iommu_group)); 585 + /* 586 + * XXX An unbound device in a live group is ok, but we'd 587 + * really like to avoid the above BUG_ON by preventing other 588 + * drivers from binding to it. Once that occurs, we have to 589 + * stop the system to maintain isolation. At a minimum, we'd 590 + * want a toggle to disable driver auto probe for this device. 591 + */ 592 + break; 593 + } 594 + 595 + vfio_group_put(group); 596 + return NOTIFY_OK; 597 + } 598 + 599 + /** 600 + * VFIO driver API 601 + */ 602 + int vfio_add_group_dev(struct device *dev, 603 + const struct vfio_device_ops *ops, void *device_data) 604 + { 605 + struct iommu_group *iommu_group; 606 + struct vfio_group *group; 607 + struct vfio_device *device; 608 + 609 + iommu_group = iommu_group_get(dev); 610 + if (!iommu_group) 611 + return -EINVAL; 612 + 613 + group = vfio_group_get_from_iommu(iommu_group); 614 + if (!group) { 615 + group = vfio_create_group(iommu_group); 616 + if (IS_ERR(group)) { 617 + iommu_group_put(iommu_group); 618 + return PTR_ERR(group); 619 + } 620 + } 621 + 622 + device = vfio_group_get_device(group, dev); 623 + if (device) { 624 + WARN(1, "Device %s already exists on group %d\n", 625 + dev_name(dev), iommu_group_id(iommu_group)); 626 + vfio_device_put(device); 627 + vfio_group_put(group); 628 + iommu_group_put(iommu_group); 629 + return -EBUSY; 630 + } 631 + 632 + device = vfio_group_create_device(group, dev, ops, device_data); 633 + if (IS_ERR(device)) { 634 + vfio_group_put(group); 635 + iommu_group_put(iommu_group); 636 + return PTR_ERR(device); 637 + } 638 + 639 + /* 640 + * Added device holds reference to iommu_group and vfio_device 641 + * (which in turn holds reference to vfio_group). Drop extra 642 + * group reference used while acquiring device. 643 + */ 644 + vfio_group_put(group); 645 + 646 + return 0; 647 + } 648 + EXPORT_SYMBOL_GPL(vfio_add_group_dev); 649 + 650 + /* Test whether a struct device is present in our tracking */ 651 + static bool vfio_dev_present(struct device *dev) 652 + { 653 + struct iommu_group *iommu_group; 654 + struct vfio_group *group; 655 + struct vfio_device *device; 656 + 657 + iommu_group = iommu_group_get(dev); 658 + if (!iommu_group) 659 + return false; 660 + 661 + group = vfio_group_get_from_iommu(iommu_group); 662 + if (!group) { 663 + iommu_group_put(iommu_group); 664 + return false; 665 + } 666 + 667 + device = vfio_group_get_device(group, dev); 668 + if (!device) { 669 + vfio_group_put(group); 670 + iommu_group_put(iommu_group); 671 + return false; 672 + } 673 + 674 + vfio_device_put(device); 675 + vfio_group_put(group); 676 + iommu_group_put(iommu_group); 677 + return true; 678 + } 679 + 680 + /* 681 + * Decrement the device reference count and wait for the device to be 682 + * removed. Open file descriptors for the device... */ 683 + void *vfio_del_group_dev(struct device *dev) 684 + { 685 + struct vfio_device *device = dev_get_drvdata(dev); 686 + struct vfio_group *group = device->group; 687 + struct iommu_group *iommu_group = group->iommu_group; 688 + void *device_data = device->device_data; 689 + 690 + vfio_device_put(device); 691 + 692 + /* TODO send a signal to encourage this to be released */ 693 + wait_event(vfio.release_q, !vfio_dev_present(dev)); 694 + 695 + iommu_group_put(iommu_group); 696 + 697 + return device_data; 698 + } 699 + EXPORT_SYMBOL_GPL(vfio_del_group_dev); 700 + 701 + /** 702 + * VFIO base fd, /dev/vfio/vfio 703 + */ 704 + static long vfio_ioctl_check_extension(struct vfio_container *container, 705 + unsigned long arg) 706 + { 707 + struct vfio_iommu_driver *driver = container->iommu_driver; 708 + long ret = 0; 709 + 710 + switch (arg) { 711 + /* No base extensions yet */ 712 + default: 713 + /* 714 + * If no driver is set, poll all registered drivers for 715 + * extensions and return the first positive result. If 716 + * a driver is already set, further queries will be passed 717 + * only to that driver. 718 + */ 719 + if (!driver) { 720 + mutex_lock(&vfio.iommu_drivers_lock); 721 + list_for_each_entry(driver, &vfio.iommu_drivers_list, 722 + vfio_next) { 723 + if (!try_module_get(driver->ops->owner)) 724 + continue; 725 + 726 + ret = driver->ops->ioctl(NULL, 727 + VFIO_CHECK_EXTENSION, 728 + arg); 729 + module_put(driver->ops->owner); 730 + if (ret > 0) 731 + break; 732 + } 733 + mutex_unlock(&vfio.iommu_drivers_lock); 734 + } else 735 + ret = driver->ops->ioctl(container->iommu_data, 736 + VFIO_CHECK_EXTENSION, arg); 737 + } 738 + 739 + return ret; 740 + } 741 + 742 + /* hold container->group_lock */ 743 + static int __vfio_container_attach_groups(struct vfio_container *container, 744 + struct vfio_iommu_driver *driver, 745 + void *data) 746 + { 747 + struct vfio_group *group; 748 + int ret = -ENODEV; 749 + 750 + list_for_each_entry(group, &container->group_list, container_next) { 751 + ret = driver->ops->attach_group(data, group->iommu_group); 752 + if (ret) 753 + goto unwind; 754 + } 755 + 756 + return ret; 757 + 758 + unwind: 759 + list_for_each_entry_continue_reverse(group, &container->group_list, 760 + container_next) { 761 + driver->ops->detach_group(data, group->iommu_group); 762 + } 763 + 764 + return ret; 765 + } 766 + 767 + static long vfio_ioctl_set_iommu(struct vfio_container *container, 768 + unsigned long arg) 769 + { 770 + struct vfio_iommu_driver *driver; 771 + long ret = -ENODEV; 772 + 773 + mutex_lock(&container->group_lock); 774 + 775 + /* 776 + * The container is designed to be an unprivileged interface while 777 + * the group can be assigned to specific users. Therefore, only by 778 + * adding a group to a container does the user get the privilege of 779 + * enabling the iommu, which may allocate finite resources. There 780 + * is no unset_iommu, but by removing all the groups from a container, 781 + * the container is deprivileged and returns to an unset state. 782 + */ 783 + if (list_empty(&container->group_list) || container->iommu_driver) { 784 + mutex_unlock(&container->group_lock); 785 + return -EINVAL; 786 + } 787 + 788 + mutex_lock(&vfio.iommu_drivers_lock); 789 + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 790 + void *data; 791 + 792 + if (!try_module_get(driver->ops->owner)) 793 + continue; 794 + 795 + /* 796 + * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, 797 + * so test which iommu driver reported support for this 798 + * extension and call open on them. We also pass them the 799 + * magic, allowing a single driver to support multiple 800 + * interfaces if they'd like. 801 + */ 802 + if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { 803 + module_put(driver->ops->owner); 804 + continue; 805 + } 806 + 807 + /* module reference holds the driver we're working on */ 808 + mutex_unlock(&vfio.iommu_drivers_lock); 809 + 810 + data = driver->ops->open(arg); 811 + if (IS_ERR(data)) { 812 + ret = PTR_ERR(data); 813 + module_put(driver->ops->owner); 814 + goto skip_drivers_unlock; 815 + } 816 + 817 + ret = __vfio_container_attach_groups(container, driver, data); 818 + if (!ret) { 819 + container->iommu_driver = driver; 820 + container->iommu_data = data; 821 + } else { 822 + driver->ops->release(data); 823 + module_put(driver->ops->owner); 824 + } 825 + 826 + goto skip_drivers_unlock; 827 + } 828 + 829 + mutex_unlock(&vfio.iommu_drivers_lock); 830 + skip_drivers_unlock: 831 + mutex_unlock(&container->group_lock); 832 + 833 + return ret; 834 + } 835 + 836 + static long vfio_fops_unl_ioctl(struct file *filep, 837 + unsigned int cmd, unsigned long arg) 838 + { 839 + struct vfio_container *container = filep->private_data; 840 + struct vfio_iommu_driver *driver; 841 + void *data; 842 + long ret = -EINVAL; 843 + 844 + if (!container) 845 + return ret; 846 + 847 + driver = container->iommu_driver; 848 + data = container->iommu_data; 849 + 850 + switch (cmd) { 851 + case VFIO_GET_API_VERSION: 852 + ret = VFIO_API_VERSION; 853 + break; 854 + case VFIO_CHECK_EXTENSION: 855 + ret = vfio_ioctl_check_extension(container, arg); 856 + break; 857 + case VFIO_SET_IOMMU: 858 + ret = vfio_ioctl_set_iommu(container, arg); 859 + break; 860 + default: 861 + if (driver) /* passthrough all unrecognized ioctls */ 862 + ret = driver->ops->ioctl(data, cmd, arg); 863 + } 864 + 865 + return ret; 866 + } 867 + 868 + #ifdef CONFIG_COMPAT 869 + static long vfio_fops_compat_ioctl(struct file *filep, 870 + unsigned int cmd, unsigned long arg) 871 + { 872 + arg = (unsigned long)compat_ptr(arg); 873 + return vfio_fops_unl_ioctl(filep, cmd, arg); 874 + } 875 + #endif /* CONFIG_COMPAT */ 876 + 877 + static int vfio_fops_open(struct inode *inode, struct file *filep) 878 + { 879 + struct vfio_container *container; 880 + 881 + container = kzalloc(sizeof(*container), GFP_KERNEL); 882 + if (!container) 883 + return -ENOMEM; 884 + 885 + INIT_LIST_HEAD(&container->group_list); 886 + mutex_init(&container->group_lock); 887 + kref_init(&container->kref); 888 + 889 + filep->private_data = container; 890 + 891 + return 0; 892 + } 893 + 894 + static int vfio_fops_release(struct inode *inode, struct file *filep) 895 + { 896 + struct vfio_container *container = filep->private_data; 897 + 898 + filep->private_data = NULL; 899 + 900 + vfio_container_put(container); 901 + 902 + return 0; 903 + } 904 + 905 + /* 906 + * Once an iommu driver is set, we optionally pass read/write/mmap 907 + * on to the driver, allowing management interfaces beyond ioctl. 908 + */ 909 + static ssize_t vfio_fops_read(struct file *filep, char __user *buf, 910 + size_t count, loff_t *ppos) 911 + { 912 + struct vfio_container *container = filep->private_data; 913 + struct vfio_iommu_driver *driver = container->iommu_driver; 914 + 915 + if (unlikely(!driver || !driver->ops->read)) 916 + return -EINVAL; 917 + 918 + return driver->ops->read(container->iommu_data, buf, count, ppos); 919 + } 920 + 921 + static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, 922 + size_t count, loff_t *ppos) 923 + { 924 + struct vfio_container *container = filep->private_data; 925 + struct vfio_iommu_driver *driver = container->iommu_driver; 926 + 927 + if (unlikely(!driver || !driver->ops->write)) 928 + return -EINVAL; 929 + 930 + return driver->ops->write(container->iommu_data, buf, count, ppos); 931 + } 932 + 933 + static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) 934 + { 935 + struct vfio_container *container = filep->private_data; 936 + struct vfio_iommu_driver *driver = container->iommu_driver; 937 + 938 + if (unlikely(!driver || !driver->ops->mmap)) 939 + return -EINVAL; 940 + 941 + return driver->ops->mmap(container->iommu_data, vma); 942 + } 943 + 944 + static const struct file_operations vfio_fops = { 945 + .owner = THIS_MODULE, 946 + .open = vfio_fops_open, 947 + .release = vfio_fops_release, 948 + .read = vfio_fops_read, 949 + .write = vfio_fops_write, 950 + .unlocked_ioctl = vfio_fops_unl_ioctl, 951 + #ifdef CONFIG_COMPAT 952 + .compat_ioctl = vfio_fops_compat_ioctl, 953 + #endif 954 + .mmap = vfio_fops_mmap, 955 + }; 956 + 957 + /** 958 + * VFIO Group fd, /dev/vfio/$GROUP 959 + */ 960 + static void __vfio_group_unset_container(struct vfio_group *group) 961 + { 962 + struct vfio_container *container = group->container; 963 + struct vfio_iommu_driver *driver; 964 + 965 + mutex_lock(&container->group_lock); 966 + 967 + driver = container->iommu_driver; 968 + if (driver) 969 + driver->ops->detach_group(container->iommu_data, 970 + group->iommu_group); 971 + 972 + group->container = NULL; 973 + list_del(&group->container_next); 974 + 975 + /* Detaching the last group deprivileges a container, remove iommu */ 976 + if (driver && list_empty(&container->group_list)) { 977 + driver->ops->release(container->iommu_data); 978 + module_put(driver->ops->owner); 979 + container->iommu_driver = NULL; 980 + container->iommu_data = NULL; 981 + } 982 + 983 + mutex_unlock(&container->group_lock); 984 + 985 + vfio_container_put(container); 986 + } 987 + 988 + /* 989 + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or 990 + * if there was no container to unset. Since the ioctl is called on 991 + * the group, we know that still exists, therefore the only valid 992 + * transition here is 1->0. 993 + */ 994 + static int vfio_group_unset_container(struct vfio_group *group) 995 + { 996 + int users = atomic_cmpxchg(&group->container_users, 1, 0); 997 + 998 + if (!users) 999 + return -EINVAL; 1000 + if (users != 1) 1001 + return -EBUSY; 1002 + 1003 + __vfio_group_unset_container(group); 1004 + 1005 + return 0; 1006 + } 1007 + 1008 + /* 1009 + * When removing container users, anything that removes the last user 1010 + * implicitly removes the group from the container. That is, if the 1011 + * group file descriptor is closed, as well as any device file descriptors, 1012 + * the group is free. 1013 + */ 1014 + static void vfio_group_try_dissolve_container(struct vfio_group *group) 1015 + { 1016 + if (0 == atomic_dec_if_positive(&group->container_users)) 1017 + __vfio_group_unset_container(group); 1018 + } 1019 + 1020 + static int vfio_group_set_container(struct vfio_group *group, int container_fd) 1021 + { 1022 + struct file *filep; 1023 + struct vfio_container *container; 1024 + struct vfio_iommu_driver *driver; 1025 + int ret = 0; 1026 + 1027 + if (atomic_read(&group->container_users)) 1028 + return -EINVAL; 1029 + 1030 + filep = fget(container_fd); 1031 + if (!filep) 1032 + return -EBADF; 1033 + 1034 + /* Sanity check, is this really our fd? */ 1035 + if (filep->f_op != &vfio_fops) { 1036 + fput(filep); 1037 + return -EINVAL; 1038 + } 1039 + 1040 + container = filep->private_data; 1041 + WARN_ON(!container); /* fget ensures we don't race vfio_release */ 1042 + 1043 + mutex_lock(&container->group_lock); 1044 + 1045 + driver = container->iommu_driver; 1046 + if (driver) { 1047 + ret = driver->ops->attach_group(container->iommu_data, 1048 + group->iommu_group); 1049 + if (ret) 1050 + goto unlock_out; 1051 + } 1052 + 1053 + group->container = container; 1054 + list_add(&group->container_next, &container->group_list); 1055 + 1056 + /* Get a reference on the container and mark a user within the group */ 1057 + vfio_container_get(container); 1058 + atomic_inc(&group->container_users); 1059 + 1060 + unlock_out: 1061 + mutex_unlock(&container->group_lock); 1062 + fput(filep); 1063 + 1064 + return ret; 1065 + } 1066 + 1067 + static bool vfio_group_viable(struct vfio_group *group) 1068 + { 1069 + return (iommu_group_for_each_dev(group->iommu_group, 1070 + group, vfio_dev_viable) == 0); 1071 + } 1072 + 1073 + static const struct file_operations vfio_device_fops; 1074 + 1075 + static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) 1076 + { 1077 + struct vfio_device *device; 1078 + struct file *filep; 1079 + int ret = -ENODEV; 1080 + 1081 + if (0 == atomic_read(&group->container_users) || 1082 + !group->container->iommu_driver || !vfio_group_viable(group)) 1083 + return -EINVAL; 1084 + 1085 + mutex_lock(&group->device_lock); 1086 + list_for_each_entry(device, &group->device_list, group_next) { 1087 + if (strcmp(dev_name(device->dev), buf)) 1088 + continue; 1089 + 1090 + ret = device->ops->open(device->device_data); 1091 + if (ret) 1092 + break; 1093 + /* 1094 + * We can't use anon_inode_getfd() because we need to modify 1095 + * the f_mode flags directly to allow more than just ioctls 1096 + */ 1097 + ret = get_unused_fd(); 1098 + if (ret < 0) { 1099 + device->ops->release(device->device_data); 1100 + break; 1101 + } 1102 + 1103 + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, 1104 + device, O_RDWR); 1105 + if (IS_ERR(filep)) { 1106 + put_unused_fd(ret); 1107 + ret = PTR_ERR(filep); 1108 + device->ops->release(device->device_data); 1109 + break; 1110 + } 1111 + 1112 + /* 1113 + * TODO: add an anon_inode interface to do this. 1114 + * Appears to be missing by lack of need rather than 1115 + * explicitly prevented. Now there's need. 1116 + */ 1117 + filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1118 + 1119 + fd_install(ret, filep); 1120 + 1121 + vfio_device_get(device); 1122 + atomic_inc(&group->container_users); 1123 + break; 1124 + } 1125 + mutex_unlock(&group->device_lock); 1126 + 1127 + return ret; 1128 + } 1129 + 1130 + static long vfio_group_fops_unl_ioctl(struct file *filep, 1131 + unsigned int cmd, unsigned long arg) 1132 + { 1133 + struct vfio_group *group = filep->private_data; 1134 + long ret = -ENOTTY; 1135 + 1136 + switch (cmd) { 1137 + case VFIO_GROUP_GET_STATUS: 1138 + { 1139 + struct vfio_group_status status; 1140 + unsigned long minsz; 1141 + 1142 + minsz = offsetofend(struct vfio_group_status, flags); 1143 + 1144 + if (copy_from_user(&status, (void __user *)arg, minsz)) 1145 + return -EFAULT; 1146 + 1147 + if (status.argsz < minsz) 1148 + return -EINVAL; 1149 + 1150 + status.flags = 0; 1151 + 1152 + if (vfio_group_viable(group)) 1153 + status.flags |= VFIO_GROUP_FLAGS_VIABLE; 1154 + 1155 + if (group->container) 1156 + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET; 1157 + 1158 + if (copy_to_user((void __user *)arg, &status, minsz)) 1159 + return -EFAULT; 1160 + 1161 + ret = 0; 1162 + break; 1163 + } 1164 + case VFIO_GROUP_SET_CONTAINER: 1165 + { 1166 + int fd; 1167 + 1168 + if (get_user(fd, (int __user *)arg)) 1169 + return -EFAULT; 1170 + 1171 + if (fd < 0) 1172 + return -EINVAL; 1173 + 1174 + ret = vfio_group_set_container(group, fd); 1175 + break; 1176 + } 1177 + case VFIO_GROUP_UNSET_CONTAINER: 1178 + ret = vfio_group_unset_container(group); 1179 + break; 1180 + case VFIO_GROUP_GET_DEVICE_FD: 1181 + { 1182 + char *buf; 1183 + 1184 + buf = strndup_user((const char __user *)arg, PAGE_SIZE); 1185 + if (IS_ERR(buf)) 1186 + return PTR_ERR(buf); 1187 + 1188 + ret = vfio_group_get_device_fd(group, buf); 1189 + kfree(buf); 1190 + break; 1191 + } 1192 + } 1193 + 1194 + return ret; 1195 + } 1196 + 1197 + #ifdef CONFIG_COMPAT 1198 + static long vfio_group_fops_compat_ioctl(struct file *filep, 1199 + unsigned int cmd, unsigned long arg) 1200 + { 1201 + arg = (unsigned long)compat_ptr(arg); 1202 + return vfio_group_fops_unl_ioctl(filep, cmd, arg); 1203 + } 1204 + #endif /* CONFIG_COMPAT */ 1205 + 1206 + static int vfio_group_fops_open(struct inode *inode, struct file *filep) 1207 + { 1208 + struct vfio_group *group; 1209 + 1210 + group = vfio_group_get_from_minor(iminor(inode)); 1211 + if (!group) 1212 + return -ENODEV; 1213 + 1214 + if (group->container) { 1215 + vfio_group_put(group); 1216 + return -EBUSY; 1217 + } 1218 + 1219 + filep->private_data = group; 1220 + 1221 + return 0; 1222 + } 1223 + 1224 + static int vfio_group_fops_release(struct inode *inode, struct file *filep) 1225 + { 1226 + struct vfio_group *group = filep->private_data; 1227 + 1228 + filep->private_data = NULL; 1229 + 1230 + vfio_group_try_dissolve_container(group); 1231 + 1232 + vfio_group_put(group); 1233 + 1234 + return 0; 1235 + } 1236 + 1237 + static const struct file_operations vfio_group_fops = { 1238 + .owner = THIS_MODULE, 1239 + .unlocked_ioctl = vfio_group_fops_unl_ioctl, 1240 + #ifdef CONFIG_COMPAT 1241 + .compat_ioctl = vfio_group_fops_compat_ioctl, 1242 + #endif 1243 + .open = vfio_group_fops_open, 1244 + .release = vfio_group_fops_release, 1245 + }; 1246 + 1247 + /** 1248 + * VFIO Device fd 1249 + */ 1250 + static int vfio_device_fops_release(struct inode *inode, struct file *filep) 1251 + { 1252 + struct vfio_device *device = filep->private_data; 1253 + 1254 + device->ops->release(device->device_data); 1255 + 1256 + vfio_group_try_dissolve_container(device->group); 1257 + 1258 + vfio_device_put(device); 1259 + 1260 + return 0; 1261 + } 1262 + 1263 + static long vfio_device_fops_unl_ioctl(struct file *filep, 1264 + unsigned int cmd, unsigned long arg) 1265 + { 1266 + struct vfio_device *device = filep->private_data; 1267 + 1268 + if (unlikely(!device->ops->ioctl)) 1269 + return -EINVAL; 1270 + 1271 + return device->ops->ioctl(device->device_data, cmd, arg); 1272 + } 1273 + 1274 + static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1275 + size_t count, loff_t *ppos) 1276 + { 1277 + struct vfio_device *device = filep->private_data; 1278 + 1279 + if (unlikely(!device->ops->read)) 1280 + return -EINVAL; 1281 + 1282 + return device->ops->read(device->device_data, buf, count, ppos); 1283 + } 1284 + 1285 + static ssize_t vfio_device_fops_write(struct file *filep, 1286 + const char __user *buf, 1287 + size_t count, loff_t *ppos) 1288 + { 1289 + struct vfio_device *device = filep->private_data; 1290 + 1291 + if (unlikely(!device->ops->write)) 1292 + return -EINVAL; 1293 + 1294 + return device->ops->write(device->device_data, buf, count, ppos); 1295 + } 1296 + 1297 + static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1298 + { 1299 + struct vfio_device *device = filep->private_data; 1300 + 1301 + if (unlikely(!device->ops->mmap)) 1302 + return -EINVAL; 1303 + 1304 + return device->ops->mmap(device->device_data, vma); 1305 + } 1306 + 1307 + #ifdef CONFIG_COMPAT 1308 + static long vfio_device_fops_compat_ioctl(struct file *filep, 1309 + unsigned int cmd, unsigned long arg) 1310 + { 1311 + arg = (unsigned long)compat_ptr(arg); 1312 + return vfio_device_fops_unl_ioctl(filep, cmd, arg); 1313 + } 1314 + #endif /* CONFIG_COMPAT */ 1315 + 1316 + static const struct file_operations vfio_device_fops = { 1317 + .owner = THIS_MODULE, 1318 + .release = vfio_device_fops_release, 1319 + .read = vfio_device_fops_read, 1320 + .write = vfio_device_fops_write, 1321 + .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1322 + #ifdef CONFIG_COMPAT 1323 + .compat_ioctl = vfio_device_fops_compat_ioctl, 1324 + #endif 1325 + .mmap = vfio_device_fops_mmap, 1326 + }; 1327 + 1328 + /** 1329 + * Module/class support 1330 + */ 1331 + static char *vfio_devnode(struct device *dev, umode_t *mode) 1332 + { 1333 + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); 1334 + } 1335 + 1336 + static int __init vfio_init(void) 1337 + { 1338 + int ret; 1339 + 1340 + idr_init(&vfio.group_idr); 1341 + mutex_init(&vfio.group_lock); 1342 + mutex_init(&vfio.iommu_drivers_lock); 1343 + INIT_LIST_HEAD(&vfio.group_list); 1344 + INIT_LIST_HEAD(&vfio.iommu_drivers_list); 1345 + init_waitqueue_head(&vfio.release_q); 1346 + 1347 + vfio.class = class_create(THIS_MODULE, "vfio"); 1348 + if (IS_ERR(vfio.class)) { 1349 + ret = PTR_ERR(vfio.class); 1350 + goto err_class; 1351 + } 1352 + 1353 + vfio.class->devnode = vfio_devnode; 1354 + 1355 + ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio"); 1356 + if (ret) 1357 + goto err_base_chrdev; 1358 + 1359 + cdev_init(&vfio.cdev, &vfio_fops); 1360 + ret = cdev_add(&vfio.cdev, vfio.devt, 1); 1361 + if (ret) 1362 + goto err_base_cdev; 1363 + 1364 + vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio"); 1365 + if (IS_ERR(vfio.dev)) { 1366 + ret = PTR_ERR(vfio.dev); 1367 + goto err_base_dev; 1368 + } 1369 + 1370 + /* /dev/vfio/$GROUP */ 1371 + cdev_init(&vfio.group_cdev, &vfio_group_fops); 1372 + ret = cdev_add(&vfio.group_cdev, 1373 + MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1); 1374 + if (ret) 1375 + goto err_groups_cdev; 1376 + 1377 + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1378 + 1379 + return 0; 1380 + 1381 + err_groups_cdev: 1382 + device_destroy(vfio.class, vfio.devt); 1383 + err_base_dev: 1384 + cdev_del(&vfio.cdev); 1385 + err_base_cdev: 1386 + unregister_chrdev_region(vfio.devt, MINORMASK); 1387 + err_base_chrdev: 1388 + class_destroy(vfio.class); 1389 + vfio.class = NULL; 1390 + err_class: 1391 + return ret; 1392 + } 1393 + 1394 + static void __exit vfio_cleanup(void) 1395 + { 1396 + WARN_ON(!list_empty(&vfio.group_list)); 1397 + 1398 + idr_destroy(&vfio.group_idr); 1399 + cdev_del(&vfio.group_cdev); 1400 + device_destroy(vfio.class, vfio.devt); 1401 + cdev_del(&vfio.cdev); 1402 + unregister_chrdev_region(vfio.devt, MINORMASK); 1403 + class_destroy(vfio.class); 1404 + vfio.class = NULL; 1405 + } 1406 + 1407 + module_init(vfio_init); 1408 + module_exit(vfio_cleanup); 1409 + 1410 + MODULE_VERSION(DRIVER_VERSION); 1411 + MODULE_LICENSE("GPL v2"); 1412 + MODULE_AUTHOR(DRIVER_AUTHOR); 1413 + MODULE_DESCRIPTION(DRIVER_DESC);
+367
include/linux/vfio.h
··· 1 + /* 2 + * VFIO API definition 3 + * 4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 5 + * Author: Alex Williamson <alex.williamson@redhat.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + #ifndef VFIO_H 12 + #define VFIO_H 13 + 14 + #include <linux/types.h> 15 + #include <linux/ioctl.h> 16 + 17 + #define VFIO_API_VERSION 0 18 + 19 + #ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */ 20 + 21 + #include <linux/iommu.h> 22 + #include <linux/mm.h> 23 + 24 + /** 25 + * struct vfio_device_ops - VFIO bus driver device callbacks 26 + * 27 + * @open: Called when userspace creates new file descriptor for device 28 + * @release: Called when userspace releases file descriptor for device 29 + * @read: Perform read(2) on device file descriptor 30 + * @write: Perform write(2) on device file descriptor 31 + * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* 32 + * operations documented below 33 + * @mmap: Perform mmap(2) on a region of the device file descriptor 34 + */ 35 + struct vfio_device_ops { 36 + char *name; 37 + int (*open)(void *device_data); 38 + void (*release)(void *device_data); 39 + ssize_t (*read)(void *device_data, char __user *buf, 40 + size_t count, loff_t *ppos); 41 + ssize_t (*write)(void *device_data, const char __user *buf, 42 + size_t count, loff_t *size); 43 + long (*ioctl)(void *device_data, unsigned int cmd, 44 + unsigned long arg); 45 + int (*mmap)(void *device_data, struct vm_area_struct *vma); 46 + }; 47 + 48 + extern int vfio_add_group_dev(struct device *dev, 49 + const struct vfio_device_ops *ops, 50 + void *device_data); 51 + 52 + extern void *vfio_del_group_dev(struct device *dev); 53 + 54 + /** 55 + * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks 56 + */ 57 + struct vfio_iommu_driver_ops { 58 + char *name; 59 + struct module *owner; 60 + void *(*open)(unsigned long arg); 61 + void (*release)(void *iommu_data); 62 + ssize_t (*read)(void *iommu_data, char __user *buf, 63 + size_t count, loff_t *ppos); 64 + ssize_t (*write)(void *iommu_data, const char __user *buf, 65 + size_t count, loff_t *size); 66 + long (*ioctl)(void *iommu_data, unsigned int cmd, 67 + unsigned long arg); 68 + int (*mmap)(void *iommu_data, struct vm_area_struct *vma); 69 + int (*attach_group)(void *iommu_data, 70 + struct iommu_group *group); 71 + void (*detach_group)(void *iommu_data, 72 + struct iommu_group *group); 73 + 74 + }; 75 + 76 + extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); 77 + 78 + extern void vfio_unregister_iommu_driver( 79 + const struct vfio_iommu_driver_ops *ops); 80 + 81 + /** 82 + * offsetofend(TYPE, MEMBER) 83 + * 84 + * @TYPE: The type of the structure 85 + * @MEMBER: The member within the structure to get the end offset of 86 + * 87 + * Simple helper macro for dealing with variable sized structures passed 88 + * from user space. This allows us to easily determine if the provided 89 + * structure is sized to include various fields. 90 + */ 91 + #define offsetofend(TYPE, MEMBER) ({ \ 92 + TYPE tmp; \ 93 + offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ 94 + 95 + #endif /* __KERNEL__ */ 96 + 97 + /* Kernel & User level defines for VFIO IOCTLs. */ 98 + 99 + /* Extensions */ 100 + 101 + /* None yet */ 102 + 103 + /* 104 + * The IOCTL interface is designed for extensibility by embedding the 105 + * structure length (argsz) and flags into structures passed between 106 + * kernel and userspace. We therefore use the _IO() macro for these 107 + * defines to avoid implicitly embedding a size into the ioctl request. 108 + * As structure fields are added, argsz will increase to match and flag 109 + * bits will be defined to indicate additional fields with valid data. 110 + * It's *always* the caller's responsibility to indicate the size of 111 + * the structure passed by setting argsz appropriately. 112 + */ 113 + 114 + #define VFIO_TYPE (';') 115 + #define VFIO_BASE 100 116 + 117 + /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ 118 + 119 + /** 120 + * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) 121 + * 122 + * Report the version of the VFIO API. This allows us to bump the entire 123 + * API version should we later need to add or change features in incompatible 124 + * ways. 125 + * Return: VFIO_API_VERSION 126 + * Availability: Always 127 + */ 128 + #define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) 129 + 130 + /** 131 + * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) 132 + * 133 + * Check whether an extension is supported. 134 + * Return: 0 if not supported, 1 (or some other positive integer) if supported. 135 + * Availability: Always 136 + */ 137 + #define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) 138 + 139 + /** 140 + * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) 141 + * 142 + * Set the iommu to the given type. The type must be supported by an 143 + * iommu driver as verified by calling CHECK_EXTENSION using the same 144 + * type. A group must be set to this file descriptor before this 145 + * ioctl is available. The IOMMU interfaces enabled by this call are 146 + * specific to the value set. 147 + * Return: 0 on success, -errno on failure 148 + * Availability: When VFIO group attached 149 + */ 150 + #define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) 151 + 152 + /* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ 153 + 154 + /** 155 + * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, 156 + * struct vfio_group_status) 157 + * 158 + * Retrieve information about the group. Fills in provided 159 + * struct vfio_group_info. Caller sets argsz. 160 + * Return: 0 on succes, -errno on failure. 161 + * Availability: Always 162 + */ 163 + struct vfio_group_status { 164 + __u32 argsz; 165 + __u32 flags; 166 + #define VFIO_GROUP_FLAGS_VIABLE (1 << 0) 167 + #define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) 168 + }; 169 + #define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) 170 + 171 + /** 172 + * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) 173 + * 174 + * Set the container for the VFIO group to the open VFIO file 175 + * descriptor provided. Groups may only belong to a single 176 + * container. Containers may, at their discretion, support multiple 177 + * groups. Only when a container is set are all of the interfaces 178 + * of the VFIO file descriptor and the VFIO group file descriptor 179 + * available to the user. 180 + * Return: 0 on success, -errno on failure. 181 + * Availability: Always 182 + */ 183 + #define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) 184 + 185 + /** 186 + * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) 187 + * 188 + * Remove the group from the attached container. This is the 189 + * opposite of the SET_CONTAINER call and returns the group to 190 + * an initial state. All device file descriptors must be released 191 + * prior to calling this interface. When removing the last group 192 + * from a container, the IOMMU will be disabled and all state lost, 193 + * effectively also returning the VFIO file descriptor to an initial 194 + * state. 195 + * Return: 0 on success, -errno on failure. 196 + * Availability: When attached to container 197 + */ 198 + #define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) 199 + 200 + /** 201 + * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) 202 + * 203 + * Return a new file descriptor for the device object described by 204 + * the provided string. The string should match a device listed in 205 + * the devices subdirectory of the IOMMU group sysfs entry. The 206 + * group containing the device must already be added to this context. 207 + * Return: new file descriptor on success, -errno on failure. 208 + * Availability: When attached to container 209 + */ 210 + #define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) 211 + 212 + /* --------------- IOCTLs for DEVICE file descriptors --------------- */ 213 + 214 + /** 215 + * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, 216 + * struct vfio_device_info) 217 + * 218 + * Retrieve information about the device. Fills in provided 219 + * struct vfio_device_info. Caller sets argsz. 220 + * Return: 0 on success, -errno on failure. 221 + */ 222 + struct vfio_device_info { 223 + __u32 argsz; 224 + __u32 flags; 225 + #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ 226 + __u32 num_regions; /* Max region index + 1 */ 227 + __u32 num_irqs; /* Max IRQ index + 1 */ 228 + }; 229 + #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) 230 + 231 + /** 232 + * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, 233 + * struct vfio_region_info) 234 + * 235 + * Retrieve information about a device region. Caller provides 236 + * struct vfio_region_info with index value set. Caller sets argsz. 237 + * Implementation of region mapping is bus driver specific. This is 238 + * intended to describe MMIO, I/O port, as well as bus specific 239 + * regions (ex. PCI config space). Zero sized regions may be used 240 + * to describe unimplemented regions (ex. unimplemented PCI BARs). 241 + * Return: 0 on success, -errno on failure. 242 + */ 243 + struct vfio_region_info { 244 + __u32 argsz; 245 + __u32 flags; 246 + #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ 247 + #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ 248 + #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ 249 + __u32 index; /* Region index */ 250 + __u32 resv; /* Reserved for alignment */ 251 + __u64 size; /* Region size (bytes) */ 252 + __u64 offset; /* Region offset from start of device fd */ 253 + }; 254 + #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) 255 + 256 + /** 257 + * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, 258 + * struct vfio_irq_info) 259 + * 260 + * Retrieve information about a device IRQ. Caller provides 261 + * struct vfio_irq_info with index value set. Caller sets argsz. 262 + * Implementation of IRQ mapping is bus driver specific. Indexes 263 + * using multiple IRQs are primarily intended to support MSI-like 264 + * interrupt blocks. Zero count irq blocks may be used to describe 265 + * unimplemented interrupt types. 266 + * 267 + * The EVENTFD flag indicates the interrupt index supports eventfd based 268 + * signaling. 269 + * 270 + * The MASKABLE flags indicates the index supports MASK and UNMASK 271 + * actions described below. 272 + * 273 + * AUTOMASKED indicates that after signaling, the interrupt line is 274 + * automatically masked by VFIO and the user needs to unmask the line 275 + * to receive new interrupts. This is primarily intended to distinguish 276 + * level triggered interrupts. 277 + * 278 + * The NORESIZE flag indicates that the interrupt lines within the index 279 + * are setup as a set and new subindexes cannot be enabled without first 280 + * disabling the entire index. This is used for interrupts like PCI MSI 281 + * and MSI-X where the driver may only use a subset of the available 282 + * indexes, but VFIO needs to enable a specific number of vectors 283 + * upfront. In the case of MSI-X, where the user can enable MSI-X and 284 + * then add and unmask vectors, it's up to userspace to make the decision 285 + * whether to allocate the maximum supported number of vectors or tear 286 + * down setup and incrementally increase the vectors as each is enabled. 287 + */ 288 + struct vfio_irq_info { 289 + __u32 argsz; 290 + __u32 flags; 291 + #define VFIO_IRQ_INFO_EVENTFD (1 << 0) 292 + #define VFIO_IRQ_INFO_MASKABLE (1 << 1) 293 + #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) 294 + #define VFIO_IRQ_INFO_NORESIZE (1 << 3) 295 + __u32 index; /* IRQ index */ 296 + __u32 count; /* Number of IRQs within this index */ 297 + }; 298 + #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) 299 + 300 + /** 301 + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) 302 + * 303 + * Set signaling, masking, and unmasking of interrupts. Caller provides 304 + * struct vfio_irq_set with all fields set. 'start' and 'count' indicate 305 + * the range of subindexes being specified. 306 + * 307 + * The DATA flags specify the type of data provided. If DATA_NONE, the 308 + * operation performs the specified action immediately on the specified 309 + * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: 310 + * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. 311 + * 312 + * DATA_BOOL allows sparse support for the same on arrays of interrupts. 313 + * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): 314 + * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, 315 + * data = {1,0,1} 316 + * 317 + * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. 318 + * A value of -1 can be used to either de-assign interrupts if already 319 + * assigned or skip un-assigned interrupts. For example, to set an eventfd 320 + * to be trigger for interrupts [0,0] and [0,2]: 321 + * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, 322 + * data = {fd1, -1, fd2} 323 + * If index [0,1] is previously set, two count = 1 ioctls calls would be 324 + * required to set [0,0] and [0,2] without changing [0,1]. 325 + * 326 + * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used 327 + * with ACTION_TRIGGER to perform kernel level interrupt loopback testing 328 + * from userspace (ie. simulate hardware triggering). 329 + * 330 + * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER 331 + * enables the interrupt index for the device. Individual subindex interrupts 332 + * can be disabled using the -1 value for DATA_EVENTFD or the index can be 333 + * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. 334 + * 335 + * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while 336 + * ACTION_TRIGGER specifies kernel->user signaling. 337 + */ 338 + struct vfio_irq_set { 339 + __u32 argsz; 340 + __u32 flags; 341 + #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ 342 + #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ 343 + #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ 344 + #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ 345 + #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ 346 + #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ 347 + __u32 index; 348 + __u32 start; 349 + __u32 count; 350 + __u8 data[]; 351 + }; 352 + #define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) 353 + 354 + #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ 355 + VFIO_IRQ_SET_DATA_BOOL | \ 356 + VFIO_IRQ_SET_DATA_EVENTFD) 357 + #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ 358 + VFIO_IRQ_SET_ACTION_UNMASK | \ 359 + VFIO_IRQ_SET_ACTION_TRIGGER) 360 + /** 361 + * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) 362 + * 363 + * Reset a device. 364 + */ 365 + #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) 366 + 367 + #endif /* VFIO_H */