vfio: VFIO core · tjh.dev/kernel@cba3345

+1

Documentation/ioctl/ioctl-number.txt

··· 88 88 and kernel/power/user.c 89 89 '8' all SNP8023 advanced NIC card 90 90 <mailto:mcr@solidum.com> 91 + ';' 64-7F linux/vfio.h 91 92 '@' 00-0F linux/radeonfb.h conflict! 92 93 '@' 00-0F drivers/video/aty/aty128fb.c conflict! 93 94 'A' 00-1F linux/apm_bios.h conflict!

+8

MAINTAINERS

··· 7411 7411 F: Documentation/filesystems/vfat.txt 7412 7412 F: fs/fat/ 7413 7413 7414 + VFIO DRIVER 7415 + M: Alex Williamson <alex.williamson@redhat.com> 7416 + L: kvm@vger.kernel.org 7417 + S: Maintained 7418 + F: Documentation/vfio.txt 7419 + F: drivers/vfio/ 7420 + F: include/linux/vfio.h 7421 + 7414 7422 VIDEOBUF2 FRAMEWORK 7415 7423 M: Pawel Osciak <pawel@osciak.com> 7416 7424 M: Marek Szyprowski <m.szyprowski@samsung.com>

+2

drivers/Kconfig

··· 112 112 113 113 source "drivers/uio/Kconfig" 114 114 115 + source "drivers/vfio/Kconfig" 116 + 115 117 source "drivers/vlynq/Kconfig" 116 118 117 119 source "drivers/virtio/Kconfig"

+1

drivers/Makefile

··· 60 60 obj-$(CONFIG_FUSION) += message/ 61 61 obj-y += firewire/ 62 62 obj-$(CONFIG_UIO) += uio/ 63 + obj-$(CONFIG_VFIO) += vfio/ 63 64 obj-y += cdrom/ 64 65 obj-y += auxdisplay/ 65 66 obj-$(CONFIG_PCCARD) += pcmcia/

+8

drivers/vfio/Kconfig

··· 1 + menuconfig VFIO 2 + tristate "VFIO Non-Privileged userspace driver framework" 3 + depends on IOMMU_API 4 + help 5 + VFIO provides a framework for secure userspace device drivers. 6 + See Documentation/vfio.txt for more details. 7 + 8 + If you don't know what to do here, say N.

+1

drivers/vfio/Makefile

··· 1 + obj-$(CONFIG_VFIO) += vfio.o

+1413

drivers/vfio/vfio.c

··· 1 + /* 2 + * VFIO core 3 + * 4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 5 + * Author: Alex Williamson <alex.williamson@redhat.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + * 11 + * Derived from original vfio: 12 + * Copyright 2010 Cisco Systems, Inc. All rights reserved. 13 + * Author: Tom Lyon, pugs@cisco.com 14 + */ 15 + 16 + #include <linux/cdev.h> 17 + #include <linux/compat.h> 18 + #include <linux/device.h> 19 + #include <linux/file.h> 20 + #include <linux/anon_inodes.h> 21 + #include <linux/fs.h> 22 + #include <linux/idr.h> 23 + #include <linux/iommu.h> 24 + #include <linux/list.h> 25 + #include <linux/module.h> 26 + #include <linux/mutex.h> 27 + #include <linux/sched.h> 28 + #include <linux/slab.h> 29 + #include <linux/string.h> 30 + #include <linux/uaccess.h> 31 + #include <linux/vfio.h> 32 + #include <linux/wait.h> 33 + 34 + #define DRIVER_VERSION "0.3" 35 + #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 36 + #define DRIVER_DESC "VFIO - User Level meta-driver" 37 + 38 + static struct vfio { 39 + struct class *class; 40 + struct list_head iommu_drivers_list; 41 + struct mutex iommu_drivers_lock; 42 + struct list_head group_list; 43 + struct idr group_idr; 44 + struct mutex group_lock; 45 + struct cdev group_cdev; 46 + struct device *dev; 47 + dev_t devt; 48 + struct cdev cdev; 49 + wait_queue_head_t release_q; 50 + } vfio; 51 + 52 + struct vfio_iommu_driver { 53 + const struct vfio_iommu_driver_ops *ops; 54 + struct list_head vfio_next; 55 + }; 56 + 57 + struct vfio_container { 58 + struct kref kref; 59 + struct list_head group_list; 60 + struct mutex group_lock; 61 + struct vfio_iommu_driver *iommu_driver; 62 + void *iommu_data; 63 + }; 64 + 65 + struct vfio_group { 66 + struct kref kref; 67 + int minor; 68 + atomic_t container_users; 69 + struct iommu_group *iommu_group; 70 + struct vfio_container *container; 71 + struct list_head device_list; 72 + struct mutex device_lock; 73 + struct device *dev; 74 + struct notifier_block nb; 75 + struct list_head vfio_next; 76 + struct list_head container_next; 77 + }; 78 + 79 + struct vfio_device { 80 + struct kref kref; 81 + struct device *dev; 82 + const struct vfio_device_ops *ops; 83 + struct vfio_group *group; 84 + struct list_head group_next; 85 + void *device_data; 86 + }; 87 + 88 + /** 89 + * IOMMU driver registration 90 + */ 91 + int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) 92 + { 93 + struct vfio_iommu_driver *driver, *tmp; 94 + 95 + driver = kzalloc(sizeof(*driver), GFP_KERNEL); 96 + if (!driver) 97 + return -ENOMEM; 98 + 99 + driver->ops = ops; 100 + 101 + mutex_lock(&vfio.iommu_drivers_lock); 102 + 103 + /* Check for duplicates */ 104 + list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { 105 + if (tmp->ops == ops) { 106 + mutex_unlock(&vfio.iommu_drivers_lock); 107 + kfree(driver); 108 + return -EINVAL; 109 + } 110 + } 111 + 112 + list_add(&driver->vfio_next, &vfio.iommu_drivers_list); 113 + 114 + mutex_unlock(&vfio.iommu_drivers_lock); 115 + 116 + return 0; 117 + } 118 + EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); 119 + 120 + void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) 121 + { 122 + struct vfio_iommu_driver *driver; 123 + 124 + mutex_lock(&vfio.iommu_drivers_lock); 125 + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 126 + if (driver->ops == ops) { 127 + list_del(&driver->vfio_next); 128 + mutex_unlock(&vfio.iommu_drivers_lock); 129 + kfree(driver); 130 + return; 131 + } 132 + } 133 + mutex_unlock(&vfio.iommu_drivers_lock); 134 + } 135 + EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); 136 + 137 + /** 138 + * Group minor allocation/free - both called with vfio.group_lock held 139 + */ 140 + static int vfio_alloc_group_minor(struct vfio_group *group) 141 + { 142 + int ret, minor; 143 + 144 + again: 145 + if (unlikely(idr_pre_get(&vfio.group_idr, GFP_KERNEL) == 0)) 146 + return -ENOMEM; 147 + 148 + /* index 0 is used by /dev/vfio/vfio */ 149 + ret = idr_get_new_above(&vfio.group_idr, group, 1, &minor); 150 + if (ret == -EAGAIN) 151 + goto again; 152 + if (ret || minor > MINORMASK) { 153 + if (minor > MINORMASK) 154 + idr_remove(&vfio.group_idr, minor); 155 + return -ENOSPC; 156 + } 157 + 158 + return minor; 159 + } 160 + 161 + static void vfio_free_group_minor(int minor) 162 + { 163 + idr_remove(&vfio.group_idr, minor); 164 + } 165 + 166 + static int vfio_iommu_group_notifier(struct notifier_block *nb, 167 + unsigned long action, void *data); 168 + static void vfio_group_get(struct vfio_group *group); 169 + 170 + /** 171 + * Container objects - containers are created when /dev/vfio/vfio is 172 + * opened, but their lifecycle extends until the last user is done, so 173 + * it's freed via kref. Must support container/group/device being 174 + * closed in any order. 175 + */ 176 + static void vfio_container_get(struct vfio_container *container) 177 + { 178 + kref_get(&container->kref); 179 + } 180 + 181 + static void vfio_container_release(struct kref *kref) 182 + { 183 + struct vfio_container *container; 184 + container = container_of(kref, struct vfio_container, kref); 185 + 186 + kfree(container); 187 + } 188 + 189 + static void vfio_container_put(struct vfio_container *container) 190 + { 191 + kref_put(&container->kref, vfio_container_release); 192 + } 193 + 194 + /** 195 + * Group objects - create, release, get, put, search 196 + */ 197 + static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) 198 + { 199 + struct vfio_group *group, *tmp; 200 + struct device *dev; 201 + int ret, minor; 202 + 203 + group = kzalloc(sizeof(*group), GFP_KERNEL); 204 + if (!group) 205 + return ERR_PTR(-ENOMEM); 206 + 207 + kref_init(&group->kref); 208 + INIT_LIST_HEAD(&group->device_list); 209 + mutex_init(&group->device_lock); 210 + atomic_set(&group->container_users, 0); 211 + group->iommu_group = iommu_group; 212 + 213 + group->nb.notifier_call = vfio_iommu_group_notifier; 214 + 215 + /* 216 + * blocking notifiers acquire a rwsem around registering and hold 217 + * it around callback. Therefore, need to register outside of 218 + * vfio.group_lock to avoid A-B/B-A contention. Our callback won't 219 + * do anything unless it can find the group in vfio.group_list, so 220 + * no harm in registering early. 221 + */ 222 + ret = iommu_group_register_notifier(iommu_group, &group->nb); 223 + if (ret) { 224 + kfree(group); 225 + return ERR_PTR(ret); 226 + } 227 + 228 + mutex_lock(&vfio.group_lock); 229 + 230 + minor = vfio_alloc_group_minor(group); 231 + if (minor < 0) { 232 + mutex_unlock(&vfio.group_lock); 233 + kfree(group); 234 + return ERR_PTR(minor); 235 + } 236 + 237 + /* Did we race creating this group? */ 238 + list_for_each_entry(tmp, &vfio.group_list, vfio_next) { 239 + if (tmp->iommu_group == iommu_group) { 240 + vfio_group_get(tmp); 241 + vfio_free_group_minor(minor); 242 + mutex_unlock(&vfio.group_lock); 243 + kfree(group); 244 + return tmp; 245 + } 246 + } 247 + 248 + dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor), 249 + group, "%d", iommu_group_id(iommu_group)); 250 + if (IS_ERR(dev)) { 251 + vfio_free_group_minor(minor); 252 + mutex_unlock(&vfio.group_lock); 253 + kfree(group); 254 + return (struct vfio_group *)dev; /* ERR_PTR */ 255 + } 256 + 257 + group->minor = minor; 258 + group->dev = dev; 259 + 260 + list_add(&group->vfio_next, &vfio.group_list); 261 + 262 + mutex_unlock(&vfio.group_lock); 263 + 264 + return group; 265 + } 266 + 267 + static void vfio_group_release(struct kref *kref) 268 + { 269 + struct vfio_group *group = container_of(kref, struct vfio_group, kref); 270 + 271 + WARN_ON(!list_empty(&group->device_list)); 272 + 273 + device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor)); 274 + list_del(&group->vfio_next); 275 + vfio_free_group_minor(group->minor); 276 + 277 + mutex_unlock(&vfio.group_lock); 278 + 279 + /* 280 + * Unregister outside of lock. A spurious callback is harmless now 281 + * that the group is no longer in vfio.group_list. 282 + */ 283 + iommu_group_unregister_notifier(group->iommu_group, &group->nb); 284 + 285 + kfree(group); 286 + } 287 + 288 + static void vfio_group_put(struct vfio_group *group) 289 + { 290 + mutex_lock(&vfio.group_lock); 291 + /* 292 + * Release needs to unlock to unregister the notifier, so only 293 + * unlock if not released. 294 + */ 295 + if (!kref_put(&group->kref, vfio_group_release)) 296 + mutex_unlock(&vfio.group_lock); 297 + } 298 + 299 + /* Assume group_lock or group reference is held */ 300 + static void vfio_group_get(struct vfio_group *group) 301 + { 302 + kref_get(&group->kref); 303 + } 304 + 305 + /* 306 + * Not really a try as we will sleep for mutex, but we need to make 307 + * sure the group pointer is valid under lock and get a reference. 308 + */ 309 + static struct vfio_group *vfio_group_try_get(struct vfio_group *group) 310 + { 311 + struct vfio_group *target = group; 312 + 313 + mutex_lock(&vfio.group_lock); 314 + list_for_each_entry(group, &vfio.group_list, vfio_next) { 315 + if (group == target) { 316 + vfio_group_get(group); 317 + mutex_unlock(&vfio.group_lock); 318 + return group; 319 + } 320 + } 321 + mutex_unlock(&vfio.group_lock); 322 + 323 + return NULL; 324 + } 325 + 326 + static 327 + struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) 328 + { 329 + struct vfio_group *group; 330 + 331 + mutex_lock(&vfio.group_lock); 332 + list_for_each_entry(group, &vfio.group_list, vfio_next) { 333 + if (group->iommu_group == iommu_group) { 334 + vfio_group_get(group); 335 + mutex_unlock(&vfio.group_lock); 336 + return group; 337 + } 338 + } 339 + mutex_unlock(&vfio.group_lock); 340 + 341 + return NULL; 342 + } 343 + 344 + static struct vfio_group *vfio_group_get_from_minor(int minor) 345 + { 346 + struct vfio_group *group; 347 + 348 + mutex_lock(&vfio.group_lock); 349 + group = idr_find(&vfio.group_idr, minor); 350 + if (!group) { 351 + mutex_unlock(&vfio.group_lock); 352 + return NULL; 353 + } 354 + vfio_group_get(group); 355 + mutex_unlock(&vfio.group_lock); 356 + 357 + return group; 358 + } 359 + 360 + /** 361 + * Device objects - create, release, get, put, search 362 + */ 363 + static 364 + struct vfio_device *vfio_group_create_device(struct vfio_group *group, 365 + struct device *dev, 366 + const struct vfio_device_ops *ops, 367 + void *device_data) 368 + { 369 + struct vfio_device *device; 370 + int ret; 371 + 372 + device = kzalloc(sizeof(*device), GFP_KERNEL); 373 + if (!device) 374 + return ERR_PTR(-ENOMEM); 375 + 376 + kref_init(&device->kref); 377 + device->dev = dev; 378 + device->group = group; 379 + device->ops = ops; 380 + device->device_data = device_data; 381 + 382 + ret = dev_set_drvdata(dev, device); 383 + if (ret) { 384 + kfree(device); 385 + return ERR_PTR(ret); 386 + } 387 + 388 + /* No need to get group_lock, caller has group reference */ 389 + vfio_group_get(group); 390 + 391 + mutex_lock(&group->device_lock); 392 + list_add(&device->group_next, &group->device_list); 393 + mutex_unlock(&group->device_lock); 394 + 395 + return device; 396 + } 397 + 398 + static void vfio_device_release(struct kref *kref) 399 + { 400 + struct vfio_device *device = container_of(kref, 401 + struct vfio_device, kref); 402 + struct vfio_group *group = device->group; 403 + 404 + mutex_lock(&group->device_lock); 405 + list_del(&device->group_next); 406 + mutex_unlock(&group->device_lock); 407 + 408 + dev_set_drvdata(device->dev, NULL); 409 + 410 + kfree(device); 411 + 412 + /* vfio_del_group_dev may be waiting for this device */ 413 + wake_up(&vfio.release_q); 414 + } 415 + 416 + /* Device reference always implies a group reference */ 417 + static void vfio_device_put(struct vfio_device *device) 418 + { 419 + kref_put(&device->kref, vfio_device_release); 420 + vfio_group_put(device->group); 421 + } 422 + 423 + static void vfio_device_get(struct vfio_device *device) 424 + { 425 + vfio_group_get(device->group); 426 + kref_get(&device->kref); 427 + } 428 + 429 + static struct vfio_device *vfio_group_get_device(struct vfio_group *group, 430 + struct device *dev) 431 + { 432 + struct vfio_device *device; 433 + 434 + mutex_lock(&group->device_lock); 435 + list_for_each_entry(device, &group->device_list, group_next) { 436 + if (device->dev == dev) { 437 + vfio_device_get(device); 438 + mutex_unlock(&group->device_lock); 439 + return device; 440 + } 441 + } 442 + mutex_unlock(&group->device_lock); 443 + return NULL; 444 + } 445 + 446 + /* 447 + * Whitelist some drivers that we know are safe (no dma) or just sit on 448 + * a device. It's not always practical to leave a device within a group 449 + * driverless as it could get re-bound to something unsafe. 450 + */ 451 + static const char * const vfio_driver_whitelist[] = { "pci-stub" }; 452 + 453 + static bool vfio_whitelisted_driver(struct device_driver *drv) 454 + { 455 + int i; 456 + 457 + for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { 458 + if (!strcmp(drv->name, vfio_driver_whitelist[i])) 459 + return true; 460 + } 461 + 462 + return false; 463 + } 464 + 465 + /* 466 + * A vfio group is viable for use by userspace if all devices are either 467 + * driver-less or bound to a vfio or whitelisted driver. We test the 468 + * latter by the existence of a struct vfio_device matching the dev. 469 + */ 470 + static int vfio_dev_viable(struct device *dev, void *data) 471 + { 472 + struct vfio_group *group = data; 473 + struct vfio_device *device; 474 + 475 + if (!dev->driver || vfio_whitelisted_driver(dev->driver)) 476 + return 0; 477 + 478 + device = vfio_group_get_device(group, dev); 479 + if (device) { 480 + vfio_device_put(device); 481 + return 0; 482 + } 483 + 484 + return -EINVAL; 485 + } 486 + 487 + /** 488 + * Async device support 489 + */ 490 + static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) 491 + { 492 + struct vfio_device *device; 493 + 494 + /* Do we already know about it? We shouldn't */ 495 + device = vfio_group_get_device(group, dev); 496 + if (WARN_ON_ONCE(device)) { 497 + vfio_device_put(device); 498 + return 0; 499 + } 500 + 501 + /* Nothing to do for idle groups */ 502 + if (!atomic_read(&group->container_users)) 503 + return 0; 504 + 505 + /* TODO Prevent device auto probing */ 506 + WARN("Device %s added to live group %d!\n", dev_name(dev), 507 + iommu_group_id(group->iommu_group)); 508 + 509 + return 0; 510 + } 511 + 512 + static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev) 513 + { 514 + struct vfio_device *device; 515 + 516 + /* 517 + * Expect to fall out here. If a device was in use, it would 518 + * have been bound to a vfio sub-driver, which would have blocked 519 + * in .remove at vfio_del_group_dev. Sanity check that we no 520 + * longer track the device, so it's safe to remove. 521 + */ 522 + device = vfio_group_get_device(group, dev); 523 + if (likely(!device)) 524 + return 0; 525 + 526 + WARN("Device %s removed from live group %d!\n", dev_name(dev), 527 + iommu_group_id(group->iommu_group)); 528 + 529 + vfio_device_put(device); 530 + return 0; 531 + } 532 + 533 + static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) 534 + { 535 + /* We don't care what happens when the group isn't in use */ 536 + if (!atomic_read(&group->container_users)) 537 + return 0; 538 + 539 + return vfio_dev_viable(dev, group); 540 + } 541 + 542 + static int vfio_iommu_group_notifier(struct notifier_block *nb, 543 + unsigned long action, void *data) 544 + { 545 + struct vfio_group *group = container_of(nb, struct vfio_group, nb); 546 + struct device *dev = data; 547 + 548 + /* 549 + * Need to go through a group_lock lookup to get a reference or 550 + * we risk racing a group being removed. Leave a WARN_ON for 551 + * debuging, but if the group no longer exists, a spurious notify 552 + * is harmless. 553 + */ 554 + group = vfio_group_try_get(group); 555 + if (WARN_ON(!group)) 556 + return NOTIFY_OK; 557 + 558 + switch (action) { 559 + case IOMMU_GROUP_NOTIFY_ADD_DEVICE: 560 + vfio_group_nb_add_dev(group, dev); 561 + break; 562 + case IOMMU_GROUP_NOTIFY_DEL_DEVICE: 563 + vfio_group_nb_del_dev(group, dev); 564 + break; 565 + case IOMMU_GROUP_NOTIFY_BIND_DRIVER: 566 + pr_debug("%s: Device %s, group %d binding to driver\n", 567 + __func__, dev_name(dev), 568 + iommu_group_id(group->iommu_group)); 569 + break; 570 + case IOMMU_GROUP_NOTIFY_BOUND_DRIVER: 571 + pr_debug("%s: Device %s, group %d bound to driver %s\n", 572 + __func__, dev_name(dev), 573 + iommu_group_id(group->iommu_group), dev->driver->name); 574 + BUG_ON(vfio_group_nb_verify(group, dev)); 575 + break; 576 + case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER: 577 + pr_debug("%s: Device %s, group %d unbinding from driver %s\n", 578 + __func__, dev_name(dev), 579 + iommu_group_id(group->iommu_group), dev->driver->name); 580 + break; 581 + case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER: 582 + pr_debug("%s: Device %s, group %d unbound from driver\n", 583 + __func__, dev_name(dev), 584 + iommu_group_id(group->iommu_group)); 585 + /* 586 + * XXX An unbound device in a live group is ok, but we'd 587 + * really like to avoid the above BUG_ON by preventing other 588 + * drivers from binding to it. Once that occurs, we have to 589 + * stop the system to maintain isolation. At a minimum, we'd 590 + * want a toggle to disable driver auto probe for this device. 591 + */ 592 + break; 593 + } 594 + 595 + vfio_group_put(group); 596 + return NOTIFY_OK; 597 + } 598 + 599 + /** 600 + * VFIO driver API 601 + */ 602 + int vfio_add_group_dev(struct device *dev, 603 + const struct vfio_device_ops *ops, void *device_data) 604 + { 605 + struct iommu_group *iommu_group; 606 + struct vfio_group *group; 607 + struct vfio_device *device; 608 + 609 + iommu_group = iommu_group_get(dev); 610 + if (!iommu_group) 611 + return -EINVAL; 612 + 613 + group = vfio_group_get_from_iommu(iommu_group); 614 + if (!group) { 615 + group = vfio_create_group(iommu_group); 616 + if (IS_ERR(group)) { 617 + iommu_group_put(iommu_group); 618 + return PTR_ERR(group); 619 + } 620 + } 621 + 622 + device = vfio_group_get_device(group, dev); 623 + if (device) { 624 + WARN(1, "Device %s already exists on group %d\n", 625 + dev_name(dev), iommu_group_id(iommu_group)); 626 + vfio_device_put(device); 627 + vfio_group_put(group); 628 + iommu_group_put(iommu_group); 629 + return -EBUSY; 630 + } 631 + 632 + device = vfio_group_create_device(group, dev, ops, device_data); 633 + if (IS_ERR(device)) { 634 + vfio_group_put(group); 635 + iommu_group_put(iommu_group); 636 + return PTR_ERR(device); 637 + } 638 + 639 + /* 640 + * Added device holds reference to iommu_group and vfio_device 641 + * (which in turn holds reference to vfio_group). Drop extra 642 + * group reference used while acquiring device. 643 + */ 644 + vfio_group_put(group); 645 + 646 + return 0; 647 + } 648 + EXPORT_SYMBOL_GPL(vfio_add_group_dev); 649 + 650 + /* Test whether a struct device is present in our tracking */ 651 + static bool vfio_dev_present(struct device *dev) 652 + { 653 + struct iommu_group *iommu_group; 654 + struct vfio_group *group; 655 + struct vfio_device *device; 656 + 657 + iommu_group = iommu_group_get(dev); 658 + if (!iommu_group) 659 + return false; 660 + 661 + group = vfio_group_get_from_iommu(iommu_group); 662 + if (!group) { 663 + iommu_group_put(iommu_group); 664 + return false; 665 + } 666 + 667 + device = vfio_group_get_device(group, dev); 668 + if (!device) { 669 + vfio_group_put(group); 670 + iommu_group_put(iommu_group); 671 + return false; 672 + } 673 + 674 + vfio_device_put(device); 675 + vfio_group_put(group); 676 + iommu_group_put(iommu_group); 677 + return true; 678 + } 679 + 680 + /* 681 + * Decrement the device reference count and wait for the device to be 682 + * removed. Open file descriptors for the device... */ 683 + void *vfio_del_group_dev(struct device *dev) 684 + { 685 + struct vfio_device *device = dev_get_drvdata(dev); 686 + struct vfio_group *group = device->group; 687 + struct iommu_group *iommu_group = group->iommu_group; 688 + void *device_data = device->device_data; 689 + 690 + vfio_device_put(device); 691 + 692 + /* TODO send a signal to encourage this to be released */ 693 + wait_event(vfio.release_q, !vfio_dev_present(dev)); 694 + 695 + iommu_group_put(iommu_group); 696 + 697 + return device_data; 698 + } 699 + EXPORT_SYMBOL_GPL(vfio_del_group_dev); 700 + 701 + /** 702 + * VFIO base fd, /dev/vfio/vfio 703 + */ 704 + static long vfio_ioctl_check_extension(struct vfio_container *container, 705 + unsigned long arg) 706 + { 707 + struct vfio_iommu_driver *driver = container->iommu_driver; 708 + long ret = 0; 709 + 710 + switch (arg) { 711 + /* No base extensions yet */ 712 + default: 713 + /* 714 + * If no driver is set, poll all registered drivers for 715 + * extensions and return the first positive result. If 716 + * a driver is already set, further queries will be passed 717 + * only to that driver. 718 + */ 719 + if (!driver) { 720 + mutex_lock(&vfio.iommu_drivers_lock); 721 + list_for_each_entry(driver, &vfio.iommu_drivers_list, 722 + vfio_next) { 723 + if (!try_module_get(driver->ops->owner)) 724 + continue; 725 + 726 + ret = driver->ops->ioctl(NULL, 727 + VFIO_CHECK_EXTENSION, 728 + arg); 729 + module_put(driver->ops->owner); 730 + if (ret > 0) 731 + break; 732 + } 733 + mutex_unlock(&vfio.iommu_drivers_lock); 734 + } else 735 + ret = driver->ops->ioctl(container->iommu_data, 736 + VFIO_CHECK_EXTENSION, arg); 737 + } 738 + 739 + return ret; 740 + } 741 + 742 + /* hold container->group_lock */ 743 + static int __vfio_container_attach_groups(struct vfio_container *container, 744 + struct vfio_iommu_driver *driver, 745 + void *data) 746 + { 747 + struct vfio_group *group; 748 + int ret = -ENODEV; 749 + 750 + list_for_each_entry(group, &container->group_list, container_next) { 751 + ret = driver->ops->attach_group(data, group->iommu_group); 752 + if (ret) 753 + goto unwind; 754 + } 755 + 756 + return ret; 757 + 758 + unwind: 759 + list_for_each_entry_continue_reverse(group, &container->group_list, 760 + container_next) { 761 + driver->ops->detach_group(data, group->iommu_group); 762 + } 763 + 764 + return ret; 765 + } 766 + 767 + static long vfio_ioctl_set_iommu(struct vfio_container *container, 768 + unsigned long arg) 769 + { 770 + struct vfio_iommu_driver *driver; 771 + long ret = -ENODEV; 772 + 773 + mutex_lock(&container->group_lock); 774 + 775 + /* 776 + * The container is designed to be an unprivileged interface while 777 + * the group can be assigned to specific users. Therefore, only by 778 + * adding a group to a container does the user get the privilege of 779 + * enabling the iommu, which may allocate finite resources. There 780 + * is no unset_iommu, but by removing all the groups from a container, 781 + * the container is deprivileged and returns to an unset state. 782 + */ 783 + if (list_empty(&container->group_list) || container->iommu_driver) { 784 + mutex_unlock(&container->group_lock); 785 + return -EINVAL; 786 + } 787 + 788 + mutex_lock(&vfio.iommu_drivers_lock); 789 + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 790 + void *data; 791 + 792 + if (!try_module_get(driver->ops->owner)) 793 + continue; 794 + 795 + /* 796 + * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, 797 + * so test which iommu driver reported support for this 798 + * extension and call open on them. We also pass them the 799 + * magic, allowing a single driver to support multiple 800 + * interfaces if they'd like. 801 + */ 802 + if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { 803 + module_put(driver->ops->owner); 804 + continue; 805 + } 806 + 807 + /* module reference holds the driver we're working on */ 808 + mutex_unlock(&vfio.iommu_drivers_lock); 809 + 810 + data = driver->ops->open(arg); 811 + if (IS_ERR(data)) { 812 + ret = PTR_ERR(data); 813 + module_put(driver->ops->owner); 814 + goto skip_drivers_unlock; 815 + } 816 + 817 + ret = __vfio_container_attach_groups(container, driver, data); 818 + if (!ret) { 819 + container->iommu_driver = driver; 820 + container->iommu_data = data; 821 + } else { 822 + driver->ops->release(data); 823 + module_put(driver->ops->owner); 824 + } 825 + 826 + goto skip_drivers_unlock; 827 + } 828 + 829 + mutex_unlock(&vfio.iommu_drivers_lock); 830 + skip_drivers_unlock: 831 + mutex_unlock(&container->group_lock); 832 + 833 + return ret; 834 + } 835 + 836 + static long vfio_fops_unl_ioctl(struct file *filep, 837 + unsigned int cmd, unsigned long arg) 838 + { 839 + struct vfio_container *container = filep->private_data; 840 + struct vfio_iommu_driver *driver; 841 + void *data; 842 + long ret = -EINVAL; 843 + 844 + if (!container) 845 + return ret; 846 + 847 + driver = container->iommu_driver; 848 + data = container->iommu_data; 849 + 850 + switch (cmd) { 851 + case VFIO_GET_API_VERSION: 852 + ret = VFIO_API_VERSION; 853 + break; 854 + case VFIO_CHECK_EXTENSION: 855 + ret = vfio_ioctl_check_extension(container, arg); 856 + break; 857 + case VFIO_SET_IOMMU: 858 + ret = vfio_ioctl_set_iommu(container, arg); 859 + break; 860 + default: 861 + if (driver) /* passthrough all unrecognized ioctls */ 862 + ret = driver->ops->ioctl(data, cmd, arg); 863 + } 864 + 865 + return ret; 866 + } 867 + 868 + #ifdef CONFIG_COMPAT 869 + static long vfio_fops_compat_ioctl(struct file *filep, 870 + unsigned int cmd, unsigned long arg) 871 + { 872 + arg = (unsigned long)compat_ptr(arg); 873 + return vfio_fops_unl_ioctl(filep, cmd, arg); 874 + } 875 + #endif /* CONFIG_COMPAT */ 876 + 877 + static int vfio_fops_open(struct inode *inode, struct file *filep) 878 + { 879 + struct vfio_container *container; 880 + 881 + container = kzalloc(sizeof(*container), GFP_KERNEL); 882 + if (!container) 883 + return -ENOMEM; 884 + 885 + INIT_LIST_HEAD(&container->group_list); 886 + mutex_init(&container->group_lock); 887 + kref_init(&container->kref); 888 + 889 + filep->private_data = container; 890 + 891 + return 0; 892 + } 893 + 894 + static int vfio_fops_release(struct inode *inode, struct file *filep) 895 + { 896 + struct vfio_container *container = filep->private_data; 897 + 898 + filep->private_data = NULL; 899 + 900 + vfio_container_put(container); 901 + 902 + return 0; 903 + } 904 + 905 + /* 906 + * Once an iommu driver is set, we optionally pass read/write/mmap 907 + * on to the driver, allowing management interfaces beyond ioctl. 908 + */ 909 + static ssize_t vfio_fops_read(struct file *filep, char __user *buf, 910 + size_t count, loff_t *ppos) 911 + { 912 + struct vfio_container *container = filep->private_data; 913 + struct vfio_iommu_driver *driver = container->iommu_driver; 914 + 915 + if (unlikely(!driver || !driver->ops->read)) 916 + return -EINVAL; 917 + 918 + return driver->ops->read(container->iommu_data, buf, count, ppos); 919 + } 920 + 921 + static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, 922 + size_t count, loff_t *ppos) 923 + { 924 + struct vfio_container *container = filep->private_data; 925 + struct vfio_iommu_driver *driver = container->iommu_driver; 926 + 927 + if (unlikely(!driver || !driver->ops->write)) 928 + return -EINVAL; 929 + 930 + return driver->ops->write(container->iommu_data, buf, count, ppos); 931 + } 932 + 933 + static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) 934 + { 935 + struct vfio_container *container = filep->private_data; 936 + struct vfio_iommu_driver *driver = container->iommu_driver; 937 + 938 + if (unlikely(!driver || !driver->ops->mmap)) 939 + return -EINVAL; 940 + 941 + return driver->ops->mmap(container->iommu_data, vma); 942 + } 943 + 944 + static const struct file_operations vfio_fops = { 945 + .owner = THIS_MODULE, 946 + .open = vfio_fops_open, 947 + .release = vfio_fops_release, 948 + .read = vfio_fops_read, 949 + .write = vfio_fops_write, 950 + .unlocked_ioctl = vfio_fops_unl_ioctl, 951 + #ifdef CONFIG_COMPAT 952 + .compat_ioctl = vfio_fops_compat_ioctl, 953 + #endif 954 + .mmap = vfio_fops_mmap, 955 + }; 956 + 957 + /** 958 + * VFIO Group fd, /dev/vfio/$GROUP 959 + */ 960 + static void __vfio_group_unset_container(struct vfio_group *group) 961 + { 962 + struct vfio_container *container = group->container; 963 + struct vfio_iommu_driver *driver; 964 + 965 + mutex_lock(&container->group_lock); 966 + 967 + driver = container->iommu_driver; 968 + if (driver) 969 + driver->ops->detach_group(container->iommu_data, 970 + group->iommu_group); 971 + 972 + group->container = NULL; 973 + list_del(&group->container_next); 974 + 975 + /* Detaching the last group deprivileges a container, remove iommu */ 976 + if (driver && list_empty(&container->group_list)) { 977 + driver->ops->release(container->iommu_data); 978 + module_put(driver->ops->owner); 979 + container->iommu_driver = NULL; 980 + container->iommu_data = NULL; 981 + } 982 + 983 + mutex_unlock(&container->group_lock); 984 + 985 + vfio_container_put(container); 986 + } 987 + 988 + /* 989 + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or 990 + * if there was no container to unset. Since the ioctl is called on 991 + * the group, we know that still exists, therefore the only valid 992 + * transition here is 1->0. 993 + */ 994 + static int vfio_group_unset_container(struct vfio_group *group) 995 + { 996 + int users = atomic_cmpxchg(&group->container_users, 1, 0); 997 + 998 + if (!users) 999 + return -EINVAL; 1000 + if (users != 1) 1001 + return -EBUSY; 1002 + 1003 + __vfio_group_unset_container(group); 1004 + 1005 + return 0; 1006 + } 1007 + 1008 + /* 1009 + * When removing container users, anything that removes the last user 1010 + * implicitly removes the group from the container. That is, if the 1011 + * group file descriptor is closed, as well as any device file descriptors, 1012 + * the group is free. 1013 + */ 1014 + static void vfio_group_try_dissolve_container(struct vfio_group *group) 1015 + { 1016 + if (0 == atomic_dec_if_positive(&group->container_users)) 1017 + __vfio_group_unset_container(group); 1018 + } 1019 + 1020 + static int vfio_group_set_container(struct vfio_group *group, int container_fd) 1021 + { 1022 + struct file *filep; 1023 + struct vfio_container *container; 1024 + struct vfio_iommu_driver *driver; 1025 + int ret = 0; 1026 + 1027 + if (atomic_read(&group->container_users)) 1028 + return -EINVAL; 1029 + 1030 + filep = fget(container_fd); 1031 + if (!filep) 1032 + return -EBADF; 1033 + 1034 + /* Sanity check, is this really our fd? */ 1035 + if (filep->f_op != &vfio_fops) { 1036 + fput(filep); 1037 + return -EINVAL; 1038 + } 1039 + 1040 + container = filep->private_data; 1041 + WARN_ON(!container); /* fget ensures we don't race vfio_release */ 1042 + 1043 + mutex_lock(&container->group_lock); 1044 + 1045 + driver = container->iommu_driver; 1046 + if (driver) { 1047 + ret = driver->ops->attach_group(container->iommu_data, 1048 + group->iommu_group); 1049 + if (ret) 1050 + goto unlock_out; 1051 + } 1052 + 1053 + group->container = container; 1054 + list_add(&group->container_next, &container->group_list); 1055 + 1056 + /* Get a reference on the container and mark a user within the group */ 1057 + vfio_container_get(container); 1058 + atomic_inc(&group->container_users); 1059 + 1060 + unlock_out: 1061 + mutex_unlock(&container->group_lock); 1062 + fput(filep); 1063 + 1064 + return ret; 1065 + } 1066 + 1067 + static bool vfio_group_viable(struct vfio_group *group) 1068 + { 1069 + return (iommu_group_for_each_dev(group->iommu_group, 1070 + group, vfio_dev_viable) == 0); 1071 + } 1072 + 1073 + static const struct file_operations vfio_device_fops; 1074 + 1075 + static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) 1076 + { 1077 + struct vfio_device *device; 1078 + struct file *filep; 1079 + int ret = -ENODEV; 1080 + 1081 + if (0 == atomic_read(&group->container_users) || 1082 + !group->container->iommu_driver || !vfio_group_viable(group)) 1083 + return -EINVAL; 1084 + 1085 + mutex_lock(&group->device_lock); 1086 + list_for_each_entry(device, &group->device_list, group_next) { 1087 + if (strcmp(dev_name(device->dev), buf)) 1088 + continue; 1089 + 1090 + ret = device->ops->open(device->device_data); 1091 + if (ret) 1092 + break; 1093 + /* 1094 + * We can't use anon_inode_getfd() because we need to modify 1095 + * the f_mode flags directly to allow more than just ioctls 1096 + */ 1097 + ret = get_unused_fd(); 1098 + if (ret < 0) { 1099 + device->ops->release(device->device_data); 1100 + break; 1101 + } 1102 + 1103 + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, 1104 + device, O_RDWR); 1105 + if (IS_ERR(filep)) { 1106 + put_unused_fd(ret); 1107 + ret = PTR_ERR(filep); 1108 + device->ops->release(device->device_data); 1109 + break; 1110 + } 1111 + 1112 + /* 1113 + * TODO: add an anon_inode interface to do this. 1114 + * Appears to be missing by lack of need rather than 1115 + * explicitly prevented. Now there's need. 1116 + */ 1117 + filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); 1118 + 1119 + fd_install(ret, filep); 1120 + 1121 + vfio_device_get(device); 1122 + atomic_inc(&group->container_users); 1123 + break; 1124 + } 1125 + mutex_unlock(&group->device_lock); 1126 + 1127 + return ret; 1128 + } 1129 + 1130 + static long vfio_group_fops_unl_ioctl(struct file *filep, 1131 + unsigned int cmd, unsigned long arg) 1132 + { 1133 + struct vfio_group *group = filep->private_data; 1134 + long ret = -ENOTTY; 1135 + 1136 + switch (cmd) { 1137 + case VFIO_GROUP_GET_STATUS: 1138 + { 1139 + struct vfio_group_status status; 1140 + unsigned long minsz; 1141 + 1142 + minsz = offsetofend(struct vfio_group_status, flags); 1143 + 1144 + if (copy_from_user(&status, (void __user *)arg, minsz)) 1145 + return -EFAULT; 1146 + 1147 + if (status.argsz < minsz) 1148 + return -EINVAL; 1149 + 1150 + status.flags = 0; 1151 + 1152 + if (vfio_group_viable(group)) 1153 + status.flags |= VFIO_GROUP_FLAGS_VIABLE; 1154 + 1155 + if (group->container) 1156 + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET; 1157 + 1158 + if (copy_to_user((void __user *)arg, &status, minsz)) 1159 + return -EFAULT; 1160 + 1161 + ret = 0; 1162 + break; 1163 + } 1164 + case VFIO_GROUP_SET_CONTAINER: 1165 + { 1166 + int fd; 1167 + 1168 + if (get_user(fd, (int __user *)arg)) 1169 + return -EFAULT; 1170 + 1171 + if (fd < 0) 1172 + return -EINVAL; 1173 + 1174 + ret = vfio_group_set_container(group, fd); 1175 + break; 1176 + } 1177 + case VFIO_GROUP_UNSET_CONTAINER: 1178 + ret = vfio_group_unset_container(group); 1179 + break; 1180 + case VFIO_GROUP_GET_DEVICE_FD: 1181 + { 1182 + char *buf; 1183 + 1184 + buf = strndup_user((const char __user *)arg, PAGE_SIZE); 1185 + if (IS_ERR(buf)) 1186 + return PTR_ERR(buf); 1187 + 1188 + ret = vfio_group_get_device_fd(group, buf); 1189 + kfree(buf); 1190 + break; 1191 + } 1192 + } 1193 + 1194 + return ret; 1195 + } 1196 + 1197 + #ifdef CONFIG_COMPAT 1198 + static long vfio_group_fops_compat_ioctl(struct file *filep, 1199 + unsigned int cmd, unsigned long arg) 1200 + { 1201 + arg = (unsigned long)compat_ptr(arg); 1202 + return vfio_group_fops_unl_ioctl(filep, cmd, arg); 1203 + } 1204 + #endif /* CONFIG_COMPAT */ 1205 + 1206 + static int vfio_group_fops_open(struct inode *inode, struct file *filep) 1207 + { 1208 + struct vfio_group *group; 1209 + 1210 + group = vfio_group_get_from_minor(iminor(inode)); 1211 + if (!group) 1212 + return -ENODEV; 1213 + 1214 + if (group->container) { 1215 + vfio_group_put(group); 1216 + return -EBUSY; 1217 + } 1218 + 1219 + filep->private_data = group; 1220 + 1221 + return 0; 1222 + } 1223 + 1224 + static int vfio_group_fops_release(struct inode *inode, struct file *filep) 1225 + { 1226 + struct vfio_group *group = filep->private_data; 1227 + 1228 + filep->private_data = NULL; 1229 + 1230 + vfio_group_try_dissolve_container(group); 1231 + 1232 + vfio_group_put(group); 1233 + 1234 + return 0; 1235 + } 1236 + 1237 + static const struct file_operations vfio_group_fops = { 1238 + .owner = THIS_MODULE, 1239 + .unlocked_ioctl = vfio_group_fops_unl_ioctl, 1240 + #ifdef CONFIG_COMPAT 1241 + .compat_ioctl = vfio_group_fops_compat_ioctl, 1242 + #endif 1243 + .open = vfio_group_fops_open, 1244 + .release = vfio_group_fops_release, 1245 + }; 1246 + 1247 + /** 1248 + * VFIO Device fd 1249 + */ 1250 + static int vfio_device_fops_release(struct inode *inode, struct file *filep) 1251 + { 1252 + struct vfio_device *device = filep->private_data; 1253 + 1254 + device->ops->release(device->device_data); 1255 + 1256 + vfio_group_try_dissolve_container(device->group); 1257 + 1258 + vfio_device_put(device); 1259 + 1260 + return 0; 1261 + } 1262 + 1263 + static long vfio_device_fops_unl_ioctl(struct file *filep, 1264 + unsigned int cmd, unsigned long arg) 1265 + { 1266 + struct vfio_device *device = filep->private_data; 1267 + 1268 + if (unlikely(!device->ops->ioctl)) 1269 + return -EINVAL; 1270 + 1271 + return device->ops->ioctl(device->device_data, cmd, arg); 1272 + } 1273 + 1274 + static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1275 + size_t count, loff_t *ppos) 1276 + { 1277 + struct vfio_device *device = filep->private_data; 1278 + 1279 + if (unlikely(!device->ops->read)) 1280 + return -EINVAL; 1281 + 1282 + return device->ops->read(device->device_data, buf, count, ppos); 1283 + } 1284 + 1285 + static ssize_t vfio_device_fops_write(struct file *filep, 1286 + const char __user *buf, 1287 + size_t count, loff_t *ppos) 1288 + { 1289 + struct vfio_device *device = filep->private_data; 1290 + 1291 + if (unlikely(!device->ops->write)) 1292 + return -EINVAL; 1293 + 1294 + return device->ops->write(device->device_data, buf, count, ppos); 1295 + } 1296 + 1297 + static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1298 + { 1299 + struct vfio_device *device = filep->private_data; 1300 + 1301 + if (unlikely(!device->ops->mmap)) 1302 + return -EINVAL; 1303 + 1304 + return device->ops->mmap(device->device_data, vma); 1305 + } 1306 + 1307 + #ifdef CONFIG_COMPAT 1308 + static long vfio_device_fops_compat_ioctl(struct file *filep, 1309 + unsigned int cmd, unsigned long arg) 1310 + { 1311 + arg = (unsigned long)compat_ptr(arg); 1312 + return vfio_device_fops_unl_ioctl(filep, cmd, arg); 1313 + } 1314 + #endif /* CONFIG_COMPAT */ 1315 + 1316 + static const struct file_operations vfio_device_fops = { 1317 + .owner = THIS_MODULE, 1318 + .release = vfio_device_fops_release, 1319 + .read = vfio_device_fops_read, 1320 + .write = vfio_device_fops_write, 1321 + .unlocked_ioctl = vfio_device_fops_unl_ioctl, 1322 + #ifdef CONFIG_COMPAT 1323 + .compat_ioctl = vfio_device_fops_compat_ioctl, 1324 + #endif 1325 + .mmap = vfio_device_fops_mmap, 1326 + }; 1327 + 1328 + /** 1329 + * Module/class support 1330 + */ 1331 + static char *vfio_devnode(struct device *dev, umode_t *mode) 1332 + { 1333 + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); 1334 + } 1335 + 1336 + static int __init vfio_init(void) 1337 + { 1338 + int ret; 1339 + 1340 + idr_init(&vfio.group_idr); 1341 + mutex_init(&vfio.group_lock); 1342 + mutex_init(&vfio.iommu_drivers_lock); 1343 + INIT_LIST_HEAD(&vfio.group_list); 1344 + INIT_LIST_HEAD(&vfio.iommu_drivers_list); 1345 + init_waitqueue_head(&vfio.release_q); 1346 + 1347 + vfio.class = class_create(THIS_MODULE, "vfio"); 1348 + if (IS_ERR(vfio.class)) { 1349 + ret = PTR_ERR(vfio.class); 1350 + goto err_class; 1351 + } 1352 + 1353 + vfio.class->devnode = vfio_devnode; 1354 + 1355 + ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio"); 1356 + if (ret) 1357 + goto err_base_chrdev; 1358 + 1359 + cdev_init(&vfio.cdev, &vfio_fops); 1360 + ret = cdev_add(&vfio.cdev, vfio.devt, 1); 1361 + if (ret) 1362 + goto err_base_cdev; 1363 + 1364 + vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio"); 1365 + if (IS_ERR(vfio.dev)) { 1366 + ret = PTR_ERR(vfio.dev); 1367 + goto err_base_dev; 1368 + } 1369 + 1370 + /* /dev/vfio/$GROUP */ 1371 + cdev_init(&vfio.group_cdev, &vfio_group_fops); 1372 + ret = cdev_add(&vfio.group_cdev, 1373 + MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1); 1374 + if (ret) 1375 + goto err_groups_cdev; 1376 + 1377 + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1378 + 1379 + return 0; 1380 + 1381 + err_groups_cdev: 1382 + device_destroy(vfio.class, vfio.devt); 1383 + err_base_dev: 1384 + cdev_del(&vfio.cdev); 1385 + err_base_cdev: 1386 + unregister_chrdev_region(vfio.devt, MINORMASK); 1387 + err_base_chrdev: 1388 + class_destroy(vfio.class); 1389 + vfio.class = NULL; 1390 + err_class: 1391 + return ret; 1392 + } 1393 + 1394 + static void __exit vfio_cleanup(void) 1395 + { 1396 + WARN_ON(!list_empty(&vfio.group_list)); 1397 + 1398 + idr_destroy(&vfio.group_idr); 1399 + cdev_del(&vfio.group_cdev); 1400 + device_destroy(vfio.class, vfio.devt); 1401 + cdev_del(&vfio.cdev); 1402 + unregister_chrdev_region(vfio.devt, MINORMASK); 1403 + class_destroy(vfio.class); 1404 + vfio.class = NULL; 1405 + } 1406 + 1407 + module_init(vfio_init); 1408 + module_exit(vfio_cleanup); 1409 + 1410 + MODULE_VERSION(DRIVER_VERSION); 1411 + MODULE_LICENSE("GPL v2"); 1412 + MODULE_AUTHOR(DRIVER_AUTHOR); 1413 + MODULE_DESCRIPTION(DRIVER_DESC);

+367

include/linux/vfio.h

··· 1 + /* 2 + * VFIO API definition 3 + * 4 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 5 + * Author: Alex Williamson <alex.williamson@redhat.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + */ 11 + #ifndef VFIO_H 12 + #define VFIO_H 13 + 14 + #include <linux/types.h> 15 + #include <linux/ioctl.h> 16 + 17 + #define VFIO_API_VERSION 0 18 + 19 + #ifdef __KERNEL__ /* Internal VFIO-core/bus driver API */ 20 + 21 + #include <linux/iommu.h> 22 + #include <linux/mm.h> 23 + 24 + /** 25 + * struct vfio_device_ops - VFIO bus driver device callbacks 26 + * 27 + * @open: Called when userspace creates new file descriptor for device 28 + * @release: Called when userspace releases file descriptor for device 29 + * @read: Perform read(2) on device file descriptor 30 + * @write: Perform write(2) on device file descriptor 31 + * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* 32 + * operations documented below 33 + * @mmap: Perform mmap(2) on a region of the device file descriptor 34 + */ 35 + struct vfio_device_ops { 36 + char *name; 37 + int (*open)(void *device_data); 38 + void (*release)(void *device_data); 39 + ssize_t (*read)(void *device_data, char __user *buf, 40 + size_t count, loff_t *ppos); 41 + ssize_t (*write)(void *device_data, const char __user *buf, 42 + size_t count, loff_t *size); 43 + long (*ioctl)(void *device_data, unsigned int cmd, 44 + unsigned long arg); 45 + int (*mmap)(void *device_data, struct vm_area_struct *vma); 46 + }; 47 + 48 + extern int vfio_add_group_dev(struct device *dev, 49 + const struct vfio_device_ops *ops, 50 + void *device_data); 51 + 52 + extern void *vfio_del_group_dev(struct device *dev); 53 + 54 + /** 55 + * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks 56 + */ 57 + struct vfio_iommu_driver_ops { 58 + char *name; 59 + struct module *owner; 60 + void *(*open)(unsigned long arg); 61 + void (*release)(void *iommu_data); 62 + ssize_t (*read)(void *iommu_data, char __user *buf, 63 + size_t count, loff_t *ppos); 64 + ssize_t (*write)(void *iommu_data, const char __user *buf, 65 + size_t count, loff_t *size); 66 + long (*ioctl)(void *iommu_data, unsigned int cmd, 67 + unsigned long arg); 68 + int (*mmap)(void *iommu_data, struct vm_area_struct *vma); 69 + int (*attach_group)(void *iommu_data, 70 + struct iommu_group *group); 71 + void (*detach_group)(void *iommu_data, 72 + struct iommu_group *group); 73 + 74 + }; 75 + 76 + extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); 77 + 78 + extern void vfio_unregister_iommu_driver( 79 + const struct vfio_iommu_driver_ops *ops); 80 + 81 + /** 82 + * offsetofend(TYPE, MEMBER) 83 + * 84 + * @TYPE: The type of the structure 85 + * @MEMBER: The member within the structure to get the end offset of 86 + * 87 + * Simple helper macro for dealing with variable sized structures passed 88 + * from user space. This allows us to easily determine if the provided 89 + * structure is sized to include various fields. 90 + */ 91 + #define offsetofend(TYPE, MEMBER) ({ \ 92 + TYPE tmp; \ 93 + offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ 94 + 95 + #endif /* __KERNEL__ */ 96 + 97 + /* Kernel & User level defines for VFIO IOCTLs. */ 98 + 99 + /* Extensions */ 100 + 101 + /* None yet */ 102 + 103 + /* 104 + * The IOCTL interface is designed for extensibility by embedding the 105 + * structure length (argsz) and flags into structures passed between 106 + * kernel and userspace. We therefore use the _IO() macro for these 107 + * defines to avoid implicitly embedding a size into the ioctl request. 108 + * As structure fields are added, argsz will increase to match and flag 109 + * bits will be defined to indicate additional fields with valid data. 110 + * It's *always* the caller's responsibility to indicate the size of 111 + * the structure passed by setting argsz appropriately. 112 + */ 113 + 114 + #define VFIO_TYPE (';') 115 + #define VFIO_BASE 100 116 + 117 + /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ 118 + 119 + /** 120 + * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0) 121 + * 122 + * Report the version of the VFIO API. This allows us to bump the entire 123 + * API version should we later need to add or change features in incompatible 124 + * ways. 125 + * Return: VFIO_API_VERSION 126 + * Availability: Always 127 + */ 128 + #define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0) 129 + 130 + /** 131 + * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32) 132 + * 133 + * Check whether an extension is supported. 134 + * Return: 0 if not supported, 1 (or some other positive integer) if supported. 135 + * Availability: Always 136 + */ 137 + #define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1) 138 + 139 + /** 140 + * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32) 141 + * 142 + * Set the iommu to the given type. The type must be supported by an 143 + * iommu driver as verified by calling CHECK_EXTENSION using the same 144 + * type. A group must be set to this file descriptor before this 145 + * ioctl is available. The IOMMU interfaces enabled by this call are 146 + * specific to the value set. 147 + * Return: 0 on success, -errno on failure 148 + * Availability: When VFIO group attached 149 + */ 150 + #define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2) 151 + 152 + /* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */ 153 + 154 + /** 155 + * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3, 156 + * struct vfio_group_status) 157 + * 158 + * Retrieve information about the group. Fills in provided 159 + * struct vfio_group_info. Caller sets argsz. 160 + * Return: 0 on succes, -errno on failure. 161 + * Availability: Always 162 + */ 163 + struct vfio_group_status { 164 + __u32 argsz; 165 + __u32 flags; 166 + #define VFIO_GROUP_FLAGS_VIABLE (1 << 0) 167 + #define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1) 168 + }; 169 + #define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3) 170 + 171 + /** 172 + * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32) 173 + * 174 + * Set the container for the VFIO group to the open VFIO file 175 + * descriptor provided. Groups may only belong to a single 176 + * container. Containers may, at their discretion, support multiple 177 + * groups. Only when a container is set are all of the interfaces 178 + * of the VFIO file descriptor and the VFIO group file descriptor 179 + * available to the user. 180 + * Return: 0 on success, -errno on failure. 181 + * Availability: Always 182 + */ 183 + #define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4) 184 + 185 + /** 186 + * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5) 187 + * 188 + * Remove the group from the attached container. This is the 189 + * opposite of the SET_CONTAINER call and returns the group to 190 + * an initial state. All device file descriptors must be released 191 + * prior to calling this interface. When removing the last group 192 + * from a container, the IOMMU will be disabled and all state lost, 193 + * effectively also returning the VFIO file descriptor to an initial 194 + * state. 195 + * Return: 0 on success, -errno on failure. 196 + * Availability: When attached to container 197 + */ 198 + #define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5) 199 + 200 + /** 201 + * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char) 202 + * 203 + * Return a new file descriptor for the device object described by 204 + * the provided string. The string should match a device listed in 205 + * the devices subdirectory of the IOMMU group sysfs entry. The 206 + * group containing the device must already be added to this context. 207 + * Return: new file descriptor on success, -errno on failure. 208 + * Availability: When attached to container 209 + */ 210 + #define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6) 211 + 212 + /* --------------- IOCTLs for DEVICE file descriptors --------------- */ 213 + 214 + /** 215 + * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7, 216 + * struct vfio_device_info) 217 + * 218 + * Retrieve information about the device. Fills in provided 219 + * struct vfio_device_info. Caller sets argsz. 220 + * Return: 0 on success, -errno on failure. 221 + */ 222 + struct vfio_device_info { 223 + __u32 argsz; 224 + __u32 flags; 225 + #define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */ 226 + __u32 num_regions; /* Max region index + 1 */ 227 + __u32 num_irqs; /* Max IRQ index + 1 */ 228 + }; 229 + #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) 230 + 231 + /** 232 + * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, 233 + * struct vfio_region_info) 234 + * 235 + * Retrieve information about a device region. Caller provides 236 + * struct vfio_region_info with index value set. Caller sets argsz. 237 + * Implementation of region mapping is bus driver specific. This is 238 + * intended to describe MMIO, I/O port, as well as bus specific 239 + * regions (ex. PCI config space). Zero sized regions may be used 240 + * to describe unimplemented regions (ex. unimplemented PCI BARs). 241 + * Return: 0 on success, -errno on failure. 242 + */ 243 + struct vfio_region_info { 244 + __u32 argsz; 245 + __u32 flags; 246 + #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ 247 + #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ 248 + #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ 249 + __u32 index; /* Region index */ 250 + __u32 resv; /* Reserved for alignment */ 251 + __u64 size; /* Region size (bytes) */ 252 + __u64 offset; /* Region offset from start of device fd */ 253 + }; 254 + #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) 255 + 256 + /** 257 + * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, 258 + * struct vfio_irq_info) 259 + * 260 + * Retrieve information about a device IRQ. Caller provides 261 + * struct vfio_irq_info with index value set. Caller sets argsz. 262 + * Implementation of IRQ mapping is bus driver specific. Indexes 263 + * using multiple IRQs are primarily intended to support MSI-like 264 + * interrupt blocks. Zero count irq blocks may be used to describe 265 + * unimplemented interrupt types. 266 + * 267 + * The EVENTFD flag indicates the interrupt index supports eventfd based 268 + * signaling. 269 + * 270 + * The MASKABLE flags indicates the index supports MASK and UNMASK 271 + * actions described below. 272 + * 273 + * AUTOMASKED indicates that after signaling, the interrupt line is 274 + * automatically masked by VFIO and the user needs to unmask the line 275 + * to receive new interrupts. This is primarily intended to distinguish 276 + * level triggered interrupts. 277 + * 278 + * The NORESIZE flag indicates that the interrupt lines within the index 279 + * are setup as a set and new subindexes cannot be enabled without first 280 + * disabling the entire index. This is used for interrupts like PCI MSI 281 + * and MSI-X where the driver may only use a subset of the available 282 + * indexes, but VFIO needs to enable a specific number of vectors 283 + * upfront. In the case of MSI-X, where the user can enable MSI-X and 284 + * then add and unmask vectors, it's up to userspace to make the decision 285 + * whether to allocate the maximum supported number of vectors or tear 286 + * down setup and incrementally increase the vectors as each is enabled. 287 + */ 288 + struct vfio_irq_info { 289 + __u32 argsz; 290 + __u32 flags; 291 + #define VFIO_IRQ_INFO_EVENTFD (1 << 0) 292 + #define VFIO_IRQ_INFO_MASKABLE (1 << 1) 293 + #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) 294 + #define VFIO_IRQ_INFO_NORESIZE (1 << 3) 295 + __u32 index; /* IRQ index */ 296 + __u32 count; /* Number of IRQs within this index */ 297 + }; 298 + #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) 299 + 300 + /** 301 + * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) 302 + * 303 + * Set signaling, masking, and unmasking of interrupts. Caller provides 304 + * struct vfio_irq_set with all fields set. 'start' and 'count' indicate 305 + * the range of subindexes being specified. 306 + * 307 + * The DATA flags specify the type of data provided. If DATA_NONE, the 308 + * operation performs the specified action immediately on the specified 309 + * interrupt(s). For example, to unmask AUTOMASKED interrupt [0,0]: 310 + * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1. 311 + * 312 + * DATA_BOOL allows sparse support for the same on arrays of interrupts. 313 + * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]): 314 + * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3, 315 + * data = {1,0,1} 316 + * 317 + * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd. 318 + * A value of -1 can be used to either de-assign interrupts if already 319 + * assigned or skip un-assigned interrupts. For example, to set an eventfd 320 + * to be trigger for interrupts [0,0] and [0,2]: 321 + * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3, 322 + * data = {fd1, -1, fd2} 323 + * If index [0,1] is previously set, two count = 1 ioctls calls would be 324 + * required to set [0,0] and [0,2] without changing [0,1]. 325 + * 326 + * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used 327 + * with ACTION_TRIGGER to perform kernel level interrupt loopback testing 328 + * from userspace (ie. simulate hardware triggering). 329 + * 330 + * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER 331 + * enables the interrupt index for the device. Individual subindex interrupts 332 + * can be disabled using the -1 value for DATA_EVENTFD or the index can be 333 + * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0. 334 + * 335 + * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while 336 + * ACTION_TRIGGER specifies kernel->user signaling. 337 + */ 338 + struct vfio_irq_set { 339 + __u32 argsz; 340 + __u32 flags; 341 + #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ 342 + #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ 343 + #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ 344 + #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ 345 + #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ 346 + #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ 347 + __u32 index; 348 + __u32 start; 349 + __u32 count; 350 + __u8 data[]; 351 + }; 352 + #define VFIO_DEVICE_SET_IRQS _IO(VFIO_TYPE, VFIO_BASE + 10) 353 + 354 + #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ 355 + VFIO_IRQ_SET_DATA_BOOL | \ 356 + VFIO_IRQ_SET_DATA_EVENTFD) 357 + #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ 358 + VFIO_IRQ_SET_ACTION_UNMASK | \ 359 + VFIO_IRQ_SET_ACTION_TRIGGER) 360 + /** 361 + * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11) 362 + * 363 + * Reset a device. 364 + */ 365 + #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11) 366 + 367 + #endif /* VFIO_H */