Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

- IOMMU updates based on trace analysis
- VFIO device request interface

* tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio:
vfio-pci: Add device request interface
vfio-pci: Generalize setup of simple eventfds
vfio: Add and use device request op for vfio bus drivers
vfio: Tie IOMMU group reference to vfio group
vfio: Add device tracking during unbind
vfio/type1: Add conditional rescheduling
vfio/type1: Chunk contiguous reserved/invalid page mappings
vfio/type1: DMA unmap chunking

+242 -42
+20 -1
drivers/vfio/pci/vfio_pci.c
··· 239 239 240 240 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 241 241 } 242 - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) 242 + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 243 243 if (pci_is_pcie(vdev->pdev)) 244 244 return 1; 245 + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 246 + return 1; 247 + } 245 248 246 249 return 0; 247 250 } ··· 467 464 468 465 switch (info.index) { 469 466 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 467 + case VFIO_PCI_REQ_IRQ_INDEX: 470 468 break; 471 469 case VFIO_PCI_ERR_IRQ_INDEX: 472 470 if (pci_is_pcie(vdev->pdev)) ··· 832 828 req_len, vma->vm_page_prot); 833 829 } 834 830 831 + static void vfio_pci_request(void *device_data, unsigned int count) 832 + { 833 + struct vfio_pci_device *vdev = device_data; 834 + 835 + mutex_lock(&vdev->igate); 836 + 837 + if (vdev->req_trigger) { 838 + dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); 839 + eventfd_signal(vdev->req_trigger, 1); 840 + } 841 + 842 + mutex_unlock(&vdev->igate); 843 + } 844 + 835 845 static const struct vfio_device_ops vfio_pci_ops = { 836 846 .name = "vfio-pci", 837 847 .open = vfio_pci_open, ··· 854 836 .read = vfio_pci_read, 855 837 .write = vfio_pci_write, 856 838 .mmap = vfio_pci_mmap, 839 + .request = vfio_pci_request, 857 840 }; 858 841 859 842 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+45 -15
drivers/vfio/pci/vfio_pci_intrs.c
··· 763 763 return 0; 764 764 } 765 765 766 - static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, 767 - unsigned index, unsigned start, 768 - unsigned count, uint32_t flags, void *data) 766 + static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, 767 + uint32_t flags, void *data) 769 768 { 770 769 int32_t fd = *(int32_t *)data; 771 770 772 - if ((index != VFIO_PCI_ERR_IRQ_INDEX) || 773 - !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) 771 + if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) 774 772 return -EINVAL; 775 773 776 774 /* DATA_NONE/DATA_BOOL enables loopback testing */ 777 775 if (flags & VFIO_IRQ_SET_DATA_NONE) { 778 - if (vdev->err_trigger) 779 - eventfd_signal(vdev->err_trigger, 1); 776 + if (*ctx) 777 + eventfd_signal(*ctx, 1); 780 778 return 0; 781 779 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 782 780 uint8_t trigger = *(uint8_t *)data; 783 - if (trigger && vdev->err_trigger) 784 - eventfd_signal(vdev->err_trigger, 1); 781 + if (trigger && *ctx) 782 + eventfd_signal(*ctx, 1); 785 783 return 0; 786 784 } 787 785 788 786 /* Handle SET_DATA_EVENTFD */ 789 787 if (fd == -1) { 790 - if (vdev->err_trigger) 791 - eventfd_ctx_put(vdev->err_trigger); 792 - vdev->err_trigger = NULL; 788 + if (*ctx) 789 + eventfd_ctx_put(*ctx); 790 + *ctx = NULL; 793 791 return 0; 794 792 } else if (fd >= 0) { 795 793 struct eventfd_ctx *efdctx; 796 794 efdctx = eventfd_ctx_fdget(fd); 797 795 if (IS_ERR(efdctx)) 798 796 return PTR_ERR(efdctx); 799 - if (vdev->err_trigger) 800 - eventfd_ctx_put(vdev->err_trigger); 801 - vdev->err_trigger = efdctx; 797 + if (*ctx) 798 + eventfd_ctx_put(*ctx); 799 + *ctx = efdctx; 802 800 return 0; 803 801 } else 804 802 return -EINVAL; 805 803 } 804 + 805 + static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, 806 + unsigned index, unsigned start, 807 + unsigned count, uint32_t flags, void *data) 808 + { 809 + if (index != VFIO_PCI_ERR_IRQ_INDEX) 810 + return -EINVAL; 811 + 812 + /* 813 + * We should sanitize start & count, but that wasn't caught 814 + * originally, so this IRQ index must forever ignore them :-( 815 + */ 816 + 817 + return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); 818 + } 819 + 820 + static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, 821 + unsigned index, unsigned start, 822 + unsigned count, uint32_t flags, void *data) 823 + { 824 + if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) 825 + return -EINVAL; 826 + 827 + return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); 828 + } 829 + 806 830 int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 807 831 unsigned index, unsigned start, unsigned count, 808 832 void *data) ··· 866 842 case VFIO_IRQ_SET_ACTION_TRIGGER: 867 843 if (pci_is_pcie(vdev->pdev)) 868 844 func = vfio_pci_set_err_trigger; 845 + break; 846 + } 847 + case VFIO_PCI_REQ_IRQ_INDEX: 848 + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 849 + case VFIO_IRQ_SET_ACTION_TRIGGER: 850 + func = vfio_pci_set_req_trigger; 869 851 break; 870 852 } 871 853 }
+1
drivers/vfio/pci/vfio_pci_private.h
··· 58 58 struct pci_saved_state *pci_saved_state; 59 59 int refcnt; 60 60 struct eventfd_ctx *err_trigger; 61 + struct eventfd_ctx *req_trigger; 61 62 }; 62 63 63 64 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
+104 -15
drivers/vfio/vfio.c
··· 63 63 void *iommu_data; 64 64 }; 65 65 66 + struct vfio_unbound_dev { 67 + struct device *dev; 68 + struct list_head unbound_next; 69 + }; 70 + 66 71 struct vfio_group { 67 72 struct kref kref; 68 73 int minor; ··· 80 75 struct notifier_block nb; 81 76 struct list_head vfio_next; 82 77 struct list_head container_next; 78 + struct list_head unbound_list; 79 + struct mutex unbound_lock; 83 80 atomic_t opened; 84 81 }; 85 82 ··· 211 204 kref_init(&group->kref); 212 205 INIT_LIST_HEAD(&group->device_list); 213 206 mutex_init(&group->device_lock); 207 + INIT_LIST_HEAD(&group->unbound_list); 208 + mutex_init(&group->unbound_lock); 214 209 atomic_set(&group->container_users, 0); 215 210 atomic_set(&group->opened, 0); 216 211 group->iommu_group = iommu_group; ··· 273 264 static void vfio_group_release(struct kref *kref) 274 265 { 275 266 struct vfio_group *group = container_of(kref, struct vfio_group, kref); 267 + struct vfio_unbound_dev *unbound, *tmp; 268 + struct iommu_group *iommu_group = group->iommu_group; 276 269 277 270 WARN_ON(!list_empty(&group->device_list)); 271 + 272 + list_for_each_entry_safe(unbound, tmp, 273 + &group->unbound_list, unbound_next) { 274 + list_del(&unbound->unbound_next); 275 + kfree(unbound); 276 + } 278 277 279 278 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); 280 279 list_del(&group->vfio_next); 281 280 vfio_free_group_minor(group->minor); 282 281 vfio_group_unlock_and_free(group); 282 + iommu_group_put(iommu_group); 283 283 } 284 284 285 285 static void vfio_group_put(struct vfio_group *group) ··· 458 440 } 459 441 460 442 /* 461 - * A vfio group is viable for use by userspace if all devices are either 462 - * driver-less or bound to a vfio or whitelisted driver. We test the 463 - * latter by the existence of a struct vfio_device matching the dev. 443 + * A vfio group is viable for use by userspace if all devices are in 444 + * one of the following states: 445 + * - driver-less 446 + * - bound to a vfio driver 447 + * - bound to a whitelisted driver 448 + * 449 + * We use two methods to determine whether a device is bound to a vfio 450 + * driver. The first is to test whether the device exists in the vfio 451 + * group. The second is to test if the device exists on the group 452 + * unbound_list, indicating it's in the middle of transitioning from 453 + * a vfio driver to driver-less. 464 454 */ 465 455 static int vfio_dev_viable(struct device *dev, void *data) 466 456 { 467 457 struct vfio_group *group = data; 468 458 struct vfio_device *device; 469 459 struct device_driver *drv = ACCESS_ONCE(dev->driver); 460 + struct vfio_unbound_dev *unbound; 461 + int ret = -EINVAL; 470 462 471 - if (!drv || vfio_whitelisted_driver(drv)) 463 + mutex_lock(&group->unbound_lock); 464 + list_for_each_entry(unbound, &group->unbound_list, unbound_next) { 465 + if (dev == unbound->dev) { 466 + ret = 0; 467 + break; 468 + } 469 + } 470 + mutex_unlock(&group->unbound_lock); 471 + 472 + if (!ret || !drv || vfio_whitelisted_driver(drv)) 472 473 return 0; 473 474 474 475 device = vfio_group_get_device(group, dev); ··· 496 459 return 0; 497 460 } 498 461 499 - return -EINVAL; 462 + return ret; 500 463 } 501 464 502 465 /** ··· 538 501 { 539 502 struct vfio_group *group = container_of(nb, struct vfio_group, nb); 540 503 struct device *dev = data; 504 + struct vfio_unbound_dev *unbound; 541 505 542 506 /* 543 507 * Need to go through a group_lock lookup to get a reference or we ··· 588 550 * stop the system to maintain isolation. At a minimum, we'd 589 551 * want a toggle to disable driver auto probe for this device. 590 552 */ 553 + 554 + mutex_lock(&group->unbound_lock); 555 + list_for_each_entry(unbound, 556 + &group->unbound_list, unbound_next) { 557 + if (dev == unbound->dev) { 558 + list_del(&unbound->unbound_next); 559 + kfree(unbound); 560 + break; 561 + } 562 + } 563 + mutex_unlock(&group->unbound_lock); 591 564 break; 592 565 } 593 566 ··· 627 578 iommu_group_put(iommu_group); 628 579 return PTR_ERR(group); 629 580 } 581 + } else { 582 + /* 583 + * A found vfio_group already holds a reference to the 584 + * iommu_group. A created vfio_group keeps the reference. 585 + */ 586 + iommu_group_put(iommu_group); 630 587 } 631 588 632 589 device = vfio_group_get_device(group, dev); ··· 641 586 dev_name(dev), iommu_group_id(iommu_group)); 642 587 vfio_device_put(device); 643 588 vfio_group_put(group); 644 - iommu_group_put(iommu_group); 645 589 return -EBUSY; 646 590 } 647 591 648 592 device = vfio_group_create_device(group, dev, ops, device_data); 649 593 if (IS_ERR(device)) { 650 594 vfio_group_put(group); 651 - iommu_group_put(iommu_group); 652 595 return PTR_ERR(device); 653 596 } 654 597 655 598 /* 656 - * Added device holds reference to iommu_group and vfio_device 657 - * (which in turn holds reference to vfio_group). Drop extra 658 - * group reference used while acquiring device. 599 + * Drop all but the vfio_device reference. The vfio_device holds 600 + * a reference to the vfio_group, which holds a reference to the 601 + * iommu_group. 659 602 */ 660 603 vfio_group_put(group); 661 604 ··· 708 655 { 709 656 struct vfio_device *device = dev_get_drvdata(dev); 710 657 struct vfio_group *group = device->group; 711 - struct iommu_group *iommu_group = group->iommu_group; 712 658 void *device_data = device->device_data; 659 + struct vfio_unbound_dev *unbound; 660 + unsigned int i = 0; 713 661 714 662 /* 715 663 * The group exists so long as we have a device reference. Get ··· 718 664 */ 719 665 vfio_group_get(group); 720 666 667 + /* 668 + * When the device is removed from the group, the group suddenly 669 + * becomes non-viable; the device has a driver (until the unbind 670 + * completes), but it's not present in the group. This is bad news 671 + * for any external users that need to re-acquire a group reference 672 + * in order to match and release their existing reference. To 673 + * solve this, we track such devices on the unbound_list to bridge 674 + * the gap until they're fully unbound. 675 + */ 676 + unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); 677 + if (unbound) { 678 + unbound->dev = dev; 679 + mutex_lock(&group->unbound_lock); 680 + list_add(&unbound->unbound_next, &group->unbound_list); 681 + mutex_unlock(&group->unbound_lock); 682 + } 683 + WARN_ON(!unbound); 684 + 721 685 vfio_device_put(device); 722 686 723 - /* TODO send a signal to encourage this to be released */ 724 - wait_event(vfio.release_q, !vfio_dev_present(group, dev)); 687 + /* 688 + * If the device is still present in the group after the above 689 + * 'put', then it is in use and we need to request it from the 690 + * bus driver. The driver may in turn need to request the 691 + * device from the user. We send the request on an arbitrary 692 + * interval with counter to allow the driver to take escalating 693 + * measures to release the device if it has the ability to do so. 694 + */ 695 + do { 696 + device = vfio_group_get_device(group, dev); 697 + if (!device) 698 + break; 699 + 700 + if (device->ops->request) 701 + device->ops->request(device_data, i++); 702 + 703 + vfio_device_put(device); 704 + 705 + } while (wait_event_interruptible_timeout(vfio.release_q, 706 + !vfio_dev_present(group, dev), 707 + HZ * 10) <= 0); 725 708 726 709 vfio_group_put(group); 727 - 728 - iommu_group_put(iommu_group); 729 710 730 711 return device_data; 731 712 }
+69 -11
drivers/vfio/vfio_iommu_type1.c
··· 66 66 struct list_head next; 67 67 struct list_head group_list; 68 68 int prot; /* IOMMU_CACHE */ 69 + bool fgsp; /* Fine-grained super pages */ 69 70 }; 70 71 71 72 struct vfio_dma { ··· 265 264 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 266 265 bool lock_cap = capable(CAP_IPC_LOCK); 267 266 long ret, i; 267 + bool rsvd; 268 268 269 269 if (!current->mm) 270 270 return -ENODEV; ··· 274 272 if (ret) 275 273 return ret; 276 274 277 - if (is_invalid_reserved_pfn(*pfn_base)) 278 - return 1; 275 + rsvd = is_invalid_reserved_pfn(*pfn_base); 279 276 280 - if (!lock_cap && current->mm->locked_vm + 1 > limit) { 277 + if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { 281 278 put_pfn(*pfn_base, prot); 282 279 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, 283 280 limit << PAGE_SHIFT); ··· 284 283 } 285 284 286 285 if (unlikely(disable_hugepages)) { 287 - vfio_lock_acct(1); 286 + if (!rsvd) 287 + vfio_lock_acct(1); 288 288 return 1; 289 289 } 290 290 ··· 297 295 if (ret) 298 296 break; 299 297 300 - if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { 298 + if (pfn != *pfn_base + i || 299 + rsvd != is_invalid_reserved_pfn(pfn)) { 301 300 put_pfn(pfn, prot); 302 301 break; 303 302 } 304 303 305 - if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { 304 + if (!rsvd && !lock_cap && 305 + current->mm->locked_vm + i + 1 > limit) { 306 306 put_pfn(pfn, prot); 307 307 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 308 308 __func__, limit << PAGE_SHIFT); ··· 312 308 } 313 309 } 314 310 315 - vfio_lock_acct(i); 311 + if (!rsvd) 312 + vfio_lock_acct(i); 316 313 317 314 return i; 318 315 } ··· 351 346 domain = d = list_first_entry(&iommu->domain_list, 352 347 struct vfio_domain, next); 353 348 354 - list_for_each_entry_continue(d, &iommu->domain_list, next) 349 + list_for_each_entry_continue(d, &iommu->domain_list, next) { 355 350 iommu_unmap(d->domain, dma->iova, dma->size); 351 + cond_resched(); 352 + } 356 353 357 354 while (iova < end) { 358 - size_t unmapped; 359 - phys_addr_t phys; 355 + size_t unmapped, len; 356 + phys_addr_t phys, next; 360 357 361 358 phys = iommu_iova_to_phys(domain->domain, iova); 362 359 if (WARN_ON(!phys)) { ··· 366 359 continue; 367 360 } 368 361 369 - unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); 362 + /* 363 + * To optimize for fewer iommu_unmap() calls, each of which 364 + * may require hardware cache flushing, try to find the 365 + * largest contiguous physical memory chunk to unmap. 366 + */ 367 + for (len = PAGE_SIZE; 368 + !domain->fgsp && iova + len < end; len += PAGE_SIZE) { 369 + next = iommu_iova_to_phys(domain->domain, iova + len); 370 + if (next != phys + len) 371 + break; 372 + } 373 + 374 + unmapped = iommu_unmap(domain->domain, iova, len); 370 375 if (WARN_ON(!unmapped)) 371 376 break; 372 377 ··· 386 367 unmapped >> PAGE_SHIFT, 387 368 dma->prot, false); 388 369 iova += unmapped; 370 + 371 + cond_resched(); 389 372 } 390 373 391 374 vfio_lock_acct(-unlocked); ··· 532 511 map_try_harder(d, iova, pfn, npage, prot)) 533 512 goto unwind; 534 513 } 514 + 515 + cond_resched(); 535 516 } 536 517 537 518 return 0; ··· 688 665 return 0; 689 666 } 690 667 668 + /* 669 + * We change our unmap behavior slightly depending on whether the IOMMU 670 + * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage 671 + * for practically any contiguous power-of-two mapping we give it. This means 672 + * we don't need to look for contiguous chunks ourselves to make unmapping 673 + * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d 674 + * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks 675 + * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when 676 + * hugetlbfs is in use. 677 + */ 678 + static void vfio_test_domain_fgsp(struct vfio_domain *domain) 679 + { 680 + struct page *pages; 681 + int ret, order = get_order(PAGE_SIZE * 2); 682 + 683 + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); 684 + if (!pages) 685 + return; 686 + 687 + ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, 688 + IOMMU_READ | IOMMU_WRITE | domain->prot); 689 + if (!ret) { 690 + size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); 691 + 692 + if (unmapped == PAGE_SIZE) 693 + iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); 694 + else 695 + domain->fgsp = true; 696 + } 697 + 698 + __free_pages(pages, order); 699 + } 700 + 691 701 static int vfio_iommu_type1_attach_group(void *iommu_data, 692 702 struct iommu_group *iommu_group) 693 703 { ··· 813 757 goto out_domain; 814 758 } 815 759 } 760 + 761 + vfio_test_domain_fgsp(domain); 816 762 817 763 /* replay mappings on new domains */ 818 764 ret = vfio_iommu_replay(iommu, domain);
+2
include/linux/vfio.h
··· 26 26 * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* 27 27 * operations documented below 28 28 * @mmap: Perform mmap(2) on a region of the device file descriptor 29 + * @request: Request for the bus driver to release the device 29 30 */ 30 31 struct vfio_device_ops { 31 32 char *name; ··· 39 38 long (*ioctl)(void *device_data, unsigned int cmd, 40 39 unsigned long arg); 41 40 int (*mmap)(void *device_data, struct vm_area_struct *vma); 41 + void (*request)(void *device_data, unsigned int count); 42 42 }; 43 43 44 44 extern int vfio_add_group_dev(struct device *dev,
+1
include/uapi/linux/vfio.h
··· 333 333 VFIO_PCI_MSI_IRQ_INDEX, 334 334 VFIO_PCI_MSIX_IRQ_INDEX, 335 335 VFIO_PCI_ERR_IRQ_INDEX, 336 + VFIO_PCI_REQ_IRQ_INDEX, 336 337 VFIO_PCI_NUM_IRQS 337 338 }; 338 339