Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfio-iommufd: Support iommufd for emulated VFIO devices

Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and
consist of all the mdev drivers.

Like the physical drivers, support for iommufd is provided by the driver
supplying the correct standard ops. Provide ops from the core that
duplicate what vfio_register_emulated_iommu_dev() does.

Emulated drivers are where it is more likely to see variation in the
iommfd support ops. For instance IDXD will probably need to setup both a
iommfd_device context linked to a PASID and an iommufd_access context to
support all their mdev operations.

Link: https://lore.kernel.org/r/7-v4-42cd2eb0e3eb+335a-vfio_iommufd_jgg@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Lixiao Yang <lixiao.yang@intel.com>
Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
Tested-by: Yu He <yu.he@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

+229 -94
+3
drivers/gpu/drm/i915/gvt/kvmgt.c
··· 1484 1484 .mmap = intel_vgpu_mmap, 1485 1485 .ioctl = intel_vgpu_ioctl, 1486 1486 .dma_unmap = intel_vgpu_dma_unmap, 1487 + .bind_iommufd = vfio_iommufd_emulated_bind, 1488 + .unbind_iommufd = vfio_iommufd_emulated_unbind, 1489 + .attach_ioas = vfio_iommufd_emulated_attach_ioas, 1487 1490 }; 1488 1491 1489 1492 static int intel_vgpu_probe(struct mdev_device *mdev)
+3
drivers/s390/cio/vfio_ccw_ops.c
··· 588 588 .ioctl = vfio_ccw_mdev_ioctl, 589 589 .request = vfio_ccw_mdev_request, 590 590 .dma_unmap = vfio_ccw_dma_unmap, 591 + .bind_iommufd = vfio_iommufd_emulated_bind, 592 + .unbind_iommufd = vfio_iommufd_emulated_unbind, 593 + .attach_ioas = vfio_iommufd_emulated_attach_ioas, 591 594 }; 592 595 593 596 struct mdev_driver vfio_ccw_mdev_driver = {
+3
drivers/s390/crypto/vfio_ap_ops.c
··· 1805 1805 .close_device = vfio_ap_mdev_close_device, 1806 1806 .ioctl = vfio_ap_mdev_ioctl, 1807 1807 .dma_unmap = vfio_ap_mdev_dma_unmap, 1808 + .bind_iommufd = vfio_iommufd_emulated_bind, 1809 + .unbind_iommufd = vfio_iommufd_emulated_unbind, 1810 + .attach_ioas = vfio_iommufd_emulated_attach_ioas, 1808 1811 }; 1809 1812 1810 1813 static struct mdev_driver vfio_ap_matrix_driver = {
+19 -91
drivers/vfio/container.c
··· 540 540 fput(group->opened_file); 541 541 } 542 542 543 - /* 544 - * Pin contiguous user pages and return their associated host pages for local 545 - * domain only. 546 - * @device [in] : device 547 - * @iova [in] : starting IOVA of user pages to be pinned. 548 - * @npage [in] : count of pages to be pinned. This count should not 549 - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 550 - * @prot [in] : protection flags 551 - * @pages[out] : array of host pages 552 - * Return error or number of pages pinned. 553 - * 554 - * A driver may only call this function if the vfio_device was created 555 - * by vfio_register_emulated_iommu_dev(). 556 - */ 557 - int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 558 - int npage, int prot, struct page **pages) 543 + int vfio_container_pin_pages(struct vfio_container *container, 544 + struct iommu_group *iommu_group, dma_addr_t iova, 545 + int npage, int prot, struct page **pages) 559 546 { 560 - struct vfio_container *container; 561 - struct vfio_group *group = device->group; 562 - struct vfio_iommu_driver *driver; 563 - int ret; 564 - 565 - if (!pages || !npage || !vfio_assert_device_open(device)) 566 - return -EINVAL; 547 + struct vfio_iommu_driver *driver = container->iommu_driver; 567 548 568 549 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) 569 550 return -E2BIG; 570 551 571 - /* group->container cannot change while a vfio device is open */ 572 - container = group->container; 573 - driver = container->iommu_driver; 574 - if (likely(driver && driver->ops->pin_pages)) 575 - ret = driver->ops->pin_pages(container->iommu_data, 576 - group->iommu_group, iova, 577 - npage, prot, pages); 578 - else 579 - ret = -ENOTTY; 580 - 581 - return ret; 552 + if (unlikely(!driver || !driver->ops->pin_pages)) 553 + return -ENOTTY; 554 + return driver->ops->pin_pages(container->iommu_data, iommu_group, iova, 555 + npage, prot, pages); 582 556 } 583 - EXPORT_SYMBOL(vfio_pin_pages); 584 557 585 - /* 586 - * Unpin contiguous host pages for local domain only. 587 - * @device [in] : device 588 - * @iova [in] : starting address of user pages to be unpinned. 589 - * @npage [in] : count of pages to be unpinned. This count should not 590 - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 591 - */ 592 - void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 558 + void vfio_container_unpin_pages(struct vfio_container *container, 559 + dma_addr_t iova, int npage) 593 560 { 594 - struct vfio_container *container; 595 - struct vfio_iommu_driver *driver; 596 - 597 561 if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) 598 562 return; 599 563 600 - if (WARN_ON(!vfio_assert_device_open(device))) 601 - return; 602 - 603 - /* group->container cannot change while a vfio device is open */ 604 - container = device->group->container; 605 - driver = container->iommu_driver; 606 - 607 - driver->ops->unpin_pages(container->iommu_data, iova, npage); 564 + container->iommu_driver->ops->unpin_pages(container->iommu_data, iova, 565 + npage); 608 566 } 609 - EXPORT_SYMBOL(vfio_unpin_pages); 610 567 611 - /* 612 - * This interface allows the CPUs to perform some sort of virtual DMA on 613 - * behalf of the device. 614 - * 615 - * CPUs read/write from/into a range of IOVAs pointing to user space memory 616 - * into/from a kernel buffer. 617 - * 618 - * As the read/write of user space memory is conducted via the CPUs and is 619 - * not a real device DMA, it is not necessary to pin the user space memory. 620 - * 621 - * @device [in] : VFIO device 622 - * @iova [in] : base IOVA of a user space buffer 623 - * @data [in] : pointer to kernel buffer 624 - * @len [in] : kernel buffer length 625 - * @write : indicate read or write 626 - * Return error code on failure or 0 on success. 627 - */ 628 - int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 629 - size_t len, bool write) 568 + int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, 569 + void *data, size_t len, bool write) 630 570 { 631 - struct vfio_container *container; 632 - struct vfio_iommu_driver *driver; 633 - int ret = 0; 571 + struct vfio_iommu_driver *driver = container->iommu_driver; 634 572 635 - if (!data || len <= 0 || !vfio_assert_device_open(device)) 636 - return -EINVAL; 637 - 638 - /* group->container cannot change while a vfio device is open */ 639 - container = device->group->container; 640 - driver = container->iommu_driver; 641 - 642 - if (likely(driver && driver->ops->dma_rw)) 643 - ret = driver->ops->dma_rw(container->iommu_data, 644 - iova, data, len, write); 645 - else 646 - ret = -ENOTTY; 647 - return ret; 573 + if (unlikely(!driver || !driver->ops->dma_rw)) 574 + return -ENOTTY; 575 + return driver->ops->dma_rw(container->iommu_data, iova, data, len, 576 + write); 648 577 } 649 - EXPORT_SYMBOL(vfio_dma_rw); 650 578 651 579 int __init vfio_container_init(void) 652 580 {
+58
drivers/vfio/iommufd.c
··· 98 98 return 0; 99 99 } 100 100 EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); 101 + 102 + /* 103 + * The emulated standard ops mean that vfio_device is going to use the 104 + * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this 105 + * ops set should call vfio_register_emulated_iommu_dev(). 106 + */ 107 + 108 + static void vfio_emulated_unmap(void *data, unsigned long iova, 109 + unsigned long length) 110 + { 111 + struct vfio_device *vdev = data; 112 + 113 + vdev->ops->dma_unmap(vdev, iova, length); 114 + } 115 + 116 + static const struct iommufd_access_ops vfio_user_ops = { 117 + .needs_pin_pages = 1, 118 + .unmap = vfio_emulated_unmap, 119 + }; 120 + 121 + int vfio_iommufd_emulated_bind(struct vfio_device *vdev, 122 + struct iommufd_ctx *ictx, u32 *out_device_id) 123 + { 124 + lockdep_assert_held(&vdev->dev_set->lock); 125 + 126 + vdev->iommufd_ictx = ictx; 127 + iommufd_ctx_get(ictx); 128 + return 0; 129 + } 130 + EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind); 131 + 132 + void vfio_iommufd_emulated_unbind(struct vfio_device *vdev) 133 + { 134 + lockdep_assert_held(&vdev->dev_set->lock); 135 + 136 + if (vdev->iommufd_access) { 137 + iommufd_access_destroy(vdev->iommufd_access); 138 + vdev->iommufd_access = NULL; 139 + } 140 + iommufd_ctx_put(vdev->iommufd_ictx); 141 + vdev->iommufd_ictx = NULL; 142 + } 143 + EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind); 144 + 145 + int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) 146 + { 147 + struct iommufd_access *user; 148 + 149 + lockdep_assert_held(&vdev->dev_set->lock); 150 + 151 + user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops, 152 + vdev); 153 + if (IS_ERR(user)) 154 + return PTR_ERR(user); 155 + vdev->iommufd_access = user; 156 + return 0; 157 + } 158 + EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas);
+8 -2
drivers/vfio/vfio.h
··· 111 111 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); 112 112 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); 113 113 114 - bool vfio_assert_device_open(struct vfio_device *device); 115 - 116 114 struct vfio_container *vfio_container_from_file(struct file *filep); 117 115 int vfio_group_use_container(struct vfio_group *group); 118 116 void vfio_group_unuse_container(struct vfio_group *group); ··· 119 121 void vfio_group_detach_container(struct vfio_group *group); 120 122 void vfio_device_container_register(struct vfio_device *device); 121 123 void vfio_device_container_unregister(struct vfio_device *device); 124 + int vfio_container_pin_pages(struct vfio_container *container, 125 + struct iommu_group *iommu_group, dma_addr_t iova, 126 + int npage, int prot, struct page **pages); 127 + void vfio_container_unpin_pages(struct vfio_container *container, 128 + dma_addr_t iova, int npage); 129 + int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova, 130 + void *data, size_t len, bool write); 131 + 122 132 int __init vfio_container_init(void); 123 133 void vfio_container_cleanup(void); 124 134
+121 -1
drivers/vfio/vfio_main.c
··· 770 770 static const struct file_operations vfio_device_fops; 771 771 772 772 /* true if the vfio_device has open_device() called but not close_device() */ 773 - bool vfio_assert_device_open(struct vfio_device *device) 773 + static bool vfio_assert_device_open(struct vfio_device *device) 774 774 { 775 775 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 776 776 } ··· 1875 1875 return 0; 1876 1876 } 1877 1877 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); 1878 + 1879 + /* 1880 + * Pin contiguous user pages and return their associated host pages for local 1881 + * domain only. 1882 + * @device [in] : device 1883 + * @iova [in] : starting IOVA of user pages to be pinned. 1884 + * @npage [in] : count of pages to be pinned. This count should not 1885 + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1886 + * @prot [in] : protection flags 1887 + * @pages[out] : array of host pages 1888 + * Return error or number of pages pinned. 1889 + * 1890 + * A driver may only call this function if the vfio_device was created 1891 + * by vfio_register_emulated_iommu_dev() due to vfio_container_pin_pages(). 1892 + */ 1893 + int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, 1894 + int npage, int prot, struct page **pages) 1895 + { 1896 + /* group->container cannot change while a vfio device is open */ 1897 + if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) 1898 + return -EINVAL; 1899 + if (device->group->container) 1900 + return vfio_container_pin_pages(device->group->container, 1901 + device->group->iommu_group, 1902 + iova, npage, prot, pages); 1903 + if (device->iommufd_access) { 1904 + int ret; 1905 + 1906 + if (iova > ULONG_MAX) 1907 + return -EINVAL; 1908 + /* 1909 + * VFIO ignores the sub page offset, npages is from the start of 1910 + * a PAGE_SIZE chunk of IOVA. The caller is expected to recover 1911 + * the sub page offset by doing: 1912 + * pages[0] + (iova % PAGE_SIZE) 1913 + */ 1914 + ret = iommufd_access_pin_pages( 1915 + device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), 1916 + npage * PAGE_SIZE, pages, 1917 + (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); 1918 + if (ret) 1919 + return ret; 1920 + return npage; 1921 + } 1922 + return -EINVAL; 1923 + } 1924 + EXPORT_SYMBOL(vfio_pin_pages); 1925 + 1926 + /* 1927 + * Unpin contiguous host pages for local domain only. 1928 + * @device [in] : device 1929 + * @iova [in] : starting address of user pages to be unpinned. 1930 + * @npage [in] : count of pages to be unpinned. This count should not 1931 + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. 1932 + */ 1933 + void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) 1934 + { 1935 + if (WARN_ON(!vfio_assert_device_open(device))) 1936 + return; 1937 + 1938 + if (device->group->container) { 1939 + vfio_container_unpin_pages(device->group->container, iova, 1940 + npage); 1941 + return; 1942 + } 1943 + if (device->iommufd_access) { 1944 + if (WARN_ON(iova > ULONG_MAX)) 1945 + return; 1946 + iommufd_access_unpin_pages(device->iommufd_access, 1947 + ALIGN_DOWN(iova, PAGE_SIZE), 1948 + npage * PAGE_SIZE); 1949 + return; 1950 + } 1951 + } 1952 + EXPORT_SYMBOL(vfio_unpin_pages); 1953 + 1954 + /* 1955 + * This interface allows the CPUs to perform some sort of virtual DMA on 1956 + * behalf of the device. 1957 + * 1958 + * CPUs read/write from/into a range of IOVAs pointing to user space memory 1959 + * into/from a kernel buffer. 1960 + * 1961 + * As the read/write of user space memory is conducted via the CPUs and is 1962 + * not a real device DMA, it is not necessary to pin the user space memory. 1963 + * 1964 + * @device [in] : VFIO device 1965 + * @iova [in] : base IOVA of a user space buffer 1966 + * @data [in] : pointer to kernel buffer 1967 + * @len [in] : kernel buffer length 1968 + * @write : indicate read or write 1969 + * Return error code on failure or 0 on success. 1970 + */ 1971 + int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, 1972 + size_t len, bool write) 1973 + { 1974 + if (!data || len <= 0 || !vfio_assert_device_open(device)) 1975 + return -EINVAL; 1976 + 1977 + if (device->group->container) 1978 + return vfio_container_dma_rw(device->group->container, iova, 1979 + data, len, write); 1980 + 1981 + if (device->iommufd_access) { 1982 + unsigned int flags = 0; 1983 + 1984 + if (iova > ULONG_MAX) 1985 + return -EINVAL; 1986 + 1987 + /* VFIO historically tries to auto-detect a kthread */ 1988 + if (!current->mm) 1989 + flags |= IOMMUFD_ACCESS_RW_KTHREAD; 1990 + if (write) 1991 + flags |= IOMMUFD_ACCESS_RW_WRITE; 1992 + return iommufd_access_rw(device->iommufd_access, iova, data, 1993 + len, flags); 1994 + } 1995 + return -EINVAL; 1996 + } 1997 + EXPORT_SYMBOL(vfio_dma_rw); 1878 1998 1879 1999 /* 1880 2000 * Module/class support
+14
include/linux/vfio.h
··· 19 19 struct kvm; 20 20 struct iommufd_ctx; 21 21 struct iommufd_device; 22 + struct iommufd_access; 22 23 23 24 /* 24 25 * VFIO devices can be placed in a set, this allows all devices to share this ··· 57 56 struct completion comp; 58 57 struct list_head group_next; 59 58 struct list_head iommu_entry; 59 + struct iommufd_access *iommufd_access; 60 60 #if IS_ENABLED(CONFIG_IOMMUFD) 61 61 struct iommufd_device *iommufd_device; 62 + struct iommufd_ctx *iommufd_ictx; 62 63 bool iommufd_attached; 63 64 #endif 64 65 }; ··· 114 111 struct iommufd_ctx *ictx, u32 *out_device_id); 115 112 void vfio_iommufd_physical_unbind(struct vfio_device *vdev); 116 113 int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); 114 + int vfio_iommufd_emulated_bind(struct vfio_device *vdev, 115 + struct iommufd_ctx *ictx, u32 *out_device_id); 116 + void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); 117 + int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); 117 118 #else 118 119 #define vfio_iommufd_physical_bind \ 119 120 ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ ··· 125 118 #define vfio_iommufd_physical_unbind \ 126 119 ((void (*)(struct vfio_device *vdev)) NULL) 127 120 #define vfio_iommufd_physical_attach_ioas \ 121 + ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) 122 + #define vfio_iommufd_emulated_bind \ 123 + ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ 124 + u32 *out_device_id)) NULL) 125 + #define vfio_iommufd_emulated_unbind \ 126 + ((void (*)(struct vfio_device *vdev)) NULL) 127 + #define vfio_iommufd_emulated_attach_ioas \ 128 128 ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) 129 129 #endif 130 130