Merge tag 'vfio-v6.6-rc1' of https://github.com/awilliam/linux-vfio

+144 -3

Documentation/driver-api/vfio.rst

··· 239 239 /* Gratuitous device reset and go... */ 240 240 ioctl(device, VFIO_DEVICE_RESET); 241 241 242 + IOMMUFD and vfio_iommu_type1 243 + ---------------------------- 244 + 245 + IOMMUFD is the new user API to manage I/O page tables from userspace. 246 + It intends to be the portal of delivering advanced userspace DMA 247 + features (nested translation [5]_, PASID [6]_, etc.) while also providing 248 + a backwards compatibility interface for existing VFIO_TYPE1v2_IOMMU use 249 + cases. Eventually the vfio_iommu_type1 driver, as well as the legacy 250 + vfio container and group model is intended to be deprecated. 251 + 252 + The IOMMUFD backwards compatibility interface can be enabled two ways. 253 + In the first method, the kernel can be configured with 254 + CONFIG_IOMMUFD_VFIO_CONTAINER, in which case the IOMMUFD subsystem 255 + transparently provides the entire infrastructure for the VFIO 256 + container and IOMMU backend interfaces. The compatibility mode can 257 + also be accessed if the VFIO container interface, ie. /dev/vfio/vfio is 258 + simply symlink'd to /dev/iommu. Note that at the time of writing, the 259 + compatibility mode is not entirely feature complete relative to 260 + VFIO_TYPE1v2_IOMMU (ex. DMA mapping MMIO) and does not attempt to 261 + provide compatibility to the VFIO_SPAPR_TCE_IOMMU interface. Therefore 262 + it is not generally advisable at this time to switch from native VFIO 263 + implementations to the IOMMUFD compatibility interfaces. 264 + 265 + Long term, VFIO users should migrate to device access through the cdev 266 + interface described below, and native access through the IOMMUFD 267 + provided interfaces. 268 + 269 + VFIO Device cdev 270 + ---------------- 271 + 272 + Traditionally user acquires a device fd via VFIO_GROUP_GET_DEVICE_FD 273 + in a VFIO group. 274 + 275 + With CONFIG_VFIO_DEVICE_CDEV=y the user can now acquire a device fd 276 + by directly opening a character device /dev/vfio/devices/vfioX where 277 + "X" is the number allocated uniquely by VFIO for registered devices. 278 + cdev interface does not support noiommu devices, so user should use 279 + the legacy group interface if noiommu is wanted. 280 + 281 + The cdev only works with IOMMUFD. Both VFIO drivers and applications 282 + must adapt to the new cdev security model which requires using 283 + VFIO_DEVICE_BIND_IOMMUFD to claim DMA ownership before starting to 284 + actually use the device. Once BIND succeeds then a VFIO device can 285 + be fully accessed by the user. 286 + 287 + VFIO device cdev doesn't rely on VFIO group/container/iommu drivers. 288 + Hence those modules can be fully compiled out in an environment 289 + where no legacy VFIO application exists. 290 + 291 + So far SPAPR does not support IOMMUFD yet. So it cannot support device 292 + cdev either. 293 + 294 + vfio device cdev access is still bound by IOMMU group semantics, ie. there 295 + can be only one DMA owner for the group. Devices belonging to the same 296 + group can not be bound to multiple iommufd_ctx or shared between native 297 + kernel and vfio bus driver or other driver supporting the driver_managed_dma 298 + flag. A violation of this ownership requirement will fail at the 299 + VFIO_DEVICE_BIND_IOMMUFD ioctl, which gates full device access. 300 + 301 + Device cdev Example 302 + ------------------- 303 + 304 + Assume user wants to access PCI device 0000:6a:01.0:: 305 + 306 + $ ls /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/ 307 + vfio0 308 + 309 + This device is therefore represented as vfio0. The user can verify 310 + its existence:: 311 + 312 + $ ls -l /dev/vfio/devices/vfio0 313 + crw------- 1 root root 511, 0 Feb 16 01:22 /dev/vfio/devices/vfio0 314 + $ cat /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/vfio0/dev 315 + 511:0 316 + $ ls -l /dev/char/511\:0 317 + lrwxrwxrwx 1 root root 21 Feb 16 01:22 /dev/char/511:0 -> ../vfio/devices/vfio0 318 + 319 + Then provide the user with access to the device if unprivileged 320 + operation is desired:: 321 + 322 + $ chown user:user /dev/vfio/devices/vfio0 323 + 324 + Finally the user could get cdev fd by:: 325 + 326 + cdev_fd = open("/dev/vfio/devices/vfio0", O_RDWR); 327 + 328 + An opened cdev_fd doesn't give the user any permission of accessing 329 + the device except binding the cdev_fd to an iommufd. After that point 330 + then the device is fully accessible including attaching it to an 331 + IOMMUFD IOAS/HWPT to enable userspace DMA:: 332 + 333 + struct vfio_device_bind_iommufd bind = { 334 + .argsz = sizeof(bind), 335 + .flags = 0, 336 + }; 337 + struct iommu_ioas_alloc alloc_data = { 338 + .size = sizeof(alloc_data), 339 + .flags = 0, 340 + }; 341 + struct vfio_device_attach_iommufd_pt attach_data = { 342 + .argsz = sizeof(attach_data), 343 + .flags = 0, 344 + }; 345 + struct iommu_ioas_map map = { 346 + .size = sizeof(map), 347 + .flags = IOMMU_IOAS_MAP_READABLE | 348 + IOMMU_IOAS_MAP_WRITEABLE | 349 + IOMMU_IOAS_MAP_FIXED_IOVA, 350 + .__reserved = 0, 351 + }; 352 + 353 + iommufd = open("/dev/iommu", O_RDWR); 354 + 355 + bind.iommufd = iommufd; 356 + ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); 357 + 358 + ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data); 359 + attach_data.pt_id = alloc_data.out_ioas_id; 360 + ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); 361 + 362 + /* Allocate some space and setup a DMA mapping */ 363 + map.user_va = (int64_t)mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, 364 + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); 365 + map.iova = 0; /* 1MB starting at 0x0 from device view */ 366 + map.length = 1024 * 1024; 367 + map.ioas_id = alloc_data.out_ioas_id;; 368 + 369 + ioctl(iommufd, IOMMU_IOAS_MAP, &map); 370 + 371 + /* Other device operations as stated in "VFIO Usage Example" */ 372 + 242 373 VFIO User API 243 374 ------------------------------------------------------------------------------- 244 375 ··· 410 279 struct iommufd_ctx *ictx, u32 *out_device_id); 411 280 void (*unbind_iommufd)(struct vfio_device *vdev); 412 281 int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); 282 + void (*detach_ioas)(struct vfio_device *vdev); 413 283 int (*open_device)(struct vfio_device *vdev); 414 284 void (*close_device)(struct vfio_device *vdev); 415 285 ssize_t (*read)(struct vfio_device *vdev, char __user *buf, ··· 447 315 - The [un]bind_iommufd callbacks are issued when the device is bound to 448 316 and unbound from iommufd. 449 317 450 - - The attach_ioas callback is issued when the device is attached to an 451 - IOAS managed by the bound iommufd. The attached IOAS is automatically 452 - detached when the device is unbound from iommufd. 318 + - The [de]attach_ioas callback is issued when the device is attached to 319 + and detached from an IOAS managed by the bound iommufd. However, the 320 + attached IOAS can also be automatically detached when the device is 321 + unbound from iommufd. 453 322 454 323 - The read/write/mmap callbacks implement the device region access defined 455 324 by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. ··· 697 564 \-0d.1 698 565 699 566 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) 567 + 568 + .. [5] Nested translation is an IOMMU feature which supports two stage 569 + address translations. This improves the address translation efficiency 570 + in IOMMU virtualization. 571 + 572 + .. [6] PASID stands for Process Address Space ID, introduced by PCI 573 + Express. It is a prerequisite for Shared Virtual Addressing (SVA) 574 + and Scalable I/O Virtualization (Scalable IOV).

+79

Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0+ 2 + .. note: can be edited and viewed with /usr/bin/formiko-vim 3 + 4 + ========================================================== 5 + PCI VFIO driver for the AMD/Pensando(R) DSC adapter family 6 + ========================================================== 7 + 8 + AMD/Pensando Linux VFIO PCI Device Driver 9 + Copyright(c) 2023 Advanced Micro Devices, Inc. 10 + 11 + Overview 12 + ======== 13 + 14 + The ``pds-vfio-pci`` module is a PCI driver that supports Live Migration 15 + capable Virtual Function (VF) devices in the DSC hardware. 16 + 17 + Using the device 18 + ================ 19 + 20 + The pds-vfio-pci device is enabled via multiple configuration steps and 21 + depends on the ``pds_core`` driver to create and enable SR-IOV Virtual 22 + Function devices. 23 + 24 + Shown below are the steps to bind the driver to a VF and also to the 25 + associated auxiliary device created by the ``pds_core`` driver. This 26 + example assumes the pds_core and pds-vfio-pci modules are already 27 + loaded. 28 + 29 + .. code-block:: bash 30 + :name: example-setup-script 31 + 32 + #!/bin/bash 33 + 34 + PF_BUS="0000:60" 35 + PF_BDF="0000:60:00.0" 36 + VF_BDF="0000:60:00.1" 37 + 38 + # Prevent non-vfio VF driver from probing the VF device 39 + echo 0 > /sys/class/pci_bus/$PF_BUS/device/$PF_BDF/sriov_drivers_autoprobe 40 + 41 + # Create single VF for Live Migration via pds_core 42 + echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs 43 + 44 + # Allow the VF to be bound to the pds-vfio-pci driver 45 + echo "pds-vfio-pci" > /sys/class/pci_bus/$PF_BUS/device/$VF_BDF/driver_override 46 + 47 + # Bind the VF to the pds-vfio-pci driver 48 + echo "$VF_BDF" > /sys/bus/pci/drivers/pds-vfio-pci/bind 49 + 50 + After performing the steps above, a file in /dev/vfio/<iommu_group> 51 + should have been created. 52 + 53 + 54 + Enabling the driver 55 + =================== 56 + 57 + The driver is enabled via the standard kernel configuration system, 58 + using the make command:: 59 + 60 + make oldconfig/menuconfig/etc. 61 + 62 + The driver is located in the menu structure at: 63 + 64 + -> Device Drivers 65 + -> VFIO Non-Privileged userspace driver framework 66 + -> VFIO support for PDS PCI devices 67 + 68 + Support 69 + ======= 70 + 71 + For general Linux networking support, please use the netdev mailing 72 + list, which is monitored by Pensando personnel:: 73 + 74 + netdev@vger.kernel.org 75 + 76 + For more specific support needs, please use the Pensando driver support 77 + email:: 78 + 79 + drivers@pensando.io

+1

Documentation/networking/device_drivers/ethernet/index.rst

··· 16 16 altera/altera_tse 17 17 amd/pds_core 18 18 amd/pds_vdpa 19 + amd/pds_vfio_pci 19 20 aquantia/atlantic 20 21 chelsio/cxgb 21 22 cirrus/cs89x0

+30 -15

Documentation/virt/kvm/devices/vfio.rst

··· 9 9 - KVM_DEV_TYPE_VFIO 10 10 11 11 Only one VFIO instance may be created per VM. The created device 12 - tracks VFIO groups in use by the VM and features of those groups 13 - important to the correctness and acceleration of the VM. As groups 14 - are enabled and disabled for use by the VM, KVM should be updated 15 - about their presence. When registered with KVM, a reference to the 16 - VFIO-group is held by KVM. 12 + tracks VFIO files (group or device) in use by the VM and features 13 + of those groups/devices important to the correctness and acceleration 14 + of the VM. As groups/devices are enabled and disabled for use by the 15 + VM, KVM should be updated about their presence. When registered with 16 + KVM, a reference to the VFIO file is held by KVM. 17 17 18 18 Groups: 19 - KVM_DEV_VFIO_GROUP 19 + KVM_DEV_VFIO_FILE 20 + alias: KVM_DEV_VFIO_GROUP 20 21 21 - KVM_DEV_VFIO_GROUP attributes: 22 - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking 23 - kvm_device_attr.addr points to an int32_t file descriptor 24 - for the VFIO group. 25 - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking 26 - kvm_device_attr.addr points to an int32_t file descriptor 27 - for the VFIO group. 22 + KVM_DEV_VFIO_FILE attributes: 23 + KVM_DEV_VFIO_FILE_ADD: Add a VFIO file (group/device) to VFIO-KVM device 24 + tracking 25 + 26 + kvm_device_attr.addr points to an int32_t file descriptor for the 27 + VFIO file. 28 + 29 + KVM_DEV_VFIO_FILE_DEL: Remove a VFIO file (group/device) from VFIO-KVM 30 + device tracking 31 + 32 + kvm_device_attr.addr points to an int32_t file descriptor for the 33 + VFIO file. 34 + 35 + KVM_DEV_VFIO_GROUP (legacy kvm device group restricted to the handling of VFIO group fd): 36 + KVM_DEV_VFIO_GROUP_ADD: same as KVM_DEV_VFIO_FILE_ADD for group fd only 37 + 38 + KVM_DEV_VFIO_GROUP_DEL: same as KVM_DEV_VFIO_FILE_DEL for group fd only 39 + 28 40 KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table 29 41 allocated by sPAPR KVM. 30 42 kvm_device_attr.addr points to a struct:: ··· 52 40 - @tablefd is a file descriptor for a TCE table allocated via 53 41 KVM_CREATE_SPAPR_TCE. 54 42 55 - The GROUP_ADD operation above should be invoked prior to accessing the 43 + The FILE/GROUP_ADD operation above should be invoked prior to accessing the 56 44 device file descriptor via VFIO_GROUP_GET_DEVICE_FD in order to support 57 45 drivers which require a kvm pointer to be set in their .open_device() 58 - callback. 46 + callback. It is the same for device file descriptor via character device 47 + open which gets device access via VFIO_DEVICE_BIND_IOMMUFD. For such file 48 + descriptors, FILE_ADD should be invoked before VFIO_DEVICE_BIND_IOMMUFD 49 + to support the drivers mentioned in prior sentence as well.

+7

MAINTAINERS

··· 22482 22482 P: Documentation/driver-api/vfio-pci-device-specific-driver-acceptance.rst 22483 22483 F: drivers/vfio/pci/*/ 22484 22484 22485 + VFIO PDS PCI DRIVER 22486 + M: Brett Creeley <brett.creeley@amd.com> 22487 + L: kvm@vger.kernel.org 22488 + S: Maintained 22489 + F: Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst 22490 + F: drivers/vfio/pci/pds/ 22491 + 22485 22492 VFIO PLATFORM DRIVER 22486 22493 M: Eric Auger <eric.auger@redhat.com> 22487 22494 L: kvm@vger.kernel.org

+1

drivers/gpu/drm/i915/gvt/kvmgt.c

··· 1474 1474 .bind_iommufd = vfio_iommufd_emulated_bind, 1475 1475 .unbind_iommufd = vfio_iommufd_emulated_unbind, 1476 1476 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 1477 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 1477 1478 }; 1478 1479 1479 1480 static int intel_vgpu_probe(struct mdev_device *mdev)

+2 -2

drivers/iommu/iommufd/Kconfig

··· 14 14 if IOMMUFD 15 15 config IOMMUFD_VFIO_CONTAINER 16 16 bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" 17 - depends on VFIO && !VFIO_CONTAINER 18 - default VFIO && !VFIO_CONTAINER 17 + depends on VFIO_GROUP && !VFIO_CONTAINER 18 + default VFIO_GROUP && !VFIO_CONTAINER 19 19 help 20 20 IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on 21 21 IOMMUFD providing compatibility emulation to give the same ioctls.

+111 -5

drivers/iommu/iommufd/device.c

··· 99 99 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); 100 100 101 101 /** 102 + * iommufd_ctx_has_group - True if any device within the group is bound 103 + * to the ictx 104 + * @ictx: iommufd file descriptor 105 + * @group: Pointer to a physical iommu_group struct 106 + * 107 + * True if any device within the group has been bound to this ictx, ex. via 108 + * iommufd_device_bind(), therefore implying ictx ownership of the group. 109 + */ 110 + bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) 111 + { 112 + struct iommufd_object *obj; 113 + unsigned long index; 114 + 115 + if (!ictx || !group) 116 + return false; 117 + 118 + xa_lock(&ictx->objects); 119 + xa_for_each(&ictx->objects, index, obj) { 120 + if (obj->type == IOMMUFD_OBJ_DEVICE && 121 + container_of(obj, struct iommufd_device, obj)->group == group) { 122 + xa_unlock(&ictx->objects); 123 + return true; 124 + } 125 + } 126 + xa_unlock(&ictx->objects); 127 + return false; 128 + } 129 + EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); 130 + 131 + /** 102 132 * iommufd_device_unbind - Undo iommufd_device_bind() 103 133 * @idev: Device returned by iommufd_device_bind() 104 134 * ··· 142 112 iommufd_object_destroy_user(idev->ictx, &idev->obj); 143 113 } 144 114 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); 115 + 116 + struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) 117 + { 118 + return idev->ictx; 119 + } 120 + EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); 121 + 122 + u32 iommufd_device_to_id(struct iommufd_device *idev) 123 + { 124 + return idev->obj.id; 125 + } 126 + EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); 145 127 146 128 static int iommufd_device_setup_msi(struct iommufd_device *idev, 147 129 struct iommufd_hw_pagetable *hwpt, ··· 483 441 iommufd_ctx_get(ictx); 484 442 iommufd_object_finalize(ictx, &access->obj); 485 443 *id = access->obj.id; 444 + mutex_init(&access->ioas_lock); 486 445 return access; 487 446 } 488 447 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); ··· 500 457 } 501 458 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); 502 459 460 + void iommufd_access_detach(struct iommufd_access *access) 461 + { 462 + struct iommufd_ioas *cur_ioas = access->ioas; 463 + 464 + mutex_lock(&access->ioas_lock); 465 + if (WARN_ON(!access->ioas)) 466 + goto out; 467 + /* 468 + * Set ioas to NULL to block any further iommufd_access_pin_pages(). 469 + * iommufd_access_unpin_pages() can continue using access->ioas_unpin. 470 + */ 471 + access->ioas = NULL; 472 + 473 + if (access->ops->unmap) { 474 + mutex_unlock(&access->ioas_lock); 475 + access->ops->unmap(access->data, 0, ULONG_MAX); 476 + mutex_lock(&access->ioas_lock); 477 + } 478 + iopt_remove_access(&cur_ioas->iopt, access); 479 + refcount_dec(&cur_ioas->obj.users); 480 + out: 481 + access->ioas_unpin = NULL; 482 + mutex_unlock(&access->ioas_lock); 483 + } 484 + EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); 485 + 503 486 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) 504 487 { 505 488 struct iommufd_ioas *new_ioas; 506 489 int rc = 0; 507 490 508 - if (access->ioas) 491 + mutex_lock(&access->ioas_lock); 492 + if (WARN_ON(access->ioas || access->ioas_unpin)) { 493 + mutex_unlock(&access->ioas_lock); 509 494 return -EINVAL; 495 + } 510 496 511 497 new_ioas = iommufd_get_ioas(access->ictx, ioas_id); 512 - if (IS_ERR(new_ioas)) 498 + if (IS_ERR(new_ioas)) { 499 + mutex_unlock(&access->ioas_lock); 513 500 return PTR_ERR(new_ioas); 501 + } 514 502 515 503 rc = iopt_add_access(&new_ioas->iopt, access); 516 504 if (rc) { 505 + mutex_unlock(&access->ioas_lock); 517 506 iommufd_put_object(&new_ioas->obj); 518 507 return rc; 519 508 } 520 509 iommufd_ref_to_users(&new_ioas->obj); 521 510 522 511 access->ioas = new_ioas; 512 + access->ioas_unpin = new_ioas; 513 + mutex_unlock(&access->ioas_lock); 523 514 return 0; 524 515 } 525 516 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); ··· 608 531 void iommufd_access_unpin_pages(struct iommufd_access *access, 609 532 unsigned long iova, unsigned long length) 610 533 { 611 - struct io_pagetable *iopt = &access->ioas->iopt; 612 534 struct iopt_area_contig_iter iter; 535 + struct io_pagetable *iopt; 613 536 unsigned long last_iova; 614 537 struct iopt_area *area; 615 538 616 539 if (WARN_ON(!length) || 617 540 WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) 618 541 return; 542 + 543 + mutex_lock(&access->ioas_lock); 544 + /* 545 + * The driver must be doing something wrong if it calls this before an 546 + * iommufd_access_attach() or after an iommufd_access_detach(). 547 + */ 548 + if (WARN_ON(!access->ioas_unpin)) { 549 + mutex_unlock(&access->ioas_lock); 550 + return; 551 + } 552 + iopt = &access->ioas_unpin->iopt; 619 553 620 554 down_read(&iopt->iova_rwsem); 621 555 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) ··· 637 549 min(last_iova, iopt_area_last_iova(area)))); 638 550 WARN_ON(!iopt_area_contig_done(&iter)); 639 551 up_read(&iopt->iova_rwsem); 552 + mutex_unlock(&access->ioas_lock); 640 553 } 641 554 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); 642 555 ··· 683 594 unsigned long length, struct page **out_pages, 684 595 unsigned int flags) 685 596 { 686 - struct io_pagetable *iopt = &access->ioas->iopt; 687 597 struct iopt_area_contig_iter iter; 598 + struct io_pagetable *iopt; 688 599 unsigned long last_iova; 689 600 struct iopt_area *area; 690 601 int rc; ··· 698 609 return -EINVAL; 699 610 if (check_add_overflow(iova, length - 1, &last_iova)) 700 611 return -EOVERFLOW; 612 + 613 + mutex_lock(&access->ioas_lock); 614 + if (!access->ioas) { 615 + mutex_unlock(&access->ioas_lock); 616 + return -ENOENT; 617 + } 618 + iopt = &access->ioas->iopt; 701 619 702 620 down_read(&iopt->iova_rwsem); 703 621 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { ··· 736 640 } 737 641 738 642 up_read(&iopt->iova_rwsem); 643 + mutex_unlock(&access->ioas_lock); 739 644 return 0; 740 645 741 646 err_remove: ··· 751 654 iopt_area_last_iova(area)))); 752 655 } 753 656 up_read(&iopt->iova_rwsem); 657 + mutex_unlock(&access->ioas_lock); 754 658 return rc; 755 659 } 756 660 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); ··· 771 673 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, 772 674 void *data, size_t length, unsigned int flags) 773 675 { 774 - struct io_pagetable *iopt = &access->ioas->iopt; 775 676 struct iopt_area_contig_iter iter; 677 + struct io_pagetable *iopt; 776 678 struct iopt_area *area; 777 679 unsigned long last_iova; 778 680 int rc; ··· 781 683 return -EINVAL; 782 684 if (check_add_overflow(iova, length - 1, &last_iova)) 783 685 return -EOVERFLOW; 686 + 687 + mutex_lock(&access->ioas_lock); 688 + if (!access->ioas) { 689 + mutex_unlock(&access->ioas_lock); 690 + return -ENOENT; 691 + } 692 + iopt = &access->ioas->iopt; 784 693 785 694 down_read(&iopt->iova_rwsem); 786 695 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { ··· 815 710 rc = -ENOENT; 816 711 err_out: 817 712 up_read(&iopt->iova_rwsem); 713 + mutex_unlock(&access->ioas_lock); 818 714 return rc; 819 715 } 820 716 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD);

+2

drivers/iommu/iommufd/iommufd_private.h

··· 296 296 struct iommufd_object obj; 297 297 struct iommufd_ctx *ictx; 298 298 struct iommufd_ioas *ioas; 299 + struct iommufd_ioas *ioas_unpin; 300 + struct mutex ioas_lock; 299 301 const struct iommufd_access_ops *ops; 300 302 void *data; 301 303 unsigned long iova_alignment;

+25 -1

drivers/iommu/iommufd/main.c

··· 50 50 * before calling iommufd_object_finalize(). 51 51 */ 52 52 rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, 53 - xa_limit_32b, GFP_KERNEL_ACCOUNT); 53 + xa_limit_31b, GFP_KERNEL_ACCOUNT); 54 54 if (rc) 55 55 goto out_free; 56 56 return obj; ··· 416 416 return ictx; 417 417 } 418 418 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); 419 + 420 + /** 421 + * iommufd_ctx_from_fd - Acquires a reference to the iommufd context 422 + * @fd: File descriptor to obtain the reference from 423 + * 424 + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success 425 + * the caller is responsible to call iommufd_ctx_put(). 426 + */ 427 + struct iommufd_ctx *iommufd_ctx_from_fd(int fd) 428 + { 429 + struct file *file; 430 + 431 + file = fget(fd); 432 + if (!file) 433 + return ERR_PTR(-EBADF); 434 + 435 + if (file->f_op != &iommufd_fops) { 436 + fput(file); 437 + return ERR_PTR(-EBADFD); 438 + } 439 + /* fget is the same as iommufd_ctx_get() */ 440 + return file->private_data; 441 + } 442 + EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); 419 443 420 444 /** 421 445 * iommufd_ctx_put - Put back a reference

+2

drivers/iommu/iommufd/vfio_compat.c

··· 483 483 rc = cap_size; 484 484 goto out_put; 485 485 } 486 + cap_size = ALIGN(cap_size, sizeof(u64)); 487 + 486 488 if (last_cap && info.argsz >= total_cap_size && 487 489 put_user(total_cap_size, &last_cap->next)) { 488 490 rc = -EFAULT;

+7 -17

drivers/net/ethernet/amd/pds_core/auxbus.c

··· 8 8 9 9 /** 10 10 * pds_client_register - Link the client to the firmware 11 - * @pf_pdev: ptr to the PF driver struct 11 + * @pf: ptr to the PF driver's private data struct 12 12 * @devname: name that includes service into, e.g. pds_core.vDPA 13 13 * 14 14 * Return: positive client ID (ci) on success, or 15 15 * negative for error 16 16 */ 17 - int pds_client_register(struct pci_dev *pf_pdev, char *devname) 17 + int pds_client_register(struct pdsc *pf, char *devname) 18 18 { 19 19 union pds_core_adminq_comp comp = {}; 20 20 union pds_core_adminq_cmd cmd = {}; 21 - struct pdsc *pf; 22 21 int err; 23 22 u16 ci; 24 - 25 - pf = pci_get_drvdata(pf_pdev); 26 - if (pf->state) 27 - return -ENXIO; 28 23 29 24 cmd.client_reg.opcode = PDS_AQ_CMD_CLIENT_REG; 30 25 strscpy(cmd.client_reg.devname, devname, ··· 48 53 49 54 /** 50 55 * pds_client_unregister - Unlink the client from the firmware 51 - * @pf_pdev: ptr to the PF driver struct 56 + * @pf: ptr to the PF driver's private data struct 52 57 * @client_id: id returned from pds_client_register() 53 58 * 54 59 * Return: 0 on success, or 55 60 * negative for error 56 61 */ 57 - int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id) 62 + int pds_client_unregister(struct pdsc *pf, u16 client_id) 58 63 { 59 64 union pds_core_adminq_comp comp = {}; 60 65 union pds_core_adminq_cmd cmd = {}; 61 - struct pdsc *pf; 62 66 int err; 63 - 64 - pf = pci_get_drvdata(pf_pdev); 65 - if (pf->state) 66 - return -ENXIO; 67 67 68 68 cmd.client_unreg.opcode = PDS_AQ_CMD_CLIENT_UNREG; 69 69 cmd.client_unreg.client_id = cpu_to_le16(client_id); ··· 188 198 189 199 padev = pf->vfs[cf->vf_id].padev; 190 200 if (padev) { 191 - pds_client_unregister(pf->pdev, padev->client_id); 201 + pds_client_unregister(pf, padev->client_id); 192 202 auxiliary_device_delete(&padev->aux_dev); 193 203 auxiliary_device_uninit(&padev->aux_dev); 194 204 padev->client_id = 0; ··· 233 243 */ 234 244 snprintf(devname, sizeof(devname), "%s.%s.%d", 235 245 PDS_CORE_DRV_NAME, pf->viftype_status[vt].name, cf->uid); 236 - client_id = pds_client_register(pf->pdev, devname); 246 + client_id = pds_client_register(pf, devname); 237 247 if (client_id < 0) { 238 248 err = client_id; 239 249 goto out_unlock; ··· 242 252 padev = pdsc_auxbus_dev_register(cf, pf, client_id, 243 253 pf->viftype_status[vt].name); 244 254 if (IS_ERR(padev)) { 245 - pds_client_unregister(pf->pdev, client_id); 255 + pds_client_unregister(pf, client_id); 246 256 err = PTR_ERR(padev); 247 257 goto out_unlock; 248 258 }

+1

drivers/s390/cio/vfio_ccw_ops.c

··· 632 632 .bind_iommufd = vfio_iommufd_emulated_bind, 633 633 .unbind_iommufd = vfio_iommufd_emulated_unbind, 634 634 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 635 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 635 636 }; 636 637 637 638 struct mdev_driver vfio_ccw_mdev_driver = {

+1

drivers/s390/crypto/vfio_ap_ops.c

··· 2020 2020 .bind_iommufd = vfio_iommufd_emulated_bind, 2021 2021 .unbind_iommufd = vfio_iommufd_emulated_unbind, 2022 2022 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 2023 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 2023 2024 .request = vfio_ap_mdev_request 2024 2025 }; 2025 2026

+27

drivers/vfio/Kconfig

··· 4 4 select IOMMU_API 5 5 depends on IOMMUFD || !IOMMUFD 6 6 select INTERVAL_TREE 7 + select VFIO_GROUP if SPAPR_TCE_IOMMU || IOMMUFD=n 8 + select VFIO_DEVICE_CDEV if !VFIO_GROUP 7 9 select VFIO_CONTAINER if IOMMUFD=n 8 10 help 9 11 VFIO provides a framework for secure userspace device drivers. ··· 14 12 If you don't know what to do here, say N. 15 13 16 14 if VFIO 15 + config VFIO_DEVICE_CDEV 16 + bool "Support for the VFIO cdev /dev/vfio/devices/vfioX" 17 + depends on IOMMUFD && !SPAPR_TCE_IOMMU 18 + default !VFIO_GROUP 19 + help 20 + The VFIO device cdev is another way for userspace to get device 21 + access. Userspace gets device fd by opening device cdev under 22 + /dev/vfio/devices/vfioX, and then bind the device fd with an iommufd 23 + to set up secure DMA context for device access. This interface does 24 + not support noiommu. 25 + 26 + If you don't know what to do here, say N. 27 + 28 + config VFIO_GROUP 29 + bool "Support for the VFIO group /dev/vfio/$group_id" 30 + default y 31 + help 32 + VFIO group support provides the traditional model for accessing 33 + devices through VFIO and is used by the majority of userspace 34 + applications and drivers making use of VFIO. 35 + 36 + If you don't know what to do here, say Y. 37 + 17 38 config VFIO_CONTAINER 18 39 bool "Support for the VFIO container /dev/vfio/vfio" 19 40 select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) 41 + depends on VFIO_GROUP 20 42 default y 21 43 help 22 44 The VFIO container is the classic interface to VFIO for establishing ··· 62 36 63 37 config VFIO_NOIOMMU 64 38 bool "VFIO No-IOMMU support" 39 + depends on VFIO_GROUP 65 40 help 66 41 VFIO is built on the ability to isolate devices using the IOMMU. 67 42 Only with an IOMMU can userspace access to DMA capable devices be

+2 -1

drivers/vfio/Makefile

··· 2 2 obj-$(CONFIG_VFIO) += vfio.o 3 3 4 4 vfio-y += vfio_main.o \ 5 - group.o \ 6 5 iova_bitmap.o 6 + vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o 7 + vfio-$(CONFIG_VFIO_GROUP) += group.o 7 8 vfio-$(CONFIG_IOMMUFD) += iommufd.o 8 9 vfio-$(CONFIG_VFIO_CONTAINER) += container.o 9 10 vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o

-1

drivers/vfio/cdx/main.c

··· 223 223 .match_id_table = vfio_cdx_table, 224 224 .driver = { 225 225 .name = "vfio-cdx", 226 - .owner = THIS_MODULE, 227 226 }, 228 227 .driver_managed_dma = true, 229 228 };

+228

drivers/vfio/device_cdev.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2023 Intel Corporation. 4 + */ 5 + #include <linux/vfio.h> 6 + #include <linux/iommufd.h> 7 + 8 + #include "vfio.h" 9 + 10 + static dev_t device_devt; 11 + 12 + void vfio_init_device_cdev(struct vfio_device *device) 13 + { 14 + device->device.devt = MKDEV(MAJOR(device_devt), device->index); 15 + cdev_init(&device->cdev, &vfio_device_fops); 16 + device->cdev.owner = THIS_MODULE; 17 + } 18 + 19 + /* 20 + * device access via the fd opened by this function is blocked until 21 + * .open_device() is called successfully during BIND_IOMMUFD. 22 + */ 23 + int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) 24 + { 25 + struct vfio_device *device = container_of(inode->i_cdev, 26 + struct vfio_device, cdev); 27 + struct vfio_device_file *df; 28 + int ret; 29 + 30 + /* Paired with the put in vfio_device_fops_release() */ 31 + if (!vfio_device_try_get_registration(device)) 32 + return -ENODEV; 33 + 34 + df = vfio_allocate_device_file(device); 35 + if (IS_ERR(df)) { 36 + ret = PTR_ERR(df); 37 + goto err_put_registration; 38 + } 39 + 40 + filep->private_data = df; 41 + 42 + return 0; 43 + 44 + err_put_registration: 45 + vfio_device_put_registration(device); 46 + return ret; 47 + } 48 + 49 + static void vfio_df_get_kvm_safe(struct vfio_device_file *df) 50 + { 51 + spin_lock(&df->kvm_ref_lock); 52 + vfio_device_get_kvm_safe(df->device, df->kvm); 53 + spin_unlock(&df->kvm_ref_lock); 54 + } 55 + 56 + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, 57 + struct vfio_device_bind_iommufd __user *arg) 58 + { 59 + struct vfio_device *device = df->device; 60 + struct vfio_device_bind_iommufd bind; 61 + unsigned long minsz; 62 + int ret; 63 + 64 + static_assert(__same_type(arg->out_devid, df->devid)); 65 + 66 + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); 67 + 68 + if (copy_from_user(&bind, arg, minsz)) 69 + return -EFAULT; 70 + 71 + if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) 72 + return -EINVAL; 73 + 74 + /* BIND_IOMMUFD only allowed for cdev fds */ 75 + if (df->group) 76 + return -EINVAL; 77 + 78 + ret = vfio_device_block_group(device); 79 + if (ret) 80 + return ret; 81 + 82 + mutex_lock(&device->dev_set->lock); 83 + /* one device cannot be bound twice */ 84 + if (df->access_granted) { 85 + ret = -EINVAL; 86 + goto out_unlock; 87 + } 88 + 89 + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); 90 + if (IS_ERR(df->iommufd)) { 91 + ret = PTR_ERR(df->iommufd); 92 + df->iommufd = NULL; 93 + goto out_unlock; 94 + } 95 + 96 + /* 97 + * Before the device open, get the KVM pointer currently 98 + * associated with the device file (if there is) and obtain 99 + * a reference. This reference is held until device closed. 100 + * Save the pointer in the device for use by drivers. 101 + */ 102 + vfio_df_get_kvm_safe(df); 103 + 104 + ret = vfio_df_open(df); 105 + if (ret) 106 + goto out_put_kvm; 107 + 108 + ret = copy_to_user(&arg->out_devid, &df->devid, 109 + sizeof(df->devid)) ? -EFAULT : 0; 110 + if (ret) 111 + goto out_close_device; 112 + 113 + device->cdev_opened = true; 114 + /* 115 + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ 116 + * read/write/mmap 117 + */ 118 + smp_store_release(&df->access_granted, true); 119 + mutex_unlock(&device->dev_set->lock); 120 + return 0; 121 + 122 + out_close_device: 123 + vfio_df_close(df); 124 + out_put_kvm: 125 + vfio_device_put_kvm(device); 126 + iommufd_ctx_put(df->iommufd); 127 + df->iommufd = NULL; 128 + out_unlock: 129 + mutex_unlock(&device->dev_set->lock); 130 + vfio_device_unblock_group(device); 131 + return ret; 132 + } 133 + 134 + void vfio_df_unbind_iommufd(struct vfio_device_file *df) 135 + { 136 + struct vfio_device *device = df->device; 137 + 138 + /* 139 + * In the time of close, there is no contention with another one 140 + * changing this flag. So read df->access_granted without lock 141 + * and no smp_load_acquire() is ok. 142 + */ 143 + if (!df->access_granted) 144 + return; 145 + 146 + mutex_lock(&device->dev_set->lock); 147 + vfio_df_close(df); 148 + vfio_device_put_kvm(device); 149 + iommufd_ctx_put(df->iommufd); 150 + device->cdev_opened = false; 151 + mutex_unlock(&device->dev_set->lock); 152 + vfio_device_unblock_group(device); 153 + } 154 + 155 + int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, 156 + struct vfio_device_attach_iommufd_pt __user *arg) 157 + { 158 + struct vfio_device *device = df->device; 159 + struct vfio_device_attach_iommufd_pt attach; 160 + unsigned long minsz; 161 + int ret; 162 + 163 + minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); 164 + 165 + if (copy_from_user(&attach, arg, minsz)) 166 + return -EFAULT; 167 + 168 + if (attach.argsz < minsz || attach.flags) 169 + return -EINVAL; 170 + 171 + mutex_lock(&device->dev_set->lock); 172 + ret = device->ops->attach_ioas(device, &attach.pt_id); 173 + if (ret) 174 + goto out_unlock; 175 + 176 + if (copy_to_user(&arg->pt_id, &attach.pt_id, sizeof(attach.pt_id))) { 177 + ret = -EFAULT; 178 + goto out_detach; 179 + } 180 + mutex_unlock(&device->dev_set->lock); 181 + 182 + return 0; 183 + 184 + out_detach: 185 + device->ops->detach_ioas(device); 186 + out_unlock: 187 + mutex_unlock(&device->dev_set->lock); 188 + return ret; 189 + } 190 + 191 + int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, 192 + struct vfio_device_detach_iommufd_pt __user *arg) 193 + { 194 + struct vfio_device *device = df->device; 195 + struct vfio_device_detach_iommufd_pt detach; 196 + unsigned long minsz; 197 + 198 + minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); 199 + 200 + if (copy_from_user(&detach, arg, minsz)) 201 + return -EFAULT; 202 + 203 + if (detach.argsz < minsz || detach.flags) 204 + return -EINVAL; 205 + 206 + mutex_lock(&device->dev_set->lock); 207 + device->ops->detach_ioas(device); 208 + mutex_unlock(&device->dev_set->lock); 209 + 210 + return 0; 211 + } 212 + 213 + static char *vfio_device_devnode(const struct device *dev, umode_t *mode) 214 + { 215 + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); 216 + } 217 + 218 + int vfio_cdev_init(struct class *device_class) 219 + { 220 + device_class->devnode = vfio_device_devnode; 221 + return alloc_chrdev_region(&device_devt, 0, 222 + MINORMASK + 1, "vfio-dev"); 223 + } 224 + 225 + void vfio_cdev_cleanup(void) 226 + { 227 + unregister_chrdev_region(device_devt, MINORMASK + 1); 228 + }

+2 -13

drivers/vfio/fsl-mc/vfio_fsl_mc.c

··· 593 593 .bind_iommufd = vfio_iommufd_physical_bind, 594 594 .unbind_iommufd = vfio_iommufd_physical_unbind, 595 595 .attach_ioas = vfio_iommufd_physical_attach_ioas, 596 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 596 597 }; 597 598 598 599 static struct fsl_mc_driver vfio_fsl_mc_driver = { ··· 601 600 .remove = vfio_fsl_mc_remove, 602 601 .driver = { 603 602 .name = "vfio-fsl-mc", 604 - .owner = THIS_MODULE, 605 603 }, 606 604 .driver_managed_dma = true, 607 605 }; 608 606 609 - static int __init vfio_fsl_mc_driver_init(void) 610 - { 611 - return fsl_mc_driver_register(&vfio_fsl_mc_driver); 612 - } 613 - 614 - static void __exit vfio_fsl_mc_driver_exit(void) 615 - { 616 - fsl_mc_driver_unregister(&vfio_fsl_mc_driver); 617 - } 618 - 619 - module_init(vfio_fsl_mc_driver_init); 620 - module_exit(vfio_fsl_mc_driver_exit); 607 + module_fsl_mc_driver(vfio_fsl_mc_driver); 621 608 622 609 MODULE_LICENSE("Dual BSD/GPL"); 623 610 MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver");

+111 -62

drivers/vfio/group.c

··· 160 160 static void vfio_device_group_get_kvm_safe(struct vfio_device *device) 161 161 { 162 162 spin_lock(&device->group->kvm_ref_lock); 163 - if (!device->group->kvm) 164 - goto unlock; 165 - 166 - _vfio_device_get_kvm_safe(device, device->group->kvm); 167 - 168 - unlock: 163 + vfio_device_get_kvm_safe(device, device->group->kvm); 169 164 spin_unlock(&device->group->kvm_ref_lock); 170 165 } 171 166 172 - static int vfio_device_group_open(struct vfio_device *device) 167 + static int vfio_df_group_open(struct vfio_device_file *df) 173 168 { 169 + struct vfio_device *device = df->device; 174 170 int ret; 175 171 176 172 mutex_lock(&device->group->group_lock); ··· 186 190 if (device->open_count == 0) 187 191 vfio_device_group_get_kvm_safe(device); 188 192 189 - ret = vfio_device_open(device, device->group->iommufd); 193 + df->iommufd = device->group->iommufd; 194 + if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) { 195 + /* 196 + * Require no compat ioas to be assigned to proceed. The basic 197 + * statement is that the user cannot have done something that 198 + * implies they expected translation to exist 199 + */ 200 + if (!capable(CAP_SYS_RAWIO) || 201 + vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) 202 + ret = -EPERM; 203 + else 204 + ret = 0; 205 + goto out_put_kvm; 206 + } 190 207 191 - if (device->open_count == 0) 192 - vfio_device_put_kvm(device); 208 + ret = vfio_df_open(df); 209 + if (ret) 210 + goto out_put_kvm; 211 + 212 + if (df->iommufd && device->open_count == 1) { 213 + ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd); 214 + if (ret) 215 + goto out_close_device; 216 + } 217 + 218 + /* 219 + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ 220 + * read/write/mmap and vfio_file_has_device_access() 221 + */ 222 + smp_store_release(&df->access_granted, true); 193 223 194 224 mutex_unlock(&device->dev_set->lock); 225 + mutex_unlock(&device->group->group_lock); 226 + return 0; 195 227 228 + out_close_device: 229 + vfio_df_close(df); 230 + out_put_kvm: 231 + df->iommufd = NULL; 232 + if (device->open_count == 0) 233 + vfio_device_put_kvm(device); 234 + mutex_unlock(&device->dev_set->lock); 196 235 out_unlock: 197 236 mutex_unlock(&device->group->group_lock); 198 237 return ret; 199 238 } 200 239 201 - void vfio_device_group_close(struct vfio_device *device) 240 + void vfio_df_group_close(struct vfio_device_file *df) 202 241 { 242 + struct vfio_device *device = df->device; 243 + 203 244 mutex_lock(&device->group->group_lock); 204 245 mutex_lock(&device->dev_set->lock); 205 246 206 - vfio_device_close(device, device->group->iommufd); 247 + vfio_df_close(df); 248 + df->iommufd = NULL; 207 249 208 250 if (device->open_count == 0) 209 251 vfio_device_put_kvm(device); ··· 252 218 253 219 static struct file *vfio_device_open_file(struct vfio_device *device) 254 220 { 221 + struct vfio_device_file *df; 255 222 struct file *filep; 256 223 int ret; 257 224 258 - ret = vfio_device_group_open(device); 259 - if (ret) 225 + df = vfio_allocate_device_file(device); 226 + if (IS_ERR(df)) { 227 + ret = PTR_ERR(df); 260 228 goto err_out; 229 + } 230 + 231 + df->group = device->group; 232 + 233 + ret = vfio_df_group_open(df); 234 + if (ret) 235 + goto err_free; 261 236 262 237 /* 263 238 * We can't use anon_inode_getfd() because we need to modify 264 239 * the f_mode flags directly to allow more than just ioctls 265 240 */ 266 241 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, 267 - device, O_RDWR); 242 + df, O_RDWR); 268 243 if (IS_ERR(filep)) { 269 244 ret = PTR_ERR(filep); 270 245 goto err_close_device; ··· 296 253 return filep; 297 254 298 255 err_close_device: 299 - vfio_device_group_close(device); 256 + vfio_df_group_close(df); 257 + err_free: 258 + kfree(df); 300 259 err_out: 301 260 return ERR_PTR(ret); 302 261 } ··· 402 357 } 403 358 } 404 359 360 + int vfio_device_block_group(struct vfio_device *device) 361 + { 362 + struct vfio_group *group = device->group; 363 + int ret = 0; 364 + 365 + mutex_lock(&group->group_lock); 366 + if (group->opened_file) { 367 + ret = -EBUSY; 368 + goto out_unlock; 369 + } 370 + 371 + group->cdev_device_open_cnt++; 372 + 373 + out_unlock: 374 + mutex_unlock(&group->group_lock); 375 + return ret; 376 + } 377 + 378 + void vfio_device_unblock_group(struct vfio_device *device) 379 + { 380 + struct vfio_group *group = device->group; 381 + 382 + mutex_lock(&group->group_lock); 383 + group->cdev_device_open_cnt--; 384 + mutex_unlock(&group->group_lock); 385 + } 386 + 405 387 static int vfio_group_fops_open(struct inode *inode, struct file *filep) 406 388 { 407 389 struct vfio_group *group = ··· 448 376 449 377 if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { 450 378 ret = -EPERM; 379 + goto out_unlock; 380 + } 381 + 382 + if (group->cdev_device_open_cnt) { 383 + ret = -EBUSY; 451 384 goto out_unlock; 452 385 } 453 386 ··· 530 453 mutex_destroy(&group->device_lock); 531 454 mutex_destroy(&group->group_lock); 532 455 WARN_ON(group->iommu_group); 456 + WARN_ON(group->cdev_device_open_cnt); 533 457 ida_free(&vfio.group_ida, MINOR(group->dev.devt)); 534 458 kfree(group); 535 459 } ··· 682 604 if (!iommu_group) 683 605 return ERR_PTR(-EINVAL); 684 606 685 - /* 686 - * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 687 - * restore cache coherency. It has to be checked here because it is only 688 - * valid for cases where we are using iommu groups. 689 - */ 690 - if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { 691 - iommu_group_put(iommu_group); 692 - return ERR_PTR(-EINVAL); 693 - } 694 - 695 607 mutex_lock(&vfio.group_lock); 696 608 group = vfio_group_find_from_iommu(iommu_group); 697 609 if (group) { ··· 813 745 return device->group->container; 814 746 } 815 747 748 + struct vfio_group *vfio_group_from_file(struct file *file) 749 + { 750 + struct vfio_group *group = file->private_data; 751 + 752 + if (file->f_op != &vfio_group_fops) 753 + return NULL; 754 + return group; 755 + } 756 + 816 757 /** 817 758 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file 818 759 * @file: VFIO group file ··· 832 755 */ 833 756 struct iommu_group *vfio_file_iommu_group(struct file *file) 834 757 { 835 - struct vfio_group *group = file->private_data; 758 + struct vfio_group *group = vfio_group_from_file(file); 836 759 struct iommu_group *iommu_group = NULL; 837 760 838 761 if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) 839 762 return NULL; 840 763 841 - if (!vfio_file_is_group(file)) 764 + if (!group) 842 765 return NULL; 843 766 844 767 mutex_lock(&group->group_lock); ··· 852 775 EXPORT_SYMBOL_GPL(vfio_file_iommu_group); 853 776 854 777 /** 855 - * vfio_file_is_group - True if the file is usable with VFIO aPIS 778 + * vfio_file_is_group - True if the file is a vfio group file 856 779 * @file: VFIO group file 857 780 */ 858 781 bool vfio_file_is_group(struct file *file) 859 782 { 860 - return file->f_op == &vfio_group_fops; 783 + return vfio_group_from_file(file); 861 784 } 862 785 EXPORT_SYMBOL_GPL(vfio_file_is_group); 863 786 864 - /** 865 - * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 866 - * is always CPU cache coherent 867 - * @file: VFIO group file 868 - * 869 - * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 870 - * bit in DMA transactions. A return of false indicates that the user has 871 - * rights to access additional instructions such as wbinvd on x86. 872 - */ 873 - bool vfio_file_enforced_coherent(struct file *file) 787 + bool vfio_group_enforced_coherent(struct vfio_group *group) 874 788 { 875 - struct vfio_group *group = file->private_data; 876 789 struct vfio_device *device; 877 790 bool ret = true; 878 - 879 - if (!vfio_file_is_group(file)) 880 - return true; 881 791 882 792 /* 883 793 * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then ··· 883 819 mutex_unlock(&group->device_lock); 884 820 return ret; 885 821 } 886 - EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 887 822 888 - /** 889 - * vfio_file_set_kvm - Link a kvm with VFIO drivers 890 - * @file: VFIO group file 891 - * @kvm: KVM to link 892 - * 893 - * When a VFIO device is first opened the KVM will be available in 894 - * device->kvm if one was associated with the group. 895 - */ 896 - void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 823 + void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) 897 824 { 898 - struct vfio_group *group = file->private_data; 899 - 900 - if (!vfio_file_is_group(file)) 901 - return; 902 - 903 825 spin_lock(&group->kvm_ref_lock); 904 826 group->kvm = kvm; 905 827 spin_unlock(&group->kvm_ref_lock); 906 828 } 907 - EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 908 829 909 830 /** 910 831 * vfio_file_has_dev - True if the VFIO file is a handle for device ··· 900 851 */ 901 852 bool vfio_file_has_dev(struct file *file, struct vfio_device *device) 902 853 { 903 - struct vfio_group *group = file->private_data; 854 + struct vfio_group *group = vfio_group_from_file(file); 904 855 905 - if (!vfio_file_is_group(file)) 856 + if (!group) 906 857 return false; 907 858 908 859 return group == device->group;

+105 -33

drivers/vfio/iommufd.c

··· 10 10 MODULE_IMPORT_NS(IOMMUFD); 11 11 MODULE_IMPORT_NS(IOMMUFD_VFIO); 12 12 13 - int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) 13 + bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, 14 + struct iommufd_ctx *ictx) 14 15 { 15 16 u32 ioas_id; 16 - u32 device_id; 17 + 18 + return !iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id); 19 + } 20 + 21 + int vfio_df_iommufd_bind(struct vfio_device_file *df) 22 + { 23 + struct vfio_device *vdev = df->device; 24 + struct iommufd_ctx *ictx = df->iommufd; 25 + 26 + lockdep_assert_held(&vdev->dev_set->lock); 27 + 28 + return vdev->ops->bind_iommufd(vdev, ictx, &df->devid); 29 + } 30 + 31 + int vfio_iommufd_compat_attach_ioas(struct vfio_device *vdev, 32 + struct iommufd_ctx *ictx) 33 + { 34 + u32 ioas_id; 17 35 int ret; 18 36 19 37 lockdep_assert_held(&vdev->dev_set->lock); 20 38 21 - if (vfio_device_is_noiommu(vdev)) { 22 - if (!capable(CAP_SYS_RAWIO)) 23 - return -EPERM; 24 - 25 - /* 26 - * Require no compat ioas to be assigned to proceed. The basic 27 - * statement is that the user cannot have done something that 28 - * implies they expected translation to exist 29 - */ 30 - if (!iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id)) 31 - return -EPERM; 39 + /* compat noiommu does not need to do ioas attach */ 40 + if (vfio_device_is_noiommu(vdev)) 32 41 return 0; 33 - } 34 - 35 - ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); 36 - if (ret) 37 - return ret; 38 42 39 43 ret = iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id); 40 44 if (ret) 41 - goto err_unbind; 42 - ret = vdev->ops->attach_ioas(vdev, &ioas_id); 43 - if (ret) 44 - goto err_unbind; 45 + return ret; 45 46 46 - /* 47 - * The legacy path has no way to return the device id or the selected 48 - * pt_id 49 - */ 50 - return 0; 51 - 52 - err_unbind: 53 - if (vdev->ops->unbind_iommufd) 54 - vdev->ops->unbind_iommufd(vdev); 55 - return ret; 47 + /* The legacy path has no way to return the selected pt_id */ 48 + return vdev->ops->attach_ioas(vdev, &ioas_id); 56 49 } 57 50 58 - void vfio_iommufd_unbind(struct vfio_device *vdev) 51 + void vfio_df_iommufd_unbind(struct vfio_device_file *df) 59 52 { 53 + struct vfio_device *vdev = df->device; 54 + 60 55 lockdep_assert_held(&vdev->dev_set->lock); 61 56 62 57 if (vfio_device_is_noiommu(vdev)) ··· 60 65 if (vdev->ops->unbind_iommufd) 61 66 vdev->ops->unbind_iommufd(vdev); 62 67 } 68 + 69 + struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev) 70 + { 71 + if (vdev->iommufd_device) 72 + return iommufd_device_to_ictx(vdev->iommufd_device); 73 + return NULL; 74 + } 75 + EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx); 76 + 77 + static int vfio_iommufd_device_id(struct vfio_device *vdev) 78 + { 79 + if (vdev->iommufd_device) 80 + return iommufd_device_to_id(vdev->iommufd_device); 81 + return -EINVAL; 82 + } 83 + 84 + /* 85 + * Return devid for a device. 86 + * valid ID for the device that is owned by the ictx 87 + * -ENOENT = device is owned but there is no ID 88 + * -ENODEV or other error = device is not owned 89 + */ 90 + int vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) 91 + { 92 + struct iommu_group *group; 93 + int devid; 94 + 95 + if (vfio_iommufd_device_ictx(vdev) == ictx) 96 + return vfio_iommufd_device_id(vdev); 97 + 98 + group = iommu_group_get(vdev->dev); 99 + if (!group) 100 + return -ENODEV; 101 + 102 + if (iommufd_ctx_has_group(ictx, group)) 103 + devid = -ENOENT; 104 + else 105 + devid = -ENODEV; 106 + 107 + iommu_group_put(group); 108 + 109 + return devid; 110 + } 111 + EXPORT_SYMBOL_GPL(vfio_iommufd_get_dev_id); 63 112 64 113 /* 65 114 * The physical standard ops mean that the iommufd_device is bound to the ··· 140 101 { 141 102 int rc; 142 103 104 + lockdep_assert_held(&vdev->dev_set->lock); 105 + 106 + if (WARN_ON(!vdev->iommufd_device)) 107 + return -EINVAL; 108 + 109 + if (vdev->iommufd_attached) 110 + return -EBUSY; 111 + 143 112 rc = iommufd_device_attach(vdev->iommufd_device, pt_id); 144 113 if (rc) 145 114 return rc; ··· 155 108 return 0; 156 109 } 157 110 EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); 111 + 112 + void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) 113 + { 114 + lockdep_assert_held(&vdev->dev_set->lock); 115 + 116 + if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached) 117 + return; 118 + 119 + iommufd_device_detach(vdev->iommufd_device); 120 + vdev->iommufd_attached = false; 121 + } 122 + EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); 158 123 159 124 /* 160 125 * The emulated standard ops mean that vfio_device is going to use the ··· 231 172 return 0; 232 173 } 233 174 EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas); 175 + 176 + void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev) 177 + { 178 + lockdep_assert_held(&vdev->dev_set->lock); 179 + 180 + if (WARN_ON(!vdev->iommufd_access) || 181 + !vdev->iommufd_attached) 182 + return; 183 + 184 + iommufd_access_detach(vdev->iommufd_access); 185 + vdev->iommufd_attached = false; 186 + } 187 + EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas);

+2

drivers/vfio/pci/Kconfig

··· 63 63 64 64 source "drivers/vfio/pci/hisilicon/Kconfig" 65 65 66 + source "drivers/vfio/pci/pds/Kconfig" 67 + 66 68 endmenu

+2

drivers/vfio/pci/Makefile

··· 11 11 obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ 12 12 13 13 obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ 14 + 15 + obj-$(CONFIG_PDS_VFIO_PCI) += pds/

+2

drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c

··· 1373 1373 .bind_iommufd = vfio_iommufd_physical_bind, 1374 1374 .unbind_iommufd = vfio_iommufd_physical_unbind, 1375 1375 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1376 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 1376 1377 }; 1377 1378 1378 1379 static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { ··· 1392 1391 .bind_iommufd = vfio_iommufd_physical_bind, 1393 1392 .unbind_iommufd = vfio_iommufd_physical_unbind, 1394 1393 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1394 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 1395 1395 }; 1396 1396 1397 1397 static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

+1 -47

drivers/vfio/pci/mlx5/cmd.c

··· 732 732 mlx5vf_cmd_dealloc_pd(migf); 733 733 } 734 734 735 - static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, 736 - u32 req_nodes) 737 - { 738 - struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 739 - unsigned long min_gap; 740 - unsigned long curr_gap; 741 - 742 - /* Special shortcut when a single range is required */ 743 - if (req_nodes == 1) { 744 - unsigned long last; 745 - 746 - curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 747 - while (curr) { 748 - last = curr->last; 749 - prev = curr; 750 - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 751 - if (prev != comb_start) 752 - interval_tree_remove(prev, root); 753 - } 754 - comb_start->last = last; 755 - return; 756 - } 757 - 758 - /* Combine ranges which have the smallest gap */ 759 - while (cur_nodes > req_nodes) { 760 - prev = NULL; 761 - min_gap = ULONG_MAX; 762 - curr = interval_tree_iter_first(root, 0, ULONG_MAX); 763 - while (curr) { 764 - if (prev) { 765 - curr_gap = curr->start - prev->last; 766 - if (curr_gap < min_gap) { 767 - min_gap = curr_gap; 768 - comb_start = prev; 769 - comb_end = curr; 770 - } 771 - } 772 - prev = curr; 773 - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 774 - } 775 - comb_start->last = comb_end->last; 776 - interval_tree_remove(comb_end, root); 777 - cur_nodes--; 778 - } 779 - } 780 - 781 735 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 782 736 struct mlx5vf_pci_core_device *mvdev, 783 737 struct rb_root_cached *ranges, u32 nnodes) ··· 754 800 int i; 755 801 756 802 if (num_ranges > max_num_range) { 757 - combine_ranges(ranges, nnodes, max_num_range); 803 + vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 758 804 num_ranges = max_num_range; 759 805 } 760 806

+1

drivers/vfio/pci/mlx5/main.c

··· 1320 1320 .bind_iommufd = vfio_iommufd_physical_bind, 1321 1321 .unbind_iommufd = vfio_iommufd_physical_unbind, 1322 1322 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1323 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 1323 1324 }; 1324 1325 1325 1326 static int mlx5vf_pci_probe(struct pci_dev *pdev,

+19

drivers/vfio/pci/pds/Kconfig

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + # Copyright (c) 2023 Advanced Micro Devices, Inc. 3 + 4 + config PDS_VFIO_PCI 5 + tristate "VFIO support for PDS PCI devices" 6 + depends on PDS_CORE 7 + select VFIO_PCI_CORE 8 + help 9 + This provides generic PCI support for PDS devices using the VFIO 10 + framework. 11 + 12 + More specific information on this driver can be 13 + found in 14 + <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>. 15 + 16 + To compile this driver as a module, choose M here. The module 17 + will be called pds-vfio-pci. 18 + 19 + If you don't know what to do here, say N.

+11

drivers/vfio/pci/pds/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + # Copyright (c) 2023 Advanced Micro Devices, Inc. 3 + 4 + obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o 5 + 6 + pds-vfio-pci-y := \ 7 + cmds.o \ 8 + dirty.o \ 9 + lm.o \ 10 + pci_drv.o \ 11 + vfio_dev.o

+510

drivers/vfio/pci/pds/cmds.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #include <linux/io.h> 5 + #include <linux/types.h> 6 + #include <linux/delay.h> 7 + 8 + #include <linux/pds/pds_common.h> 9 + #include <linux/pds/pds_core_if.h> 10 + #include <linux/pds/pds_adminq.h> 11 + 12 + #include "vfio_dev.h" 13 + #include "cmds.h" 14 + 15 + #define SUSPEND_TIMEOUT_S 5 16 + #define SUSPEND_CHECK_INTERVAL_MS 1 17 + 18 + static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio, 19 + union pds_core_adminq_cmd *req, 20 + union pds_core_adminq_comp *resp, 21 + bool fast_poll) 22 + { 23 + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); 24 + union pds_core_adminq_cmd cmd = {}; 25 + struct pdsc *pdsc; 26 + int err; 27 + 28 + /* Wrap the client request */ 29 + cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD; 30 + cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id); 31 + memcpy(cmd.client_request.client_cmd, req, 32 + sizeof(cmd.client_request.client_cmd)); 33 + 34 + pdsc = pdsc_get_pf_struct(pdev); 35 + if (IS_ERR(pdsc)) 36 + return PTR_ERR(pdsc); 37 + 38 + err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll); 39 + if (err && err != -EAGAIN) 40 + dev_err(pds_vfio_to_dev(pds_vfio), 41 + "client admin cmd failed: %pe\n", ERR_PTR(err)); 42 + 43 + return err; 44 + } 45 + 46 + int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio) 47 + { 48 + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); 49 + char devname[PDS_DEVNAME_LEN]; 50 + struct pdsc *pdsc; 51 + int ci; 52 + 53 + snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME, 54 + pci_domain_nr(pdev->bus), 55 + PCI_DEVID(pdev->bus->number, pdev->devfn)); 56 + 57 + pdsc = pdsc_get_pf_struct(pdev); 58 + if (IS_ERR(pdsc)) 59 + return PTR_ERR(pdsc); 60 + 61 + ci = pds_client_register(pdsc, devname); 62 + if (ci < 0) 63 + return ci; 64 + 65 + pds_vfio->client_id = ci; 66 + 67 + return 0; 68 + } 69 + 70 + void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio) 71 + { 72 + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); 73 + struct pdsc *pdsc; 74 + int err; 75 + 76 + pdsc = pdsc_get_pf_struct(pdev); 77 + if (IS_ERR(pdsc)) 78 + return; 79 + 80 + err = pds_client_unregister(pdsc, pds_vfio->client_id); 81 + if (err) 82 + dev_err(&pdev->dev, "unregister from DSC failed: %pe\n", 83 + ERR_PTR(err)); 84 + 85 + pds_vfio->client_id = 0; 86 + } 87 + 88 + static int 89 + pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) 90 + { 91 + union pds_core_adminq_cmd cmd = { 92 + .lm_suspend_status = { 93 + .opcode = PDS_LM_CMD_SUSPEND_STATUS, 94 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 95 + .type = type, 96 + }, 97 + }; 98 + struct device *dev = pds_vfio_to_dev(pds_vfio); 99 + union pds_core_adminq_comp comp = {}; 100 + unsigned long time_limit; 101 + unsigned long time_start; 102 + unsigned long time_done; 103 + int err; 104 + 105 + time_start = jiffies; 106 + time_limit = time_start + HZ * SUSPEND_TIMEOUT_S; 107 + do { 108 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); 109 + if (err != -EAGAIN) 110 + break; 111 + 112 + msleep(SUSPEND_CHECK_INTERVAL_MS); 113 + } while (time_before(jiffies, time_limit)); 114 + 115 + time_done = jiffies; 116 + dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__, 117 + pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start)); 118 + 119 + /* Check the results */ 120 + if (time_after_eq(time_done, time_limit)) { 121 + dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__, 122 + pds_vfio->vf_id); 123 + err = -ETIMEDOUT; 124 + } 125 + 126 + return err; 127 + } 128 + 129 + int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) 130 + { 131 + union pds_core_adminq_cmd cmd = { 132 + .lm_suspend = { 133 + .opcode = PDS_LM_CMD_SUSPEND, 134 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 135 + .type = type, 136 + }, 137 + }; 138 + struct device *dev = pds_vfio_to_dev(pds_vfio); 139 + union pds_core_adminq_comp comp = {}; 140 + int err; 141 + 142 + dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id); 143 + 144 + /* 145 + * The initial suspend request to the firmware starts the device suspend 146 + * operation and the firmware returns success if it's started 147 + * successfully. 148 + */ 149 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); 150 + if (err) { 151 + dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id, 152 + ERR_PTR(err)); 153 + return err; 154 + } 155 + 156 + /* 157 + * The subsequent suspend status request(s) check if the firmware has 158 + * completed the device suspend process. 159 + */ 160 + return pds_vfio_suspend_wait_device_cmd(pds_vfio, type); 161 + } 162 + 163 + int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) 164 + { 165 + union pds_core_adminq_cmd cmd = { 166 + .lm_resume = { 167 + .opcode = PDS_LM_CMD_RESUME, 168 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 169 + .type = type, 170 + }, 171 + }; 172 + struct device *dev = pds_vfio_to_dev(pds_vfio); 173 + union pds_core_adminq_comp comp = {}; 174 + 175 + dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id); 176 + 177 + return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); 178 + } 179 + 180 + int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size) 181 + { 182 + union pds_core_adminq_cmd cmd = { 183 + .lm_state_size = { 184 + .opcode = PDS_LM_CMD_STATE_SIZE, 185 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 186 + }, 187 + }; 188 + struct device *dev = pds_vfio_to_dev(pds_vfio); 189 + union pds_core_adminq_comp comp = {}; 190 + int err; 191 + 192 + dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id); 193 + 194 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 195 + if (err) 196 + return err; 197 + 198 + *size = le64_to_cpu(comp.lm_state_size.size); 199 + return 0; 200 + } 201 + 202 + static int pds_vfio_dma_map_lm_file(struct device *dev, 203 + enum dma_data_direction dir, 204 + struct pds_vfio_lm_file *lm_file) 205 + { 206 + struct pds_lm_sg_elem *sgl, *sge; 207 + struct scatterlist *sg; 208 + dma_addr_t sgl_addr; 209 + size_t sgl_size; 210 + int err; 211 + int i; 212 + 213 + if (!lm_file) 214 + return -EINVAL; 215 + 216 + /* dma map file pages */ 217 + err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0); 218 + if (err) 219 + return err; 220 + 221 + lm_file->num_sge = lm_file->sg_table.nents; 222 + 223 + /* alloc sgl */ 224 + sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem); 225 + sgl = kzalloc(sgl_size, GFP_KERNEL); 226 + if (!sgl) { 227 + err = -ENOMEM; 228 + goto out_unmap_sgtable; 229 + } 230 + 231 + /* fill sgl */ 232 + sge = sgl; 233 + for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) { 234 + sge->addr = cpu_to_le64(sg_dma_address(sg)); 235 + sge->len = cpu_to_le32(sg_dma_len(sg)); 236 + dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len); 237 + sge++; 238 + } 239 + 240 + sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE); 241 + if (dma_mapping_error(dev, sgl_addr)) { 242 + err = -EIO; 243 + goto out_free_sgl; 244 + } 245 + 246 + lm_file->sgl = sgl; 247 + lm_file->sgl_addr = sgl_addr; 248 + 249 + return 0; 250 + 251 + out_free_sgl: 252 + kfree(sgl); 253 + out_unmap_sgtable: 254 + lm_file->num_sge = 0; 255 + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); 256 + return err; 257 + } 258 + 259 + static void pds_vfio_dma_unmap_lm_file(struct device *dev, 260 + enum dma_data_direction dir, 261 + struct pds_vfio_lm_file *lm_file) 262 + { 263 + if (!lm_file) 264 + return; 265 + 266 + /* free sgl */ 267 + if (lm_file->sgl) { 268 + dma_unmap_single(dev, lm_file->sgl_addr, 269 + lm_file->num_sge * sizeof(*lm_file->sgl), 270 + DMA_TO_DEVICE); 271 + kfree(lm_file->sgl); 272 + lm_file->sgl = NULL; 273 + lm_file->sgl_addr = DMA_MAPPING_ERROR; 274 + lm_file->num_sge = 0; 275 + } 276 + 277 + /* dma unmap file pages */ 278 + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); 279 + } 280 + 281 + int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) 282 + { 283 + union pds_core_adminq_cmd cmd = { 284 + .lm_save = { 285 + .opcode = PDS_LM_CMD_SAVE, 286 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 287 + }, 288 + }; 289 + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); 290 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 291 + union pds_core_adminq_comp comp = {}; 292 + struct pds_vfio_lm_file *lm_file; 293 + int err; 294 + 295 + dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id); 296 + 297 + lm_file = pds_vfio->save_file; 298 + 299 + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); 300 + if (err) { 301 + dev_err(&pdev->dev, "failed to map save migration file: %pe\n", 302 + ERR_PTR(err)); 303 + return err; 304 + } 305 + 306 + cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr); 307 + cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge); 308 + 309 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 310 + if (err) 311 + dev_err(&pdev->dev, "failed to get migration state: %pe\n", 312 + ERR_PTR(err)); 313 + 314 + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); 315 + 316 + return err; 317 + } 318 + 319 + int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) 320 + { 321 + union pds_core_adminq_cmd cmd = { 322 + .lm_restore = { 323 + .opcode = PDS_LM_CMD_RESTORE, 324 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 325 + }, 326 + }; 327 + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); 328 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 329 + union pds_core_adminq_comp comp = {}; 330 + struct pds_vfio_lm_file *lm_file; 331 + int err; 332 + 333 + dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id); 334 + 335 + lm_file = pds_vfio->restore_file; 336 + 337 + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); 338 + if (err) { 339 + dev_err(&pdev->dev, 340 + "failed to map restore migration file: %pe\n", 341 + ERR_PTR(err)); 342 + return err; 343 + } 344 + 345 + cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr); 346 + cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge); 347 + 348 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 349 + if (err) 350 + dev_err(&pdev->dev, "failed to set migration state: %pe\n", 351 + ERR_PTR(err)); 352 + 353 + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); 354 + 355 + return err; 356 + } 357 + 358 + void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, 359 + enum pds_lm_host_vf_status vf_status) 360 + { 361 + union pds_core_adminq_cmd cmd = { 362 + .lm_host_vf_status = { 363 + .opcode = PDS_LM_CMD_HOST_VF_STATUS, 364 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 365 + .status = vf_status, 366 + }, 367 + }; 368 + struct device *dev = pds_vfio_to_dev(pds_vfio); 369 + union pds_core_adminq_comp comp = {}; 370 + int err; 371 + 372 + dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id, 373 + vf_status); 374 + if (vf_status != PDS_LM_STA_IN_PROGRESS && 375 + vf_status != PDS_LM_STA_NONE) { 376 + dev_warn(dev, "Invalid host VF migration status, %d\n", 377 + vf_status); 378 + return; 379 + } 380 + 381 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 382 + if (err) 383 + dev_warn(dev, "failed to send host VF migration status: %pe\n", 384 + ERR_PTR(err)); 385 + } 386 + 387 + int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, 388 + u64 regions_dma, u8 *max_regions, u8 *num_regions) 389 + { 390 + union pds_core_adminq_cmd cmd = { 391 + .lm_dirty_status = { 392 + .opcode = PDS_LM_CMD_DIRTY_STATUS, 393 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 394 + }, 395 + }; 396 + struct device *dev = pds_vfio_to_dev(pds_vfio); 397 + union pds_core_adminq_comp comp = {}; 398 + int err; 399 + 400 + dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id); 401 + 402 + cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma); 403 + cmd.lm_dirty_status.max_regions = *max_regions; 404 + 405 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 406 + if (err) { 407 + dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err)); 408 + return err; 409 + } 410 + 411 + /* only support seq_ack approach for now */ 412 + if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) & 413 + BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) { 414 + dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n"); 415 + return -EOPNOTSUPP; 416 + } 417 + 418 + *num_regions = comp.lm_dirty_status.num_regions; 419 + *max_regions = comp.lm_dirty_status.max_regions; 420 + 421 + dev_dbg(dev, 422 + "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n", 423 + *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK"); 424 + 425 + return 0; 426 + } 427 + 428 + int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, 429 + u64 regions_dma, u8 num_regions) 430 + { 431 + union pds_core_adminq_cmd cmd = { 432 + .lm_dirty_enable = { 433 + .opcode = PDS_LM_CMD_DIRTY_ENABLE, 434 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 435 + .regions_dma = cpu_to_le64(regions_dma), 436 + .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK, 437 + .num_regions = num_regions, 438 + }, 439 + }; 440 + struct device *dev = pds_vfio_to_dev(pds_vfio); 441 + union pds_core_adminq_comp comp = {}; 442 + int err; 443 + 444 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 445 + if (err) { 446 + dev_err(dev, "failed dirty tracking enable: %pe\n", 447 + ERR_PTR(err)); 448 + return err; 449 + } 450 + 451 + return 0; 452 + } 453 + 454 + int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio) 455 + { 456 + union pds_core_adminq_cmd cmd = { 457 + .lm_dirty_disable = { 458 + .opcode = PDS_LM_CMD_DIRTY_DISABLE, 459 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 460 + }, 461 + }; 462 + struct device *dev = pds_vfio_to_dev(pds_vfio); 463 + union pds_core_adminq_comp comp = {}; 464 + int err; 465 + 466 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 467 + if (err || comp.lm_dirty_status.num_regions != 0) { 468 + /* in case num_regions is still non-zero after disable */ 469 + err = err ? err : -EIO; 470 + dev_err(dev, 471 + "failed dirty tracking disable: %pe, num_regions %d\n", 472 + ERR_PTR(err), comp.lm_dirty_status.num_regions); 473 + return err; 474 + } 475 + 476 + return 0; 477 + } 478 + 479 + int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, 480 + u64 sgl_dma, u16 num_sge, u32 offset, 481 + u32 total_len, bool read_seq) 482 + { 483 + const char *cmd_type_str = read_seq ? "read_seq" : "write_ack"; 484 + union pds_core_adminq_cmd cmd = { 485 + .lm_dirty_seq_ack = { 486 + .vf_id = cpu_to_le16(pds_vfio->vf_id), 487 + .len_bytes = cpu_to_le32(total_len), 488 + .off_bytes = cpu_to_le32(offset), 489 + .sgl_addr = cpu_to_le64(sgl_dma), 490 + .num_sge = cpu_to_le16(num_sge), 491 + }, 492 + }; 493 + struct device *dev = pds_vfio_to_dev(pds_vfio); 494 + union pds_core_adminq_comp comp = {}; 495 + int err; 496 + 497 + if (read_seq) 498 + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ; 499 + else 500 + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK; 501 + 502 + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); 503 + if (err) { 504 + dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str, 505 + ERR_PTR(err)); 506 + return err; 507 + } 508 + 509 + return 0; 510 + }

+25

drivers/vfio/pci/pds/cmds.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #ifndef _CMDS_H_ 5 + #define _CMDS_H_ 6 + 7 + int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio); 8 + void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio); 9 + int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); 10 + int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); 11 + int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size); 12 + int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); 13 + int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); 14 + void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, 15 + enum pds_lm_host_vf_status vf_status); 16 + int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, 17 + u64 regions_dma, u8 *max_regions, 18 + u8 *num_regions); 19 + int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, 20 + u64 regions_dma, u8 num_regions); 21 + int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio); 22 + int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, 23 + u64 sgl_dma, u16 num_sge, u32 offset, 24 + u32 total_len, bool read_seq); 25 + #endif /* _CMDS_H_ */

+564

drivers/vfio/pci/pds/dirty.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #include <linux/interval_tree.h> 5 + #include <linux/vfio.h> 6 + 7 + #include <linux/pds/pds_common.h> 8 + #include <linux/pds/pds_core_if.h> 9 + #include <linux/pds/pds_adminq.h> 10 + 11 + #include "vfio_dev.h" 12 + #include "cmds.h" 13 + #include "dirty.h" 14 + 15 + #define READ_SEQ true 16 + #define WRITE_ACK false 17 + 18 + bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio) 19 + { 20 + return pds_vfio->dirty.is_enabled; 21 + } 22 + 23 + void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio) 24 + { 25 + pds_vfio->dirty.is_enabled = true; 26 + } 27 + 28 + void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio) 29 + { 30 + pds_vfio->dirty.is_enabled = false; 31 + } 32 + 33 + static void 34 + pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio, 35 + u8 max_regions) 36 + { 37 + int len = max_regions * sizeof(struct pds_lm_dirty_region_info); 38 + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 39 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 40 + struct pds_lm_dirty_region_info *region_info; 41 + dma_addr_t regions_dma; 42 + u8 num_regions; 43 + int err; 44 + 45 + region_info = kcalloc(max_regions, 46 + sizeof(struct pds_lm_dirty_region_info), 47 + GFP_KERNEL); 48 + if (!region_info) 49 + return; 50 + 51 + regions_dma = 52 + dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE); 53 + if (dma_mapping_error(pdsc_dev, regions_dma)) 54 + goto out_free_region_info; 55 + 56 + err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions, 57 + &num_regions); 58 + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE); 59 + if (err) 60 + goto out_free_region_info; 61 + 62 + for (unsigned int i = 0; i < num_regions; i++) 63 + dev_dbg(&pdev->dev, 64 + "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n", 65 + i, le64_to_cpu(region_info[i].dma_base), 66 + le32_to_cpu(region_info[i].page_count), 67 + region_info[i].page_size_log2); 68 + 69 + out_free_region_info: 70 + kfree(region_info); 71 + } 72 + 73 + static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, 74 + unsigned long bytes) 75 + { 76 + unsigned long *host_seq_bmp, *host_ack_bmp; 77 + 78 + host_seq_bmp = vzalloc(bytes); 79 + if (!host_seq_bmp) 80 + return -ENOMEM; 81 + 82 + host_ack_bmp = vzalloc(bytes); 83 + if (!host_ack_bmp) { 84 + bitmap_free(host_seq_bmp); 85 + return -ENOMEM; 86 + } 87 + 88 + dirty->host_seq.bmp = host_seq_bmp; 89 + dirty->host_ack.bmp = host_ack_bmp; 90 + 91 + return 0; 92 + } 93 + 94 + static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) 95 + { 96 + vfree(dirty->host_seq.bmp); 97 + vfree(dirty->host_ack.bmp); 98 + dirty->host_seq.bmp = NULL; 99 + dirty->host_ack.bmp = NULL; 100 + } 101 + 102 + static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, 103 + struct pds_vfio_bmp_info *bmp_info) 104 + { 105 + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 106 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 107 + 108 + dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, 109 + bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), 110 + DMA_BIDIRECTIONAL); 111 + kfree(bmp_info->sgl); 112 + 113 + bmp_info->num_sge = 0; 114 + bmp_info->sgl = NULL; 115 + bmp_info->sgl_addr = 0; 116 + } 117 + 118 + static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) 119 + { 120 + if (pds_vfio->dirty.host_seq.sgl) 121 + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); 122 + if (pds_vfio->dirty.host_ack.sgl) 123 + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); 124 + } 125 + 126 + static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, 127 + struct pds_vfio_bmp_info *bmp_info, 128 + u32 page_count) 129 + { 130 + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 131 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 132 + struct pds_lm_sg_elem *sgl; 133 + dma_addr_t sgl_addr; 134 + size_t sgl_size; 135 + u32 max_sge; 136 + 137 + max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8); 138 + sgl_size = max_sge * sizeof(struct pds_lm_sg_elem); 139 + 140 + sgl = kzalloc(sgl_size, GFP_KERNEL); 141 + if (!sgl) 142 + return -ENOMEM; 143 + 144 + sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL); 145 + if (dma_mapping_error(pdsc_dev, sgl_addr)) { 146 + kfree(sgl); 147 + return -EIO; 148 + } 149 + 150 + bmp_info->sgl = sgl; 151 + bmp_info->num_sge = max_sge; 152 + bmp_info->sgl_addr = sgl_addr; 153 + 154 + return 0; 155 + } 156 + 157 + static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, 158 + u32 page_count) 159 + { 160 + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 161 + int err; 162 + 163 + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, 164 + page_count); 165 + if (err) 166 + return err; 167 + 168 + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, 169 + page_count); 170 + if (err) { 171 + __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); 172 + return err; 173 + } 174 + 175 + return 0; 176 + } 177 + 178 + static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, 179 + struct rb_root_cached *ranges, u32 nnodes, 180 + u64 *page_size) 181 + { 182 + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 183 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 184 + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 185 + u64 region_start, region_size, region_page_size; 186 + struct pds_lm_dirty_region_info *region_info; 187 + struct interval_tree_node *node = NULL; 188 + u8 max_regions = 0, num_regions; 189 + dma_addr_t regions_dma = 0; 190 + u32 num_ranges = nnodes; 191 + u32 page_count; 192 + u16 len; 193 + int err; 194 + 195 + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", 196 + pds_vfio->vf_id); 197 + 198 + if (pds_vfio_dirty_is_enabled(pds_vfio)) 199 + return -EINVAL; 200 + 201 + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ 202 + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, 203 + &num_regions); 204 + if (err < 0) { 205 + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", 206 + ERR_PTR(err)); 207 + return err; 208 + } else if (num_regions) { 209 + dev_err(&pdev->dev, 210 + "Dirty tracking already enabled for %d regions\n", 211 + num_regions); 212 + return -EEXIST; 213 + } else if (!max_regions) { 214 + dev_err(&pdev->dev, 215 + "Device doesn't support dirty tracking, max_regions %d\n", 216 + max_regions); 217 + return -EOPNOTSUPP; 218 + } 219 + 220 + /* 221 + * Only support 1 region for now. If there are any large gaps in the 222 + * VM's address regions, then this would be a waste of memory as we are 223 + * generating 2 bitmaps (ack/seq) from the min address to the max 224 + * address of the VM's address regions. In the future, if we support 225 + * more than one region in the device/driver we can split the bitmaps 226 + * on the largest address region gaps. We can do this split up to the 227 + * max_regions times returned from the dirty_status command. 228 + */ 229 + max_regions = 1; 230 + if (num_ranges > max_regions) { 231 + vfio_combine_iova_ranges(ranges, nnodes, max_regions); 232 + num_ranges = max_regions; 233 + } 234 + 235 + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 236 + if (!node) 237 + return -EINVAL; 238 + 239 + region_size = node->last - node->start + 1; 240 + region_start = node->start; 241 + region_page_size = *page_size; 242 + 243 + len = sizeof(*region_info); 244 + region_info = kzalloc(len, GFP_KERNEL); 245 + if (!region_info) 246 + return -ENOMEM; 247 + 248 + page_count = DIV_ROUND_UP(region_size, region_page_size); 249 + 250 + region_info->dma_base = cpu_to_le64(region_start); 251 + region_info->page_count = cpu_to_le32(page_count); 252 + region_info->page_size_log2 = ilog2(region_page_size); 253 + 254 + regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, 255 + DMA_BIDIRECTIONAL); 256 + if (dma_mapping_error(pdsc_dev, regions_dma)) { 257 + err = -ENOMEM; 258 + goto out_free_region_info; 259 + } 260 + 261 + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); 262 + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); 263 + if (err) 264 + goto out_free_region_info; 265 + 266 + /* 267 + * page_count might be adjusted by the device, 268 + * update it before freeing region_info DMA 269 + */ 270 + page_count = le32_to_cpu(region_info->page_count); 271 + 272 + dev_dbg(&pdev->dev, 273 + "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", 274 + regions_dma, region_start, page_count, 275 + (u8)ilog2(region_page_size)); 276 + 277 + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); 278 + if (err) { 279 + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", 280 + ERR_PTR(err)); 281 + goto out_free_region_info; 282 + } 283 + 284 + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); 285 + if (err) { 286 + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", 287 + ERR_PTR(err)); 288 + goto out_free_bitmaps; 289 + } 290 + 291 + dirty->region_start = region_start; 292 + dirty->region_size = region_size; 293 + dirty->region_page_size = region_page_size; 294 + pds_vfio_dirty_set_enabled(pds_vfio); 295 + 296 + pds_vfio_print_guest_region_info(pds_vfio, max_regions); 297 + 298 + kfree(region_info); 299 + 300 + return 0; 301 + 302 + out_free_bitmaps: 303 + pds_vfio_dirty_free_bitmaps(dirty); 304 + out_free_region_info: 305 + kfree(region_info); 306 + return err; 307 + } 308 + 309 + void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) 310 + { 311 + if (pds_vfio_dirty_is_enabled(pds_vfio)) { 312 + pds_vfio_dirty_set_disabled(pds_vfio); 313 + if (send_cmd) 314 + pds_vfio_dirty_disable_cmd(pds_vfio); 315 + pds_vfio_dirty_free_sgl(pds_vfio); 316 + pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); 317 + } 318 + 319 + if (send_cmd) 320 + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); 321 + } 322 + 323 + static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, 324 + struct pds_vfio_bmp_info *bmp_info, 325 + u32 offset, u32 bmp_bytes, bool read_seq) 326 + { 327 + const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; 328 + u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 329 + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; 330 + struct device *pdsc_dev = &pci_physfn(pdev)->dev; 331 + unsigned long long npages; 332 + struct sg_table sg_table; 333 + struct scatterlist *sg; 334 + struct page **pages; 335 + u32 page_offset; 336 + const void *bmp; 337 + size_t size; 338 + u16 num_sge; 339 + int err; 340 + int i; 341 + 342 + bmp = (void *)((u64)bmp_info->bmp + offset); 343 + page_offset = offset_in_page(bmp); 344 + bmp -= page_offset; 345 + 346 + /* 347 + * Start and end of bitmap section to seq/ack might not be page 348 + * aligned, so use the page_offset to account for that so there 349 + * will be enough pages to represent the bmp_bytes 350 + */ 351 + npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE); 352 + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); 353 + if (!pages) 354 + return -ENOMEM; 355 + 356 + for (unsigned long long i = 0; i < npages; i++) { 357 + struct page *page = vmalloc_to_page(bmp); 358 + 359 + if (!page) { 360 + err = -EFAULT; 361 + goto out_free_pages; 362 + } 363 + 364 + pages[i] = page; 365 + bmp += PAGE_SIZE; 366 + } 367 + 368 + err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset, 369 + bmp_bytes, GFP_KERNEL); 370 + if (err) 371 + goto out_free_pages; 372 + 373 + err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0); 374 + if (err) 375 + goto out_free_sg_table; 376 + 377 + for_each_sgtable_dma_sg(&sg_table, sg, i) { 378 + struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; 379 + 380 + sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); 381 + sg_elem->len = cpu_to_le32(sg_dma_len(sg)); 382 + } 383 + 384 + num_sge = sg_table.nents; 385 + size = num_sge * sizeof(struct pds_lm_sg_elem); 386 + dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); 387 + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, 388 + offset, bmp_bytes, read_seq); 389 + if (err) 390 + dev_err(&pdev->dev, 391 + "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", 392 + bmp_type_str, offset, bmp_bytes, 393 + num_sge, bmp_info->sgl_addr, ERR_PTR(err)); 394 + dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); 395 + 396 + dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); 397 + out_free_sg_table: 398 + sg_free_table(&sg_table); 399 + out_free_pages: 400 + kfree(pages); 401 + 402 + return err; 403 + } 404 + 405 + static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, 406 + u32 offset, u32 len) 407 + { 408 + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, 409 + offset, len, WRITE_ACK); 410 + } 411 + 412 + static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, 413 + u32 offset, u32 len) 414 + { 415 + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, 416 + offset, len, READ_SEQ); 417 + } 418 + 419 + static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, 420 + struct iova_bitmap *dirty_bitmap, 421 + u32 bmp_offset, u32 len_bytes) 422 + { 423 + u64 page_size = pds_vfio->dirty.region_page_size; 424 + u64 region_start = pds_vfio->dirty.region_start; 425 + u32 bmp_offset_bit; 426 + __le64 *seq, *ack; 427 + int dword_count; 428 + 429 + dword_count = len_bytes / sizeof(u64); 430 + seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); 431 + ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); 432 + bmp_offset_bit = bmp_offset * 8; 433 + 434 + for (int i = 0; i < dword_count; i++) { 435 + u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]); 436 + 437 + /* prepare for next write_ack call */ 438 + ack[i] = seq[i]; 439 + 440 + for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) { 441 + if (xor & BIT(bit_i)) { 442 + u64 abs_bit_i = bmp_offset_bit + 443 + i * BITS_PER_TYPE(u64) + bit_i; 444 + u64 addr = abs_bit_i * page_size + region_start; 445 + 446 + iova_bitmap_set(dirty_bitmap, addr, page_size); 447 + } 448 + } 449 + } 450 + 451 + return 0; 452 + } 453 + 454 + static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, 455 + struct iova_bitmap *dirty_bitmap, 456 + unsigned long iova, unsigned long length) 457 + { 458 + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; 459 + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; 460 + u64 bmp_offset, bmp_bytes; 461 + u64 bitmap_size, pages; 462 + int err; 463 + 464 + dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id); 465 + 466 + if (!pds_vfio_dirty_is_enabled(pds_vfio)) { 467 + dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n", 468 + pds_vfio->vf_id); 469 + return -EINVAL; 470 + } 471 + 472 + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); 473 + bitmap_size = 474 + round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; 475 + 476 + dev_dbg(dev, 477 + "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", 478 + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, 479 + pages, bitmap_size); 480 + 481 + if (!length || ((dirty->region_start + iova + length) > 482 + (dirty->region_start + dirty->region_size))) { 483 + dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", 484 + iova, length); 485 + return -EINVAL; 486 + } 487 + 488 + /* bitmap is modified in 64 bit chunks */ 489 + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, 490 + sizeof(u64)), 491 + sizeof(u64)); 492 + if (bmp_bytes != bitmap_size) { 493 + dev_err(dev, 494 + "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", 495 + bmp_bytes, bitmap_size); 496 + return -EINVAL; 497 + } 498 + 499 + bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); 500 + 501 + dev_dbg(dev, 502 + "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", 503 + iova, length, bmp_offset, bmp_bytes); 504 + 505 + err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); 506 + if (err) 507 + return err; 508 + 509 + err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, 510 + bmp_bytes); 511 + if (err) 512 + return err; 513 + 514 + err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); 515 + if (err) 516 + return err; 517 + 518 + return 0; 519 + } 520 + 521 + int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, 522 + unsigned long length, struct iova_bitmap *dirty) 523 + { 524 + struct pds_vfio_pci_device *pds_vfio = 525 + container_of(vdev, struct pds_vfio_pci_device, 526 + vfio_coredev.vdev); 527 + int err; 528 + 529 + mutex_lock(&pds_vfio->state_mutex); 530 + err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); 531 + pds_vfio_state_mutex_unlock(pds_vfio); 532 + 533 + return err; 534 + } 535 + 536 + int pds_vfio_dma_logging_start(struct vfio_device *vdev, 537 + struct rb_root_cached *ranges, u32 nnodes, 538 + u64 *page_size) 539 + { 540 + struct pds_vfio_pci_device *pds_vfio = 541 + container_of(vdev, struct pds_vfio_pci_device, 542 + vfio_coredev.vdev); 543 + int err; 544 + 545 + mutex_lock(&pds_vfio->state_mutex); 546 + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); 547 + err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); 548 + pds_vfio_state_mutex_unlock(pds_vfio); 549 + 550 + return err; 551 + } 552 + 553 + int pds_vfio_dma_logging_stop(struct vfio_device *vdev) 554 + { 555 + struct pds_vfio_pci_device *pds_vfio = 556 + container_of(vdev, struct pds_vfio_pci_device, 557 + vfio_coredev.vdev); 558 + 559 + mutex_lock(&pds_vfio->state_mutex); 560 + pds_vfio_dirty_disable(pds_vfio, true); 561 + pds_vfio_state_mutex_unlock(pds_vfio); 562 + 563 + return 0; 564 + }

+39

drivers/vfio/pci/pds/dirty.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #ifndef _DIRTY_H_ 5 + #define _DIRTY_H_ 6 + 7 + struct pds_vfio_bmp_info { 8 + unsigned long *bmp; 9 + u32 bmp_bytes; 10 + struct pds_lm_sg_elem *sgl; 11 + dma_addr_t sgl_addr; 12 + u16 num_sge; 13 + }; 14 + 15 + struct pds_vfio_dirty { 16 + struct pds_vfio_bmp_info host_seq; 17 + struct pds_vfio_bmp_info host_ack; 18 + u64 region_size; 19 + u64 region_start; 20 + u64 region_page_size; 21 + bool is_enabled; 22 + }; 23 + 24 + struct pds_vfio_pci_device; 25 + 26 + bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio); 27 + void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio); 28 + void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio); 29 + void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, 30 + bool send_cmd); 31 + 32 + int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, 33 + unsigned long length, 34 + struct iova_bitmap *dirty); 35 + int pds_vfio_dma_logging_start(struct vfio_device *vdev, 36 + struct rb_root_cached *ranges, u32 nnodes, 37 + u64 *page_size); 38 + int pds_vfio_dma_logging_stop(struct vfio_device *vdev); 39 + #endif /* _DIRTY_H_ */

+434

drivers/vfio/pci/pds/lm.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #include <linux/anon_inodes.h> 5 + #include <linux/file.h> 6 + #include <linux/fs.h> 7 + #include <linux/highmem.h> 8 + #include <linux/vfio.h> 9 + #include <linux/vfio_pci_core.h> 10 + 11 + #include "vfio_dev.h" 12 + #include "cmds.h" 13 + 14 + static struct pds_vfio_lm_file * 15 + pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size) 16 + { 17 + struct pds_vfio_lm_file *lm_file = NULL; 18 + unsigned long long npages; 19 + struct page **pages; 20 + void *page_mem; 21 + const void *p; 22 + 23 + if (!size) 24 + return NULL; 25 + 26 + /* Alloc file structure */ 27 + lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL); 28 + if (!lm_file) 29 + return NULL; 30 + 31 + /* Create file */ 32 + lm_file->filep = 33 + anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags); 34 + if (IS_ERR(lm_file->filep)) 35 + goto out_free_file; 36 + 37 + stream_open(lm_file->filep->f_inode, lm_file->filep); 38 + mutex_init(&lm_file->lock); 39 + 40 + /* prevent file from being released before we are done with it */ 41 + get_file(lm_file->filep); 42 + 43 + /* Allocate memory for file pages */ 44 + npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE); 45 + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); 46 + if (!pages) 47 + goto out_put_file; 48 + 49 + page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL); 50 + if (!page_mem) 51 + goto out_free_pages_array; 52 + 53 + p = page_mem - offset_in_page(page_mem); 54 + for (unsigned long long i = 0; i < npages; i++) { 55 + if (is_vmalloc_addr(p)) 56 + pages[i] = vmalloc_to_page(p); 57 + else 58 + pages[i] = kmap_to_page((void *)p); 59 + if (!pages[i]) 60 + goto out_free_page_mem; 61 + 62 + p += PAGE_SIZE; 63 + } 64 + 65 + /* Create scatterlist of file pages to use for DMA mapping later */ 66 + if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0, 67 + size, GFP_KERNEL)) 68 + goto out_free_page_mem; 69 + 70 + lm_file->size = size; 71 + lm_file->pages = pages; 72 + lm_file->npages = npages; 73 + lm_file->page_mem = page_mem; 74 + lm_file->alloc_size = npages * PAGE_SIZE; 75 + 76 + return lm_file; 77 + 78 + out_free_page_mem: 79 + kvfree(page_mem); 80 + out_free_pages_array: 81 + kfree(pages); 82 + out_put_file: 83 + fput(lm_file->filep); 84 + mutex_destroy(&lm_file->lock); 85 + out_free_file: 86 + kfree(lm_file); 87 + 88 + return NULL; 89 + } 90 + 91 + static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file) 92 + { 93 + mutex_lock(&lm_file->lock); 94 + 95 + lm_file->size = 0; 96 + lm_file->alloc_size = 0; 97 + 98 + /* Free scatter list of file pages */ 99 + sg_free_table(&lm_file->sg_table); 100 + 101 + kvfree(lm_file->page_mem); 102 + lm_file->page_mem = NULL; 103 + kfree(lm_file->pages); 104 + lm_file->pages = NULL; 105 + 106 + mutex_unlock(&lm_file->lock); 107 + 108 + /* allow file to be released since we are done with it */ 109 + fput(lm_file->filep); 110 + } 111 + 112 + void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio) 113 + { 114 + if (!pds_vfio->save_file) 115 + return; 116 + 117 + pds_vfio_put_lm_file(pds_vfio->save_file); 118 + pds_vfio->save_file = NULL; 119 + } 120 + 121 + void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio) 122 + { 123 + if (!pds_vfio->restore_file) 124 + return; 125 + 126 + pds_vfio_put_lm_file(pds_vfio->restore_file); 127 + pds_vfio->restore_file = NULL; 128 + } 129 + 130 + static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, 131 + unsigned long offset) 132 + { 133 + unsigned long cur_offset = 0; 134 + struct scatterlist *sg; 135 + unsigned int i; 136 + 137 + /* All accesses are sequential */ 138 + if (offset < lm_file->last_offset || !lm_file->last_offset_sg) { 139 + lm_file->last_offset = 0; 140 + lm_file->last_offset_sg = lm_file->sg_table.sgl; 141 + lm_file->sg_last_entry = 0; 142 + } 143 + 144 + cur_offset = lm_file->last_offset; 145 + 146 + for_each_sg(lm_file->last_offset_sg, sg, 147 + lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) { 148 + if (offset < sg->length + cur_offset) { 149 + lm_file->last_offset_sg = sg; 150 + lm_file->sg_last_entry += i; 151 + lm_file->last_offset = cur_offset; 152 + return nth_page(sg_page(sg), 153 + (offset - cur_offset) / PAGE_SIZE); 154 + } 155 + cur_offset += sg->length; 156 + } 157 + 158 + return NULL; 159 + } 160 + 161 + static int pds_vfio_release_file(struct inode *inode, struct file *filp) 162 + { 163 + struct pds_vfio_lm_file *lm_file = filp->private_data; 164 + 165 + mutex_lock(&lm_file->lock); 166 + lm_file->filep->f_pos = 0; 167 + lm_file->size = 0; 168 + mutex_unlock(&lm_file->lock); 169 + mutex_destroy(&lm_file->lock); 170 + kfree(lm_file); 171 + 172 + return 0; 173 + } 174 + 175 + static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf, 176 + size_t len, loff_t *pos) 177 + { 178 + struct pds_vfio_lm_file *lm_file = filp->private_data; 179 + ssize_t done = 0; 180 + 181 + if (pos) 182 + return -ESPIPE; 183 + pos = &filp->f_pos; 184 + 185 + mutex_lock(&lm_file->lock); 186 + if (*pos > lm_file->size) { 187 + done = -EINVAL; 188 + goto out_unlock; 189 + } 190 + 191 + len = min_t(size_t, lm_file->size - *pos, len); 192 + while (len) { 193 + size_t page_offset; 194 + struct page *page; 195 + size_t page_len; 196 + u8 *from_buff; 197 + int err; 198 + 199 + page_offset = (*pos) % PAGE_SIZE; 200 + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); 201 + if (!page) { 202 + if (done == 0) 203 + done = -EINVAL; 204 + goto out_unlock; 205 + } 206 + 207 + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 208 + from_buff = kmap_local_page(page); 209 + err = copy_to_user(buf, from_buff + page_offset, page_len); 210 + kunmap_local(from_buff); 211 + if (err) { 212 + done = -EFAULT; 213 + goto out_unlock; 214 + } 215 + *pos += page_len; 216 + len -= page_len; 217 + done += page_len; 218 + buf += page_len; 219 + } 220 + 221 + out_unlock: 222 + mutex_unlock(&lm_file->lock); 223 + return done; 224 + } 225 + 226 + static const struct file_operations pds_vfio_save_fops = { 227 + .owner = THIS_MODULE, 228 + .read = pds_vfio_save_read, 229 + .release = pds_vfio_release_file, 230 + .llseek = no_llseek, 231 + }; 232 + 233 + static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio) 234 + { 235 + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; 236 + struct pds_vfio_lm_file *lm_file; 237 + u64 size; 238 + int err; 239 + 240 + /* Get live migration state size in this state */ 241 + err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size); 242 + if (err) { 243 + dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err)); 244 + return err; 245 + } 246 + 247 + dev_dbg(dev, "save status, size = %lld\n", size); 248 + 249 + if (!size) { 250 + dev_err(dev, "invalid state size\n"); 251 + return -EIO; 252 + } 253 + 254 + lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size); 255 + if (!lm_file) { 256 + dev_err(dev, "failed to create save file\n"); 257 + return -ENOENT; 258 + } 259 + 260 + dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n", 261 + lm_file->size, lm_file->alloc_size, lm_file->npages); 262 + 263 + pds_vfio->save_file = lm_file; 264 + 265 + return 0; 266 + } 267 + 268 + static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf, 269 + size_t len, loff_t *pos) 270 + { 271 + struct pds_vfio_lm_file *lm_file = filp->private_data; 272 + loff_t requested_length; 273 + ssize_t done = 0; 274 + 275 + if (pos) 276 + return -ESPIPE; 277 + 278 + pos = &filp->f_pos; 279 + 280 + if (*pos < 0 || 281 + check_add_overflow((loff_t)len, *pos, &requested_length)) 282 + return -EINVAL; 283 + 284 + mutex_lock(&lm_file->lock); 285 + 286 + while (len) { 287 + size_t page_offset; 288 + struct page *page; 289 + size_t page_len; 290 + u8 *to_buff; 291 + int err; 292 + 293 + page_offset = (*pos) % PAGE_SIZE; 294 + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); 295 + if (!page) { 296 + if (done == 0) 297 + done = -EINVAL; 298 + goto out_unlock; 299 + } 300 + 301 + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 302 + to_buff = kmap_local_page(page); 303 + err = copy_from_user(to_buff + page_offset, buf, page_len); 304 + kunmap_local(to_buff); 305 + if (err) { 306 + done = -EFAULT; 307 + goto out_unlock; 308 + } 309 + *pos += page_len; 310 + len -= page_len; 311 + done += page_len; 312 + buf += page_len; 313 + lm_file->size += page_len; 314 + } 315 + out_unlock: 316 + mutex_unlock(&lm_file->lock); 317 + return done; 318 + } 319 + 320 + static const struct file_operations pds_vfio_restore_fops = { 321 + .owner = THIS_MODULE, 322 + .write = pds_vfio_restore_write, 323 + .release = pds_vfio_release_file, 324 + .llseek = no_llseek, 325 + }; 326 + 327 + static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio) 328 + { 329 + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; 330 + struct pds_vfio_lm_file *lm_file; 331 + u64 size; 332 + 333 + size = sizeof(union pds_lm_dev_state); 334 + dev_dbg(dev, "restore status, size = %lld\n", size); 335 + 336 + if (!size) { 337 + dev_err(dev, "invalid state size"); 338 + return -EIO; 339 + } 340 + 341 + lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size); 342 + if (!lm_file) { 343 + dev_err(dev, "failed to create restore file"); 344 + return -ENOENT; 345 + } 346 + pds_vfio->restore_file = lm_file; 347 + 348 + return 0; 349 + } 350 + 351 + struct file * 352 + pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, 353 + enum vfio_device_mig_state next) 354 + { 355 + enum vfio_device_mig_state cur = pds_vfio->state; 356 + int err; 357 + 358 + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) { 359 + err = pds_vfio_get_save_file(pds_vfio); 360 + if (err) 361 + return ERR_PTR(err); 362 + 363 + err = pds_vfio_get_lm_state_cmd(pds_vfio); 364 + if (err) { 365 + pds_vfio_put_save_file(pds_vfio); 366 + return ERR_PTR(err); 367 + } 368 + 369 + return pds_vfio->save_file->filep; 370 + } 371 + 372 + if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) { 373 + pds_vfio_put_save_file(pds_vfio); 374 + pds_vfio_dirty_disable(pds_vfio, true); 375 + return NULL; 376 + } 377 + 378 + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) { 379 + err = pds_vfio_get_restore_file(pds_vfio); 380 + if (err) 381 + return ERR_PTR(err); 382 + 383 + return pds_vfio->restore_file->filep; 384 + } 385 + 386 + if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) { 387 + err = pds_vfio_set_lm_state_cmd(pds_vfio); 388 + if (err) 389 + return ERR_PTR(err); 390 + 391 + pds_vfio_put_restore_file(pds_vfio); 392 + return NULL; 393 + } 394 + 395 + if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) { 396 + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, 397 + PDS_LM_STA_IN_PROGRESS); 398 + err = pds_vfio_suspend_device_cmd(pds_vfio, 399 + PDS_LM_SUSPEND_RESUME_TYPE_P2P); 400 + if (err) 401 + return ERR_PTR(err); 402 + 403 + return NULL; 404 + } 405 + 406 + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) { 407 + err = pds_vfio_resume_device_cmd(pds_vfio, 408 + PDS_LM_SUSPEND_RESUME_TYPE_FULL); 409 + if (err) 410 + return ERR_PTR(err); 411 + 412 + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); 413 + return NULL; 414 + } 415 + 416 + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) { 417 + err = pds_vfio_resume_device_cmd(pds_vfio, 418 + PDS_LM_SUSPEND_RESUME_TYPE_P2P); 419 + if (err) 420 + return ERR_PTR(err); 421 + 422 + return NULL; 423 + } 424 + 425 + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) { 426 + err = pds_vfio_suspend_device_cmd(pds_vfio, 427 + PDS_LM_SUSPEND_RESUME_TYPE_FULL); 428 + if (err) 429 + return ERR_PTR(err); 430 + return NULL; 431 + } 432 + 433 + return ERR_PTR(-EINVAL); 434 + }

+41

drivers/vfio/pci/pds/lm.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #ifndef _LM_H_ 5 + #define _LM_H_ 6 + 7 + #include <linux/fs.h> 8 + #include <linux/mutex.h> 9 + #include <linux/scatterlist.h> 10 + #include <linux/types.h> 11 + 12 + #include <linux/pds/pds_common.h> 13 + #include <linux/pds/pds_adminq.h> 14 + 15 + struct pds_vfio_lm_file { 16 + struct file *filep; 17 + struct mutex lock; /* protect live migration data file */ 18 + u64 size; /* Size with valid data */ 19 + u64 alloc_size; /* Total allocated size. Always >= len */ 20 + void *page_mem; /* memory allocated for pages */ 21 + struct page **pages; /* Backing pages for file */ 22 + unsigned long long npages; 23 + struct sg_table sg_table; /* SG table for backing pages */ 24 + struct pds_lm_sg_elem *sgl; /* DMA mapping */ 25 + dma_addr_t sgl_addr; 26 + u16 num_sge; 27 + struct scatterlist *last_offset_sg; /* Iterator */ 28 + unsigned int sg_last_entry; 29 + unsigned long last_offset; 30 + }; 31 + 32 + struct pds_vfio_pci_device; 33 + 34 + struct file * 35 + pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, 36 + enum vfio_device_mig_state next); 37 + 38 + void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio); 39 + void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio); 40 + 41 + #endif /* _LM_H_ */

+209

drivers/vfio/pci/pds/pci_drv.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 5 + 6 + #include <linux/module.h> 7 + #include <linux/pci.h> 8 + #include <linux/types.h> 9 + #include <linux/vfio.h> 10 + 11 + #include <linux/pds/pds_common.h> 12 + #include <linux/pds/pds_core_if.h> 13 + #include <linux/pds/pds_adminq.h> 14 + 15 + #include "vfio_dev.h" 16 + #include "pci_drv.h" 17 + #include "cmds.h" 18 + 19 + #define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver" 20 + #define PCI_VENDOR_ID_PENSANDO 0x1dd8 21 + 22 + static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) 23 + { 24 + bool deferred_reset_needed = false; 25 + 26 + /* 27 + * Documentation states that the kernel migration driver must not 28 + * generate asynchronous device state transitions outside of 29 + * manipulation by the user or the VFIO_DEVICE_RESET ioctl. 30 + * 31 + * Since recovery is an asynchronous event received from the device, 32 + * initiate a deferred reset. Issue a deferred reset in the following 33 + * situations: 34 + * 1. Migration is in progress, which will cause the next step of 35 + * the migration to fail. 36 + * 2. If the device is in a state that will be set to 37 + * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is 38 + * shutdown and device is in VFIO_DEVICE_STATE_STOP). 39 + */ 40 + mutex_lock(&pds_vfio->state_mutex); 41 + if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING && 42 + pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || 43 + (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && 44 + pds_vfio_dirty_is_enabled(pds_vfio))) 45 + deferred_reset_needed = true; 46 + mutex_unlock(&pds_vfio->state_mutex); 47 + 48 + /* 49 + * On the next user initiated state transition, the device will 50 + * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's 51 + * responsibility to reset the device. 52 + * 53 + * If a VFIO_DEVICE_RESET is requested post recovery and before the next 54 + * state transition, then the deferred reset state will be set to 55 + * VFIO_DEVICE_STATE_RUNNING. 56 + */ 57 + if (deferred_reset_needed) { 58 + spin_lock(&pds_vfio->reset_lock); 59 + pds_vfio->deferred_reset = true; 60 + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; 61 + spin_unlock(&pds_vfio->reset_lock); 62 + } 63 + } 64 + 65 + static int pds_vfio_pci_notify_handler(struct notifier_block *nb, 66 + unsigned long ecode, void *data) 67 + { 68 + struct pds_vfio_pci_device *pds_vfio = 69 + container_of(nb, struct pds_vfio_pci_device, nb); 70 + struct device *dev = pds_vfio_to_dev(pds_vfio); 71 + union pds_core_notifyq_comp *event = data; 72 + 73 + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); 74 + 75 + /* 76 + * We don't need to do anything for RESET state==0 as there is no notify 77 + * or feedback mechanism available, and it is possible that we won't 78 + * even see a state==0 event since the pds_core recovery is pending. 79 + * 80 + * Any requests from VFIO while state==0 will fail, which will return 81 + * error and may cause migration to fail. 82 + */ 83 + if (ecode == PDS_EVENT_RESET) { 84 + dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n", 85 + __func__, event->reset.state); 86 + /* 87 + * pds_core device finished recovery and sent us the 88 + * notification (state == 1) to allow us to recover 89 + */ 90 + if (event->reset.state == 1) 91 + pds_vfio_recovery(pds_vfio); 92 + } 93 + 94 + return 0; 95 + } 96 + 97 + static int 98 + pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio) 99 + { 100 + struct device *dev = pds_vfio_to_dev(pds_vfio); 101 + struct notifier_block *nb = &pds_vfio->nb; 102 + int err; 103 + 104 + if (!nb->notifier_call) { 105 + nb->notifier_call = pds_vfio_pci_notify_handler; 106 + err = pdsc_register_notify(nb); 107 + if (err) { 108 + nb->notifier_call = NULL; 109 + dev_err(dev, 110 + "failed to register pds event handler: %pe\n", 111 + ERR_PTR(err)); 112 + return -EINVAL; 113 + } 114 + dev_dbg(dev, "pds event handler registered\n"); 115 + } 116 + 117 + return 0; 118 + } 119 + 120 + static void 121 + pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio) 122 + { 123 + if (pds_vfio->nb.notifier_call) { 124 + pdsc_unregister_notify(&pds_vfio->nb); 125 + pds_vfio->nb.notifier_call = NULL; 126 + } 127 + } 128 + 129 + static int pds_vfio_pci_probe(struct pci_dev *pdev, 130 + const struct pci_device_id *id) 131 + { 132 + struct pds_vfio_pci_device *pds_vfio; 133 + int err; 134 + 135 + pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev, 136 + &pdev->dev, pds_vfio_ops_info()); 137 + if (IS_ERR(pds_vfio)) 138 + return PTR_ERR(pds_vfio); 139 + 140 + dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev); 141 + 142 + err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev); 143 + if (err) 144 + goto out_put_vdev; 145 + 146 + err = pds_vfio_register_client_cmd(pds_vfio); 147 + if (err) { 148 + dev_err(&pdev->dev, "failed to register as client: %pe\n", 149 + ERR_PTR(err)); 150 + goto out_unregister_coredev; 151 + } 152 + 153 + err = pds_vfio_pci_register_event_handler(pds_vfio); 154 + if (err) 155 + goto out_unregister_client; 156 + 157 + return 0; 158 + 159 + out_unregister_client: 160 + pds_vfio_unregister_client_cmd(pds_vfio); 161 + out_unregister_coredev: 162 + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); 163 + out_put_vdev: 164 + vfio_put_device(&pds_vfio->vfio_coredev.vdev); 165 + return err; 166 + } 167 + 168 + static void pds_vfio_pci_remove(struct pci_dev *pdev) 169 + { 170 + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); 171 + 172 + pds_vfio_pci_unregister_event_handler(pds_vfio); 173 + pds_vfio_unregister_client_cmd(pds_vfio); 174 + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); 175 + vfio_put_device(&pds_vfio->vfio_coredev.vdev); 176 + } 177 + 178 + static const struct pci_device_id pds_vfio_pci_table[] = { 179 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */ 180 + { 0, } 181 + }; 182 + MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table); 183 + 184 + static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev) 185 + { 186 + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); 187 + 188 + pds_vfio_reset(pds_vfio); 189 + } 190 + 191 + static const struct pci_error_handlers pds_vfio_pci_err_handlers = { 192 + .reset_done = pds_vfio_pci_aer_reset_done, 193 + .error_detected = vfio_pci_core_aer_err_detected, 194 + }; 195 + 196 + static struct pci_driver pds_vfio_pci_driver = { 197 + .name = KBUILD_MODNAME, 198 + .id_table = pds_vfio_pci_table, 199 + .probe = pds_vfio_pci_probe, 200 + .remove = pds_vfio_pci_remove, 201 + .err_handler = &pds_vfio_pci_err_handlers, 202 + .driver_managed_dma = true, 203 + }; 204 + 205 + module_pci_driver(pds_vfio_pci_driver); 206 + 207 + MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); 208 + MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); 209 + MODULE_LICENSE("GPL");

+9

drivers/vfio/pci/pds/pci_drv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #ifndef _PCI_DRV_H 5 + #define _PCI_DRV_H 6 + 7 + #include <linux/pci.h> 8 + 9 + #endif /* _PCI_DRV_H */

+227

drivers/vfio/pci/pds/vfio_dev.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #include <linux/vfio.h> 5 + #include <linux/vfio_pci_core.h> 6 + 7 + #include "lm.h" 8 + #include "dirty.h" 9 + #include "vfio_dev.h" 10 + 11 + struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio) 12 + { 13 + return pds_vfio->vfio_coredev.pdev; 14 + } 15 + 16 + struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio) 17 + { 18 + return &pds_vfio_to_pci_dev(pds_vfio)->dev; 19 + } 20 + 21 + struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev) 22 + { 23 + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 24 + 25 + return container_of(core_device, struct pds_vfio_pci_device, 26 + vfio_coredev); 27 + } 28 + 29 + void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) 30 + { 31 + again: 32 + spin_lock(&pds_vfio->reset_lock); 33 + if (pds_vfio->deferred_reset) { 34 + pds_vfio->deferred_reset = false; 35 + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { 36 + pds_vfio_put_restore_file(pds_vfio); 37 + pds_vfio_put_save_file(pds_vfio); 38 + pds_vfio_dirty_disable(pds_vfio, false); 39 + } 40 + pds_vfio->state = pds_vfio->deferred_reset_state; 41 + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 42 + spin_unlock(&pds_vfio->reset_lock); 43 + goto again; 44 + } 45 + mutex_unlock(&pds_vfio->state_mutex); 46 + spin_unlock(&pds_vfio->reset_lock); 47 + } 48 + 49 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) 50 + { 51 + spin_lock(&pds_vfio->reset_lock); 52 + pds_vfio->deferred_reset = true; 53 + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 54 + if (!mutex_trylock(&pds_vfio->state_mutex)) { 55 + spin_unlock(&pds_vfio->reset_lock); 56 + return; 57 + } 58 + spin_unlock(&pds_vfio->reset_lock); 59 + pds_vfio_state_mutex_unlock(pds_vfio); 60 + } 61 + 62 + static struct file * 63 + pds_vfio_set_device_state(struct vfio_device *vdev, 64 + enum vfio_device_mig_state new_state) 65 + { 66 + struct pds_vfio_pci_device *pds_vfio = 67 + container_of(vdev, struct pds_vfio_pci_device, 68 + vfio_coredev.vdev); 69 + struct file *res = NULL; 70 + 71 + mutex_lock(&pds_vfio->state_mutex); 72 + /* 73 + * only way to transition out of VFIO_DEVICE_STATE_ERROR is via 74 + * VFIO_DEVICE_RESET, so prevent the state machine from running since 75 + * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning 76 + * from VFIO_DEVICE_STATE_ERROR to any other state 77 + */ 78 + while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR && 79 + new_state != pds_vfio->state) { 80 + enum vfio_device_mig_state next_state; 81 + 82 + int err = vfio_mig_get_next_state(vdev, pds_vfio->state, 83 + new_state, &next_state); 84 + if (err) { 85 + res = ERR_PTR(err); 86 + break; 87 + } 88 + 89 + res = pds_vfio_step_device_state_locked(pds_vfio, next_state); 90 + if (IS_ERR(res)) 91 + break; 92 + 93 + pds_vfio->state = next_state; 94 + 95 + if (WARN_ON(res && new_state != pds_vfio->state)) { 96 + res = ERR_PTR(-EINVAL); 97 + break; 98 + } 99 + } 100 + pds_vfio_state_mutex_unlock(pds_vfio); 101 + /* still waiting on a deferred_reset */ 102 + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) 103 + res = ERR_PTR(-EIO); 104 + 105 + return res; 106 + } 107 + 108 + static int pds_vfio_get_device_state(struct vfio_device *vdev, 109 + enum vfio_device_mig_state *current_state) 110 + { 111 + struct pds_vfio_pci_device *pds_vfio = 112 + container_of(vdev, struct pds_vfio_pci_device, 113 + vfio_coredev.vdev); 114 + 115 + mutex_lock(&pds_vfio->state_mutex); 116 + *current_state = pds_vfio->state; 117 + pds_vfio_state_mutex_unlock(pds_vfio); 118 + return 0; 119 + } 120 + 121 + static int pds_vfio_get_device_state_size(struct vfio_device *vdev, 122 + unsigned long *stop_copy_length) 123 + { 124 + *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH; 125 + return 0; 126 + } 127 + 128 + static const struct vfio_migration_ops pds_vfio_lm_ops = { 129 + .migration_set_state = pds_vfio_set_device_state, 130 + .migration_get_state = pds_vfio_get_device_state, 131 + .migration_get_data_size = pds_vfio_get_device_state_size 132 + }; 133 + 134 + static const struct vfio_log_ops pds_vfio_log_ops = { 135 + .log_start = pds_vfio_dma_logging_start, 136 + .log_stop = pds_vfio_dma_logging_stop, 137 + .log_read_and_clear = pds_vfio_dma_logging_report, 138 + }; 139 + 140 + static int pds_vfio_init_device(struct vfio_device *vdev) 141 + { 142 + struct pds_vfio_pci_device *pds_vfio = 143 + container_of(vdev, struct pds_vfio_pci_device, 144 + vfio_coredev.vdev); 145 + struct pci_dev *pdev = to_pci_dev(vdev->dev); 146 + int err, vf_id, pci_id; 147 + 148 + vf_id = pci_iov_vf_id(pdev); 149 + if (vf_id < 0) 150 + return vf_id; 151 + 152 + err = vfio_pci_core_init_dev(vdev); 153 + if (err) 154 + return err; 155 + 156 + pds_vfio->vf_id = vf_id; 157 + 158 + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 159 + vdev->mig_ops = &pds_vfio_lm_ops; 160 + vdev->log_ops = &pds_vfio_log_ops; 161 + 162 + pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn); 163 + dev_dbg(&pdev->dev, 164 + "%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n", 165 + __func__, pci_dev_id(pdev->physfn), pci_id, vf_id, 166 + pci_domain_nr(pdev->bus), pds_vfio); 167 + 168 + return 0; 169 + } 170 + 171 + static int pds_vfio_open_device(struct vfio_device *vdev) 172 + { 173 + struct pds_vfio_pci_device *pds_vfio = 174 + container_of(vdev, struct pds_vfio_pci_device, 175 + vfio_coredev.vdev); 176 + int err; 177 + 178 + err = vfio_pci_core_enable(&pds_vfio->vfio_coredev); 179 + if (err) 180 + return err; 181 + 182 + mutex_init(&pds_vfio->state_mutex); 183 + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; 184 + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 185 + 186 + vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); 187 + 188 + return 0; 189 + } 190 + 191 + static void pds_vfio_close_device(struct vfio_device *vdev) 192 + { 193 + struct pds_vfio_pci_device *pds_vfio = 194 + container_of(vdev, struct pds_vfio_pci_device, 195 + vfio_coredev.vdev); 196 + 197 + mutex_lock(&pds_vfio->state_mutex); 198 + pds_vfio_put_restore_file(pds_vfio); 199 + pds_vfio_put_save_file(pds_vfio); 200 + pds_vfio_dirty_disable(pds_vfio, true); 201 + mutex_unlock(&pds_vfio->state_mutex); 202 + mutex_destroy(&pds_vfio->state_mutex); 203 + vfio_pci_core_close_device(vdev); 204 + } 205 + 206 + static const struct vfio_device_ops pds_vfio_ops = { 207 + .name = "pds-vfio", 208 + .init = pds_vfio_init_device, 209 + .release = vfio_pci_core_release_dev, 210 + .open_device = pds_vfio_open_device, 211 + .close_device = pds_vfio_close_device, 212 + .ioctl = vfio_pci_core_ioctl, 213 + .device_feature = vfio_pci_core_ioctl_feature, 214 + .read = vfio_pci_core_read, 215 + .write = vfio_pci_core_write, 216 + .mmap = vfio_pci_core_mmap, 217 + .request = vfio_pci_core_request, 218 + .match = vfio_pci_core_match, 219 + .bind_iommufd = vfio_iommufd_physical_bind, 220 + .unbind_iommufd = vfio_iommufd_physical_unbind, 221 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 222 + }; 223 + 224 + const struct vfio_device_ops *pds_vfio_ops_info(void) 225 + { 226 + return &pds_vfio_ops; 227 + }

+39

drivers/vfio/pci/pds/vfio_dev.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright(c) 2023 Advanced Micro Devices, Inc. */ 3 + 4 + #ifndef _VFIO_DEV_H_ 5 + #define _VFIO_DEV_H_ 6 + 7 + #include <linux/pci.h> 8 + #include <linux/vfio_pci_core.h> 9 + 10 + #include "dirty.h" 11 + #include "lm.h" 12 + 13 + struct pds_vfio_pci_device { 14 + struct vfio_pci_core_device vfio_coredev; 15 + 16 + struct pds_vfio_lm_file *save_file; 17 + struct pds_vfio_lm_file *restore_file; 18 + struct pds_vfio_dirty dirty; 19 + struct mutex state_mutex; /* protect migration state */ 20 + enum vfio_device_mig_state state; 21 + spinlock_t reset_lock; /* protect reset_done flow */ 22 + u8 deferred_reset; 23 + enum vfio_device_mig_state deferred_reset_state; 24 + struct notifier_block nb; 25 + 26 + int vf_id; 27 + u16 client_id; 28 + }; 29 + 30 + void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); 31 + 32 + const struct vfio_device_ops *pds_vfio_ops_info(void); 33 + struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); 34 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); 35 + 36 + struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); 37 + struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio); 38 + 39 + #endif /* _VFIO_DEV_H_ */

+1

drivers/vfio/pci/vfio_pci.c

··· 141 141 .bind_iommufd = vfio_iommufd_physical_bind, 142 142 .unbind_iommufd = vfio_iommufd_physical_unbind, 143 143 .attach_ioas = vfio_iommufd_physical_attach_ioas, 144 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 144 145 }; 145 146 146 147 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)

+157 -108

drivers/vfio/pci/vfio_pci_core.c

··· 27 27 #include <linux/vgaarb.h> 28 28 #include <linux/nospec.h> 29 29 #include <linux/sched/mm.h> 30 + #include <linux/iommufd.h> 30 31 #if IS_ENABLED(CONFIG_EEH) 31 32 #include <asm/eeh.h> 32 33 #endif ··· 181 180 struct vfio_pci_group_info; 182 181 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); 183 182 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 184 - struct vfio_pci_group_info *groups); 183 + struct vfio_pci_group_info *groups, 184 + struct iommufd_ctx *iommufd_ctx); 185 185 186 186 /* 187 187 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND ··· 778 776 } 779 777 780 778 struct vfio_pci_fill_info { 781 - int max; 782 - int cur; 783 - struct vfio_pci_dependent_device *devices; 779 + struct vfio_pci_dependent_device __user *devices; 780 + struct vfio_pci_dependent_device __user *devices_end; 781 + struct vfio_device *vdev; 782 + u32 count; 783 + u32 flags; 784 784 }; 785 785 786 786 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 787 787 { 788 + struct vfio_pci_dependent_device info = { 789 + .segment = pci_domain_nr(pdev->bus), 790 + .bus = pdev->bus->number, 791 + .devfn = pdev->devfn, 792 + }; 788 793 struct vfio_pci_fill_info *fill = data; 789 - struct iommu_group *iommu_group; 790 794 791 - if (fill->cur == fill->max) 792 - return -EAGAIN; /* Something changed, try again */ 795 + fill->count++; 796 + if (fill->devices >= fill->devices_end) 797 + return 0; 793 798 794 - iommu_group = iommu_group_get(&pdev->dev); 795 - if (!iommu_group) 796 - return -EPERM; /* Cannot reset non-isolated devices */ 799 + if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { 800 + struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); 801 + struct vfio_device_set *dev_set = fill->vdev->dev_set; 802 + struct vfio_device *vdev; 797 803 798 - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 799 - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 800 - fill->devices[fill->cur].bus = pdev->bus->number; 801 - fill->devices[fill->cur].devfn = pdev->devfn; 802 - fill->cur++; 803 - iommu_group_put(iommu_group); 804 + /* 805 + * hot-reset requires all affected devices be represented in 806 + * the dev_set. 807 + */ 808 + vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); 809 + if (!vdev) { 810 + info.devid = VFIO_PCI_DEVID_NOT_OWNED; 811 + } else { 812 + int id = vfio_iommufd_get_dev_id(vdev, iommufd); 813 + 814 + if (id > 0) 815 + info.devid = id; 816 + else if (id == -ENOENT) 817 + info.devid = VFIO_PCI_DEVID_OWNED; 818 + else 819 + info.devid = VFIO_PCI_DEVID_NOT_OWNED; 820 + } 821 + /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ 822 + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) 823 + fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; 824 + } else { 825 + struct iommu_group *iommu_group; 826 + 827 + iommu_group = iommu_group_get(&pdev->dev); 828 + if (!iommu_group) 829 + return -EPERM; /* Cannot reset non-isolated devices */ 830 + 831 + info.group_id = iommu_group_id(iommu_group); 832 + iommu_group_put(iommu_group); 833 + } 834 + 835 + if (copy_to_user(fill->devices, &info, sizeof(info))) 836 + return -EFAULT; 837 + fill->devices++; 804 838 return 0; 805 839 } 806 840 ··· 958 920 struct vfio_device_info __user *arg) 959 921 { 960 922 unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); 961 - struct vfio_device_info info; 923 + struct vfio_device_info info = {}; 962 924 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 963 - unsigned long capsz; 964 925 int ret; 965 - 966 - /* For backward compatibility, cannot require this */ 967 - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 968 926 969 927 if (copy_from_user(&info, arg, minsz)) 970 928 return -EFAULT; ··· 968 934 if (info.argsz < minsz) 969 935 return -EINVAL; 970 936 971 - if (info.argsz >= capsz) { 972 - minsz = capsz; 973 - info.cap_offset = 0; 974 - } 937 + minsz = min_t(size_t, info.argsz, sizeof(info)); 975 938 976 939 info.flags = VFIO_DEVICE_FLAGS_PCI; 977 940 ··· 1259 1228 unsigned long minsz = 1260 1229 offsetofend(struct vfio_pci_hot_reset_info, count); 1261 1230 struct vfio_pci_hot_reset_info hdr; 1262 - struct vfio_pci_fill_info fill = { 0 }; 1263 - struct vfio_pci_dependent_device *devices = NULL; 1231 + struct vfio_pci_fill_info fill = {}; 1264 1232 bool slot = false; 1265 1233 int ret = 0; 1266 1234 ··· 1277 1247 else if (pci_probe_reset_bus(vdev->pdev->bus)) 1278 1248 return -ENODEV; 1279 1249 1280 - /* How many devices are affected? */ 1281 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 1282 - &fill.max, slot); 1250 + fill.devices = arg->devices; 1251 + fill.devices_end = arg->devices + 1252 + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); 1253 + fill.vdev = &vdev->vdev; 1254 + 1255 + if (vfio_device_cdev_opened(&vdev->vdev)) 1256 + fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID | 1257 + VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; 1258 + 1259 + mutex_lock(&vdev->vdev.dev_set->lock); 1260 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, 1261 + &fill, slot); 1262 + mutex_unlock(&vdev->vdev.dev_set->lock); 1283 1263 if (ret) 1284 1264 return ret; 1285 1265 1286 - WARN_ON(!fill.max); /* Should always be at least one */ 1287 - 1288 - /* 1289 - * If there's enough space, fill it now, otherwise return -ENOSPC and 1290 - * the number of devices affected. 1291 - */ 1292 - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 1293 - ret = -ENOSPC; 1294 - hdr.count = fill.max; 1295 - goto reset_info_exit; 1296 - } 1297 - 1298 - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 1299 - if (!devices) 1300 - return -ENOMEM; 1301 - 1302 - fill.devices = devices; 1303 - 1304 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, 1305 - &fill, slot); 1306 - 1307 - /* 1308 - * If a device was removed between counting and filling, we may come up 1309 - * short of fill.max. If a device was added, we'll have a return of 1310 - * -EAGAIN above. 1311 - */ 1312 - if (!ret) 1313 - hdr.count = fill.cur; 1314 - 1315 - reset_info_exit: 1266 + hdr.count = fill.count; 1267 + hdr.flags = fill.flags; 1316 1268 if (copy_to_user(arg, &hdr, minsz)) 1317 - ret = -EFAULT; 1269 + return -EFAULT; 1318 1270 1319 - if (!ret) { 1320 - if (copy_to_user(&arg->devices, devices, 1321 - hdr.count * sizeof(*devices))) 1322 - ret = -EFAULT; 1323 - } 1324 - 1325 - kfree(devices); 1326 - return ret; 1271 + if (fill.count > fill.devices - arg->devices) 1272 + return -ENOSPC; 1273 + return 0; 1327 1274 } 1328 1275 1329 - static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, 1330 - struct vfio_pci_hot_reset __user *arg) 1276 + static int 1277 + vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, 1278 + int array_count, bool slot, 1279 + struct vfio_pci_hot_reset __user *arg) 1331 1280 { 1332 - unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); 1333 - struct vfio_pci_hot_reset hdr; 1334 1281 int32_t *group_fds; 1335 1282 struct file **files; 1336 1283 struct vfio_pci_group_info info; 1337 - bool slot = false; 1338 1284 int file_idx, count = 0, ret = 0; 1339 - 1340 - if (copy_from_user(&hdr, arg, minsz)) 1341 - return -EFAULT; 1342 - 1343 - if (hdr.argsz < minsz || hdr.flags) 1344 - return -EINVAL; 1345 - 1346 - /* Can we do a slot or bus reset or neither? */ 1347 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 1348 - slot = true; 1349 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 1350 - return -ENODEV; 1351 1285 1352 1286 /* 1353 1287 * We can't let userspace give us an arbitrarily large buffer to copy, ··· 1323 1329 if (ret) 1324 1330 return ret; 1325 1331 1326 - /* Somewhere between 1 and count is OK */ 1327 - if (!hdr.count || hdr.count > count) 1332 + if (array_count > count) 1328 1333 return -EINVAL; 1329 1334 1330 - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1331 - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); 1335 + group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL); 1336 + files = kcalloc(array_count, sizeof(*files), GFP_KERNEL); 1332 1337 if (!group_fds || !files) { 1333 1338 kfree(group_fds); 1334 1339 kfree(files); ··· 1335 1342 } 1336 1343 1337 1344 if (copy_from_user(group_fds, arg->group_fds, 1338 - hdr.count * sizeof(*group_fds))) { 1345 + array_count * sizeof(*group_fds))) { 1339 1346 kfree(group_fds); 1340 1347 kfree(files); 1341 1348 return -EFAULT; 1342 1349 } 1343 1350 1344 1351 /* 1345 - * For each group_fd, get the group through the vfio external user 1346 - * interface and store the group and iommu ID. This ensures the group 1347 - * is held across the reset. 1352 + * Get the group file for each fd to ensure the group is held across 1353 + * the reset 1348 1354 */ 1349 - for (file_idx = 0; file_idx < hdr.count; file_idx++) { 1355 + for (file_idx = 0; file_idx < array_count; file_idx++) { 1350 1356 struct file *file = fget(group_fds[file_idx]); 1351 1357 1352 1358 if (!file) { ··· 1369 1377 if (ret) 1370 1378 goto hot_reset_release; 1371 1379 1372 - info.count = hdr.count; 1380 + info.count = array_count; 1373 1381 info.files = files; 1374 1382 1375 - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); 1383 + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL); 1376 1384 1377 1385 hot_reset_release: 1378 1386 for (file_idx--; file_idx >= 0; file_idx--) ··· 1380 1388 1381 1389 kfree(files); 1382 1390 return ret; 1391 + } 1392 + 1393 + static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, 1394 + struct vfio_pci_hot_reset __user *arg) 1395 + { 1396 + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); 1397 + struct vfio_pci_hot_reset hdr; 1398 + bool slot = false; 1399 + 1400 + if (copy_from_user(&hdr, arg, minsz)) 1401 + return -EFAULT; 1402 + 1403 + if (hdr.argsz < minsz || hdr.flags) 1404 + return -EINVAL; 1405 + 1406 + /* zero-length array is only for cdev opened devices */ 1407 + if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev)) 1408 + return -EINVAL; 1409 + 1410 + /* Can we do a slot or bus reset or neither? */ 1411 + if (!pci_probe_reset_slot(vdev->pdev->slot)) 1412 + slot = true; 1413 + else if (pci_probe_reset_bus(vdev->pdev->bus)) 1414 + return -ENODEV; 1415 + 1416 + if (hdr.count) 1417 + return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg); 1418 + 1419 + return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, 1420 + vfio_iommufd_device_ictx(&vdev->vdev)); 1383 1421 } 1384 1422 1385 1423 static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, ··· 2377 2355 }; 2378 2356 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); 2379 2357 2380 - static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, 2358 + static bool vfio_dev_in_groups(struct vfio_device *vdev, 2381 2359 struct vfio_pci_group_info *groups) 2382 2360 { 2383 2361 unsigned int i; 2384 2362 2363 + if (!groups) 2364 + return false; 2365 + 2385 2366 for (i = 0; i < groups->count; i++) 2386 - if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) 2367 + if (vfio_file_has_dev(groups->files[i], vdev)) 2387 2368 return true; 2388 2369 return false; 2389 2370 } ··· 2394 2369 static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) 2395 2370 { 2396 2371 struct vfio_device_set *dev_set = data; 2397 - struct vfio_device *cur; 2398 2372 2399 - list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 2400 - if (cur->dev == &pdev->dev) 2401 - return 0; 2402 - return -EBUSY; 2373 + return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV; 2403 2374 } 2404 2375 2405 2376 /* ··· 2462 2441 * get each memory_lock. 2463 2442 */ 2464 2443 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 2465 - struct vfio_pci_group_info *groups) 2444 + struct vfio_pci_group_info *groups, 2445 + struct iommufd_ctx *iommufd_ctx) 2466 2446 { 2467 2447 struct vfio_pci_core_device *cur_mem; 2468 2448 struct vfio_pci_core_device *cur_vma; ··· 2493 2471 goto err_unlock; 2494 2472 2495 2473 list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { 2474 + bool owned; 2475 + 2496 2476 /* 2497 - * Test whether all the affected devices are contained by the 2498 - * set of groups provided by the user. 2477 + * Test whether all the affected devices can be reset by the 2478 + * user. 2479 + * 2480 + * If called from a group opened device and the user provides 2481 + * a set of groups, all the devices in the dev_set should be 2482 + * contained by the set of groups provided by the user. 2483 + * 2484 + * If called from a cdev opened device and the user provides 2485 + * a zero-length array, all the devices in the dev_set must 2486 + * be bound to the same iommufd_ctx as the input iommufd_ctx. 2487 + * If there is any device that has not been bound to any 2488 + * iommufd_ctx yet, check if its iommu_group has any device 2489 + * bound to the input iommufd_ctx. Such devices can be 2490 + * considered owned by the input iommufd_ctx as the device 2491 + * cannot be owned by another iommufd_ctx when its iommu_group 2492 + * is owned. 2493 + * 2494 + * Otherwise, reset is not allowed. 2499 2495 */ 2500 - if (!vfio_dev_in_groups(cur_vma, groups)) { 2496 + if (iommufd_ctx) { 2497 + int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, 2498 + iommufd_ctx); 2499 + 2500 + owned = (devid > 0 || devid == -ENOENT); 2501 + } else { 2502 + owned = vfio_dev_in_groups(&cur_vma->vdev, groups); 2503 + } 2504 + 2505 + if (!owned) { 2501 2506 ret = -EINVAL; 2502 2507 goto err_undo; 2503 2508 }

+1

drivers/vfio/platform/vfio_amba.c

··· 119 119 .bind_iommufd = vfio_iommufd_physical_bind, 120 120 .unbind_iommufd = vfio_iommufd_physical_unbind, 121 121 .attach_ioas = vfio_iommufd_physical_attach_ioas, 122 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 122 123 }; 123 124 124 125 static const struct amba_id pl330_ids[] = {

+1

drivers/vfio/platform/vfio_platform.c

··· 108 108 .bind_iommufd = vfio_iommufd_physical_bind, 109 109 .unbind_iommufd = vfio_iommufd_physical_unbind, 110 110 .attach_ioas = vfio_iommufd_physical_attach_ioas, 111 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 111 112 }; 112 113 113 114 static struct platform_driver vfio_platform_driver = {

+201 -17

drivers/vfio/vfio.h

··· 16 16 struct iommu_group; 17 17 struct vfio_container; 18 18 19 + struct vfio_device_file { 20 + struct vfio_device *device; 21 + struct vfio_group *group; 22 + 23 + u8 access_granted; 24 + u32 devid; /* only valid when iommufd is valid */ 25 + spinlock_t kvm_ref_lock; /* protect kvm field */ 26 + struct kvm *kvm; 27 + struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ 28 + }; 29 + 19 30 void vfio_device_put_registration(struct vfio_device *device); 20 31 bool vfio_device_try_get_registration(struct vfio_device *device); 21 - int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd); 22 - void vfio_device_close(struct vfio_device *device, 23 - struct iommufd_ctx *iommufd); 32 + int vfio_df_open(struct vfio_device_file *df); 33 + void vfio_df_close(struct vfio_device_file *df); 34 + struct vfio_device_file * 35 + vfio_allocate_device_file(struct vfio_device *device); 24 36 25 37 extern const struct file_operations vfio_device_fops; 38 + 39 + #ifdef CONFIG_VFIO_NOIOMMU 40 + extern bool vfio_noiommu __read_mostly; 41 + #else 42 + enum { vfio_noiommu = false }; 43 + #endif 26 44 27 45 enum vfio_group_type { 28 46 /* ··· 66 48 VFIO_NO_IOMMU, 67 49 }; 68 50 51 + #if IS_ENABLED(CONFIG_VFIO_GROUP) 69 52 struct vfio_group { 70 53 struct device dev; 71 54 struct cdev cdev; ··· 93 74 struct blocking_notifier_head notifier; 94 75 struct iommufd_ctx *iommufd; 95 76 spinlock_t kvm_ref_lock; 77 + unsigned int cdev_device_open_cnt; 96 78 }; 97 79 80 + int vfio_device_block_group(struct vfio_device *device); 81 + void vfio_device_unblock_group(struct vfio_device *device); 98 82 int vfio_device_set_group(struct vfio_device *device, 99 83 enum vfio_group_type type); 100 84 void vfio_device_remove_group(struct vfio_device *device); ··· 105 83 void vfio_device_group_unregister(struct vfio_device *device); 106 84 int vfio_device_group_use_iommu(struct vfio_device *device); 107 85 void vfio_device_group_unuse_iommu(struct vfio_device *device); 108 - void vfio_device_group_close(struct vfio_device *device); 86 + void vfio_df_group_close(struct vfio_device_file *df); 87 + struct vfio_group *vfio_group_from_file(struct file *file); 88 + bool vfio_group_enforced_coherent(struct vfio_group *group); 89 + void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); 109 90 bool vfio_device_has_container(struct vfio_device *device); 110 91 int __init vfio_group_init(void); 111 92 void vfio_group_cleanup(void); ··· 118 93 return IS_ENABLED(CONFIG_VFIO_NOIOMMU) && 119 94 vdev->group->type == VFIO_NO_IOMMU; 120 95 } 96 + #else 97 + struct vfio_group; 98 + 99 + static inline int vfio_device_block_group(struct vfio_device *device) 100 + { 101 + return 0; 102 + } 103 + 104 + static inline void vfio_device_unblock_group(struct vfio_device *device) 105 + { 106 + } 107 + 108 + static inline int vfio_device_set_group(struct vfio_device *device, 109 + enum vfio_group_type type) 110 + { 111 + return 0; 112 + } 113 + 114 + static inline void vfio_device_remove_group(struct vfio_device *device) 115 + { 116 + } 117 + 118 + static inline void vfio_device_group_register(struct vfio_device *device) 119 + { 120 + } 121 + 122 + static inline void vfio_device_group_unregister(struct vfio_device *device) 123 + { 124 + } 125 + 126 + static inline int vfio_device_group_use_iommu(struct vfio_device *device) 127 + { 128 + return -EOPNOTSUPP; 129 + } 130 + 131 + static inline void vfio_device_group_unuse_iommu(struct vfio_device *device) 132 + { 133 + } 134 + 135 + static inline void vfio_df_group_close(struct vfio_device_file *df) 136 + { 137 + } 138 + 139 + static inline struct vfio_group *vfio_group_from_file(struct file *file) 140 + { 141 + return NULL; 142 + } 143 + 144 + static inline bool vfio_group_enforced_coherent(struct vfio_group *group) 145 + { 146 + return true; 147 + } 148 + 149 + static inline void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) 150 + { 151 + } 152 + 153 + static inline bool vfio_device_has_container(struct vfio_device *device) 154 + { 155 + return false; 156 + } 157 + 158 + static inline int __init vfio_group_init(void) 159 + { 160 + return 0; 161 + } 162 + 163 + static inline void vfio_group_cleanup(void) 164 + { 165 + } 166 + 167 + static inline bool vfio_device_is_noiommu(struct vfio_device *vdev) 168 + { 169 + return false; 170 + } 171 + #endif /* CONFIG_VFIO_GROUP */ 121 172 122 173 #if IS_ENABLED(CONFIG_VFIO_CONTAINER) 123 174 /** ··· 318 217 #endif 319 218 320 219 #if IS_ENABLED(CONFIG_IOMMUFD) 321 - int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); 322 - void vfio_iommufd_unbind(struct vfio_device *device); 220 + bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, 221 + struct iommufd_ctx *ictx); 222 + int vfio_df_iommufd_bind(struct vfio_device_file *df); 223 + void vfio_df_iommufd_unbind(struct vfio_device_file *df); 224 + int vfio_iommufd_compat_attach_ioas(struct vfio_device *device, 225 + struct iommufd_ctx *ictx); 323 226 #else 324 - static inline int vfio_iommufd_bind(struct vfio_device *device, 227 + static inline bool 228 + vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, 325 229 struct iommufd_ctx *ictx) 230 + { 231 + return false; 232 + } 233 + 234 + static inline int vfio_df_iommufd_bind(struct vfio_device_file *fd) 326 235 { 327 236 return -EOPNOTSUPP; 328 237 } 329 238 330 - static inline void vfio_iommufd_unbind(struct vfio_device *device) 239 + static inline void vfio_df_iommufd_unbind(struct vfio_device_file *df) 331 240 { 332 241 } 242 + 243 + static inline int 244 + vfio_iommufd_compat_attach_ioas(struct vfio_device *device, 245 + struct iommufd_ctx *ictx) 246 + { 247 + return -EOPNOTSUPP; 248 + } 333 249 #endif 250 + 251 + int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, 252 + struct vfio_device_attach_iommufd_pt __user *arg); 253 + int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, 254 + struct vfio_device_detach_iommufd_pt __user *arg); 255 + 256 + #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) 257 + void vfio_init_device_cdev(struct vfio_device *device); 258 + 259 + static inline int vfio_device_add(struct vfio_device *device) 260 + { 261 + /* cdev does not support noiommu device */ 262 + if (vfio_device_is_noiommu(device)) 263 + return device_add(&device->device); 264 + vfio_init_device_cdev(device); 265 + return cdev_device_add(&device->cdev, &device->device); 266 + } 267 + 268 + static inline void vfio_device_del(struct vfio_device *device) 269 + { 270 + if (vfio_device_is_noiommu(device)) 271 + device_del(&device->device); 272 + else 273 + cdev_device_del(&device->cdev, &device->device); 274 + } 275 + 276 + int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep); 277 + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, 278 + struct vfio_device_bind_iommufd __user *arg); 279 + void vfio_df_unbind_iommufd(struct vfio_device_file *df); 280 + int vfio_cdev_init(struct class *device_class); 281 + void vfio_cdev_cleanup(void); 282 + #else 283 + static inline void vfio_init_device_cdev(struct vfio_device *device) 284 + { 285 + } 286 + 287 + static inline int vfio_device_add(struct vfio_device *device) 288 + { 289 + return device_add(&device->device); 290 + } 291 + 292 + static inline void vfio_device_del(struct vfio_device *device) 293 + { 294 + device_del(&device->device); 295 + } 296 + 297 + static inline int vfio_device_fops_cdev_open(struct inode *inode, 298 + struct file *filep) 299 + { 300 + return 0; 301 + } 302 + 303 + static inline long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, 304 + struct vfio_device_bind_iommufd __user *arg) 305 + { 306 + return -ENOTTY; 307 + } 308 + 309 + static inline void vfio_df_unbind_iommufd(struct vfio_device_file *df) 310 + { 311 + } 312 + 313 + static inline int vfio_cdev_init(struct class *device_class) 314 + { 315 + return 0; 316 + } 317 + 318 + static inline void vfio_cdev_cleanup(void) 319 + { 320 + } 321 + #endif /* CONFIG_VFIO_DEVICE_CDEV */ 334 322 335 323 #if IS_ENABLED(CONFIG_VFIO_VIRQFD) 336 324 int __init vfio_virqfd_init(void); ··· 434 244 } 435 245 #endif 436 246 437 - #ifdef CONFIG_VFIO_NOIOMMU 438 - extern bool vfio_noiommu __read_mostly; 439 - #else 440 - enum { vfio_noiommu = false }; 441 - #endif 442 - 443 247 #ifdef CONFIG_HAVE_KVM 444 - void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); 248 + void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); 445 249 void vfio_device_put_kvm(struct vfio_device *device); 446 250 #else 447 - static inline void _vfio_device_get_kvm_safe(struct vfio_device *device, 448 - struct kvm *kvm) 251 + static inline void vfio_device_get_kvm_safe(struct vfio_device *device, 252 + struct kvm *kvm) 449 253 { 450 254 } 451 255

+3 -10

drivers/vfio/vfio_iommu_type1.c

··· 2732 2732 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, 2733 2733 struct vfio_info_cap *caps) 2734 2734 { 2735 - struct vfio_iommu_type1_info_cap_migration cap_mig; 2735 + struct vfio_iommu_type1_info_cap_migration cap_mig = {}; 2736 2736 2737 2737 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; 2738 2738 cap_mig.header.version = 1; ··· 2762 2762 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, 2763 2763 unsigned long arg) 2764 2764 { 2765 - struct vfio_iommu_type1_info info; 2765 + struct vfio_iommu_type1_info info = {}; 2766 2766 unsigned long minsz; 2767 2767 struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 2768 - unsigned long capsz; 2769 2768 int ret; 2770 2769 2771 2770 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); 2772 - 2773 - /* For backward compatibility, cannot require this */ 2774 - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 2775 2771 2776 2772 if (copy_from_user(&info, (void __user *)arg, minsz)) 2777 2773 return -EFAULT; ··· 2775 2779 if (info.argsz < minsz) 2776 2780 return -EINVAL; 2777 2781 2778 - if (info.argsz >= capsz) { 2779 - minsz = capsz; 2780 - info.cap_offset = 0; /* output, no-recopy necessary */ 2781 - } 2782 + minsz = min_t(size_t, info.argsz, sizeof(info)); 2782 2783 2783 2784 mutex_lock(&iommu->lock); 2784 2785 info.flags = VFIO_IOMMU_INFO_PGSIZES;

+279 -28

drivers/vfio/vfio_main.c

··· 141 141 } 142 142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count); 143 143 144 + struct vfio_device * 145 + vfio_find_device_in_devset(struct vfio_device_set *dev_set, 146 + struct device *dev) 147 + { 148 + struct vfio_device *cur; 149 + 150 + lockdep_assert_held(&dev_set->lock); 151 + 152 + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 153 + if (cur->dev == dev) 154 + return cur; 155 + return NULL; 156 + } 157 + EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); 158 + 144 159 /* 145 160 * Device objects - create, release, get, put, search 146 161 */ ··· 273 258 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && 274 259 (!device->ops->bind_iommufd || 275 260 !device->ops->unbind_iommufd || 276 - !device->ops->attach_ioas))) 261 + !device->ops->attach_ioas || 262 + !device->ops->detach_ioas))) 277 263 return -EINVAL; 278 264 279 265 /* ··· 292 276 if (ret) 293 277 return ret; 294 278 295 - ret = device_add(&device->device); 279 + /* 280 + * VFIO always sets IOMMU_CACHE because we offer no way for userspace to 281 + * restore cache coherency. It has to be checked here because it is only 282 + * valid for cases where we are using iommu groups. 283 + */ 284 + if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && 285 + !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { 286 + ret = -EINVAL; 287 + goto err_out; 288 + } 289 + 290 + ret = vfio_device_add(device); 296 291 if (ret) 297 292 goto err_out; 298 293 ··· 343 316 bool interrupted = false; 344 317 long rc; 345 318 319 + /* 320 + * Prevent new device opened by userspace via the 321 + * VFIO_GROUP_GET_DEVICE_FD in the group path. 322 + */ 323 + vfio_device_group_unregister(device); 324 + 325 + /* 326 + * Balances vfio_device_add() in register path, also prevents 327 + * new device opened by userspace in the cdev path. 328 + */ 329 + vfio_device_del(device); 330 + 346 331 vfio_device_put_registration(device); 347 332 rc = try_wait_for_completion(&device->comp); 348 333 while (rc <= 0) { ··· 378 339 } 379 340 } 380 341 381 - vfio_device_group_unregister(device); 382 - 383 - /* Balances device_add in register path */ 384 - device_del(&device->device); 385 - 386 342 /* Balances vfio_device_set_group in register path */ 387 343 vfio_device_remove_group(device); 388 344 } 389 345 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); 390 346 391 347 #ifdef CONFIG_HAVE_KVM 392 - void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 348 + void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) 393 349 { 394 350 void (*pfn)(struct kvm *kvm); 395 351 bool (*fn)(struct kvm *kvm); 396 352 bool ret; 397 353 398 354 lockdep_assert_held(&device->dev_set->lock); 355 + 356 + if (!kvm) 357 + return; 399 358 400 359 pfn = symbol_get(kvm_put_kvm); 401 360 if (WARN_ON(!pfn)) ··· 441 404 return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); 442 405 } 443 406 444 - static int vfio_device_first_open(struct vfio_device *device, 445 - struct iommufd_ctx *iommufd) 407 + struct vfio_device_file * 408 + vfio_allocate_device_file(struct vfio_device *device) 446 409 { 410 + struct vfio_device_file *df; 411 + 412 + df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT); 413 + if (!df) 414 + return ERR_PTR(-ENOMEM); 415 + 416 + df->device = device; 417 + spin_lock_init(&df->kvm_ref_lock); 418 + 419 + return df; 420 + } 421 + 422 + static int vfio_df_device_first_open(struct vfio_device_file *df) 423 + { 424 + struct vfio_device *device = df->device; 425 + struct iommufd_ctx *iommufd = df->iommufd; 447 426 int ret; 448 427 449 428 lockdep_assert_held(&device->dev_set->lock); ··· 468 415 return -ENODEV; 469 416 470 417 if (iommufd) 471 - ret = vfio_iommufd_bind(device, iommufd); 418 + ret = vfio_df_iommufd_bind(df); 472 419 else 473 420 ret = vfio_device_group_use_iommu(device); 474 421 if (ret) ··· 483 430 484 431 err_unuse_iommu: 485 432 if (iommufd) 486 - vfio_iommufd_unbind(device); 433 + vfio_df_iommufd_unbind(df); 487 434 else 488 435 vfio_device_group_unuse_iommu(device); 489 436 err_module_put: ··· 491 438 return ret; 492 439 } 493 440 494 - static void vfio_device_last_close(struct vfio_device *device, 495 - struct iommufd_ctx *iommufd) 441 + static void vfio_df_device_last_close(struct vfio_device_file *df) 496 442 { 443 + struct vfio_device *device = df->device; 444 + struct iommufd_ctx *iommufd = df->iommufd; 445 + 497 446 lockdep_assert_held(&device->dev_set->lock); 498 447 499 448 if (device->ops->close_device) 500 449 device->ops->close_device(device); 501 450 if (iommufd) 502 - vfio_iommufd_unbind(device); 451 + vfio_df_iommufd_unbind(df); 503 452 else 504 453 vfio_device_group_unuse_iommu(device); 505 454 module_put(device->dev->driver->owner); 506 455 } 507 456 508 - int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) 457 + int vfio_df_open(struct vfio_device_file *df) 509 458 { 459 + struct vfio_device *device = df->device; 510 460 int ret = 0; 511 461 512 462 lockdep_assert_held(&device->dev_set->lock); 513 463 464 + /* 465 + * Only the group path allows the device to be opened multiple 466 + * times. The device cdev path doesn't have a secure way for it. 467 + */ 468 + if (device->open_count != 0 && !df->group) 469 + return -EINVAL; 470 + 514 471 device->open_count++; 515 472 if (device->open_count == 1) { 516 - ret = vfio_device_first_open(device, iommufd); 473 + ret = vfio_df_device_first_open(df); 517 474 if (ret) 518 475 device->open_count--; 519 476 } ··· 531 468 return ret; 532 469 } 533 470 534 - void vfio_device_close(struct vfio_device *device, 535 - struct iommufd_ctx *iommufd) 471 + void vfio_df_close(struct vfio_device_file *df) 536 472 { 473 + struct vfio_device *device = df->device; 474 + 537 475 lockdep_assert_held(&device->dev_set->lock); 538 476 539 477 vfio_assert_device_open(device); 540 478 if (device->open_count == 1) 541 - vfio_device_last_close(device, iommufd); 479 + vfio_df_device_last_close(df); 542 480 device->open_count--; 543 481 } 544 482 ··· 581 517 */ 582 518 static int vfio_device_fops_release(struct inode *inode, struct file *filep) 583 519 { 584 - struct vfio_device *device = filep->private_data; 520 + struct vfio_device_file *df = filep->private_data; 521 + struct vfio_device *device = df->device; 585 522 586 - vfio_device_group_close(device); 523 + if (df->group) 524 + vfio_df_group_close(df); 525 + else 526 + vfio_df_unbind_iommufd(df); 587 527 588 528 vfio_device_put_registration(device); 529 + 530 + kfree(df); 589 531 590 532 return 0; 591 533 } ··· 935 865 return 0; 936 866 } 937 867 868 + void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 869 + u32 req_nodes) 870 + { 871 + struct interval_tree_node *prev, *curr, *comb_start, *comb_end; 872 + unsigned long min_gap, curr_gap; 873 + 874 + /* Special shortcut when a single range is required */ 875 + if (req_nodes == 1) { 876 + unsigned long last; 877 + 878 + comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); 879 + curr = comb_start; 880 + while (curr) { 881 + last = curr->last; 882 + prev = curr; 883 + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 884 + if (prev != comb_start) 885 + interval_tree_remove(prev, root); 886 + } 887 + comb_start->last = last; 888 + return; 889 + } 890 + 891 + /* Combine ranges which have the smallest gap */ 892 + while (cur_nodes > req_nodes) { 893 + prev = NULL; 894 + min_gap = ULONG_MAX; 895 + curr = interval_tree_iter_first(root, 0, ULONG_MAX); 896 + while (curr) { 897 + if (prev) { 898 + curr_gap = curr->start - prev->last; 899 + if (curr_gap < min_gap) { 900 + min_gap = curr_gap; 901 + comb_start = prev; 902 + comb_end = curr; 903 + } 904 + } 905 + prev = curr; 906 + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); 907 + } 908 + comb_start->last = comb_end->last; 909 + interval_tree_remove(comb_end, root); 910 + cur_nodes--; 911 + } 912 + } 913 + EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); 914 + 938 915 /* Ranges should fit into a single kernel page */ 939 916 #define LOG_MAX_RANGES \ 940 917 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) ··· 1204 1087 static long vfio_device_fops_unl_ioctl(struct file *filep, 1205 1088 unsigned int cmd, unsigned long arg) 1206 1089 { 1207 - struct vfio_device *device = filep->private_data; 1090 + struct vfio_device_file *df = filep->private_data; 1091 + struct vfio_device *device = df->device; 1092 + void __user *uptr = (void __user *)arg; 1208 1093 int ret; 1094 + 1095 + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) 1096 + return vfio_df_ioctl_bind_iommufd(df, uptr); 1097 + 1098 + /* Paired with smp_store_release() following vfio_df_open() */ 1099 + if (!smp_load_acquire(&df->access_granted)) 1100 + return -EINVAL; 1209 1101 1210 1102 ret = vfio_device_pm_runtime_get(device); 1211 1103 if (ret) 1212 1104 return ret; 1213 1105 1106 + /* cdev only ioctls */ 1107 + if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { 1108 + switch (cmd) { 1109 + case VFIO_DEVICE_ATTACH_IOMMUFD_PT: 1110 + ret = vfio_df_ioctl_attach_pt(df, uptr); 1111 + goto out; 1112 + 1113 + case VFIO_DEVICE_DETACH_IOMMUFD_PT: 1114 + ret = vfio_df_ioctl_detach_pt(df, uptr); 1115 + goto out; 1116 + } 1117 + } 1118 + 1214 1119 switch (cmd) { 1215 1120 case VFIO_DEVICE_FEATURE: 1216 - ret = vfio_ioctl_device_feature(device, (void __user *)arg); 1121 + ret = vfio_ioctl_device_feature(device, uptr); 1217 1122 break; 1218 1123 1219 1124 default: ··· 1245 1106 ret = device->ops->ioctl(device, cmd, arg); 1246 1107 break; 1247 1108 } 1248 - 1109 + out: 1249 1110 vfio_device_pm_runtime_put(device); 1250 1111 return ret; 1251 1112 } ··· 1253 1114 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, 1254 1115 size_t count, loff_t *ppos) 1255 1116 { 1256 - struct vfio_device *device = filep->private_data; 1117 + struct vfio_device_file *df = filep->private_data; 1118 + struct vfio_device *device = df->device; 1119 + 1120 + /* Paired with smp_store_release() following vfio_df_open() */ 1121 + if (!smp_load_acquire(&df->access_granted)) 1122 + return -EINVAL; 1257 1123 1258 1124 if (unlikely(!device->ops->read)) 1259 1125 return -EINVAL; ··· 1270 1126 const char __user *buf, 1271 1127 size_t count, loff_t *ppos) 1272 1128 { 1273 - struct vfio_device *device = filep->private_data; 1129 + struct vfio_device_file *df = filep->private_data; 1130 + struct vfio_device *device = df->device; 1131 + 1132 + /* Paired with smp_store_release() following vfio_df_open() */ 1133 + if (!smp_load_acquire(&df->access_granted)) 1134 + return -EINVAL; 1274 1135 1275 1136 if (unlikely(!device->ops->write)) 1276 1137 return -EINVAL; ··· 1285 1136 1286 1137 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) 1287 1138 { 1288 - struct vfio_device *device = filep->private_data; 1139 + struct vfio_device_file *df = filep->private_data; 1140 + struct vfio_device *device = df->device; 1141 + 1142 + /* Paired with smp_store_release() following vfio_df_open() */ 1143 + if (!smp_load_acquire(&df->access_granted)) 1144 + return -EINVAL; 1289 1145 1290 1146 if (unlikely(!device->ops->mmap)) 1291 1147 return -EINVAL; ··· 1300 1146 1301 1147 const struct file_operations vfio_device_fops = { 1302 1148 .owner = THIS_MODULE, 1149 + .open = vfio_device_fops_cdev_open, 1303 1150 .release = vfio_device_fops_release, 1304 1151 .read = vfio_device_fops_read, 1305 1152 .write = vfio_device_fops_write, ··· 1308 1153 .compat_ioctl = compat_ptr_ioctl, 1309 1154 .mmap = vfio_device_fops_mmap, 1310 1155 }; 1156 + 1157 + static struct vfio_device *vfio_device_from_file(struct file *file) 1158 + { 1159 + struct vfio_device_file *df = file->private_data; 1160 + 1161 + if (file->f_op != &vfio_device_fops) 1162 + return NULL; 1163 + return df->device; 1164 + } 1165 + 1166 + /** 1167 + * vfio_file_is_valid - True if the file is valid vfio file 1168 + * @file: VFIO group file or VFIO device file 1169 + */ 1170 + bool vfio_file_is_valid(struct file *file) 1171 + { 1172 + return vfio_group_from_file(file) || 1173 + vfio_device_from_file(file); 1174 + } 1175 + EXPORT_SYMBOL_GPL(vfio_file_is_valid); 1176 + 1177 + /** 1178 + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file 1179 + * is always CPU cache coherent 1180 + * @file: VFIO group file or VFIO device file 1181 + * 1182 + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop 1183 + * bit in DMA transactions. A return of false indicates that the user has 1184 + * rights to access additional instructions such as wbinvd on x86. 1185 + */ 1186 + bool vfio_file_enforced_coherent(struct file *file) 1187 + { 1188 + struct vfio_device *device; 1189 + struct vfio_group *group; 1190 + 1191 + group = vfio_group_from_file(file); 1192 + if (group) 1193 + return vfio_group_enforced_coherent(group); 1194 + 1195 + device = vfio_device_from_file(file); 1196 + if (device) 1197 + return device_iommu_capable(device->dev, 1198 + IOMMU_CAP_ENFORCE_CACHE_COHERENCY); 1199 + 1200 + return true; 1201 + } 1202 + EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); 1203 + 1204 + static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) 1205 + { 1206 + struct vfio_device_file *df = file->private_data; 1207 + 1208 + /* 1209 + * The kvm is first recorded in the vfio_device_file, and will 1210 + * be propagated to vfio_device::kvm when the file is bound to 1211 + * iommufd successfully in the vfio device cdev path. 1212 + */ 1213 + spin_lock(&df->kvm_ref_lock); 1214 + df->kvm = kvm; 1215 + spin_unlock(&df->kvm_ref_lock); 1216 + } 1217 + 1218 + /** 1219 + * vfio_file_set_kvm - Link a kvm with VFIO drivers 1220 + * @file: VFIO group file or VFIO device file 1221 + * @kvm: KVM to link 1222 + * 1223 + * When a VFIO device is first opened the KVM will be available in 1224 + * device->kvm if one was associated with the file. 1225 + */ 1226 + void vfio_file_set_kvm(struct file *file, struct kvm *kvm) 1227 + { 1228 + struct vfio_group *group; 1229 + 1230 + group = vfio_group_from_file(file); 1231 + if (group) 1232 + vfio_group_set_kvm(group, kvm); 1233 + 1234 + if (vfio_device_from_file(file)) 1235 + vfio_device_file_set_kvm(file, kvm); 1236 + } 1237 + EXPORT_SYMBOL_GPL(vfio_file_set_kvm); 1311 1238 1312 1239 /* 1313 1240 * Sub-module support ··· 1408 1171 { 1409 1172 void *buf; 1410 1173 struct vfio_info_cap_header *header, *tmp; 1174 + 1175 + /* Ensure that the next capability struct will be aligned */ 1176 + size = ALIGN(size, sizeof(u64)); 1411 1177 1412 1178 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1413 1179 if (!buf) { ··· 1444 1204 { 1445 1205 struct vfio_info_cap_header *tmp; 1446 1206 void *buf = (void *)caps->buf; 1207 + 1208 + /* Capability structs should start with proper alignment */ 1209 + WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); 1447 1210 1448 1211 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) 1449 1212 tmp->next += offset; ··· 1658 1415 goto err_dev_class; 1659 1416 } 1660 1417 1418 + ret = vfio_cdev_init(vfio.device_class); 1419 + if (ret) 1420 + goto err_alloc_dev_chrdev; 1421 + 1661 1422 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); 1662 1423 return 0; 1663 1424 1425 + err_alloc_dev_chrdev: 1426 + class_destroy(vfio.device_class); 1427 + vfio.device_class = NULL; 1664 1428 err_dev_class: 1665 1429 vfio_virqfd_exit(); 1666 1430 err_virqfd: ··· 1678 1428 static void __exit vfio_cleanup(void) 1679 1429 { 1680 1430 ida_destroy(&vfio.device_ida); 1431 + vfio_cdev_cleanup(); 1681 1432 class_destroy(vfio.device_class); 1682 1433 vfio.device_class = NULL; 1683 1434 vfio_virqfd_exit();

+7

include/linux/iommufd.h

··· 16 16 struct iommufd_ctx; 17 17 struct iommufd_access; 18 18 struct file; 19 + struct iommu_group; 19 20 20 21 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, 21 22 struct device *dev, u32 *id); ··· 24 23 25 24 int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); 26 25 void iommufd_device_detach(struct iommufd_device *idev); 26 + 27 + struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); 28 + u32 iommufd_device_to_id(struct iommufd_device *idev); 27 29 28 30 struct iommufd_access_ops { 29 31 u8 needs_pin_pages : 1; ··· 48 44 const struct iommufd_access_ops *ops, void *data, u32 *id); 49 45 void iommufd_access_destroy(struct iommufd_access *access); 50 46 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id); 47 + void iommufd_access_detach(struct iommufd_access *access); 51 48 52 49 void iommufd_ctx_get(struct iommufd_ctx *ictx); 53 50 54 51 #if IS_ENABLED(CONFIG_IOMMUFD) 55 52 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); 53 + struct iommufd_ctx *iommufd_ctx_from_fd(int fd); 56 54 void iommufd_ctx_put(struct iommufd_ctx *ictx); 55 + bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group); 57 56 58 57 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, 59 58 unsigned long length, struct page **out_pages,

+375

include/linux/pds/pds_adminq.h

··· 818 818 __le64 features; 819 819 }; 820 820 821 + #define PDS_LM_DEVICE_STATE_LENGTH 65536 822 + #define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X) \ 823 + PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X) 824 + 825 + /* 826 + * enum pds_lm_cmd_opcode - Live Migration Device commands 827 + */ 828 + enum pds_lm_cmd_opcode { 829 + PDS_LM_CMD_HOST_VF_STATUS = 1, 830 + 831 + /* Device state commands */ 832 + PDS_LM_CMD_STATE_SIZE = 16, 833 + PDS_LM_CMD_SUSPEND = 18, 834 + PDS_LM_CMD_SUSPEND_STATUS = 19, 835 + PDS_LM_CMD_RESUME = 20, 836 + PDS_LM_CMD_SAVE = 21, 837 + PDS_LM_CMD_RESTORE = 22, 838 + 839 + /* Dirty page tracking commands */ 840 + PDS_LM_CMD_DIRTY_STATUS = 32, 841 + PDS_LM_CMD_DIRTY_ENABLE = 33, 842 + PDS_LM_CMD_DIRTY_DISABLE = 34, 843 + PDS_LM_CMD_DIRTY_READ_SEQ = 35, 844 + PDS_LM_CMD_DIRTY_WRITE_ACK = 36, 845 + }; 846 + 847 + /** 848 + * struct pds_lm_cmd - generic command 849 + * @opcode: Opcode 850 + * @rsvd: Word boundary padding 851 + * @vf_id: VF id 852 + * @rsvd2: Structure padding to 60 Bytes 853 + */ 854 + struct pds_lm_cmd { 855 + u8 opcode; 856 + u8 rsvd; 857 + __le16 vf_id; 858 + u8 rsvd2[56]; 859 + }; 860 + 861 + /** 862 + * struct pds_lm_state_size_cmd - STATE_SIZE command 863 + * @opcode: Opcode 864 + * @rsvd: Word boundary padding 865 + * @vf_id: VF id 866 + */ 867 + struct pds_lm_state_size_cmd { 868 + u8 opcode; 869 + u8 rsvd; 870 + __le16 vf_id; 871 + }; 872 + 873 + /** 874 + * struct pds_lm_state_size_comp - STATE_SIZE command completion 875 + * @status: Status of the command (enum pds_core_status_code) 876 + * @rsvd: Word boundary padding 877 + * @comp_index: Index in the desc ring for which this is the completion 878 + * @size: Size of the device state 879 + * @rsvd2: Word boundary padding 880 + * @color: Color bit 881 + */ 882 + struct pds_lm_state_size_comp { 883 + u8 status; 884 + u8 rsvd; 885 + __le16 comp_index; 886 + union { 887 + __le64 size; 888 + u8 rsvd2[11]; 889 + } __packed; 890 + u8 color; 891 + }; 892 + 893 + enum pds_lm_suspend_resume_type { 894 + PDS_LM_SUSPEND_RESUME_TYPE_FULL = 0, 895 + PDS_LM_SUSPEND_RESUME_TYPE_P2P = 1, 896 + }; 897 + 898 + /** 899 + * struct pds_lm_suspend_cmd - SUSPEND command 900 + * @opcode: Opcode PDS_LM_CMD_SUSPEND 901 + * @rsvd: Word boundary padding 902 + * @vf_id: VF id 903 + * @type: Type of suspend (enum pds_lm_suspend_resume_type) 904 + */ 905 + struct pds_lm_suspend_cmd { 906 + u8 opcode; 907 + u8 rsvd; 908 + __le16 vf_id; 909 + u8 type; 910 + }; 911 + 912 + /** 913 + * struct pds_lm_suspend_status_cmd - SUSPEND status command 914 + * @opcode: Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS 915 + * @rsvd: Word boundary padding 916 + * @vf_id: VF id 917 + * @type: Type of suspend (enum pds_lm_suspend_resume_type) 918 + */ 919 + struct pds_lm_suspend_status_cmd { 920 + u8 opcode; 921 + u8 rsvd; 922 + __le16 vf_id; 923 + u8 type; 924 + }; 925 + 926 + /** 927 + * struct pds_lm_resume_cmd - RESUME command 928 + * @opcode: Opcode PDS_LM_CMD_RESUME 929 + * @rsvd: Word boundary padding 930 + * @vf_id: VF id 931 + * @type: Type of resume (enum pds_lm_suspend_resume_type) 932 + */ 933 + struct pds_lm_resume_cmd { 934 + u8 opcode; 935 + u8 rsvd; 936 + __le16 vf_id; 937 + u8 type; 938 + }; 939 + 940 + /** 941 + * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element 942 + * @addr: DMA address of SG element data buffer 943 + * @len: Length of SG element data buffer, in bytes 944 + * @rsvd: Word boundary padding 945 + */ 946 + struct pds_lm_sg_elem { 947 + __le64 addr; 948 + __le32 len; 949 + __le16 rsvd[2]; 950 + }; 951 + 952 + /** 953 + * struct pds_lm_save_cmd - SAVE command 954 + * @opcode: Opcode PDS_LM_CMD_SAVE 955 + * @rsvd: Word boundary padding 956 + * @vf_id: VF id 957 + * @rsvd2: Word boundary padding 958 + * @sgl_addr: IOVA address of the SGL to dma the device state 959 + * @num_sge: Total number of SG elements 960 + */ 961 + struct pds_lm_save_cmd { 962 + u8 opcode; 963 + u8 rsvd; 964 + __le16 vf_id; 965 + u8 rsvd2[4]; 966 + __le64 sgl_addr; 967 + __le32 num_sge; 968 + } __packed; 969 + 970 + /** 971 + * struct pds_lm_restore_cmd - RESTORE command 972 + * @opcode: Opcode PDS_LM_CMD_RESTORE 973 + * @rsvd: Word boundary padding 974 + * @vf_id: VF id 975 + * @rsvd2: Word boundary padding 976 + * @sgl_addr: IOVA address of the SGL to dma the device state 977 + * @num_sge: Total number of SG elements 978 + */ 979 + struct pds_lm_restore_cmd { 980 + u8 opcode; 981 + u8 rsvd; 982 + __le16 vf_id; 983 + u8 rsvd2[4]; 984 + __le64 sgl_addr; 985 + __le32 num_sge; 986 + } __packed; 987 + 988 + /** 989 + * union pds_lm_dev_state - device state information 990 + * @words: Device state words 991 + */ 992 + union pds_lm_dev_state { 993 + __le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)]; 994 + }; 995 + 996 + enum pds_lm_host_vf_status { 997 + PDS_LM_STA_NONE = 0, 998 + PDS_LM_STA_IN_PROGRESS, 999 + PDS_LM_STA_MAX, 1000 + }; 1001 + 1002 + /** 1003 + * struct pds_lm_dirty_region_info - Memory region info for STATUS and ENABLE 1004 + * @dma_base: Base address of the DMA-contiguous memory region 1005 + * @page_count: Number of pages in the memory region 1006 + * @page_size_log2: Log2 page size in the memory region 1007 + * @rsvd: Word boundary padding 1008 + */ 1009 + struct pds_lm_dirty_region_info { 1010 + __le64 dma_base; 1011 + __le32 page_count; 1012 + u8 page_size_log2; 1013 + u8 rsvd[3]; 1014 + }; 1015 + 1016 + /** 1017 + * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command 1018 + * @opcode: Opcode PDS_LM_CMD_DIRTY_STATUS 1019 + * @rsvd: Word boundary padding 1020 + * @vf_id: VF id 1021 + * @max_regions: Capacity of the region info buffer 1022 + * @rsvd2: Word boundary padding 1023 + * @regions_dma: DMA address of the region info buffer 1024 + * 1025 + * The minimum of max_regions (from the command) and num_regions (from the 1026 + * completion) of struct pds_lm_dirty_region_info will be written to 1027 + * regions_dma. 1028 + * 1029 + * The max_regions may be zero, in which case regions_dma is ignored. In that 1030 + * case, the completion will only report the maximum number of regions 1031 + * supported by the device, and the number of regions currently enabled. 1032 + */ 1033 + struct pds_lm_dirty_status_cmd { 1034 + u8 opcode; 1035 + u8 rsvd; 1036 + __le16 vf_id; 1037 + u8 max_regions; 1038 + u8 rsvd2[3]; 1039 + __le64 regions_dma; 1040 + } __packed; 1041 + 1042 + /** 1043 + * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap 1044 + * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled 1045 + * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation 1046 + */ 1047 + enum pds_lm_dirty_bmp_type { 1048 + PDS_LM_DIRTY_BMP_TYPE_NONE = 0, 1049 + PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK = 1, 1050 + }; 1051 + 1052 + /** 1053 + * struct pds_lm_dirty_status_comp - STATUS command completion 1054 + * @status: Status of the command (enum pds_core_status_code) 1055 + * @rsvd: Word boundary padding 1056 + * @comp_index: Index in the desc ring for which this is the completion 1057 + * @max_regions: Maximum number of regions supported by the device 1058 + * @num_regions: Number of regions currently enabled 1059 + * @bmp_type: Type of dirty bitmap representation 1060 + * @rsvd2: Word boundary padding 1061 + * @bmp_type_mask: Mask of supported bitmap types, bit index per type 1062 + * @rsvd3: Word boundary padding 1063 + * @color: Color bit 1064 + * 1065 + * This completion descriptor is used for STATUS, ENABLE, and DISABLE. 1066 + */ 1067 + struct pds_lm_dirty_status_comp { 1068 + u8 status; 1069 + u8 rsvd; 1070 + __le16 comp_index; 1071 + u8 max_regions; 1072 + u8 num_regions; 1073 + u8 bmp_type; 1074 + u8 rsvd2; 1075 + __le32 bmp_type_mask; 1076 + u8 rsvd3[3]; 1077 + u8 color; 1078 + }; 1079 + 1080 + /** 1081 + * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command 1082 + * @opcode: Opcode PDS_LM_CMD_DIRTY_ENABLE 1083 + * @rsvd: Word boundary padding 1084 + * @vf_id: VF id 1085 + * @bmp_type: Type of dirty bitmap representation 1086 + * @num_regions: Number of entries in the region info buffer 1087 + * @rsvd2: Word boundary padding 1088 + * @regions_dma: DMA address of the region info buffer 1089 + * 1090 + * The num_regions must be nonzero, and less than or equal to the maximum 1091 + * number of regions supported by the device. 1092 + * 1093 + * The memory regions should not overlap. 1094 + * 1095 + * The information should be initialized by the driver. The device may modify 1096 + * the information on successful completion, such as by size-aligning the 1097 + * number of pages in a region. 1098 + * 1099 + * The modified number of pages will be greater than or equal to the page count 1100 + * given in the enable command, and at least as coarsly aligned as the given 1101 + * value. For example, the count might be aligned to a multiple of 64, but 1102 + * if the value is already a multiple of 128 or higher, it will not change. 1103 + * If the driver requires its own minimum alignment of the number of pages, the 1104 + * driver should account for that already in the region info of this command. 1105 + * 1106 + * This command uses struct pds_lm_dirty_status_comp for its completion. 1107 + */ 1108 + struct pds_lm_dirty_enable_cmd { 1109 + u8 opcode; 1110 + u8 rsvd; 1111 + __le16 vf_id; 1112 + u8 bmp_type; 1113 + u8 num_regions; 1114 + u8 rsvd2[2]; 1115 + __le64 regions_dma; 1116 + } __packed; 1117 + 1118 + /** 1119 + * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command 1120 + * @opcode: Opcode PDS_LM_CMD_DIRTY_DISABLE 1121 + * @rsvd: Word boundary padding 1122 + * @vf_id: VF id 1123 + * 1124 + * Dirty page tracking will be disabled. This may be called in any state, as 1125 + * long as dirty page tracking is supported by the device, to ensure that dirty 1126 + * page tracking is disabled. 1127 + * 1128 + * This command uses struct pds_lm_dirty_status_comp for its completion. On 1129 + * success, num_regions will be zero. 1130 + */ 1131 + struct pds_lm_dirty_disable_cmd { 1132 + u8 opcode; 1133 + u8 rsvd; 1134 + __le16 vf_id; 1135 + }; 1136 + 1137 + /** 1138 + * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK command 1139 + * @opcode: Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK] 1140 + * @rsvd: Word boundary padding 1141 + * @vf_id: VF id 1142 + * @off_bytes: Byte offset in the bitmap 1143 + * @len_bytes: Number of bytes to transfer 1144 + * @num_sge: Number of DMA scatter gather elements 1145 + * @rsvd2: Word boundary padding 1146 + * @sgl_addr: DMA address of scatter gather list 1147 + * 1148 + * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap. 1149 + * 1150 + * This command treats the entire bitmap as a byte buffer. It does not 1151 + * distinguish between guest memory regions. The driver should refer to the 1152 + * number of pages in each region, according to PDS_LM_CMD_DIRTY_STATUS, to 1153 + * determine the region boundaries in the bitmap. Each region will be 1154 + * represented by exactly the number of bits as the page count for that region, 1155 + * immediately following the last bit of the previous region. 1156 + */ 1157 + struct pds_lm_dirty_seq_ack_cmd { 1158 + u8 opcode; 1159 + u8 rsvd; 1160 + __le16 vf_id; 1161 + __le32 off_bytes; 1162 + __le32 len_bytes; 1163 + __le16 num_sge; 1164 + u8 rsvd2[2]; 1165 + __le64 sgl_addr; 1166 + } __packed; 1167 + 1168 + /** 1169 + * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command 1170 + * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS 1171 + * @rsvd: Word boundary padding 1172 + * @vf_id: VF id 1173 + * @status: Current LM status of host VF driver (enum pds_lm_host_status) 1174 + */ 1175 + struct pds_lm_host_vf_status_cmd { 1176 + u8 opcode; 1177 + u8 rsvd; 1178 + __le16 vf_id; 1179 + u8 status; 1180 + }; 1181 + 821 1182 union pds_core_adminq_cmd { 822 1183 u8 opcode; 823 1184 u8 bytes[64]; ··· 1205 844 struct pds_vdpa_vq_init_cmd vdpa_vq_init; 1206 845 struct pds_vdpa_vq_reset_cmd vdpa_vq_reset; 1207 846 847 + struct pds_lm_suspend_cmd lm_suspend; 848 + struct pds_lm_suspend_status_cmd lm_suspend_status; 849 + struct pds_lm_resume_cmd lm_resume; 850 + struct pds_lm_state_size_cmd lm_state_size; 851 + struct pds_lm_save_cmd lm_save; 852 + struct pds_lm_restore_cmd lm_restore; 853 + struct pds_lm_host_vf_status_cmd lm_host_vf_status; 854 + struct pds_lm_dirty_status_cmd lm_dirty_status; 855 + struct pds_lm_dirty_enable_cmd lm_dirty_enable; 856 + struct pds_lm_dirty_disable_cmd lm_dirty_disable; 857 + struct pds_lm_dirty_seq_ack_cmd lm_dirty_seq_ack; 1208 858 }; 1209 859 1210 860 union pds_core_adminq_comp { ··· 1240 868 1241 869 struct pds_vdpa_vq_init_comp vdpa_vq_init; 1242 870 struct pds_vdpa_vq_reset_comp vdpa_vq_reset; 871 + 872 + struct pds_lm_state_size_comp lm_state_size; 873 + struct pds_lm_dirty_status_comp lm_dirty_status; 1243 874 }; 1244 875 1245 876 #ifndef __CHECKER__

+6 -3

include/linux/pds/pds_common.h

··· 34 34 35 35 #define PDS_DEV_TYPE_CORE_STR "Core" 36 36 #define PDS_DEV_TYPE_VDPA_STR "vDPA" 37 - #define PDS_DEV_TYPE_VFIO_STR "VFio" 37 + #define PDS_DEV_TYPE_VFIO_STR "vfio" 38 38 #define PDS_DEV_TYPE_ETH_STR "Eth" 39 39 #define PDS_DEV_TYPE_RDMA_STR "RDMA" 40 40 #define PDS_DEV_TYPE_LM_STR "LM" 41 41 42 42 #define PDS_VDPA_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_VDPA_STR 43 + #define PDS_VFIO_LM_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR "." PDS_DEV_TYPE_VFIO_STR 44 + 45 + struct pdsc; 43 46 44 47 int pdsc_register_notify(struct notifier_block *nb); 45 48 void pdsc_unregister_notify(struct notifier_block *nb); 46 49 void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); 47 - int pds_client_register(struct pci_dev *pf_pdev, char *devname); 48 - int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id); 50 + int pds_client_register(struct pdsc *pf, char *devname); 51 + int pds_client_unregister(struct pdsc *pf, u16 client_id); 49 52 #endif /* _PDS_COMMON_H_ */

+64 -5

include/linux/vfio.h

··· 13 13 #include <linux/mm.h> 14 14 #include <linux/workqueue.h> 15 15 #include <linux/poll.h> 16 + #include <linux/cdev.h> 16 17 #include <uapi/linux/vfio.h> 17 18 #include <linux/iova_bitmap.h> 18 19 ··· 43 42 */ 44 43 const struct vfio_migration_ops *mig_ops; 45 44 const struct vfio_log_ops *log_ops; 45 + #if IS_ENABLED(CONFIG_VFIO_GROUP) 46 46 struct vfio_group *group; 47 + struct list_head group_next; 48 + struct list_head iommu_entry; 49 + #endif 47 50 struct vfio_device_set *dev_set; 48 51 struct list_head dev_set_list; 49 52 unsigned int migration_flags; ··· 56 51 /* Members below here are private, not for driver use */ 57 52 unsigned int index; 58 53 struct device device; /* device.kref covers object life circle */ 54 + #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) 55 + struct cdev cdev; 56 + #endif 59 57 refcount_t refcount; /* user count on registered device*/ 60 58 unsigned int open_count; 61 59 struct completion comp; 62 - struct list_head group_next; 63 - struct list_head iommu_entry; 64 60 struct iommufd_access *iommufd_access; 65 61 void (*put_kvm)(struct kvm *kvm); 66 62 #if IS_ENABLED(CONFIG_IOMMUFD) 67 63 struct iommufd_device *iommufd_device; 68 - bool iommufd_attached; 64 + u8 iommufd_attached:1; 69 65 #endif 66 + u8 cdev_opened:1; 70 67 }; 71 68 72 69 /** ··· 80 73 * @bind_iommufd: Called when binding the device to an iommufd 81 74 * @unbind_iommufd: Opposite of bind_iommufd 82 75 * @attach_ioas: Called when attaching device to an IOAS/HWPT managed by the 83 - * bound iommufd. Undo in unbind_iommufd. 76 + * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not 77 + * called. 78 + * @detach_ioas: Opposite of attach_ioas 84 79 * @open_device: Called when the first file descriptor is opened for this device 85 80 * @close_device: Opposite of open_device 86 81 * @read: Perform read(2) on device file descriptor ··· 106 97 struct iommufd_ctx *ictx, u32 *out_device_id); 107 98 void (*unbind_iommufd)(struct vfio_device *vdev); 108 99 int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); 100 + void (*detach_ioas)(struct vfio_device *vdev); 109 101 int (*open_device)(struct vfio_device *vdev); 110 102 void (*close_device)(struct vfio_device *vdev); 111 103 ssize_t (*read)(struct vfio_device *vdev, char __user *buf, ··· 124 114 }; 125 115 126 116 #if IS_ENABLED(CONFIG_IOMMUFD) 117 + struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev); 118 + int vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx); 127 119 int vfio_iommufd_physical_bind(struct vfio_device *vdev, 128 120 struct iommufd_ctx *ictx, u32 *out_device_id); 129 121 void vfio_iommufd_physical_unbind(struct vfio_device *vdev); 130 122 int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); 123 + void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); 131 124 int vfio_iommufd_emulated_bind(struct vfio_device *vdev, 132 125 struct iommufd_ctx *ictx, u32 *out_device_id); 133 126 void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); 134 127 int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); 128 + void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev); 135 129 #else 130 + static inline struct iommufd_ctx * 131 + vfio_iommufd_device_ictx(struct vfio_device *vdev) 132 + { 133 + return NULL; 134 + } 135 + 136 + static inline int 137 + vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) 138 + { 139 + return VFIO_PCI_DEVID_NOT_OWNED; 140 + } 141 + 136 142 #define vfio_iommufd_physical_bind \ 137 143 ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ 138 144 u32 *out_device_id)) NULL) ··· 156 130 ((void (*)(struct vfio_device *vdev)) NULL) 157 131 #define vfio_iommufd_physical_attach_ioas \ 158 132 ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) 133 + #define vfio_iommufd_physical_detach_ioas \ 134 + ((void (*)(struct vfio_device *vdev)) NULL) 159 135 #define vfio_iommufd_emulated_bind \ 160 136 ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ 161 137 u32 *out_device_id)) NULL) ··· 165 137 ((void (*)(struct vfio_device *vdev)) NULL) 166 138 #define vfio_iommufd_emulated_attach_ioas \ 167 139 ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) 140 + #define vfio_iommufd_emulated_detach_ioas \ 141 + ((void (*)(struct vfio_device *vdev)) NULL) 168 142 #endif 143 + 144 + static inline bool vfio_device_cdev_opened(struct vfio_device *device) 145 + { 146 + return device->cdev_opened; 147 + } 169 148 170 149 /** 171 150 * struct vfio_migration_ops - VFIO bus device driver migration callbacks ··· 274 239 275 240 int vfio_assign_device_set(struct vfio_device *device, void *set_id); 276 241 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); 242 + struct vfio_device * 243 + vfio_find_device_in_devset(struct vfio_device_set *dev_set, 244 + struct device *dev); 277 245 278 246 int vfio_mig_get_next_state(struct vfio_device *device, 279 247 enum vfio_device_mig_state cur_fsm, 280 248 enum vfio_device_mig_state new_fsm, 281 249 enum vfio_device_mig_state *next_fsm); 282 250 251 + void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, 252 + u32 req_nodes); 253 + 283 254 /* 284 255 * External user API 285 256 */ 257 + #if IS_ENABLED(CONFIG_VFIO_GROUP) 286 258 struct iommu_group *vfio_file_iommu_group(struct file *file); 287 259 bool vfio_file_is_group(struct file *file); 260 + bool vfio_file_has_dev(struct file *file, struct vfio_device *device); 261 + #else 262 + static inline struct iommu_group *vfio_file_iommu_group(struct file *file) 263 + { 264 + return NULL; 265 + } 266 + 267 + static inline bool vfio_file_is_group(struct file *file) 268 + { 269 + return false; 270 + } 271 + 272 + static inline bool vfio_file_has_dev(struct file *file, struct vfio_device *device) 273 + { 274 + return false; 275 + } 276 + #endif 277 + bool vfio_file_is_valid(struct file *file); 288 278 bool vfio_file_enforced_coherent(struct file *file); 289 279 void vfio_file_set_kvm(struct file *file, struct kvm *kvm); 290 - bool vfio_file_has_dev(struct file *file, struct vfio_device *device); 291 280 292 281 #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) 293 282

+10 -3

include/uapi/linux/kvm.h

··· 1418 1418 __u64 addr; /* userspace address of attr data */ 1419 1419 }; 1420 1420 1421 - #define KVM_DEV_VFIO_GROUP 1 1422 - #define KVM_DEV_VFIO_GROUP_ADD 1 1423 - #define KVM_DEV_VFIO_GROUP_DEL 2 1421 + #define KVM_DEV_VFIO_FILE 1 1422 + 1423 + #define KVM_DEV_VFIO_FILE_ADD 1 1424 + #define KVM_DEV_VFIO_FILE_DEL 2 1425 + 1426 + /* KVM_DEV_VFIO_GROUP aliases are for compile time uapi compatibility */ 1427 + #define KVM_DEV_VFIO_GROUP KVM_DEV_VFIO_FILE 1428 + 1429 + #define KVM_DEV_VFIO_GROUP_ADD KVM_DEV_VFIO_FILE_ADD 1430 + #define KVM_DEV_VFIO_GROUP_DEL KVM_DEV_VFIO_FILE_DEL 1424 1431 #define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 1425 1432 1426 1433 enum kvm_device_type {

+143 -1

include/uapi/linux/vfio.h

··· 217 217 __u32 num_regions; /* Max region index + 1 */ 218 218 __u32 num_irqs; /* Max IRQ index + 1 */ 219 219 __u32 cap_offset; /* Offset within info struct of first cap */ 220 + __u32 pad; 220 221 }; 221 222 #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) 222 223 ··· 678 677 * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, 679 678 * struct vfio_pci_hot_reset_info) 680 679 * 680 + * This command is used to query the affected devices in the hot reset for 681 + * a given device. 682 + * 683 + * This command always reports the segment, bus, and devfn information for 684 + * each affected device, and selectively reports the group_id or devid per 685 + * the way how the calling device is opened. 686 + * 687 + * - If the calling device is opened via the traditional group/container 688 + * API, group_id is reported. User should check if it has owned all 689 + * the affected devices and provides a set of group fds to prove the 690 + * ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl. 691 + * 692 + * - If the calling device is opened as a cdev, devid is reported. 693 + * Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this 694 + * data type. All the affected devices should be represented in 695 + * the dev_set, ex. bound to a vfio driver, and also be owned by 696 + * this interface which is determined by the following conditions: 697 + * 1) Has a valid devid within the iommufd_ctx of the calling device. 698 + * Ownership cannot be determined across separate iommufd_ctx and 699 + * the cdev calling conventions do not support a proof-of-ownership 700 + * model as provided in the legacy group interface. In this case 701 + * valid devid with value greater than zero is provided in the return 702 + * structure. 703 + * 2) Does not have a valid devid within the iommufd_ctx of the calling 704 + * device, but belongs to the same IOMMU group as the calling device 705 + * or another opened device that has a valid devid within the 706 + * iommufd_ctx of the calling device. This provides implicit ownership 707 + * for devices within the same DMA isolation context. In this case 708 + * the devid value of VFIO_PCI_DEVID_OWNED is provided in the return 709 + * structure. 710 + * 711 + * A devid value of VFIO_PCI_DEVID_NOT_OWNED is provided in the return 712 + * structure for affected devices where device is NOT represented in the 713 + * dev_set or ownership is not available. Such devices prevent the use 714 + * of VFIO_DEVICE_PCI_HOT_RESET ioctl outside of the proof-of-ownership 715 + * calling conventions (ie. via legacy group accessed devices). Flag 716 + * VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the 717 + * affected devices are represented in the dev_set and also owned by 718 + * the user. This flag is available only when 719 + * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. 720 + * When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero 721 + * length fd array on the calling device as the ownership is validated 722 + * by iommufd_ctx. 723 + * 681 724 * Return: 0 on success, -errno on failure: 682 725 * -enospc = insufficient buffer, -enodev = unsupported for device. 683 726 */ 684 727 struct vfio_pci_dependent_device { 685 - __u32 group_id; 728 + union { 729 + __u32 group_id; 730 + __u32 devid; 731 + #define VFIO_PCI_DEVID_OWNED 0 732 + #define VFIO_PCI_DEVID_NOT_OWNED -1 733 + }; 686 734 __u16 segment; 687 735 __u8 bus; 688 736 __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ ··· 740 690 struct vfio_pci_hot_reset_info { 741 691 __u32 argsz; 742 692 __u32 flags; 693 + #define VFIO_PCI_HOT_RESET_FLAG_DEV_ID (1 << 0) 694 + #define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED (1 << 1) 743 695 __u32 count; 744 696 struct vfio_pci_dependent_device devices[]; 745 697 }; ··· 751 699 /** 752 700 * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, 753 701 * struct vfio_pci_hot_reset) 702 + * 703 + * A PCI hot reset results in either a bus or slot reset which may affect 704 + * other devices sharing the bus/slot. The calling user must have 705 + * ownership of the full set of affected devices as determined by the 706 + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl. 707 + * 708 + * When called on a device file descriptor acquired through the vfio 709 + * group interface, the user is required to provide proof of ownership 710 + * of those affected devices via the group_fds array in struct 711 + * vfio_pci_hot_reset. 712 + * 713 + * When called on a direct cdev opened vfio device, the flags field of 714 + * struct vfio_pci_hot_reset_info reports the ownership status of the 715 + * affected devices and this ioctl must be called with an empty group_fds 716 + * array. See above INFO ioctl definition for ownership requirements. 717 + * 718 + * Mixed usage of legacy groups and cdevs across the set of affected 719 + * devices is not supported. 754 720 * 755 721 * Return: 0 on success, -errno on failure. 756 722 */ ··· 897 827 }; 898 828 899 829 #define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17) 830 + 831 + /* 832 + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, 833 + * struct vfio_device_bind_iommufd) 834 + * @argsz: User filled size of this data. 835 + * @flags: Must be 0. 836 + * @iommufd: iommufd to bind. 837 + * @out_devid: The device id generated by this bind. devid is a handle for 838 + * this device/iommufd bond and can be used in IOMMUFD commands. 839 + * 840 + * Bind a vfio_device to the specified iommufd. 841 + * 842 + * User is restricted from accessing the device before the binding operation 843 + * is completed. Only allowed on cdev fds. 844 + * 845 + * Unbind is automatically conducted when device fd is closed. 846 + * 847 + * Return: 0 on success, -errno on failure. 848 + */ 849 + struct vfio_device_bind_iommufd { 850 + __u32 argsz; 851 + __u32 flags; 852 + __s32 iommufd; 853 + __u32 out_devid; 854 + }; 855 + 856 + #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) 857 + 858 + /* 859 + * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, 860 + * struct vfio_device_attach_iommufd_pt) 861 + * @argsz: User filled size of this data. 862 + * @flags: Must be 0. 863 + * @pt_id: Input the target id which can represent an ioas or a hwpt 864 + * allocated via iommufd subsystem. 865 + * Output the input ioas id or the attached hwpt id which could 866 + * be the specified hwpt itself or a hwpt automatically created 867 + * for the specified ioas by kernel during the attachment. 868 + * 869 + * Associate the device with an address space within the bound iommufd. 870 + * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only 871 + * allowed on cdev fds. 872 + * 873 + * Return: 0 on success, -errno on failure. 874 + */ 875 + struct vfio_device_attach_iommufd_pt { 876 + __u32 argsz; 877 + __u32 flags; 878 + __u32 pt_id; 879 + }; 880 + 881 + #define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) 882 + 883 + /* 884 + * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, 885 + * struct vfio_device_detach_iommufd_pt) 886 + * @argsz: User filled size of this data. 887 + * @flags: Must be 0. 888 + * 889 + * Remove the association of the device and its current associated address 890 + * space. After it, the device should be in a blocking DMA state. This is only 891 + * allowed on cdev fds. 892 + * 893 + * Return: 0 on success, -errno on failure. 894 + */ 895 + struct vfio_device_detach_iommufd_pt { 896 + __u32 argsz; 897 + __u32 flags; 898 + }; 899 + 900 + #define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) 900 901 901 902 /* 902 903 * Provide support for setting a PCI VF Token, which is used as a shared ··· 1445 1304 #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ 1446 1305 __u64 iova_pgsizes; /* Bitmap of supported page sizes */ 1447 1306 __u32 cap_offset; /* Offset within info struct of first cap */ 1307 + __u32 pad; 1448 1308 }; 1449 1309 1450 1310 /*

+1

samples/vfio-mdev/mbochs.c

··· 1377 1377 .bind_iommufd = vfio_iommufd_emulated_bind, 1378 1378 .unbind_iommufd = vfio_iommufd_emulated_unbind, 1379 1379 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 1380 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 1380 1381 }; 1381 1382 1382 1383 static struct mdev_driver mbochs_driver = {

+1

samples/vfio-mdev/mdpy.c

··· 666 666 .bind_iommufd = vfio_iommufd_emulated_bind, 667 667 .unbind_iommufd = vfio_iommufd_emulated_unbind, 668 668 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 669 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 669 670 }; 670 671 671 672 static struct mdev_driver mdpy_driver = {

+1

samples/vfio-mdev/mtty.c

··· 1272 1272 .bind_iommufd = vfio_iommufd_emulated_bind, 1273 1273 .unbind_iommufd = vfio_iommufd_emulated_unbind, 1274 1274 .attach_ioas = vfio_iommufd_emulated_attach_ioas, 1275 + .detach_ioas = vfio_iommufd_emulated_detach_ioas, 1275 1276 }; 1276 1277 1277 1278 static struct mdev_driver mtty_driver = {

+77 -84

virt/kvm/vfio.c

··· 21 21 #include <asm/kvm_ppc.h> 22 22 #endif 23 23 24 - struct kvm_vfio_group { 24 + struct kvm_vfio_file { 25 25 struct list_head node; 26 26 struct file *file; 27 27 #ifdef CONFIG_SPAPR_TCE_IOMMU ··· 30 30 }; 31 31 32 32 struct kvm_vfio { 33 - struct list_head group_list; 33 + struct list_head file_list; 34 34 struct mutex lock; 35 35 bool noncoherent; 36 36 }; ··· 64 64 return ret; 65 65 } 66 66 67 - static bool kvm_vfio_file_is_group(struct file *file) 67 + static bool kvm_vfio_file_is_valid(struct file *file) 68 68 { 69 69 bool (*fn)(struct file *file); 70 70 bool ret; 71 71 72 - fn = symbol_get(vfio_file_is_group); 72 + fn = symbol_get(vfio_file_is_valid); 73 73 if (!fn) 74 74 return false; 75 75 76 76 ret = fn(file); 77 77 78 - symbol_put(vfio_file_is_group); 78 + symbol_put(vfio_file_is_valid); 79 79 80 80 return ret; 81 81 } ··· 98 98 } 99 99 100 100 static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, 101 - struct kvm_vfio_group *kvg) 101 + struct kvm_vfio_file *kvf) 102 102 { 103 - if (WARN_ON_ONCE(!kvg->iommu_group)) 103 + if (WARN_ON_ONCE(!kvf->iommu_group)) 104 104 return; 105 105 106 - kvm_spapr_tce_release_iommu_group(kvm, kvg->iommu_group); 107 - iommu_group_put(kvg->iommu_group); 108 - kvg->iommu_group = NULL; 106 + kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group); 107 + iommu_group_put(kvf->iommu_group); 108 + kvf->iommu_group = NULL; 109 109 } 110 110 #endif 111 111 112 112 /* 113 - * Groups can use the same or different IOMMU domains. If the same then 114 - * adding a new group may change the coherency of groups we've previously 115 - * been told about. We don't want to care about any of that so we retest 116 - * each group and bail as soon as we find one that's noncoherent. This 117 - * means we only ever [un]register_noncoherent_dma once for the whole device. 113 + * Groups/devices can use the same or different IOMMU domains. If the same 114 + * then adding a new group/device may change the coherency of groups/devices 115 + * we've previously been told about. We don't want to care about any of 116 + * that so we retest each group/device and bail as soon as we find one that's 117 + * noncoherent. This means we only ever [un]register_noncoherent_dma once 118 + * for the whole device. 118 119 */ 119 120 static void kvm_vfio_update_coherency(struct kvm_device *dev) 120 121 { 121 122 struct kvm_vfio *kv = dev->private; 122 123 bool noncoherent = false; 123 - struct kvm_vfio_group *kvg; 124 + struct kvm_vfio_file *kvf; 124 125 125 - mutex_lock(&kv->lock); 126 - 127 - list_for_each_entry(kvg, &kv->group_list, node) { 128 - if (!kvm_vfio_file_enforced_coherent(kvg->file)) { 126 + list_for_each_entry(kvf, &kv->file_list, node) { 127 + if (!kvm_vfio_file_enforced_coherent(kvf->file)) { 129 128 noncoherent = true; 130 129 break; 131 130 } ··· 138 139 else 139 140 kvm_arch_unregister_noncoherent_dma(dev->kvm); 140 141 } 141 - 142 - mutex_unlock(&kv->lock); 143 142 } 144 143 145 - static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) 144 + static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) 146 145 { 147 146 struct kvm_vfio *kv = dev->private; 148 - struct kvm_vfio_group *kvg; 147 + struct kvm_vfio_file *kvf; 149 148 struct file *filp; 150 - int ret; 149 + int ret = 0; 151 150 152 151 filp = fget(fd); 153 152 if (!filp) 154 153 return -EBADF; 155 154 156 - /* Ensure the FD is a vfio group FD.*/ 157 - if (!kvm_vfio_file_is_group(filp)) { 155 + /* Ensure the FD is a vfio FD. */ 156 + if (!kvm_vfio_file_is_valid(filp)) { 158 157 ret = -EINVAL; 159 - goto err_fput; 158 + goto out_fput; 160 159 } 161 160 162 161 mutex_lock(&kv->lock); 163 162 164 - list_for_each_entry(kvg, &kv->group_list, node) { 165 - if (kvg->file == filp) { 163 + list_for_each_entry(kvf, &kv->file_list, node) { 164 + if (kvf->file == filp) { 166 165 ret = -EEXIST; 167 - goto err_unlock; 166 + goto out_unlock; 168 167 } 169 168 } 170 169 171 - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); 172 - if (!kvg) { 170 + kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT); 171 + if (!kvf) { 173 172 ret = -ENOMEM; 174 - goto err_unlock; 173 + goto out_unlock; 175 174 } 176 175 177 - kvg->file = filp; 178 - list_add_tail(&kvg->node, &kv->group_list); 176 + kvf->file = get_file(filp); 177 + list_add_tail(&kvf->node, &kv->file_list); 179 178 180 179 kvm_arch_start_assignment(dev->kvm); 181 - 182 - mutex_unlock(&kv->lock); 183 - 184 - kvm_vfio_file_set_kvm(kvg->file, dev->kvm); 180 + kvm_vfio_file_set_kvm(kvf->file, dev->kvm); 185 181 kvm_vfio_update_coherency(dev); 186 182 187 - return 0; 188 - err_unlock: 183 + out_unlock: 189 184 mutex_unlock(&kv->lock); 190 - err_fput: 185 + out_fput: 191 186 fput(filp); 192 187 return ret; 193 188 } 194 189 195 - static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) 190 + static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) 196 191 { 197 192 struct kvm_vfio *kv = dev->private; 198 - struct kvm_vfio_group *kvg; 193 + struct kvm_vfio_file *kvf; 199 194 struct fd f; 200 195 int ret; 201 196 ··· 201 208 202 209 mutex_lock(&kv->lock); 203 210 204 - list_for_each_entry(kvg, &kv->group_list, node) { 205 - if (kvg->file != f.file) 211 + list_for_each_entry(kvf, &kv->file_list, node) { 212 + if (kvf->file != f.file) 206 213 continue; 207 214 208 - list_del(&kvg->node); 215 + list_del(&kvf->node); 209 216 kvm_arch_end_assignment(dev->kvm); 210 217 #ifdef CONFIG_SPAPR_TCE_IOMMU 211 - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); 218 + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); 212 219 #endif 213 - kvm_vfio_file_set_kvm(kvg->file, NULL); 214 - fput(kvg->file); 215 - kfree(kvg); 220 + kvm_vfio_file_set_kvm(kvf->file, NULL); 221 + fput(kvf->file); 222 + kfree(kvf); 216 223 ret = 0; 217 224 break; 218 225 } 226 + 227 + kvm_vfio_update_coherency(dev); 219 228 220 229 mutex_unlock(&kv->lock); 221 230 222 231 fdput(f); 223 232 224 - kvm_vfio_update_coherency(dev); 225 - 226 233 return ret; 227 234 } 228 235 229 236 #ifdef CONFIG_SPAPR_TCE_IOMMU 230 - static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, 231 - void __user *arg) 237 + static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev, 238 + void __user *arg) 232 239 { 233 240 struct kvm_vfio_spapr_tce param; 234 241 struct kvm_vfio *kv = dev->private; 235 - struct kvm_vfio_group *kvg; 242 + struct kvm_vfio_file *kvf; 236 243 struct fd f; 237 244 int ret; 238 245 ··· 247 254 248 255 mutex_lock(&kv->lock); 249 256 250 - list_for_each_entry(kvg, &kv->group_list, node) { 251 - if (kvg->file != f.file) 257 + list_for_each_entry(kvf, &kv->file_list, node) { 258 + if (kvf->file != f.file) 252 259 continue; 253 260 254 - if (!kvg->iommu_group) { 255 - kvg->iommu_group = kvm_vfio_file_iommu_group(kvg->file); 256 - if (WARN_ON_ONCE(!kvg->iommu_group)) { 261 + if (!kvf->iommu_group) { 262 + kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file); 263 + if (WARN_ON_ONCE(!kvf->iommu_group)) { 257 264 ret = -EIO; 258 265 goto err_fdput; 259 266 } 260 267 } 261 268 262 269 ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, 263 - kvg->iommu_group); 270 + kvf->iommu_group); 264 271 break; 265 272 } 266 273 ··· 271 278 } 272 279 #endif 273 280 274 - static int kvm_vfio_set_group(struct kvm_device *dev, long attr, 275 - void __user *arg) 281 + static int kvm_vfio_set_file(struct kvm_device *dev, long attr, 282 + void __user *arg) 276 283 { 277 284 int32_t __user *argp = arg; 278 285 int32_t fd; 279 286 280 287 switch (attr) { 281 - case KVM_DEV_VFIO_GROUP_ADD: 288 + case KVM_DEV_VFIO_FILE_ADD: 282 289 if (get_user(fd, argp)) 283 290 return -EFAULT; 284 - return kvm_vfio_group_add(dev, fd); 291 + return kvm_vfio_file_add(dev, fd); 285 292 286 - case KVM_DEV_VFIO_GROUP_DEL: 293 + case KVM_DEV_VFIO_FILE_DEL: 287 294 if (get_user(fd, argp)) 288 295 return -EFAULT; 289 - return kvm_vfio_group_del(dev, fd); 296 + return kvm_vfio_file_del(dev, fd); 290 297 291 298 #ifdef CONFIG_SPAPR_TCE_IOMMU 292 299 case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: 293 - return kvm_vfio_group_set_spapr_tce(dev, arg); 300 + return kvm_vfio_file_set_spapr_tce(dev, arg); 294 301 #endif 295 302 } 296 303 ··· 301 308 struct kvm_device_attr *attr) 302 309 { 303 310 switch (attr->group) { 304 - case KVM_DEV_VFIO_GROUP: 305 - return kvm_vfio_set_group(dev, attr->attr, 306 - u64_to_user_ptr(attr->addr)); 311 + case KVM_DEV_VFIO_FILE: 312 + return kvm_vfio_set_file(dev, attr->attr, 313 + u64_to_user_ptr(attr->addr)); 307 314 } 308 315 309 316 return -ENXIO; ··· 313 320 struct kvm_device_attr *attr) 314 321 { 315 322 switch (attr->group) { 316 - case KVM_DEV_VFIO_GROUP: 323 + case KVM_DEV_VFIO_FILE: 317 324 switch (attr->attr) { 318 - case KVM_DEV_VFIO_GROUP_ADD: 319 - case KVM_DEV_VFIO_GROUP_DEL: 325 + case KVM_DEV_VFIO_FILE_ADD: 326 + case KVM_DEV_VFIO_FILE_DEL: 320 327 #ifdef CONFIG_SPAPR_TCE_IOMMU 321 328 case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: 322 329 #endif ··· 332 339 static void kvm_vfio_release(struct kvm_device *dev) 333 340 { 334 341 struct kvm_vfio *kv = dev->private; 335 - struct kvm_vfio_group *kvg, *tmp; 342 + struct kvm_vfio_file *kvf, *tmp; 336 343 337 - list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { 344 + list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) { 338 345 #ifdef CONFIG_SPAPR_TCE_IOMMU 339 - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); 346 + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); 340 347 #endif 341 - kvm_vfio_file_set_kvm(kvg->file, NULL); 342 - fput(kvg->file); 343 - list_del(&kvg->node); 344 - kfree(kvg); 348 + kvm_vfio_file_set_kvm(kvf->file, NULL); 349 + fput(kvf->file); 350 + list_del(&kvf->node); 351 + kfree(kvf); 345 352 kvm_arch_end_assignment(dev->kvm); 346 353 } 347 354 ··· 375 382 if (!kv) 376 383 return -ENOMEM; 377 384 378 - INIT_LIST_HEAD(&kv->group_list); 385 + INIT_LIST_HEAD(&kv->file_list); 379 386 mutex_init(&kv->lock); 380 387 381 388 dev->private = kv;