Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

- Fix dma-valid return WAITED implementation (Anthony Yznaga)

- SPDX license cleanups (Cai Huoqing)

- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)

- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)

- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)

- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)

- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)

* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...

+2956 -2915
+1
Documentation/PCI/pci.rst
··· 103 103 - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) 104 104 - class and classmask fields default to 0 105 105 - driver_data defaults to 0UL. 106 + - override_only field defaults to 0. 106 107 107 108 Note that driver_data must match the value used by any of the pci_device_id 108 109 entries defined in the driver. This makes the driver_data field mandatory
+3 -1
Documentation/driver-api/vfio.rst
··· 255 255 void vfio_init_group_dev(struct vfio_device *device, 256 256 struct device *dev, 257 257 const struct vfio_device_ops *ops); 258 + void vfio_uninit_group_dev(struct vfio_device *device); 258 259 int vfio_register_group_dev(struct vfio_device *device); 259 260 void vfio_unregister_group_dev(struct vfio_device *device); 260 261 261 262 The driver should embed the vfio_device in its own structure and call 262 - vfio_init_group_dev() to pre-configure it before going to registration. 263 + vfio_init_group_dev() to pre-configure it before going to registration 264 + and call vfio_uninit_group_dev() after completing the un-registration. 263 265 vfio_register_group_dev() indicates to the core to begin tracking the 264 266 iommu_group of the specified dev and register the dev as owned by a VFIO bus 265 267 driver. Once vfio_register_group_dev() returns it is possible for userspace to
+1
MAINTAINERS
··· 19607 19607 F: Documentation/driver-api/vfio.rst 19608 19608 F: drivers/vfio/ 19609 19609 F: include/linux/vfio.h 19610 + F: include/linux/vfio_pci_core.h 19610 19611 F: include/uapi/linux/vfio.h 19611 19612 19612 19613 VFIO FSL-MC DRIVER
+3 -5
arch/s390/include/asm/kvm_host.h
··· 798 798 unsigned short ibc; 799 799 }; 800 800 801 - struct kvm_s390_module_hook { 802 - int (*hook)(struct kvm_vcpu *vcpu); 803 - struct module *owner; 804 - }; 801 + typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); 805 802 806 803 struct kvm_s390_crypto { 807 804 struct kvm_s390_crypto_cb *crycb; 808 - struct kvm_s390_module_hook *pqap_hook; 805 + struct rw_semaphore pqap_hook_rwsem; 806 + crypto_hook *pqap_hook; 809 807 __u32 crycbd; 810 808 __u8 aes_kw; 811 809 __u8 dea_kw;
+28 -4
arch/s390/kvm/kvm-s390.c
··· 2559 2559 kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; 2560 2560 } 2561 2561 2562 + /* 2563 + * kvm_arch_crypto_set_masks 2564 + * 2565 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks 2566 + * to be set. 2567 + * @apm: the mask identifying the accessible AP adapters 2568 + * @aqm: the mask identifying the accessible AP domains 2569 + * @adm: the mask identifying the accessible AP control domains 2570 + * 2571 + * Set the masks that identify the adapters, domains and control domains to 2572 + * which the KVM guest is granted access. 2573 + * 2574 + * Note: The kvm->lock mutex must be locked by the caller before invoking this 2575 + * function. 2576 + */ 2562 2577 void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, 2563 2578 unsigned long *aqm, unsigned long *adm) 2564 2579 { 2565 2580 struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; 2566 2581 2567 - mutex_lock(&kvm->lock); 2568 2582 kvm_s390_vcpu_block_all(kvm); 2569 2583 2570 2584 switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { ··· 2609 2595 /* recreate the shadow crycb for each vcpu */ 2610 2596 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); 2611 2597 kvm_s390_vcpu_unblock_all(kvm); 2612 - mutex_unlock(&kvm->lock); 2613 2598 } 2614 2599 EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); 2615 2600 2601 + /* 2602 + * kvm_arch_crypto_clear_masks 2603 + * 2604 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks 2605 + * to be cleared. 2606 + * 2607 + * Clear the masks that identify the adapters, domains and control domains to 2608 + * which the KVM guest is granted access. 2609 + * 2610 + * Note: The kvm->lock mutex must be locked by the caller before invoking this 2611 + * function. 2612 + */ 2616 2613 void kvm_arch_crypto_clear_masks(struct kvm *kvm) 2617 2614 { 2618 - mutex_lock(&kvm->lock); 2619 2615 kvm_s390_vcpu_block_all(kvm); 2620 2616 2621 2617 memset(&kvm->arch.crypto.crycb->apcb0, 0, ··· 2637 2613 /* recreate the shadow crycb for each vcpu */ 2638 2614 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); 2639 2615 kvm_s390_vcpu_unblock_all(kvm); 2640 - mutex_unlock(&kvm->lock); 2641 2616 } 2642 2617 EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); 2643 2618 ··· 2653 2630 { 2654 2631 kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; 2655 2632 kvm_s390_set_crycb_format(kvm); 2633 + init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem); 2656 2634 2657 2635 if (!test_kvm_facility(kvm, 76)) 2658 2636 return;
+9 -6
arch/s390/kvm/priv.c
··· 610 610 static int handle_pqap(struct kvm_vcpu *vcpu) 611 611 { 612 612 struct ap_queue_status status = {}; 613 + crypto_hook pqap_hook; 613 614 unsigned long reg0; 614 615 int ret; 615 616 uint8_t fc; ··· 655 654 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 656 655 657 656 /* 658 - * Verify that the hook callback is registered, lock the owner 659 - * and call the hook. 657 + * If the hook callback is registered, there will be a pointer to the 658 + * hook function pointer in the kvm_s390_crypto structure. Lock the 659 + * owner, retrieve the hook function pointer and call the hook. 660 660 */ 661 + down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 661 662 if (vcpu->kvm->arch.crypto.pqap_hook) { 662 - if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) 663 - return -EOPNOTSUPP; 664 - ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); 665 - module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); 663 + pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; 664 + ret = pqap_hook(vcpu); 666 665 if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) 667 666 kvm_s390_set_psw_cc(vcpu, 3); 667 + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 668 668 return ret; 669 669 } 670 + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 670 671 /* 671 672 * A vfio_driver must register a hook. 672 673 * No hook means no driver to enable the SIE CRYCB and no queues.
+4 -4
drivers/gpu/drm/i915/gvt/kvmgt.c
··· 885 885 return NOTIFY_OK; 886 886 } 887 887 888 - static int intel_vgpu_open(struct mdev_device *mdev) 888 + static int intel_vgpu_open_device(struct mdev_device *mdev) 889 889 { 890 890 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 891 891 struct kvmgt_vdev *vdev = kvmgt_vdev(vgpu); ··· 1004 1004 vgpu->handle = 0; 1005 1005 } 1006 1006 1007 - static void intel_vgpu_release(struct mdev_device *mdev) 1007 + static void intel_vgpu_close_device(struct mdev_device *mdev) 1008 1008 { 1009 1009 struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); 1010 1010 ··· 1753 1753 .create = intel_vgpu_create, 1754 1754 .remove = intel_vgpu_remove, 1755 1755 1756 - .open = intel_vgpu_open, 1757 - .release = intel_vgpu_release, 1756 + .open_device = intel_vgpu_open_device, 1757 + .close_device = intel_vgpu_close_device, 1758 1758 1759 1759 .read = intel_vgpu_read, 1760 1760 .write = intel_vgpu_write,
+21 -7
drivers/pci/pci-driver.c
··· 136 136 struct pci_dev *dev) 137 137 { 138 138 struct pci_dynid *dynid; 139 - const struct pci_device_id *found_id = NULL; 139 + const struct pci_device_id *found_id = NULL, *ids; 140 140 141 141 /* When driver_override is set, only bind to the matching driver */ 142 142 if (dev->driver_override && strcmp(dev->driver_override, drv->name)) ··· 152 152 } 153 153 spin_unlock(&drv->dynids.lock); 154 154 155 - if (!found_id) 156 - found_id = pci_match_id(drv->id_table, dev); 155 + if (found_id) 156 + return found_id; 157 + 158 + for (ids = drv->id_table; (found_id = pci_match_id(ids, dev)); 159 + ids = found_id + 1) { 160 + /* 161 + * The match table is split based on driver_override. 162 + * In case override_only was set, enforce driver_override 163 + * matching. 164 + */ 165 + if (found_id->override_only) { 166 + if (dev->driver_override) 167 + return found_id; 168 + } else { 169 + return found_id; 170 + } 171 + } 157 172 158 173 /* driver_override will always match, send a dummy id */ 159 - if (!found_id && dev->driver_override) 160 - found_id = &pci_device_id_any; 161 - 162 - return found_id; 174 + if (dev->driver_override) 175 + return &pci_device_id_any; 176 + return NULL; 163 177 } 164 178 165 179 /**
+4 -4
drivers/s390/cio/vfio_ccw_ops.c
··· 159 159 return 0; 160 160 } 161 161 162 - static int vfio_ccw_mdev_open(struct mdev_device *mdev) 162 + static int vfio_ccw_mdev_open_device(struct mdev_device *mdev) 163 163 { 164 164 struct vfio_ccw_private *private = 165 165 dev_get_drvdata(mdev_parent_dev(mdev)); ··· 194 194 return ret; 195 195 } 196 196 197 - static void vfio_ccw_mdev_release(struct mdev_device *mdev) 197 + static void vfio_ccw_mdev_close_device(struct mdev_device *mdev) 198 198 { 199 199 struct vfio_ccw_private *private = 200 200 dev_get_drvdata(mdev_parent_dev(mdev)); ··· 638 638 .supported_type_groups = mdev_type_groups, 639 639 .create = vfio_ccw_mdev_create, 640 640 .remove = vfio_ccw_mdev_remove, 641 - .open = vfio_ccw_mdev_open, 642 - .release = vfio_ccw_mdev_release, 641 + .open_device = vfio_ccw_mdev_open_device, 642 + .close_device = vfio_ccw_mdev_close_device, 643 643 .read = vfio_ccw_mdev_read, 644 644 .write = vfio_ccw_mdev_write, 645 645 .ioctl = vfio_ccw_mdev_ioctl,
+132 -150
drivers/s390/crypto/vfio_ap_ops.c
··· 24 24 #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough" 25 25 #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device" 26 26 27 - static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev); 27 + static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev); 28 28 static struct vfio_ap_queue *vfio_ap_find_queue(int apqn); 29 + static const struct vfio_device_ops vfio_ap_matrix_dev_ops; 29 30 30 31 static int match_apqn(struct device *dev, const void *data) 31 32 { ··· 296 295 matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook, 297 296 struct ap_matrix_mdev, pqap_hook); 298 297 299 - /* 300 - * If the KVM pointer is in the process of being set, wait until the 301 - * process has completed. 302 - */ 303 - wait_event_cmd(matrix_mdev->wait_for_kvm, 304 - !matrix_mdev->kvm_busy, 305 - mutex_unlock(&matrix_dev->lock), 306 - mutex_lock(&matrix_dev->lock)); 307 - 308 298 /* If the there is no guest using the mdev, there is nothing to do */ 309 299 if (!matrix_mdev->kvm) 310 300 goto out_unlock; ··· 328 336 matrix->adm_max = info->apxa ? info->Nd : 15; 329 337 } 330 338 331 - static int vfio_ap_mdev_create(struct mdev_device *mdev) 339 + static int vfio_ap_mdev_probe(struct mdev_device *mdev) 332 340 { 333 341 struct ap_matrix_mdev *matrix_mdev; 342 + int ret; 334 343 335 344 if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) 336 345 return -EPERM; 337 346 338 347 matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL); 339 348 if (!matrix_mdev) { 340 - atomic_inc(&matrix_dev->available_instances); 341 - return -ENOMEM; 349 + ret = -ENOMEM; 350 + goto err_dec_available; 342 351 } 352 + vfio_init_group_dev(&matrix_mdev->vdev, &mdev->dev, 353 + &vfio_ap_matrix_dev_ops); 343 354 344 355 matrix_mdev->mdev = mdev; 345 356 vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); 346 - init_waitqueue_head(&matrix_mdev->wait_for_kvm); 347 - mdev_set_drvdata(mdev, matrix_mdev); 348 - matrix_mdev->pqap_hook.hook = handle_pqap; 349 - matrix_mdev->pqap_hook.owner = THIS_MODULE; 357 + matrix_mdev->pqap_hook = handle_pqap; 350 358 mutex_lock(&matrix_dev->lock); 351 359 list_add(&matrix_mdev->node, &matrix_dev->mdev_list); 352 360 mutex_unlock(&matrix_dev->lock); 353 361 362 + ret = vfio_register_group_dev(&matrix_mdev->vdev); 363 + if (ret) 364 + goto err_list; 365 + dev_set_drvdata(&mdev->dev, matrix_mdev); 354 366 return 0; 367 + 368 + err_list: 369 + mutex_lock(&matrix_dev->lock); 370 + list_del(&matrix_mdev->node); 371 + mutex_unlock(&matrix_dev->lock); 372 + kfree(matrix_mdev); 373 + err_dec_available: 374 + atomic_inc(&matrix_dev->available_instances); 375 + return ret; 355 376 } 356 377 357 - static int vfio_ap_mdev_remove(struct mdev_device *mdev) 378 + static void vfio_ap_mdev_remove(struct mdev_device *mdev) 358 379 { 359 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 380 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); 381 + 382 + vfio_unregister_group_dev(&matrix_mdev->vdev); 360 383 361 384 mutex_lock(&matrix_dev->lock); 362 - vfio_ap_mdev_reset_queues(mdev); 385 + vfio_ap_mdev_reset_queues(matrix_mdev); 363 386 list_del(&matrix_mdev->node); 364 387 kfree(matrix_mdev); 365 - mdev_set_drvdata(mdev, NULL); 366 388 atomic_inc(&matrix_dev->available_instances); 367 389 mutex_unlock(&matrix_dev->lock); 368 - 369 - return 0; 370 390 } 371 391 372 392 static ssize_t name_show(struct mdev_type *mtype, ··· 618 614 { 619 615 int ret; 620 616 unsigned long apid; 621 - struct mdev_device *mdev = mdev_from_dev(dev); 622 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 617 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 623 618 624 619 mutex_lock(&matrix_dev->lock); 625 620 626 - /* 627 - * If the KVM pointer is in flux or the guest is running, disallow 628 - * un-assignment of adapter 629 - */ 630 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 621 + /* If the KVM guest is running, disallow assignment of adapter */ 622 + if (matrix_mdev->kvm) { 631 623 ret = -EBUSY; 632 624 goto done; 633 625 } ··· 685 685 { 686 686 int ret; 687 687 unsigned long apid; 688 - struct mdev_device *mdev = mdev_from_dev(dev); 689 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 688 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 690 689 691 690 mutex_lock(&matrix_dev->lock); 692 691 693 - /* 694 - * If the KVM pointer is in flux or the guest is running, disallow 695 - * un-assignment of adapter 696 - */ 697 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 692 + /* If the KVM guest is running, disallow unassignment of adapter */ 693 + if (matrix_mdev->kvm) { 698 694 ret = -EBUSY; 699 695 goto done; 700 696 } ··· 769 773 { 770 774 int ret; 771 775 unsigned long apqi; 772 - struct mdev_device *mdev = mdev_from_dev(dev); 773 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 776 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 774 777 unsigned long max_apqi = matrix_mdev->matrix.aqm_max; 775 778 776 779 mutex_lock(&matrix_dev->lock); 777 780 778 - /* 779 - * If the KVM pointer is in flux or the guest is running, disallow 780 - * assignment of domain 781 - */ 782 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 781 + /* If the KVM guest is running, disallow assignment of domain */ 782 + if (matrix_mdev->kvm) { 783 783 ret = -EBUSY; 784 784 goto done; 785 785 } ··· 832 840 { 833 841 int ret; 834 842 unsigned long apqi; 835 - struct mdev_device *mdev = mdev_from_dev(dev); 836 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 843 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 837 844 838 845 mutex_lock(&matrix_dev->lock); 839 846 840 - /* 841 - * If the KVM pointer is in flux or the guest is running, disallow 842 - * un-assignment of domain 843 - */ 844 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 847 + /* If the KVM guest is running, disallow unassignment of domain */ 848 + if (matrix_mdev->kvm) { 845 849 ret = -EBUSY; 846 850 goto done; 847 851 } ··· 881 893 { 882 894 int ret; 883 895 unsigned long id; 884 - struct mdev_device *mdev = mdev_from_dev(dev); 885 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 896 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 886 897 887 898 mutex_lock(&matrix_dev->lock); 888 899 889 - /* 890 - * If the KVM pointer is in flux or the guest is running, disallow 891 - * assignment of control domain. 892 - */ 893 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 900 + /* If the KVM guest is running, disallow assignment of control domain */ 901 + if (matrix_mdev->kvm) { 894 902 ret = -EBUSY; 895 903 goto done; 896 904 } ··· 933 949 { 934 950 int ret; 935 951 unsigned long domid; 936 - struct mdev_device *mdev = mdev_from_dev(dev); 937 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 952 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 938 953 unsigned long max_domid = matrix_mdev->matrix.adm_max; 939 954 940 955 mutex_lock(&matrix_dev->lock); 941 956 942 - /* 943 - * If the KVM pointer is in flux or the guest is running, disallow 944 - * un-assignment of control domain. 945 - */ 946 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 957 + /* If a KVM guest is running, disallow unassignment of control domain */ 958 + if (matrix_mdev->kvm) { 947 959 ret = -EBUSY; 948 960 goto done; 949 961 } ··· 968 988 int nchars = 0; 969 989 int n; 970 990 char *bufpos = buf; 971 - struct mdev_device *mdev = mdev_from_dev(dev); 972 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 991 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 973 992 unsigned long max_domid = matrix_mdev->matrix.adm_max; 974 993 975 994 mutex_lock(&matrix_dev->lock); ··· 986 1007 static ssize_t matrix_show(struct device *dev, struct device_attribute *attr, 987 1008 char *buf) 988 1009 { 989 - struct mdev_device *mdev = mdev_from_dev(dev); 990 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1010 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 991 1011 char *bufpos = buf; 992 1012 unsigned long apid; 993 1013 unsigned long apqi; ··· 1076 1098 struct ap_matrix_mdev *m; 1077 1099 1078 1100 if (kvm->arch.crypto.crycbd) { 1101 + down_write(&kvm->arch.crypto.pqap_hook_rwsem); 1102 + kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook; 1103 + up_write(&kvm->arch.crypto.pqap_hook_rwsem); 1104 + 1105 + mutex_lock(&kvm->lock); 1106 + mutex_lock(&matrix_dev->lock); 1107 + 1079 1108 list_for_each_entry(m, &matrix_dev->mdev_list, node) { 1080 - if (m != matrix_mdev && m->kvm == kvm) 1109 + if (m != matrix_mdev && m->kvm == kvm) { 1110 + mutex_unlock(&kvm->lock); 1111 + mutex_unlock(&matrix_dev->lock); 1081 1112 return -EPERM; 1113 + } 1082 1114 } 1083 1115 1084 1116 kvm_get_kvm(kvm); 1085 - matrix_mdev->kvm_busy = true; 1086 - mutex_unlock(&matrix_dev->lock); 1117 + matrix_mdev->kvm = kvm; 1087 1118 kvm_arch_crypto_set_masks(kvm, 1088 1119 matrix_mdev->matrix.apm, 1089 1120 matrix_mdev->matrix.aqm, 1090 1121 matrix_mdev->matrix.adm); 1091 - mutex_lock(&matrix_dev->lock); 1092 - kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook; 1093 - matrix_mdev->kvm = kvm; 1094 - matrix_mdev->kvm_busy = false; 1095 - wake_up_all(&matrix_mdev->wait_for_kvm); 1122 + 1123 + mutex_unlock(&kvm->lock); 1124 + mutex_unlock(&matrix_dev->lock); 1096 1125 } 1097 1126 1098 1127 return 0; ··· 1148 1163 * certain circumstances, will result in a circular lock dependency if this is 1149 1164 * done under the @matrix_mdev->lock. 1150 1165 */ 1151 - static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev) 1166 + static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev, 1167 + struct kvm *kvm) 1152 1168 { 1153 - /* 1154 - * If the KVM pointer is in the process of being set, wait until the 1155 - * process has completed. 1156 - */ 1157 - wait_event_cmd(matrix_mdev->wait_for_kvm, 1158 - !matrix_mdev->kvm_busy, 1159 - mutex_unlock(&matrix_dev->lock), 1160 - mutex_lock(&matrix_dev->lock)); 1169 + if (kvm && kvm->arch.crypto.crycbd) { 1170 + down_write(&kvm->arch.crypto.pqap_hook_rwsem); 1171 + kvm->arch.crypto.pqap_hook = NULL; 1172 + up_write(&kvm->arch.crypto.pqap_hook_rwsem); 1161 1173 1162 - if (matrix_mdev->kvm) { 1163 - matrix_mdev->kvm_busy = true; 1164 - mutex_unlock(&matrix_dev->lock); 1165 - kvm_arch_crypto_clear_masks(matrix_mdev->kvm); 1174 + mutex_lock(&kvm->lock); 1166 1175 mutex_lock(&matrix_dev->lock); 1167 - vfio_ap_mdev_reset_queues(matrix_mdev->mdev); 1168 - matrix_mdev->kvm->arch.crypto.pqap_hook = NULL; 1169 - kvm_put_kvm(matrix_mdev->kvm); 1176 + 1177 + kvm_arch_crypto_clear_masks(kvm); 1178 + vfio_ap_mdev_reset_queues(matrix_mdev); 1179 + kvm_put_kvm(kvm); 1170 1180 matrix_mdev->kvm = NULL; 1171 - matrix_mdev->kvm_busy = false; 1172 - wake_up_all(&matrix_mdev->wait_for_kvm); 1181 + 1182 + mutex_unlock(&kvm->lock); 1183 + mutex_unlock(&matrix_dev->lock); 1173 1184 } 1174 1185 } 1175 1186 ··· 1178 1197 if (action != VFIO_GROUP_NOTIFY_SET_KVM) 1179 1198 return NOTIFY_OK; 1180 1199 1181 - mutex_lock(&matrix_dev->lock); 1182 1200 matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier); 1183 1201 1184 1202 if (!data) 1185 - vfio_ap_mdev_unset_kvm(matrix_mdev); 1203 + vfio_ap_mdev_unset_kvm(matrix_mdev, matrix_mdev->kvm); 1186 1204 else if (vfio_ap_mdev_set_kvm(matrix_mdev, data)) 1187 1205 notify_rc = NOTIFY_DONE; 1188 - 1189 - mutex_unlock(&matrix_dev->lock); 1190 1206 1191 1207 return notify_rc; 1192 1208 } ··· 1254 1276 return ret; 1255 1277 } 1256 1278 1257 - static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev) 1279 + static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev) 1258 1280 { 1259 1281 int ret; 1260 1282 int rc = 0; 1261 1283 unsigned long apid, apqi; 1262 1284 struct vfio_ap_queue *q; 1263 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1264 1285 1265 1286 for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, 1266 1287 matrix_mdev->matrix.apm_max + 1) { ··· 1280 1303 return rc; 1281 1304 } 1282 1305 1283 - static int vfio_ap_mdev_open(struct mdev_device *mdev) 1306 + static int vfio_ap_mdev_open_device(struct vfio_device *vdev) 1284 1307 { 1285 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1308 + struct ap_matrix_mdev *matrix_mdev = 1309 + container_of(vdev, struct ap_matrix_mdev, vdev); 1286 1310 unsigned long events; 1287 1311 int ret; 1288 - 1289 - 1290 - if (!try_module_get(THIS_MODULE)) 1291 - return -ENODEV; 1292 1312 1293 1313 matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; 1294 1314 events = VFIO_GROUP_NOTIFY_SET_KVM; 1295 1315 1296 - ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1316 + ret = vfio_register_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1297 1317 &events, &matrix_mdev->group_notifier); 1298 - if (ret) { 1299 - module_put(THIS_MODULE); 1318 + if (ret) 1300 1319 return ret; 1301 - } 1302 1320 1303 1321 matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier; 1304 1322 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; 1305 - ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 1323 + ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, 1306 1324 &events, &matrix_mdev->iommu_notifier); 1307 - if (!ret) 1308 - return ret; 1325 + if (ret) 1326 + goto out_unregister_group; 1327 + return 0; 1309 1328 1310 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1329 + out_unregister_group: 1330 + vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1311 1331 &matrix_mdev->group_notifier); 1312 - module_put(THIS_MODULE); 1313 1332 return ret; 1314 1333 } 1315 1334 1316 - static void vfio_ap_mdev_release(struct mdev_device *mdev) 1335 + static void vfio_ap_mdev_close_device(struct vfio_device *vdev) 1317 1336 { 1318 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1337 + struct ap_matrix_mdev *matrix_mdev = 1338 + container_of(vdev, struct ap_matrix_mdev, vdev); 1319 1339 1320 - mutex_lock(&matrix_dev->lock); 1321 - vfio_ap_mdev_unset_kvm(matrix_mdev); 1322 - mutex_unlock(&matrix_dev->lock); 1323 - 1324 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 1340 + vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, 1325 1341 &matrix_mdev->iommu_notifier); 1326 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1342 + vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1327 1343 &matrix_mdev->group_notifier); 1328 - module_put(THIS_MODULE); 1344 + vfio_ap_mdev_unset_kvm(matrix_mdev, matrix_mdev->kvm); 1329 1345 } 1330 1346 1331 1347 static int vfio_ap_mdev_get_device_info(unsigned long arg) ··· 1341 1371 return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 1342 1372 } 1343 1373 1344 - static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev, 1374 + static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev, 1345 1375 unsigned int cmd, unsigned long arg) 1346 1376 { 1377 + struct ap_matrix_mdev *matrix_mdev = 1378 + container_of(vdev, struct ap_matrix_mdev, vdev); 1347 1379 int ret; 1348 - struct ap_matrix_mdev *matrix_mdev; 1349 1380 1350 1381 mutex_lock(&matrix_dev->lock); 1351 1382 switch (cmd) { ··· 1354 1383 ret = vfio_ap_mdev_get_device_info(arg); 1355 1384 break; 1356 1385 case VFIO_DEVICE_RESET: 1357 - matrix_mdev = mdev_get_drvdata(mdev); 1358 - if (WARN(!matrix_mdev, "Driver data missing from mdev!!")) { 1359 - ret = -EINVAL; 1360 - break; 1361 - } 1362 - 1363 - /* 1364 - * If the KVM pointer is in the process of being set, wait until 1365 - * the process has completed. 1366 - */ 1367 - wait_event_cmd(matrix_mdev->wait_for_kvm, 1368 - !matrix_mdev->kvm_busy, 1369 - mutex_unlock(&matrix_dev->lock), 1370 - mutex_lock(&matrix_dev->lock)); 1371 - 1372 - ret = vfio_ap_mdev_reset_queues(mdev); 1386 + ret = vfio_ap_mdev_reset_queues(matrix_mdev); 1373 1387 break; 1374 1388 default: 1375 1389 ret = -EOPNOTSUPP; ··· 1365 1409 return ret; 1366 1410 } 1367 1411 1412 + static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { 1413 + .open_device = vfio_ap_mdev_open_device, 1414 + .close_device = vfio_ap_mdev_close_device, 1415 + .ioctl = vfio_ap_mdev_ioctl, 1416 + }; 1417 + 1418 + static struct mdev_driver vfio_ap_matrix_driver = { 1419 + .driver = { 1420 + .name = "vfio_ap_mdev", 1421 + .owner = THIS_MODULE, 1422 + .mod_name = KBUILD_MODNAME, 1423 + .dev_groups = vfio_ap_mdev_attr_groups, 1424 + }, 1425 + .probe = vfio_ap_mdev_probe, 1426 + .remove = vfio_ap_mdev_remove, 1427 + }; 1428 + 1368 1429 static const struct mdev_parent_ops vfio_ap_matrix_ops = { 1369 1430 .owner = THIS_MODULE, 1431 + .device_driver = &vfio_ap_matrix_driver, 1370 1432 .supported_type_groups = vfio_ap_mdev_type_groups, 1371 - .mdev_attr_groups = vfio_ap_mdev_attr_groups, 1372 - .create = vfio_ap_mdev_create, 1373 - .remove = vfio_ap_mdev_remove, 1374 - .open = vfio_ap_mdev_open, 1375 - .release = vfio_ap_mdev_release, 1376 - .ioctl = vfio_ap_mdev_ioctl, 1377 1433 }; 1378 1434 1379 1435 int vfio_ap_mdev_register(void) 1380 1436 { 1437 + int ret; 1438 + 1381 1439 atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT); 1382 1440 1383 - return mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops); 1441 + ret = mdev_register_driver(&vfio_ap_matrix_driver); 1442 + if (ret) 1443 + return ret; 1444 + 1445 + ret = mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops); 1446 + if (ret) 1447 + goto err_driver; 1448 + return 0; 1449 + 1450 + err_driver: 1451 + mdev_unregister_driver(&vfio_ap_matrix_driver); 1452 + return ret; 1384 1453 } 1385 1454 1386 1455 void vfio_ap_mdev_unregister(void) 1387 1456 { 1388 1457 mdev_unregister_device(&matrix_dev->device); 1458 + mdev_unregister_driver(&vfio_ap_matrix_driver); 1389 1459 }
+3 -3
drivers/s390/crypto/vfio_ap_private.h
··· 18 18 #include <linux/delay.h> 19 19 #include <linux/mutex.h> 20 20 #include <linux/kvm_host.h> 21 + #include <linux/vfio.h> 21 22 22 23 #include "ap_bus.h" 23 24 ··· 80 79 * @kvm: the struct holding guest's state 81 80 */ 82 81 struct ap_matrix_mdev { 82 + struct vfio_device vdev; 83 83 struct list_head node; 84 84 struct ap_matrix matrix; 85 85 struct notifier_block group_notifier; 86 86 struct notifier_block iommu_notifier; 87 - bool kvm_busy; 88 - wait_queue_head_t wait_for_kvm; 89 87 struct kvm *kvm; 90 - struct kvm_s390_module_hook pqap_hook; 88 + crypto_hook pqap_hook; 91 89 struct mdev_device *mdev; 92 90 }; 93 91
+23 -22
drivers/vfio/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - config VFIO_IOMMU_TYPE1 3 - tristate 4 - depends on VFIO 5 - default n 6 - 7 - config VFIO_IOMMU_SPAPR_TCE 8 - tristate 9 - depends on VFIO && SPAPR_TCE_IOMMU 10 - default VFIO 11 - 12 - config VFIO_SPAPR_EEH 13 - tristate 14 - depends on EEH && VFIO_IOMMU_SPAPR_TCE 15 - default VFIO 16 - 17 - config VFIO_VIRQFD 18 - tristate 19 - depends on VFIO && EVENTFD 20 - default n 21 - 22 2 menuconfig VFIO 23 3 tristate "VFIO Non-Privileged userspace driver framework" 24 4 select IOMMU_API ··· 9 29 10 30 If you don't know what to do here, say N. 11 31 12 - menuconfig VFIO_NOIOMMU 32 + if VFIO 33 + config VFIO_IOMMU_TYPE1 34 + tristate 35 + default n 36 + 37 + config VFIO_IOMMU_SPAPR_TCE 38 + tristate 39 + depends on SPAPR_TCE_IOMMU 40 + default VFIO 41 + 42 + config VFIO_SPAPR_EEH 43 + tristate 44 + depends on EEH && VFIO_IOMMU_SPAPR_TCE 45 + default VFIO 46 + 47 + config VFIO_VIRQFD 48 + tristate 49 + select EVENTFD 50 + default n 51 + 52 + config VFIO_NOIOMMU 13 53 bool "VFIO No-IOMMU support" 14 - depends on VFIO 15 54 help 16 55 VFIO is built on the ability to isolate devices using the IOMMU. 17 56 Only with an IOMMU can userspace access to DMA capable devices be ··· 47 48 source "drivers/vfio/platform/Kconfig" 48 49 source "drivers/vfio/mdev/Kconfig" 49 50 source "drivers/vfio/fsl-mc/Kconfig" 51 + endif 52 + 50 53 source "virt/lib/Kconfig"
+2 -1
drivers/vfio/fsl-mc/Kconfig
··· 1 1 config VFIO_FSL_MC 2 2 tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" 3 - depends on VFIO && FSL_MC_BUS && EVENTFD 3 + depends on FSL_MC_BUS 4 + select EVENTFD 4 5 help 5 6 Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc 6 7 (Management Complex) devices. This is required to passthrough
+29 -132
drivers/vfio/fsl-mc/vfio_fsl_mc.c
··· 19 19 20 20 static struct fsl_mc_driver vfio_fsl_mc_driver; 21 21 22 - static DEFINE_MUTEX(reflck_lock); 23 - 24 - static void vfio_fsl_mc_reflck_get(struct vfio_fsl_mc_reflck *reflck) 22 + static int vfio_fsl_mc_open_device(struct vfio_device *core_vdev) 25 23 { 26 - kref_get(&reflck->kref); 27 - } 28 - 29 - static void vfio_fsl_mc_reflck_release(struct kref *kref) 30 - { 31 - struct vfio_fsl_mc_reflck *reflck = container_of(kref, 32 - struct vfio_fsl_mc_reflck, 33 - kref); 34 - 35 - mutex_destroy(&reflck->lock); 36 - kfree(reflck); 37 - mutex_unlock(&reflck_lock); 38 - } 39 - 40 - static void vfio_fsl_mc_reflck_put(struct vfio_fsl_mc_reflck *reflck) 41 - { 42 - kref_put_mutex(&reflck->kref, vfio_fsl_mc_reflck_release, &reflck_lock); 43 - } 44 - 45 - static struct vfio_fsl_mc_reflck *vfio_fsl_mc_reflck_alloc(void) 46 - { 47 - struct vfio_fsl_mc_reflck *reflck; 48 - 49 - reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); 50 - if (!reflck) 51 - return ERR_PTR(-ENOMEM); 52 - 53 - kref_init(&reflck->kref); 54 - mutex_init(&reflck->lock); 55 - 56 - return reflck; 57 - } 58 - 59 - static int vfio_fsl_mc_reflck_attach(struct vfio_fsl_mc_device *vdev) 60 - { 61 - int ret = 0; 62 - 63 - mutex_lock(&reflck_lock); 64 - if (is_fsl_mc_bus_dprc(vdev->mc_dev)) { 65 - vdev->reflck = vfio_fsl_mc_reflck_alloc(); 66 - ret = PTR_ERR_OR_ZERO(vdev->reflck); 67 - } else { 68 - struct device *mc_cont_dev = vdev->mc_dev->dev.parent; 69 - struct vfio_device *device; 70 - struct vfio_fsl_mc_device *cont_vdev; 71 - 72 - device = vfio_device_get_from_dev(mc_cont_dev); 73 - if (!device) { 74 - ret = -ENODEV; 75 - goto unlock; 76 - } 77 - 78 - cont_vdev = 79 - container_of(device, struct vfio_fsl_mc_device, vdev); 80 - if (!cont_vdev || !cont_vdev->reflck) { 81 - vfio_device_put(device); 82 - ret = -ENODEV; 83 - goto unlock; 84 - } 85 - vfio_fsl_mc_reflck_get(cont_vdev->reflck); 86 - vdev->reflck = cont_vdev->reflck; 87 - vfio_device_put(device); 88 - } 89 - 90 - unlock: 91 - mutex_unlock(&reflck_lock); 92 - return ret; 93 - } 94 - 95 - static int vfio_fsl_mc_regions_init(struct vfio_fsl_mc_device *vdev) 96 - { 24 + struct vfio_fsl_mc_device *vdev = 25 + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); 97 26 struct fsl_mc_device *mc_dev = vdev->mc_dev; 98 27 int count = mc_dev->obj_desc.region_count; 99 28 int i; ··· 65 136 kfree(vdev->regions); 66 137 } 67 138 68 - static int vfio_fsl_mc_open(struct vfio_device *core_vdev) 139 + 140 + static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) 69 141 { 70 142 struct vfio_fsl_mc_device *vdev = 71 143 container_of(core_vdev, struct vfio_fsl_mc_device, vdev); 72 - int ret = 0; 73 - 74 - mutex_lock(&vdev->reflck->lock); 75 - if (!vdev->refcnt) { 76 - ret = vfio_fsl_mc_regions_init(vdev); 77 - if (ret) 78 - goto out; 79 - } 80 - vdev->refcnt++; 81 - out: 82 - mutex_unlock(&vdev->reflck->lock); 83 - 84 - return ret; 85 - } 86 - 87 - static void vfio_fsl_mc_release(struct vfio_device *core_vdev) 88 - { 89 - struct vfio_fsl_mc_device *vdev = 90 - container_of(core_vdev, struct vfio_fsl_mc_device, vdev); 144 + struct fsl_mc_device *mc_dev = vdev->mc_dev; 145 + struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev); 146 + struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev); 91 147 int ret; 92 148 93 - mutex_lock(&vdev->reflck->lock); 149 + vfio_fsl_mc_regions_cleanup(vdev); 94 150 95 - if (!(--vdev->refcnt)) { 96 - struct fsl_mc_device *mc_dev = vdev->mc_dev; 97 - struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev); 98 - struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev); 151 + /* reset the device before cleaning up the interrupts */ 152 + ret = dprc_reset_container(mc_cont->mc_io, 0, mc_cont->mc_handle, 153 + mc_cont->obj_desc.id, 154 + DPRC_RESET_OPTION_NON_RECURSIVE); 99 155 100 - vfio_fsl_mc_regions_cleanup(vdev); 156 + if (WARN_ON(ret)) 157 + dev_warn(&mc_cont->dev, 158 + "VFIO_FLS_MC: reset device has failed (%d)\n", ret); 101 159 102 - /* reset the device before cleaning up the interrupts */ 103 - ret = dprc_reset_container(mc_cont->mc_io, 0, 104 - mc_cont->mc_handle, 105 - mc_cont->obj_desc.id, 106 - DPRC_RESET_OPTION_NON_RECURSIVE); 160 + vfio_fsl_mc_irqs_cleanup(vdev); 107 161 108 - if (ret) { 109 - dev_warn(&mc_cont->dev, "VFIO_FLS_MC: reset device has failed (%d)\n", 110 - ret); 111 - WARN_ON(1); 112 - } 113 - 114 - vfio_fsl_mc_irqs_cleanup(vdev); 115 - 116 - fsl_mc_cleanup_irq_pool(mc_cont); 117 - } 118 - 119 - mutex_unlock(&vdev->reflck->lock); 162 + fsl_mc_cleanup_irq_pool(mc_cont); 120 163 } 121 164 122 165 static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, ··· 405 504 406 505 static const struct vfio_device_ops vfio_fsl_mc_ops = { 407 506 .name = "vfio-fsl-mc", 408 - .open = vfio_fsl_mc_open, 409 - .release = vfio_fsl_mc_release, 507 + .open_device = vfio_fsl_mc_open_device, 508 + .close_device = vfio_fsl_mc_close_device, 410 509 .ioctl = vfio_fsl_mc_ioctl, 411 510 .read = vfio_fsl_mc_read, 412 511 .write = vfio_fsl_mc_write, ··· 526 625 vdev->mc_dev = mc_dev; 527 626 mutex_init(&vdev->igate); 528 627 529 - ret = vfio_fsl_mc_reflck_attach(vdev); 628 + if (is_fsl_mc_bus_dprc(mc_dev)) 629 + ret = vfio_assign_device_set(&vdev->vdev, &mc_dev->dev); 630 + else 631 + ret = vfio_assign_device_set(&vdev->vdev, mc_dev->dev.parent); 530 632 if (ret) 531 - goto out_kfree; 633 + goto out_uninit; 532 634 533 635 ret = vfio_fsl_mc_init_device(vdev); 534 636 if (ret) 535 - goto out_reflck; 637 + goto out_uninit; 536 638 537 639 ret = vfio_register_group_dev(&vdev->vdev); 538 640 if (ret) { ··· 543 639 goto out_device; 544 640 } 545 641 546 - /* 547 - * This triggers recursion into vfio_fsl_mc_probe() on another device 548 - * and the vfio_fsl_mc_reflck_attach() must succeed, which relies on the 549 - * vfio_add_group_dev() above. It has no impact on this vdev, so it is 550 - * safe to be after the vfio device is made live. 551 - */ 552 642 ret = vfio_fsl_mc_scan_container(mc_dev); 553 643 if (ret) 554 644 goto out_group_dev; ··· 553 655 vfio_unregister_group_dev(&vdev->vdev); 554 656 out_device: 555 657 vfio_fsl_uninit_device(vdev); 556 - out_reflck: 557 - vfio_fsl_mc_reflck_put(vdev->reflck); 558 - out_kfree: 658 + out_uninit: 659 + vfio_uninit_group_dev(&vdev->vdev); 559 660 kfree(vdev); 560 661 out_group_put: 561 662 vfio_iommu_group_put(group, dev); ··· 571 674 572 675 dprc_remove_devices(mc_dev, NULL, 0); 573 676 vfio_fsl_uninit_device(vdev); 574 - vfio_fsl_mc_reflck_put(vdev->reflck); 575 677 678 + vfio_uninit_group_dev(&vdev->vdev); 576 679 kfree(vdev); 577 680 vfio_iommu_group_put(mc_dev->dev.iommu_group, dev); 578 681
+3 -3
drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
··· 120 120 if (start != 0 || count != 1) 121 121 return -EINVAL; 122 122 123 - mutex_lock(&vdev->reflck->lock); 123 + mutex_lock(&vdev->vdev.dev_set->lock); 124 124 ret = fsl_mc_populate_irq_pool(mc_cont, 125 125 FSL_MC_IRQ_POOL_MAX_TOTAL_IRQS); 126 126 if (ret) ··· 129 129 ret = vfio_fsl_mc_irqs_allocate(vdev); 130 130 if (ret) 131 131 goto unlock; 132 - mutex_unlock(&vdev->reflck->lock); 132 + mutex_unlock(&vdev->vdev.dev_set->lock); 133 133 134 134 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 135 135 s32 fd = *(s32 *)data; ··· 154 154 return 0; 155 155 156 156 unlock: 157 - mutex_unlock(&vdev->reflck->lock); 157 + mutex_unlock(&vdev->vdev.dev_set->lock); 158 158 return ret; 159 159 160 160 }
-7
drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
··· 22 22 char *name; 23 23 }; 24 24 25 - struct vfio_fsl_mc_reflck { 26 - struct kref kref; 27 - struct mutex lock; 28 - }; 29 - 30 25 struct vfio_fsl_mc_region { 31 26 u32 flags; 32 27 u32 type; ··· 34 39 struct vfio_device vdev; 35 40 struct fsl_mc_device *mc_dev; 36 41 struct notifier_block nb; 37 - int refcnt; 38 42 struct vfio_fsl_mc_region *regions; 39 - struct vfio_fsl_mc_reflck *reflck; 40 43 struct mutex igate; 41 44 struct vfio_fsl_mc_irq *mc_irqs; 42 45 };
-1
drivers/vfio/mdev/Kconfig
··· 2 2 3 3 config VFIO_MDEV 4 4 tristate "Mediated device driver framework" 5 - depends on VFIO 6 5 default n 7 6 help 8 7 Provides a framework to virtualize devices.
+1 -5
drivers/vfio/mdev/mdev_core.c
··· 138 138 if (!dev) 139 139 return -EINVAL; 140 140 141 - /* Not mandatory, but its absence could be a problem */ 142 - if (!ops->request) 143 - dev_info(dev, "Driver cannot be asked to release device\n"); 144 - 145 141 mutex_lock(&parent_list_lock); 146 142 147 143 /* Check for duplicate */ ··· 394 398 mdev_bus_unregister(); 395 399 } 396 400 397 - module_init(mdev_init) 401 + subsys_initcall(mdev_init) 398 402 module_exit(mdev_exit) 399 403 400 404 MODULE_VERSION(DRIVER_VERSION);
+19 -14
drivers/vfio/mdev/vfio_mdev.c
··· 17 17 18 18 #include "mdev_private.h" 19 19 20 - static int vfio_mdev_open(struct vfio_device *core_vdev) 20 + static int vfio_mdev_open_device(struct vfio_device *core_vdev) 21 21 { 22 22 struct mdev_device *mdev = to_mdev_device(core_vdev->dev); 23 23 struct mdev_parent *parent = mdev->type->parent; 24 24 25 - if (unlikely(!parent->ops->open)) 26 - return -EINVAL; 25 + if (unlikely(!parent->ops->open_device)) 26 + return 0; 27 27 28 - return parent->ops->open(mdev); 28 + return parent->ops->open_device(mdev); 29 29 } 30 30 31 - static void vfio_mdev_release(struct vfio_device *core_vdev) 31 + static void vfio_mdev_close_device(struct vfio_device *core_vdev) 32 32 { 33 33 struct mdev_device *mdev = to_mdev_device(core_vdev->dev); 34 34 struct mdev_parent *parent = mdev->type->parent; 35 35 36 - if (likely(parent->ops->release)) 37 - parent->ops->release(mdev); 36 + if (likely(parent->ops->close_device)) 37 + parent->ops->close_device(mdev); 38 38 } 39 39 40 40 static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev, ··· 44 44 struct mdev_parent *parent = mdev->type->parent; 45 45 46 46 if (unlikely(!parent->ops->ioctl)) 47 - return -EINVAL; 47 + return 0; 48 48 49 49 return parent->ops->ioctl(mdev, cmd, arg); 50 50 } ··· 100 100 101 101 static const struct vfio_device_ops vfio_mdev_dev_ops = { 102 102 .name = "vfio-mdev", 103 - .open = vfio_mdev_open, 104 - .release = vfio_mdev_release, 103 + .open_device = vfio_mdev_open_device, 104 + .close_device = vfio_mdev_close_device, 105 105 .ioctl = vfio_mdev_unlocked_ioctl, 106 106 .read = vfio_mdev_read, 107 107 .write = vfio_mdev_write, ··· 120 120 121 121 vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops); 122 122 ret = vfio_register_group_dev(vdev); 123 - if (ret) { 124 - kfree(vdev); 125 - return ret; 126 - } 123 + if (ret) 124 + goto out_uninit; 125 + 127 126 dev_set_drvdata(&mdev->dev, vdev); 128 127 return 0; 128 + 129 + out_uninit: 130 + vfio_uninit_group_dev(vdev); 131 + kfree(vdev); 132 + return ret; 129 133 } 130 134 131 135 static void vfio_mdev_remove(struct mdev_device *mdev) ··· 137 133 struct vfio_device *vdev = dev_get_drvdata(&mdev->dev); 138 134 139 135 vfio_unregister_group_dev(vdev); 136 + vfio_uninit_group_dev(vdev); 140 137 kfree(vdev); 141 138 } 142 139
+22 -18
drivers/vfio/pci/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - config VFIO_PCI 3 - tristate "VFIO support for PCI devices" 4 - depends on VFIO && PCI && EVENTFD 5 - depends on MMU 2 + if PCI && MMU 3 + config VFIO_PCI_CORE 4 + tristate 6 5 select VFIO_VIRQFD 7 6 select IRQ_BYPASS_MANAGER 7 + 8 + config VFIO_PCI_MMAP 9 + def_bool y if !S390 10 + 11 + config VFIO_PCI_INTX 12 + def_bool y if !S390 13 + 14 + config VFIO_PCI 15 + tristate "Generic VFIO support for any PCI device" 16 + select VFIO_PCI_CORE 8 17 help 9 - Support for the PCI VFIO bus driver. This is required to make 10 - use of PCI drivers using the VFIO framework. 18 + Support for the generic PCI VFIO bus driver which can connect any 19 + PCI device to the VFIO framework. 11 20 12 21 If you don't know what to do here, say N. 13 22 23 + if VFIO_PCI 14 24 config VFIO_PCI_VGA 15 - bool "VFIO PCI support for VGA devices" 16 - depends on VFIO_PCI && X86 && VGA_ARB 25 + bool "Generic VFIO PCI support for VGA devices" 26 + depends on X86 && VGA_ARB 17 27 help 18 28 Support for VGA extension to VFIO PCI. This exposes an additional 19 29 region on VGA devices for accessing legacy VGA addresses used by ··· 31 21 32 22 If you don't know what to do here, say N. 33 23 34 - config VFIO_PCI_MMAP 35 - depends on VFIO_PCI 36 - def_bool y if !S390 37 - 38 - config VFIO_PCI_INTX 39 - depends on VFIO_PCI 40 - def_bool y if !S390 41 - 42 24 config VFIO_PCI_IGD 43 - bool "VFIO PCI extensions for Intel graphics (GVT-d)" 44 - depends on VFIO_PCI && X86 25 + bool "Generic VFIO PCI extensions for Intel graphics (GVT-d)" 26 + depends on X86 45 27 default y 46 28 help 47 29 Support for Intel IGD specific extensions to enable direct ··· 42 40 and LPC bridge config space. 43 41 44 42 To enable Intel IGD assignment through vfio-pci, say Y. 43 + endif 44 + endif
+5 -3
drivers/vfio/pci/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 3 - vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 4 - vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 5 - vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o 3 + vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 4 + vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o 5 + obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o 6 6 7 + vfio-pci-y := vfio_pci.o 8 + vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 7 9 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+46 -2224
drivers/vfio/pci/vfio_pci.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + * 3 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 6 * Author: Alex Williamson <alex.williamson@redhat.com> 5 7 * ··· 20 18 #include <linux/module.h> 21 19 #include <linux/mutex.h> 22 20 #include <linux/notifier.h> 23 - #include <linux/pci.h> 24 21 #include <linux/pm_runtime.h> 25 22 #include <linux/slab.h> 26 23 #include <linux/types.h> 27 24 #include <linux/uaccess.h> 28 - #include <linux/vfio.h> 29 - #include <linux/vgaarb.h> 30 - #include <linux/nospec.h> 31 - #include <linux/sched/mm.h> 32 25 33 - #include "vfio_pci_private.h" 26 + #include <linux/vfio_pci_core.h> 34 27 35 - #define DRIVER_VERSION "0.2" 36 28 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 37 29 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 38 30 ··· 59 63 static bool disable_denylist; 60 64 module_param(disable_denylist, bool, 0444); 61 65 MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); 62 - 63 - static inline bool vfio_vga_disabled(void) 64 - { 65 - #ifdef CONFIG_VFIO_PCI_VGA 66 - return disable_vga; 67 - #else 68 - return true; 69 - #endif 70 - } 71 66 72 67 static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) 73 68 { ··· 98 111 return true; 99 112 } 100 113 101 - /* 102 - * Our VGA arbiter participation is limited since we don't know anything 103 - * about the device itself. However, if the device is the only VGA device 104 - * downstream of a bridge and VFIO VGA support is disabled, then we can 105 - * safely return legacy VGA IO and memory as not decoded since the user 106 - * has no way to get to it and routing can be disabled externally at the 107 - * bridge. 108 - */ 109 - static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga) 114 + static int vfio_pci_open_device(struct vfio_device *core_vdev) 110 115 { 111 - struct pci_dev *tmp = NULL; 112 - unsigned char max_busnr; 113 - unsigned int decodes; 114 - 115 - if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 116 - return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 117 - VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 118 - 119 - max_busnr = pci_bus_max_busnr(pdev->bus); 120 - decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 121 - 122 - while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 123 - if (tmp == pdev || 124 - pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 125 - pci_is_root_bus(tmp->bus)) 126 - continue; 127 - 128 - if (tmp->bus->number >= pdev->bus->number && 129 - tmp->bus->number <= max_busnr) { 130 - pci_dev_put(tmp); 131 - decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 132 - break; 133 - } 134 - } 135 - 136 - return decodes; 137 - } 138 - 139 - static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 140 - { 141 - return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 142 - } 143 - 144 - static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) 145 - { 146 - struct resource *res; 147 - int i; 148 - struct vfio_pci_dummy_resource *dummy_res; 149 - 150 - for (i = 0; i < PCI_STD_NUM_BARS; i++) { 151 - int bar = i + PCI_STD_RESOURCES; 152 - 153 - res = &vdev->pdev->resource[bar]; 154 - 155 - if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 156 - goto no_mmap; 157 - 158 - if (!(res->flags & IORESOURCE_MEM)) 159 - goto no_mmap; 160 - 161 - /* 162 - * The PCI core shouldn't set up a resource with a 163 - * type but zero size. But there may be bugs that 164 - * cause us to do that. 165 - */ 166 - if (!resource_size(res)) 167 - goto no_mmap; 168 - 169 - if (resource_size(res) >= PAGE_SIZE) { 170 - vdev->bar_mmap_supported[bar] = true; 171 - continue; 172 - } 173 - 174 - if (!(res->start & ~PAGE_MASK)) { 175 - /* 176 - * Add a dummy resource to reserve the remainder 177 - * of the exclusive page in case that hot-add 178 - * device's bar is assigned into it. 179 - */ 180 - dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 181 - if (dummy_res == NULL) 182 - goto no_mmap; 183 - 184 - dummy_res->resource.name = "vfio sub-page reserved"; 185 - dummy_res->resource.start = res->end + 1; 186 - dummy_res->resource.end = res->start + PAGE_SIZE - 1; 187 - dummy_res->resource.flags = res->flags; 188 - if (request_resource(res->parent, 189 - &dummy_res->resource)) { 190 - kfree(dummy_res); 191 - goto no_mmap; 192 - } 193 - dummy_res->index = bar; 194 - list_add(&dummy_res->res_next, 195 - &vdev->dummy_resources_list); 196 - vdev->bar_mmap_supported[bar] = true; 197 - continue; 198 - } 199 - /* 200 - * Here we don't handle the case when the BAR is not page 201 - * aligned because we can't expect the BAR will be 202 - * assigned into the same location in a page in guest 203 - * when we passthrough the BAR. And it's hard to access 204 - * this BAR in userspace because we have no way to get 205 - * the BAR's location in a page. 206 - */ 207 - no_mmap: 208 - vdev->bar_mmap_supported[bar] = false; 209 - } 210 - } 211 - 212 - static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 213 - static void vfio_pci_disable(struct vfio_pci_device *vdev); 214 - static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data); 215 - 216 - /* 217 - * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 218 - * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 219 - * If a device implements the former but not the latter we would typically 220 - * expect broken_intx_masking be set and require an exclusive interrupt. 221 - * However since we do have control of the device's ability to assert INTx, 222 - * we can instead pretend that the device does not implement INTx, virtualizing 223 - * the pin register to report zero and maintaining DisINTx set on the host. 224 - */ 225 - static bool vfio_pci_nointx(struct pci_dev *pdev) 226 - { 227 - switch (pdev->vendor) { 228 - case PCI_VENDOR_ID_INTEL: 229 - switch (pdev->device) { 230 - /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 231 - case 0x1572: 232 - case 0x1574: 233 - case 0x1580 ... 0x1581: 234 - case 0x1583 ... 0x158b: 235 - case 0x37d0 ... 0x37d2: 236 - /* X550 */ 237 - case 0x1563: 238 - return true; 239 - default: 240 - return false; 241 - } 242 - } 243 - 244 - return false; 245 - } 246 - 247 - static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev) 248 - { 249 - struct pci_dev *pdev = vdev->pdev; 250 - u16 pmcsr; 251 - 252 - if (!pdev->pm_cap) 253 - return; 254 - 255 - pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 256 - 257 - vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 258 - } 259 - 260 - /* 261 - * pci_set_power_state() wrapper handling devices which perform a soft reset on 262 - * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 263 - * restore when returned to D0. Saved separately from pci_saved_state for use 264 - * by PM capability emulation and separately from pci_dev internal saved state 265 - * to avoid it being overwritten and consumed around other resets. 266 - */ 267 - int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) 268 - { 269 - struct pci_dev *pdev = vdev->pdev; 270 - bool needs_restore = false, needs_save = false; 271 - int ret; 272 - 273 - if (vdev->needs_pm_restore) { 274 - if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 275 - pci_save_state(pdev); 276 - needs_save = true; 277 - } 278 - 279 - if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 280 - needs_restore = true; 281 - } 282 - 283 - ret = pci_set_power_state(pdev, state); 284 - 285 - if (!ret) { 286 - /* D3 might be unsupported via quirk, skip unless in D3 */ 287 - if (needs_save && pdev->current_state >= PCI_D3hot) { 288 - vdev->pm_save = pci_store_saved_state(pdev); 289 - } else if (needs_restore) { 290 - pci_load_and_free_saved_state(pdev, &vdev->pm_save); 291 - pci_restore_state(pdev); 292 - } 293 - } 294 - 295 - return ret; 296 - } 297 - 298 - static int vfio_pci_enable(struct vfio_pci_device *vdev) 299 - { 116 + struct vfio_pci_core_device *vdev = 117 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 300 118 struct pci_dev *pdev = vdev->pdev; 301 119 int ret; 302 - u16 cmd; 303 - u8 msix_pos; 304 120 305 - vfio_pci_set_power_state(vdev, PCI_D0); 306 - 307 - /* Don't allow our initial saved state to include busmaster */ 308 - pci_clear_master(pdev); 309 - 310 - ret = pci_enable_device(pdev); 121 + ret = vfio_pci_core_enable(vdev); 311 122 if (ret) 312 123 return ret; 313 - 314 - /* If reset fails because of the device lock, fail this path entirely */ 315 - ret = pci_try_reset_function(pdev); 316 - if (ret == -EAGAIN) { 317 - pci_disable_device(pdev); 318 - return ret; 319 - } 320 - 321 - vdev->reset_works = !ret; 322 - pci_save_state(pdev); 323 - vdev->pci_saved_state = pci_store_saved_state(pdev); 324 - if (!vdev->pci_saved_state) 325 - pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 326 - 327 - if (likely(!nointxmask)) { 328 - if (vfio_pci_nointx(pdev)) { 329 - pci_info(pdev, "Masking broken INTx support\n"); 330 - vdev->nointx = true; 331 - pci_intx(pdev, 0); 332 - } else 333 - vdev->pci_2_3 = pci_intx_mask_supported(pdev); 334 - } 335 - 336 - pci_read_config_word(pdev, PCI_COMMAND, &cmd); 337 - if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 338 - cmd &= ~PCI_COMMAND_INTX_DISABLE; 339 - pci_write_config_word(pdev, PCI_COMMAND, cmd); 340 - } 341 - 342 - ret = vfio_config_init(vdev); 343 - if (ret) { 344 - kfree(vdev->pci_saved_state); 345 - vdev->pci_saved_state = NULL; 346 - pci_disable_device(pdev); 347 - return ret; 348 - } 349 - 350 - msix_pos = pdev->msix_cap; 351 - if (msix_pos) { 352 - u16 flags; 353 - u32 table; 354 - 355 - pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 356 - pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 357 - 358 - vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 359 - vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 360 - vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 361 - } else 362 - vdev->msix_bar = 0xFF; 363 - 364 - if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 365 - vdev->has_vga = true; 366 124 367 125 if (vfio_pci_is_vga(pdev) && 368 126 pdev->vendor == PCI_VENDOR_ID_INTEL && ··· 115 383 ret = vfio_pci_igd_init(vdev); 116 384 if (ret && ret != -ENODEV) { 117 385 pci_warn(pdev, "Failed to setup Intel IGD regions\n"); 118 - goto disable_exit; 119 - } 120 - } 121 - 122 - vfio_pci_probe_mmaps(vdev); 123 - 124 - return 0; 125 - 126 - disable_exit: 127 - vfio_pci_disable(vdev); 128 - return ret; 129 - } 130 - 131 - static void vfio_pci_disable(struct vfio_pci_device *vdev) 132 - { 133 - struct pci_dev *pdev = vdev->pdev; 134 - struct vfio_pci_dummy_resource *dummy_res, *tmp; 135 - struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 136 - int i, bar; 137 - 138 - /* Stop the device from further DMA */ 139 - pci_clear_master(pdev); 140 - 141 - vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 142 - VFIO_IRQ_SET_ACTION_TRIGGER, 143 - vdev->irq_type, 0, 0, NULL); 144 - 145 - /* Device closed, don't need mutex here */ 146 - list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 147 - &vdev->ioeventfds_list, next) { 148 - vfio_virqfd_disable(&ioeventfd->virqfd); 149 - list_del(&ioeventfd->next); 150 - kfree(ioeventfd); 151 - } 152 - vdev->ioeventfds_nr = 0; 153 - 154 - vdev->virq_disabled = false; 155 - 156 - for (i = 0; i < vdev->num_regions; i++) 157 - vdev->region[i].ops->release(vdev, &vdev->region[i]); 158 - 159 - vdev->num_regions = 0; 160 - kfree(vdev->region); 161 - vdev->region = NULL; /* don't krealloc a freed pointer */ 162 - 163 - vfio_config_free(vdev); 164 - 165 - for (i = 0; i < PCI_STD_NUM_BARS; i++) { 166 - bar = i + PCI_STD_RESOURCES; 167 - if (!vdev->barmap[bar]) 168 - continue; 169 - pci_iounmap(pdev, vdev->barmap[bar]); 170 - pci_release_selected_regions(pdev, 1 << bar); 171 - vdev->barmap[bar] = NULL; 172 - } 173 - 174 - list_for_each_entry_safe(dummy_res, tmp, 175 - &vdev->dummy_resources_list, res_next) { 176 - list_del(&dummy_res->res_next); 177 - release_resource(&dummy_res->resource); 178 - kfree(dummy_res); 179 - } 180 - 181 - vdev->needs_reset = true; 182 - 183 - /* 184 - * If we have saved state, restore it. If we can reset the device, 185 - * even better. Resetting with current state seems better than 186 - * nothing, but saving and restoring current state without reset 187 - * is just busy work. 188 - */ 189 - if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 190 - pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 191 - 192 - if (!vdev->reset_works) 193 - goto out; 194 - 195 - pci_save_state(pdev); 196 - } 197 - 198 - /* 199 - * Disable INTx and MSI, presumably to avoid spurious interrupts 200 - * during reset. Stolen from pci_reset_function() 201 - */ 202 - pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 203 - 204 - /* 205 - * Try to get the locks ourselves to prevent a deadlock. The 206 - * success of this is dependent on being able to lock the device, 207 - * which is not always possible. 208 - * We can not use the "try" reset interface here, which will 209 - * overwrite the previously restored configuration information. 210 - */ 211 - if (vdev->reset_works && pci_dev_trylock(pdev)) { 212 - if (!__pci_reset_function_locked(pdev)) 213 - vdev->needs_reset = false; 214 - pci_dev_unlock(pdev); 215 - } 216 - 217 - pci_restore_state(pdev); 218 - out: 219 - pci_disable_device(pdev); 220 - 221 - vfio_pci_try_bus_reset(vdev); 222 - 223 - if (!disable_idle_d3) 224 - vfio_pci_set_power_state(vdev, PCI_D3hot); 225 - } 226 - 227 - static struct pci_driver vfio_pci_driver; 228 - 229 - static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev) 230 - { 231 - struct pci_dev *physfn = pci_physfn(vdev->pdev); 232 - struct vfio_device *pf_dev; 233 - 234 - if (!vdev->pdev->is_virtfn) 235 - return NULL; 236 - 237 - pf_dev = vfio_device_get_from_dev(&physfn->dev); 238 - if (!pf_dev) 239 - return NULL; 240 - 241 - if (pci_dev_driver(physfn) != &vfio_pci_driver) { 242 - vfio_device_put(pf_dev); 243 - return NULL; 244 - } 245 - 246 - return container_of(pf_dev, struct vfio_pci_device, vdev); 247 - } 248 - 249 - static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) 250 - { 251 - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 252 - 253 - if (!pf_vdev) 254 - return; 255 - 256 - mutex_lock(&pf_vdev->vf_token->lock); 257 - pf_vdev->vf_token->users += val; 258 - WARN_ON(pf_vdev->vf_token->users < 0); 259 - mutex_unlock(&pf_vdev->vf_token->lock); 260 - 261 - vfio_device_put(&pf_vdev->vdev); 262 - } 263 - 264 - static void vfio_pci_release(struct vfio_device *core_vdev) 265 - { 266 - struct vfio_pci_device *vdev = 267 - container_of(core_vdev, struct vfio_pci_device, vdev); 268 - 269 - mutex_lock(&vdev->reflck->lock); 270 - 271 - if (!(--vdev->refcnt)) { 272 - vfio_pci_vf_token_user_add(vdev, -1); 273 - vfio_spapr_pci_eeh_release(vdev->pdev); 274 - vfio_pci_disable(vdev); 275 - 276 - mutex_lock(&vdev->igate); 277 - if (vdev->err_trigger) { 278 - eventfd_ctx_put(vdev->err_trigger); 279 - vdev->err_trigger = NULL; 280 - } 281 - if (vdev->req_trigger) { 282 - eventfd_ctx_put(vdev->req_trigger); 283 - vdev->req_trigger = NULL; 284 - } 285 - mutex_unlock(&vdev->igate); 286 - } 287 - 288 - mutex_unlock(&vdev->reflck->lock); 289 - } 290 - 291 - static int vfio_pci_open(struct vfio_device *core_vdev) 292 - { 293 - struct vfio_pci_device *vdev = 294 - container_of(core_vdev, struct vfio_pci_device, vdev); 295 - int ret = 0; 296 - 297 - mutex_lock(&vdev->reflck->lock); 298 - 299 - if (!vdev->refcnt) { 300 - ret = vfio_pci_enable(vdev); 301 - if (ret) 302 - goto error; 303 - 304 - vfio_spapr_pci_eeh_open(vdev->pdev); 305 - vfio_pci_vf_token_user_add(vdev, 1); 306 - } 307 - vdev->refcnt++; 308 - error: 309 - mutex_unlock(&vdev->reflck->lock); 310 - return ret; 311 - } 312 - 313 - static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 314 - { 315 - if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 316 - u8 pin; 317 - 318 - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 319 - vdev->nointx || vdev->pdev->is_virtfn) 320 - return 0; 321 - 322 - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 323 - 324 - return pin ? 1 : 0; 325 - } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 326 - u8 pos; 327 - u16 flags; 328 - 329 - pos = vdev->pdev->msi_cap; 330 - if (pos) { 331 - pci_read_config_word(vdev->pdev, 332 - pos + PCI_MSI_FLAGS, &flags); 333 - return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 334 - } 335 - } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 336 - u8 pos; 337 - u16 flags; 338 - 339 - pos = vdev->pdev->msix_cap; 340 - if (pos) { 341 - pci_read_config_word(vdev->pdev, 342 - pos + PCI_MSIX_FLAGS, &flags); 343 - 344 - return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 345 - } 346 - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 347 - if (pci_is_pcie(vdev->pdev)) 348 - return 1; 349 - } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 350 - return 1; 351 - } 352 - 353 - return 0; 354 - } 355 - 356 - static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 357 - { 358 - (*(int *)data)++; 359 - return 0; 360 - } 361 - 362 - struct vfio_pci_fill_info { 363 - int max; 364 - int cur; 365 - struct vfio_pci_dependent_device *devices; 366 - }; 367 - 368 - static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 369 - { 370 - struct vfio_pci_fill_info *fill = data; 371 - struct iommu_group *iommu_group; 372 - 373 - if (fill->cur == fill->max) 374 - return -EAGAIN; /* Something changed, try again */ 375 - 376 - iommu_group = iommu_group_get(&pdev->dev); 377 - if (!iommu_group) 378 - return -EPERM; /* Cannot reset non-isolated devices */ 379 - 380 - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 381 - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 382 - fill->devices[fill->cur].bus = pdev->bus->number; 383 - fill->devices[fill->cur].devfn = pdev->devfn; 384 - fill->cur++; 385 - iommu_group_put(iommu_group); 386 - return 0; 387 - } 388 - 389 - struct vfio_pci_group_entry { 390 - struct vfio_group *group; 391 - int id; 392 - }; 393 - 394 - struct vfio_pci_group_info { 395 - int count; 396 - struct vfio_pci_group_entry *groups; 397 - }; 398 - 399 - static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) 400 - { 401 - struct vfio_pci_group_info *info = data; 402 - struct iommu_group *group; 403 - int id, i; 404 - 405 - group = iommu_group_get(&pdev->dev); 406 - if (!group) 407 - return -EPERM; 408 - 409 - id = iommu_group_id(group); 410 - 411 - for (i = 0; i < info->count; i++) 412 - if (info->groups[i].id == id) 413 - break; 414 - 415 - iommu_group_put(group); 416 - 417 - return (i == info->count) ? -EINVAL : 0; 418 - } 419 - 420 - static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 421 - { 422 - for (; pdev; pdev = pdev->bus->self) 423 - if (pdev->bus == slot->bus) 424 - return (pdev->slot == slot); 425 - return false; 426 - } 427 - 428 - struct vfio_pci_walk_info { 429 - int (*fn)(struct pci_dev *, void *data); 430 - void *data; 431 - struct pci_dev *pdev; 432 - bool slot; 433 - int ret; 434 - }; 435 - 436 - static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 437 - { 438 - struct vfio_pci_walk_info *walk = data; 439 - 440 - if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 441 - walk->ret = walk->fn(pdev, walk->data); 442 - 443 - return walk->ret; 444 - } 445 - 446 - static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 447 - int (*fn)(struct pci_dev *, 448 - void *data), void *data, 449 - bool slot) 450 - { 451 - struct vfio_pci_walk_info walk = { 452 - .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 453 - }; 454 - 455 - pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 456 - 457 - return walk.ret; 458 - } 459 - 460 - static int msix_mmappable_cap(struct vfio_pci_device *vdev, 461 - struct vfio_info_cap *caps) 462 - { 463 - struct vfio_info_cap_header header = { 464 - .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 465 - .version = 1 466 - }; 467 - 468 - return vfio_info_add_capability(caps, &header, sizeof(header)); 469 - } 470 - 471 - int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 472 - unsigned int type, unsigned int subtype, 473 - const struct vfio_pci_regops *ops, 474 - size_t size, u32 flags, void *data) 475 - { 476 - struct vfio_pci_region *region; 477 - 478 - region = krealloc(vdev->region, 479 - (vdev->num_regions + 1) * sizeof(*region), 480 - GFP_KERNEL); 481 - if (!region) 482 - return -ENOMEM; 483 - 484 - vdev->region = region; 485 - vdev->region[vdev->num_regions].type = type; 486 - vdev->region[vdev->num_regions].subtype = subtype; 487 - vdev->region[vdev->num_regions].ops = ops; 488 - vdev->region[vdev->num_regions].size = size; 489 - vdev->region[vdev->num_regions].flags = flags; 490 - vdev->region[vdev->num_regions].data = data; 491 - 492 - vdev->num_regions++; 493 - 494 - return 0; 495 - } 496 - 497 - struct vfio_devices { 498 - struct vfio_pci_device **devices; 499 - int cur_index; 500 - int max_index; 501 - }; 502 - 503 - static long vfio_pci_ioctl(struct vfio_device *core_vdev, 504 - unsigned int cmd, unsigned long arg) 505 - { 506 - struct vfio_pci_device *vdev = 507 - container_of(core_vdev, struct vfio_pci_device, vdev); 508 - unsigned long minsz; 509 - 510 - if (cmd == VFIO_DEVICE_GET_INFO) { 511 - struct vfio_device_info info; 512 - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 513 - unsigned long capsz; 514 - int ret; 515 - 516 - minsz = offsetofend(struct vfio_device_info, num_irqs); 517 - 518 - /* For backward compatibility, cannot require this */ 519 - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 520 - 521 - if (copy_from_user(&info, (void __user *)arg, minsz)) 522 - return -EFAULT; 523 - 524 - if (info.argsz < minsz) 525 - return -EINVAL; 526 - 527 - if (info.argsz >= capsz) { 528 - minsz = capsz; 529 - info.cap_offset = 0; 530 - } 531 - 532 - info.flags = VFIO_DEVICE_FLAGS_PCI; 533 - 534 - if (vdev->reset_works) 535 - info.flags |= VFIO_DEVICE_FLAGS_RESET; 536 - 537 - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 538 - info.num_irqs = VFIO_PCI_NUM_IRQS; 539 - 540 - ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 541 - if (ret && ret != -ENODEV) { 542 - pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 386 + vfio_pci_core_disable(vdev); 543 387 return ret; 544 388 } 545 - 546 - if (caps.size) { 547 - info.flags |= VFIO_DEVICE_FLAGS_CAPS; 548 - if (info.argsz < sizeof(info) + caps.size) { 549 - info.argsz = sizeof(info) + caps.size; 550 - } else { 551 - vfio_info_cap_shift(&caps, sizeof(info)); 552 - if (copy_to_user((void __user *)arg + 553 - sizeof(info), caps.buf, 554 - caps.size)) { 555 - kfree(caps.buf); 556 - return -EFAULT; 557 - } 558 - info.cap_offset = sizeof(info); 559 - } 560 - 561 - kfree(caps.buf); 562 - } 563 - 564 - return copy_to_user((void __user *)arg, &info, minsz) ? 565 - -EFAULT : 0; 566 - 567 - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 568 - struct pci_dev *pdev = vdev->pdev; 569 - struct vfio_region_info info; 570 - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 571 - int i, ret; 572 - 573 - minsz = offsetofend(struct vfio_region_info, offset); 574 - 575 - if (copy_from_user(&info, (void __user *)arg, minsz)) 576 - return -EFAULT; 577 - 578 - if (info.argsz < minsz) 579 - return -EINVAL; 580 - 581 - switch (info.index) { 582 - case VFIO_PCI_CONFIG_REGION_INDEX: 583 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 584 - info.size = pdev->cfg_size; 585 - info.flags = VFIO_REGION_INFO_FLAG_READ | 586 - VFIO_REGION_INFO_FLAG_WRITE; 587 - break; 588 - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 589 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 590 - info.size = pci_resource_len(pdev, info.index); 591 - if (!info.size) { 592 - info.flags = 0; 593 - break; 594 - } 595 - 596 - info.flags = VFIO_REGION_INFO_FLAG_READ | 597 - VFIO_REGION_INFO_FLAG_WRITE; 598 - if (vdev->bar_mmap_supported[info.index]) { 599 - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 600 - if (info.index == vdev->msix_bar) { 601 - ret = msix_mmappable_cap(vdev, &caps); 602 - if (ret) 603 - return ret; 604 - } 605 - } 606 - 607 - break; 608 - case VFIO_PCI_ROM_REGION_INDEX: 609 - { 610 - void __iomem *io; 611 - size_t size; 612 - u16 cmd; 613 - 614 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 615 - info.flags = 0; 616 - 617 - /* Report the BAR size, not the ROM size */ 618 - info.size = pci_resource_len(pdev, info.index); 619 - if (!info.size) { 620 - /* Shadow ROMs appear as PCI option ROMs */ 621 - if (pdev->resource[PCI_ROM_RESOURCE].flags & 622 - IORESOURCE_ROM_SHADOW) 623 - info.size = 0x20000; 624 - else 625 - break; 626 - } 627 - 628 - /* 629 - * Is it really there? Enable memory decode for 630 - * implicit access in pci_map_rom(). 631 - */ 632 - cmd = vfio_pci_memory_lock_and_enable(vdev); 633 - io = pci_map_rom(pdev, &size); 634 - if (io) { 635 - info.flags = VFIO_REGION_INFO_FLAG_READ; 636 - pci_unmap_rom(pdev, io); 637 - } else { 638 - info.size = 0; 639 - } 640 - vfio_pci_memory_unlock_and_restore(vdev, cmd); 641 - 642 - break; 643 - } 644 - case VFIO_PCI_VGA_REGION_INDEX: 645 - if (!vdev->has_vga) 646 - return -EINVAL; 647 - 648 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 649 - info.size = 0xc0000; 650 - info.flags = VFIO_REGION_INFO_FLAG_READ | 651 - VFIO_REGION_INFO_FLAG_WRITE; 652 - 653 - break; 654 - default: 655 - { 656 - struct vfio_region_info_cap_type cap_type = { 657 - .header.id = VFIO_REGION_INFO_CAP_TYPE, 658 - .header.version = 1 }; 659 - 660 - if (info.index >= 661 - VFIO_PCI_NUM_REGIONS + vdev->num_regions) 662 - return -EINVAL; 663 - info.index = array_index_nospec(info.index, 664 - VFIO_PCI_NUM_REGIONS + 665 - vdev->num_regions); 666 - 667 - i = info.index - VFIO_PCI_NUM_REGIONS; 668 - 669 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 670 - info.size = vdev->region[i].size; 671 - info.flags = vdev->region[i].flags; 672 - 673 - cap_type.type = vdev->region[i].type; 674 - cap_type.subtype = vdev->region[i].subtype; 675 - 676 - ret = vfio_info_add_capability(&caps, &cap_type.header, 677 - sizeof(cap_type)); 678 - if (ret) 679 - return ret; 680 - 681 - if (vdev->region[i].ops->add_capability) { 682 - ret = vdev->region[i].ops->add_capability(vdev, 683 - &vdev->region[i], &caps); 684 - if (ret) 685 - return ret; 686 - } 687 - } 688 - } 689 - 690 - if (caps.size) { 691 - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 692 - if (info.argsz < sizeof(info) + caps.size) { 693 - info.argsz = sizeof(info) + caps.size; 694 - info.cap_offset = 0; 695 - } else { 696 - vfio_info_cap_shift(&caps, sizeof(info)); 697 - if (copy_to_user((void __user *)arg + 698 - sizeof(info), caps.buf, 699 - caps.size)) { 700 - kfree(caps.buf); 701 - return -EFAULT; 702 - } 703 - info.cap_offset = sizeof(info); 704 - } 705 - 706 - kfree(caps.buf); 707 - } 708 - 709 - return copy_to_user((void __user *)arg, &info, minsz) ? 710 - -EFAULT : 0; 711 - 712 - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 713 - struct vfio_irq_info info; 714 - 715 - minsz = offsetofend(struct vfio_irq_info, count); 716 - 717 - if (copy_from_user(&info, (void __user *)arg, minsz)) 718 - return -EFAULT; 719 - 720 - if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 721 - return -EINVAL; 722 - 723 - switch (info.index) { 724 - case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 725 - case VFIO_PCI_REQ_IRQ_INDEX: 726 - break; 727 - case VFIO_PCI_ERR_IRQ_INDEX: 728 - if (pci_is_pcie(vdev->pdev)) 729 - break; 730 - fallthrough; 731 - default: 732 - return -EINVAL; 733 - } 734 - 735 - info.flags = VFIO_IRQ_INFO_EVENTFD; 736 - 737 - info.count = vfio_pci_get_irq_count(vdev, info.index); 738 - 739 - if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 740 - info.flags |= (VFIO_IRQ_INFO_MASKABLE | 741 - VFIO_IRQ_INFO_AUTOMASKED); 742 - else 743 - info.flags |= VFIO_IRQ_INFO_NORESIZE; 744 - 745 - return copy_to_user((void __user *)arg, &info, minsz) ? 746 - -EFAULT : 0; 747 - 748 - } else if (cmd == VFIO_DEVICE_SET_IRQS) { 749 - struct vfio_irq_set hdr; 750 - u8 *data = NULL; 751 - int max, ret = 0; 752 - size_t data_size = 0; 753 - 754 - minsz = offsetofend(struct vfio_irq_set, count); 755 - 756 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 757 - return -EFAULT; 758 - 759 - max = vfio_pci_get_irq_count(vdev, hdr.index); 760 - 761 - ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 762 - VFIO_PCI_NUM_IRQS, &data_size); 763 - if (ret) 764 - return ret; 765 - 766 - if (data_size) { 767 - data = memdup_user((void __user *)(arg + minsz), 768 - data_size); 769 - if (IS_ERR(data)) 770 - return PTR_ERR(data); 771 - } 772 - 773 - mutex_lock(&vdev->igate); 774 - 775 - ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 776 - hdr.start, hdr.count, data); 777 - 778 - mutex_unlock(&vdev->igate); 779 - kfree(data); 780 - 781 - return ret; 782 - 783 - } else if (cmd == VFIO_DEVICE_RESET) { 784 - int ret; 785 - 786 - if (!vdev->reset_works) 787 - return -EINVAL; 788 - 789 - vfio_pci_zap_and_down_write_memory_lock(vdev); 790 - ret = pci_try_reset_function(vdev->pdev); 791 - up_write(&vdev->memory_lock); 792 - 793 - return ret; 794 - 795 - } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 796 - struct vfio_pci_hot_reset_info hdr; 797 - struct vfio_pci_fill_info fill = { 0 }; 798 - struct vfio_pci_dependent_device *devices = NULL; 799 - bool slot = false; 800 - int ret = 0; 801 - 802 - minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 803 - 804 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 805 - return -EFAULT; 806 - 807 - if (hdr.argsz < minsz) 808 - return -EINVAL; 809 - 810 - hdr.flags = 0; 811 - 812 - /* Can we do a slot or bus reset or neither? */ 813 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 814 - slot = true; 815 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 816 - return -ENODEV; 817 - 818 - /* How many devices are affected? */ 819 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 820 - vfio_pci_count_devs, 821 - &fill.max, slot); 822 - if (ret) 823 - return ret; 824 - 825 - WARN_ON(!fill.max); /* Should always be at least one */ 826 - 827 - /* 828 - * If there's enough space, fill it now, otherwise return 829 - * -ENOSPC and the number of devices affected. 830 - */ 831 - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 832 - ret = -ENOSPC; 833 - hdr.count = fill.max; 834 - goto reset_info_exit; 835 - } 836 - 837 - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 838 - if (!devices) 839 - return -ENOMEM; 840 - 841 - fill.devices = devices; 842 - 843 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 844 - vfio_pci_fill_devs, 845 - &fill, slot); 846 - 847 - /* 848 - * If a device was removed between counting and filling, 849 - * we may come up short of fill.max. If a device was 850 - * added, we'll have a return of -EAGAIN above. 851 - */ 852 - if (!ret) 853 - hdr.count = fill.cur; 854 - 855 - reset_info_exit: 856 - if (copy_to_user((void __user *)arg, &hdr, minsz)) 857 - ret = -EFAULT; 858 - 859 - if (!ret) { 860 - if (copy_to_user((void __user *)(arg + minsz), devices, 861 - hdr.count * sizeof(*devices))) 862 - ret = -EFAULT; 863 - } 864 - 865 - kfree(devices); 866 - return ret; 867 - 868 - } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 869 - struct vfio_pci_hot_reset hdr; 870 - int32_t *group_fds; 871 - struct vfio_pci_group_entry *groups; 872 - struct vfio_pci_group_info info; 873 - struct vfio_devices devs = { .cur_index = 0 }; 874 - bool slot = false; 875 - int i, group_idx, mem_idx = 0, count = 0, ret = 0; 876 - 877 - minsz = offsetofend(struct vfio_pci_hot_reset, count); 878 - 879 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 880 - return -EFAULT; 881 - 882 - if (hdr.argsz < minsz || hdr.flags) 883 - return -EINVAL; 884 - 885 - /* Can we do a slot or bus reset or neither? */ 886 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 887 - slot = true; 888 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 889 - return -ENODEV; 890 - 891 - /* 892 - * We can't let userspace give us an arbitrarily large 893 - * buffer to copy, so verify how many we think there 894 - * could be. Note groups can have multiple devices so 895 - * one group per device is the max. 896 - */ 897 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 898 - vfio_pci_count_devs, 899 - &count, slot); 900 - if (ret) 901 - return ret; 902 - 903 - /* Somewhere between 1 and count is OK */ 904 - if (!hdr.count || hdr.count > count) 905 - return -EINVAL; 906 - 907 - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 908 - groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 909 - if (!group_fds || !groups) { 910 - kfree(group_fds); 911 - kfree(groups); 912 - return -ENOMEM; 913 - } 914 - 915 - if (copy_from_user(group_fds, (void __user *)(arg + minsz), 916 - hdr.count * sizeof(*group_fds))) { 917 - kfree(group_fds); 918 - kfree(groups); 919 - return -EFAULT; 920 - } 921 - 922 - /* 923 - * For each group_fd, get the group through the vfio external 924 - * user interface and store the group and iommu ID. This 925 - * ensures the group is held across the reset. 926 - */ 927 - for (group_idx = 0; group_idx < hdr.count; group_idx++) { 928 - struct vfio_group *group; 929 - struct fd f = fdget(group_fds[group_idx]); 930 - if (!f.file) { 931 - ret = -EBADF; 932 - break; 933 - } 934 - 935 - group = vfio_group_get_external_user(f.file); 936 - fdput(f); 937 - if (IS_ERR(group)) { 938 - ret = PTR_ERR(group); 939 - break; 940 - } 941 - 942 - groups[group_idx].group = group; 943 - groups[group_idx].id = 944 - vfio_external_user_iommu_id(group); 945 - } 946 - 947 - kfree(group_fds); 948 - 949 - /* release reference to groups on error */ 950 - if (ret) 951 - goto hot_reset_release; 952 - 953 - info.count = hdr.count; 954 - info.groups = groups; 955 - 956 - /* 957 - * Test whether all the affected devices are contained 958 - * by the set of groups provided by the user. 959 - */ 960 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 961 - vfio_pci_validate_devs, 962 - &info, slot); 963 - if (ret) 964 - goto hot_reset_release; 965 - 966 - devs.max_index = count; 967 - devs.devices = kcalloc(count, sizeof(struct vfio_device *), 968 - GFP_KERNEL); 969 - if (!devs.devices) { 970 - ret = -ENOMEM; 971 - goto hot_reset_release; 972 - } 973 - 974 - /* 975 - * We need to get memory_lock for each device, but devices 976 - * can share mmap_lock, therefore we need to zap and hold 977 - * the vma_lock for each device, and only then get each 978 - * memory_lock. 979 - */ 980 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 981 - vfio_pci_try_zap_and_vma_lock_cb, 982 - &devs, slot); 983 - if (ret) 984 - goto hot_reset_release; 985 - 986 - for (; mem_idx < devs.cur_index; mem_idx++) { 987 - struct vfio_pci_device *tmp = devs.devices[mem_idx]; 988 - 989 - ret = down_write_trylock(&tmp->memory_lock); 990 - if (!ret) { 991 - ret = -EBUSY; 992 - goto hot_reset_release; 993 - } 994 - mutex_unlock(&tmp->vma_lock); 995 - } 996 - 997 - /* User has access, do the reset */ 998 - ret = pci_reset_bus(vdev->pdev); 999 - 1000 - hot_reset_release: 1001 - for (i = 0; i < devs.cur_index; i++) { 1002 - struct vfio_pci_device *tmp = devs.devices[i]; 1003 - 1004 - if (i < mem_idx) 1005 - up_write(&tmp->memory_lock); 1006 - else 1007 - mutex_unlock(&tmp->vma_lock); 1008 - vfio_device_put(&tmp->vdev); 1009 - } 1010 - kfree(devs.devices); 1011 - 1012 - for (group_idx--; group_idx >= 0; group_idx--) 1013 - vfio_group_put_external_user(groups[group_idx].group); 1014 - 1015 - kfree(groups); 1016 - return ret; 1017 - } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1018 - struct vfio_device_ioeventfd ioeventfd; 1019 - int count; 1020 - 1021 - minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1022 - 1023 - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1024 - return -EFAULT; 1025 - 1026 - if (ioeventfd.argsz < minsz) 1027 - return -EINVAL; 1028 - 1029 - if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1030 - return -EINVAL; 1031 - 1032 - count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1033 - 1034 - if (hweight8(count) != 1 || ioeventfd.fd < -1) 1035 - return -EINVAL; 1036 - 1037 - return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1038 - ioeventfd.data, count, ioeventfd.fd); 1039 - } else if (cmd == VFIO_DEVICE_FEATURE) { 1040 - struct vfio_device_feature feature; 1041 - uuid_t uuid; 1042 - 1043 - minsz = offsetofend(struct vfio_device_feature, flags); 1044 - 1045 - if (copy_from_user(&feature, (void __user *)arg, minsz)) 1046 - return -EFAULT; 1047 - 1048 - if (feature.argsz < minsz) 1049 - return -EINVAL; 1050 - 1051 - /* Check unknown flags */ 1052 - if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 1053 - VFIO_DEVICE_FEATURE_SET | 1054 - VFIO_DEVICE_FEATURE_GET | 1055 - VFIO_DEVICE_FEATURE_PROBE)) 1056 - return -EINVAL; 1057 - 1058 - /* GET & SET are mutually exclusive except with PROBE */ 1059 - if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1060 - (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1061 - (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1062 - return -EINVAL; 1063 - 1064 - switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1065 - case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1066 - if (!vdev->vf_token) 1067 - return -ENOTTY; 1068 - 1069 - /* 1070 - * We do not support GET of the VF Token UUID as this 1071 - * could expose the token of the previous device user. 1072 - */ 1073 - if (feature.flags & VFIO_DEVICE_FEATURE_GET) 1074 - return -EINVAL; 1075 - 1076 - if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 1077 - return 0; 1078 - 1079 - /* Don't SET unless told to do so */ 1080 - if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 1081 - return -EINVAL; 1082 - 1083 - if (feature.argsz < minsz + sizeof(uuid)) 1084 - return -EINVAL; 1085 - 1086 - if (copy_from_user(&uuid, (void __user *)(arg + minsz), 1087 - sizeof(uuid))) 1088 - return -EFAULT; 1089 - 1090 - mutex_lock(&vdev->vf_token->lock); 1091 - uuid_copy(&vdev->vf_token->uuid, &uuid); 1092 - mutex_unlock(&vdev->vf_token->lock); 1093 - 1094 - return 0; 1095 - default: 1096 - return -ENOTTY; 1097 - } 1098 389 } 1099 390 1100 - return -ENOTTY; 1101 - } 1102 - 1103 - static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf, 1104 - size_t count, loff_t *ppos, bool iswrite) 1105 - { 1106 - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1107 - 1108 - if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1109 - return -EINVAL; 1110 - 1111 - switch (index) { 1112 - case VFIO_PCI_CONFIG_REGION_INDEX: 1113 - return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1114 - 1115 - case VFIO_PCI_ROM_REGION_INDEX: 1116 - if (iswrite) 1117 - return -EINVAL; 1118 - return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1119 - 1120 - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1121 - return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1122 - 1123 - case VFIO_PCI_VGA_REGION_INDEX: 1124 - return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1125 - default: 1126 - index -= VFIO_PCI_NUM_REGIONS; 1127 - return vdev->region[index].ops->rw(vdev, buf, 1128 - count, ppos, iswrite); 1129 - } 1130 - 1131 - return -EINVAL; 1132 - } 1133 - 1134 - static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf, 1135 - size_t count, loff_t *ppos) 1136 - { 1137 - struct vfio_pci_device *vdev = 1138 - container_of(core_vdev, struct vfio_pci_device, vdev); 1139 - 1140 - if (!count) 1141 - return 0; 1142 - 1143 - return vfio_pci_rw(vdev, buf, count, ppos, false); 1144 - } 1145 - 1146 - static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf, 1147 - size_t count, loff_t *ppos) 1148 - { 1149 - struct vfio_pci_device *vdev = 1150 - container_of(core_vdev, struct vfio_pci_device, vdev); 1151 - 1152 - if (!count) 1153 - return 0; 1154 - 1155 - return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1156 - } 1157 - 1158 - /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1159 - static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) 1160 - { 1161 - struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1162 - 1163 - /* 1164 - * Lock ordering: 1165 - * vma_lock is nested under mmap_lock for vm_ops callback paths. 1166 - * The memory_lock semaphore is used by both code paths calling 1167 - * into this function to zap vmas and the vm_ops.fault callback 1168 - * to protect the memory enable state of the device. 1169 - * 1170 - * When zapping vmas we need to maintain the mmap_lock => vma_lock 1171 - * ordering, which requires using vma_lock to walk vma_list to 1172 - * acquire an mm, then dropping vma_lock to get the mmap_lock and 1173 - * reacquiring vma_lock. This logic is derived from similar 1174 - * requirements in uverbs_user_mmap_disassociate(). 1175 - * 1176 - * mmap_lock must always be the top-level lock when it is taken. 1177 - * Therefore we can only hold the memory_lock write lock when 1178 - * vma_list is empty, as we'd need to take mmap_lock to clear 1179 - * entries. vma_list can only be guaranteed empty when holding 1180 - * vma_lock, thus memory_lock is nested under vma_lock. 1181 - * 1182 - * This enables the vm_ops.fault callback to acquire vma_lock, 1183 - * followed by memory_lock read lock, while already holding 1184 - * mmap_lock without risk of deadlock. 1185 - */ 1186 - while (1) { 1187 - struct mm_struct *mm = NULL; 1188 - 1189 - if (try) { 1190 - if (!mutex_trylock(&vdev->vma_lock)) 1191 - return 0; 1192 - } else { 1193 - mutex_lock(&vdev->vma_lock); 1194 - } 1195 - while (!list_empty(&vdev->vma_list)) { 1196 - mmap_vma = list_first_entry(&vdev->vma_list, 1197 - struct vfio_pci_mmap_vma, 1198 - vma_next); 1199 - mm = mmap_vma->vma->vm_mm; 1200 - if (mmget_not_zero(mm)) 1201 - break; 1202 - 1203 - list_del(&mmap_vma->vma_next); 1204 - kfree(mmap_vma); 1205 - mm = NULL; 1206 - } 1207 - if (!mm) 1208 - return 1; 1209 - mutex_unlock(&vdev->vma_lock); 1210 - 1211 - if (try) { 1212 - if (!mmap_read_trylock(mm)) { 1213 - mmput(mm); 1214 - return 0; 1215 - } 1216 - } else { 1217 - mmap_read_lock(mm); 1218 - } 1219 - if (try) { 1220 - if (!mutex_trylock(&vdev->vma_lock)) { 1221 - mmap_read_unlock(mm); 1222 - mmput(mm); 1223 - return 0; 1224 - } 1225 - } else { 1226 - mutex_lock(&vdev->vma_lock); 1227 - } 1228 - list_for_each_entry_safe(mmap_vma, tmp, 1229 - &vdev->vma_list, vma_next) { 1230 - struct vm_area_struct *vma = mmap_vma->vma; 1231 - 1232 - if (vma->vm_mm != mm) 1233 - continue; 1234 - 1235 - list_del(&mmap_vma->vma_next); 1236 - kfree(mmap_vma); 1237 - 1238 - zap_vma_ptes(vma, vma->vm_start, 1239 - vma->vm_end - vma->vm_start); 1240 - } 1241 - mutex_unlock(&vdev->vma_lock); 1242 - mmap_read_unlock(mm); 1243 - mmput(mm); 1244 - } 1245 - } 1246 - 1247 - void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev) 1248 - { 1249 - vfio_pci_zap_and_vma_lock(vdev, false); 1250 - down_write(&vdev->memory_lock); 1251 - mutex_unlock(&vdev->vma_lock); 1252 - } 1253 - 1254 - u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev) 1255 - { 1256 - u16 cmd; 1257 - 1258 - down_write(&vdev->memory_lock); 1259 - pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1260 - if (!(cmd & PCI_COMMAND_MEMORY)) 1261 - pci_write_config_word(vdev->pdev, PCI_COMMAND, 1262 - cmd | PCI_COMMAND_MEMORY); 1263 - 1264 - return cmd; 1265 - } 1266 - 1267 - void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd) 1268 - { 1269 - pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1270 - up_write(&vdev->memory_lock); 1271 - } 1272 - 1273 - /* Caller holds vma_lock */ 1274 - static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, 1275 - struct vm_area_struct *vma) 1276 - { 1277 - struct vfio_pci_mmap_vma *mmap_vma; 1278 - 1279 - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1280 - if (!mmap_vma) 1281 - return -ENOMEM; 1282 - 1283 - mmap_vma->vma = vma; 1284 - list_add(&mmap_vma->vma_next, &vdev->vma_list); 391 + vfio_pci_core_finish_enable(vdev); 1285 392 1286 393 return 0; 1287 - } 1288 - 1289 - /* 1290 - * Zap mmaps on open so that we can fault them in on access and therefore 1291 - * our vma_list only tracks mappings accessed since last zap. 1292 - */ 1293 - static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1294 - { 1295 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1296 - } 1297 - 1298 - static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1299 - { 1300 - struct vfio_pci_device *vdev = vma->vm_private_data; 1301 - struct vfio_pci_mmap_vma *mmap_vma; 1302 - 1303 - mutex_lock(&vdev->vma_lock); 1304 - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1305 - if (mmap_vma->vma == vma) { 1306 - list_del(&mmap_vma->vma_next); 1307 - kfree(mmap_vma); 1308 - break; 1309 - } 1310 - } 1311 - mutex_unlock(&vdev->vma_lock); 1312 - } 1313 - 1314 - static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1315 - { 1316 - struct vm_area_struct *vma = vmf->vma; 1317 - struct vfio_pci_device *vdev = vma->vm_private_data; 1318 - struct vfio_pci_mmap_vma *mmap_vma; 1319 - vm_fault_t ret = VM_FAULT_NOPAGE; 1320 - 1321 - mutex_lock(&vdev->vma_lock); 1322 - down_read(&vdev->memory_lock); 1323 - 1324 - if (!__vfio_pci_memory_enabled(vdev)) { 1325 - ret = VM_FAULT_SIGBUS; 1326 - goto up_out; 1327 - } 1328 - 1329 - /* 1330 - * We populate the whole vma on fault, so we need to test whether 1331 - * the vma has already been mapped, such as for concurrent faults 1332 - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1333 - * we ask it to fill the same range again. 1334 - */ 1335 - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1336 - if (mmap_vma->vma == vma) 1337 - goto up_out; 1338 - } 1339 - 1340 - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1341 - vma->vm_end - vma->vm_start, 1342 - vma->vm_page_prot)) { 1343 - ret = VM_FAULT_SIGBUS; 1344 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1345 - goto up_out; 1346 - } 1347 - 1348 - if (__vfio_pci_add_vma(vdev, vma)) { 1349 - ret = VM_FAULT_OOM; 1350 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1351 - } 1352 - 1353 - up_out: 1354 - up_read(&vdev->memory_lock); 1355 - mutex_unlock(&vdev->vma_lock); 1356 - return ret; 1357 - } 1358 - 1359 - static const struct vm_operations_struct vfio_pci_mmap_ops = { 1360 - .open = vfio_pci_mmap_open, 1361 - .close = vfio_pci_mmap_close, 1362 - .fault = vfio_pci_mmap_fault, 1363 - }; 1364 - 1365 - static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1366 - { 1367 - struct vfio_pci_device *vdev = 1368 - container_of(core_vdev, struct vfio_pci_device, vdev); 1369 - struct pci_dev *pdev = vdev->pdev; 1370 - unsigned int index; 1371 - u64 phys_len, req_len, pgoff, req_start; 1372 - int ret; 1373 - 1374 - index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1375 - 1376 - if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1377 - return -EINVAL; 1378 - if (vma->vm_end < vma->vm_start) 1379 - return -EINVAL; 1380 - if ((vma->vm_flags & VM_SHARED) == 0) 1381 - return -EINVAL; 1382 - if (index >= VFIO_PCI_NUM_REGIONS) { 1383 - int regnum = index - VFIO_PCI_NUM_REGIONS; 1384 - struct vfio_pci_region *region = vdev->region + regnum; 1385 - 1386 - if (region->ops && region->ops->mmap && 1387 - (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1388 - return region->ops->mmap(vdev, region, vma); 1389 - return -EINVAL; 1390 - } 1391 - if (index >= VFIO_PCI_ROM_REGION_INDEX) 1392 - return -EINVAL; 1393 - if (!vdev->bar_mmap_supported[index]) 1394 - return -EINVAL; 1395 - 1396 - phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1397 - req_len = vma->vm_end - vma->vm_start; 1398 - pgoff = vma->vm_pgoff & 1399 - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1400 - req_start = pgoff << PAGE_SHIFT; 1401 - 1402 - if (req_start + req_len > phys_len) 1403 - return -EINVAL; 1404 - 1405 - /* 1406 - * Even though we don't make use of the barmap for the mmap, 1407 - * we need to request the region and the barmap tracks that. 1408 - */ 1409 - if (!vdev->barmap[index]) { 1410 - ret = pci_request_selected_regions(pdev, 1411 - 1 << index, "vfio-pci"); 1412 - if (ret) 1413 - return ret; 1414 - 1415 - vdev->barmap[index] = pci_iomap(pdev, index, 0); 1416 - if (!vdev->barmap[index]) { 1417 - pci_release_selected_regions(pdev, 1 << index); 1418 - return -ENOMEM; 1419 - } 1420 - } 1421 - 1422 - vma->vm_private_data = vdev; 1423 - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1424 - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1425 - 1426 - /* 1427 - * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1428 - * change vm_flags within the fault handler. Set them now. 1429 - */ 1430 - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1431 - vma->vm_ops = &vfio_pci_mmap_ops; 1432 - 1433 - return 0; 1434 - } 1435 - 1436 - static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count) 1437 - { 1438 - struct vfio_pci_device *vdev = 1439 - container_of(core_vdev, struct vfio_pci_device, vdev); 1440 - struct pci_dev *pdev = vdev->pdev; 1441 - 1442 - mutex_lock(&vdev->igate); 1443 - 1444 - if (vdev->req_trigger) { 1445 - if (!(count % 10)) 1446 - pci_notice_ratelimited(pdev, 1447 - "Relaying device request to user (#%u)\n", 1448 - count); 1449 - eventfd_signal(vdev->req_trigger, 1); 1450 - } else if (count == 0) { 1451 - pci_warn(pdev, 1452 - "No device request channel registered, blocked until released by user\n"); 1453 - } 1454 - 1455 - mutex_unlock(&vdev->igate); 1456 - } 1457 - 1458 - static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, 1459 - bool vf_token, uuid_t *uuid) 1460 - { 1461 - /* 1462 - * There's always some degree of trust or collaboration between SR-IOV 1463 - * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1464 - * can disrupt VFs with a reset, but often the PF has more explicit 1465 - * access to deny service to the VF or access data passed through the 1466 - * VF. We therefore require an opt-in via a shared VF token (UUID) to 1467 - * represent this trust. This both prevents that a VF driver might 1468 - * assume the PF driver is a trusted, in-kernel driver, and also that 1469 - * a PF driver might be replaced with a rogue driver, unknown to in-use 1470 - * VF drivers. 1471 - * 1472 - * Therefore when presented with a VF, if the PF is a vfio device and 1473 - * it is bound to the vfio-pci driver, the user needs to provide a VF 1474 - * token to access the device, in the form of appending a vf_token to 1475 - * the device name, for example: 1476 - * 1477 - * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1478 - * 1479 - * When presented with a PF which has VFs in use, the user must also 1480 - * provide the current VF token to prove collaboration with existing 1481 - * VF users. If VFs are not in use, the VF token provided for the PF 1482 - * device will act to set the VF token. 1483 - * 1484 - * If the VF token is provided but unused, an error is generated. 1485 - */ 1486 - if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1487 - return 0; /* No VF token provided or required */ 1488 - 1489 - if (vdev->pdev->is_virtfn) { 1490 - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 1491 - bool match; 1492 - 1493 - if (!pf_vdev) { 1494 - if (!vf_token) 1495 - return 0; /* PF is not vfio-pci, no VF token */ 1496 - 1497 - pci_info_ratelimited(vdev->pdev, 1498 - "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1499 - return -EINVAL; 1500 - } 1501 - 1502 - if (!vf_token) { 1503 - vfio_device_put(&pf_vdev->vdev); 1504 - pci_info_ratelimited(vdev->pdev, 1505 - "VF token required to access device\n"); 1506 - return -EACCES; 1507 - } 1508 - 1509 - mutex_lock(&pf_vdev->vf_token->lock); 1510 - match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1511 - mutex_unlock(&pf_vdev->vf_token->lock); 1512 - 1513 - vfio_device_put(&pf_vdev->vdev); 1514 - 1515 - if (!match) { 1516 - pci_info_ratelimited(vdev->pdev, 1517 - "Incorrect VF token provided for device\n"); 1518 - return -EACCES; 1519 - } 1520 - } else if (vdev->vf_token) { 1521 - mutex_lock(&vdev->vf_token->lock); 1522 - if (vdev->vf_token->users) { 1523 - if (!vf_token) { 1524 - mutex_unlock(&vdev->vf_token->lock); 1525 - pci_info_ratelimited(vdev->pdev, 1526 - "VF token required to access device\n"); 1527 - return -EACCES; 1528 - } 1529 - 1530 - if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1531 - mutex_unlock(&vdev->vf_token->lock); 1532 - pci_info_ratelimited(vdev->pdev, 1533 - "Incorrect VF token provided for device\n"); 1534 - return -EACCES; 1535 - } 1536 - } else if (vf_token) { 1537 - uuid_copy(&vdev->vf_token->uuid, uuid); 1538 - } 1539 - 1540 - mutex_unlock(&vdev->vf_token->lock); 1541 - } else if (vf_token) { 1542 - pci_info_ratelimited(vdev->pdev, 1543 - "VF token incorrectly provided, not a PF or VF\n"); 1544 - return -EINVAL; 1545 - } 1546 - 1547 - return 0; 1548 - } 1549 - 1550 - #define VF_TOKEN_ARG "vf_token=" 1551 - 1552 - static int vfio_pci_match(struct vfio_device *core_vdev, char *buf) 1553 - { 1554 - struct vfio_pci_device *vdev = 1555 - container_of(core_vdev, struct vfio_pci_device, vdev); 1556 - bool vf_token = false; 1557 - uuid_t uuid; 1558 - int ret; 1559 - 1560 - if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1561 - return 0; /* No match */ 1562 - 1563 - if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1564 - buf += strlen(pci_name(vdev->pdev)); 1565 - 1566 - if (*buf != ' ') 1567 - return 0; /* No match: non-whitespace after name */ 1568 - 1569 - while (*buf) { 1570 - if (*buf == ' ') { 1571 - buf++; 1572 - continue; 1573 - } 1574 - 1575 - if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1576 - strlen(VF_TOKEN_ARG))) { 1577 - buf += strlen(VF_TOKEN_ARG); 1578 - 1579 - if (strlen(buf) < UUID_STRING_LEN) 1580 - return -EINVAL; 1581 - 1582 - ret = uuid_parse(buf, &uuid); 1583 - if (ret) 1584 - return ret; 1585 - 1586 - vf_token = true; 1587 - buf += UUID_STRING_LEN; 1588 - } else { 1589 - /* Unknown/duplicate option */ 1590 - return -EINVAL; 1591 - } 1592 - } 1593 - } 1594 - 1595 - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1596 - if (ret) 1597 - return ret; 1598 - 1599 - return 1; /* Match */ 1600 394 } 1601 395 1602 396 static const struct vfio_device_ops vfio_pci_ops = { 1603 397 .name = "vfio-pci", 1604 - .open = vfio_pci_open, 1605 - .release = vfio_pci_release, 1606 - .ioctl = vfio_pci_ioctl, 1607 - .read = vfio_pci_read, 1608 - .write = vfio_pci_write, 1609 - .mmap = vfio_pci_mmap, 1610 - .request = vfio_pci_request, 1611 - .match = vfio_pci_match, 398 + .open_device = vfio_pci_open_device, 399 + .close_device = vfio_pci_core_close_device, 400 + .ioctl = vfio_pci_core_ioctl, 401 + .read = vfio_pci_core_read, 402 + .write = vfio_pci_core_write, 403 + .mmap = vfio_pci_core_mmap, 404 + .request = vfio_pci_core_request, 405 + .match = vfio_pci_core_match, 1612 406 }; 1613 - 1614 - static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); 1615 - static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); 1616 - 1617 - static int vfio_pci_bus_notifier(struct notifier_block *nb, 1618 - unsigned long action, void *data) 1619 - { 1620 - struct vfio_pci_device *vdev = container_of(nb, 1621 - struct vfio_pci_device, nb); 1622 - struct device *dev = data; 1623 - struct pci_dev *pdev = to_pci_dev(dev); 1624 - struct pci_dev *physfn = pci_physfn(pdev); 1625 - 1626 - if (action == BUS_NOTIFY_ADD_DEVICE && 1627 - pdev->is_virtfn && physfn == vdev->pdev) { 1628 - pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1629 - pci_name(pdev)); 1630 - pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1631 - vfio_pci_ops.name); 1632 - } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1633 - pdev->is_virtfn && physfn == vdev->pdev) { 1634 - struct pci_driver *drv = pci_dev_driver(pdev); 1635 - 1636 - if (drv && drv != &vfio_pci_driver) 1637 - pci_warn(vdev->pdev, 1638 - "VF %s bound to driver %s while PF bound to vfio-pci\n", 1639 - pci_name(pdev), drv->name); 1640 - } 1641 - 1642 - return 0; 1643 - } 1644 - 1645 - static int vfio_pci_vf_init(struct vfio_pci_device *vdev) 1646 - { 1647 - struct pci_dev *pdev = vdev->pdev; 1648 - int ret; 1649 - 1650 - if (!pdev->is_physfn) 1651 - return 0; 1652 - 1653 - vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1654 - if (!vdev->vf_token) 1655 - return -ENOMEM; 1656 - 1657 - mutex_init(&vdev->vf_token->lock); 1658 - uuid_gen(&vdev->vf_token->uuid); 1659 - 1660 - vdev->nb.notifier_call = vfio_pci_bus_notifier; 1661 - ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1662 - if (ret) { 1663 - kfree(vdev->vf_token); 1664 - return ret; 1665 - } 1666 - return 0; 1667 - } 1668 - 1669 - static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) 1670 - { 1671 - if (!vdev->vf_token) 1672 - return; 1673 - 1674 - bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1675 - WARN_ON(vdev->vf_token->users); 1676 - mutex_destroy(&vdev->vf_token->lock); 1677 - kfree(vdev->vf_token); 1678 - } 1679 - 1680 - static int vfio_pci_vga_init(struct vfio_pci_device *vdev) 1681 - { 1682 - struct pci_dev *pdev = vdev->pdev; 1683 - int ret; 1684 - 1685 - if (!vfio_pci_is_vga(pdev)) 1686 - return 0; 1687 - 1688 - ret = vga_client_register(pdev, vfio_pci_set_decode); 1689 - if (ret) 1690 - return ret; 1691 - vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false)); 1692 - return 0; 1693 - } 1694 - 1695 - static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) 1696 - { 1697 - struct pci_dev *pdev = vdev->pdev; 1698 - 1699 - if (!vfio_pci_is_vga(pdev)) 1700 - return; 1701 - vga_client_unregister(pdev); 1702 - vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1703 - VGA_RSRC_LEGACY_IO | 1704 - VGA_RSRC_LEGACY_MEM); 1705 - } 1706 407 1707 408 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1708 409 { 1709 - struct vfio_pci_device *vdev; 1710 - struct iommu_group *group; 410 + struct vfio_pci_core_device *vdev; 1711 411 int ret; 1712 412 1713 413 if (vfio_pci_is_denylisted(pdev)) 1714 414 return -EINVAL; 1715 415 1716 - if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1717 - return -EINVAL; 1718 - 1719 - /* 1720 - * Prevent binding to PFs with VFs enabled, the VFs might be in use 1721 - * by the host or other users. We cannot capture the VFs if they 1722 - * already exist, nor can we track VF users. Disabling SR-IOV here 1723 - * would initiate removing the VFs, which would unbind the driver, 1724 - * which is prone to blocking if that VF is also in use by vfio-pci. 1725 - * Just reject these PFs and let the user sort it out. 1726 - */ 1727 - if (pci_num_vf(pdev)) { 1728 - pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1729 - return -EBUSY; 1730 - } 1731 - 1732 - group = vfio_iommu_group_get(&pdev->dev); 1733 - if (!group) 1734 - return -EINVAL; 1735 - 1736 416 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 1737 - if (!vdev) { 1738 - ret = -ENOMEM; 1739 - goto out_group_put; 1740 - } 417 + if (!vdev) 418 + return -ENOMEM; 419 + vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); 1741 420 1742 - vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops); 1743 - vdev->pdev = pdev; 1744 - vdev->irq_type = VFIO_PCI_NUM_IRQS; 1745 - mutex_init(&vdev->igate); 1746 - spin_lock_init(&vdev->irqlock); 1747 - mutex_init(&vdev->ioeventfds_lock); 1748 - INIT_LIST_HEAD(&vdev->dummy_resources_list); 1749 - INIT_LIST_HEAD(&vdev->ioeventfds_list); 1750 - mutex_init(&vdev->vma_lock); 1751 - INIT_LIST_HEAD(&vdev->vma_list); 1752 - init_rwsem(&vdev->memory_lock); 1753 - 1754 - ret = vfio_pci_reflck_attach(vdev); 421 + ret = vfio_pci_core_register_device(vdev); 1755 422 if (ret) 1756 423 goto out_free; 1757 - ret = vfio_pci_vf_init(vdev); 1758 - if (ret) 1759 - goto out_reflck; 1760 - ret = vfio_pci_vga_init(vdev); 1761 - if (ret) 1762 - goto out_vf; 1763 - 1764 - vfio_pci_probe_power_state(vdev); 1765 - 1766 - if (!disable_idle_d3) { 1767 - /* 1768 - * pci-core sets the device power state to an unknown value at 1769 - * bootup and after being removed from a driver. The only 1770 - * transition it allows from this unknown state is to D0, which 1771 - * typically happens when a driver calls pci_enable_device(). 1772 - * We're not ready to enable the device yet, but we do want to 1773 - * be able to get to D3. Therefore first do a D0 transition 1774 - * before going to D3. 1775 - */ 1776 - vfio_pci_set_power_state(vdev, PCI_D0); 1777 - vfio_pci_set_power_state(vdev, PCI_D3hot); 1778 - } 1779 - 1780 - ret = vfio_register_group_dev(&vdev->vdev); 1781 - if (ret) 1782 - goto out_power; 1783 424 dev_set_drvdata(&pdev->dev, vdev); 1784 425 return 0; 1785 426 1786 - out_power: 1787 - if (!disable_idle_d3) 1788 - vfio_pci_set_power_state(vdev, PCI_D0); 1789 - out_vf: 1790 - vfio_pci_vf_uninit(vdev); 1791 - out_reflck: 1792 - vfio_pci_reflck_put(vdev->reflck); 1793 427 out_free: 1794 - kfree(vdev->pm_save); 428 + vfio_pci_core_uninit_device(vdev); 1795 429 kfree(vdev); 1796 - out_group_put: 1797 - vfio_iommu_group_put(group, &pdev->dev); 1798 430 return ret; 1799 431 } 1800 432 1801 433 static void vfio_pci_remove(struct pci_dev *pdev) 1802 434 { 1803 - struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev); 435 + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); 1804 436 1805 - pci_disable_sriov(pdev); 1806 - 1807 - vfio_unregister_group_dev(&vdev->vdev); 1808 - 1809 - vfio_pci_vf_uninit(vdev); 1810 - vfio_pci_reflck_put(vdev->reflck); 1811 - vfio_pci_vga_uninit(vdev); 1812 - 1813 - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1814 - 1815 - if (!disable_idle_d3) 1816 - vfio_pci_set_power_state(vdev, PCI_D0); 1817 - 1818 - mutex_destroy(&vdev->ioeventfds_lock); 1819 - kfree(vdev->region); 1820 - kfree(vdev->pm_save); 437 + vfio_pci_core_unregister_device(vdev); 438 + vfio_pci_core_uninit_device(vdev); 1821 439 kfree(vdev); 1822 - } 1823 - 1824 - static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1825 - pci_channel_state_t state) 1826 - { 1827 - struct vfio_pci_device *vdev; 1828 - struct vfio_device *device; 1829 - 1830 - device = vfio_device_get_from_dev(&pdev->dev); 1831 - if (device == NULL) 1832 - return PCI_ERS_RESULT_DISCONNECT; 1833 - 1834 - vdev = container_of(device, struct vfio_pci_device, vdev); 1835 - 1836 - mutex_lock(&vdev->igate); 1837 - 1838 - if (vdev->err_trigger) 1839 - eventfd_signal(vdev->err_trigger, 1); 1840 - 1841 - mutex_unlock(&vdev->igate); 1842 - 1843 - vfio_device_put(device); 1844 - 1845 - return PCI_ERS_RESULT_CAN_RECOVER; 1846 440 } 1847 441 1848 442 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 1849 443 { 1850 - struct vfio_device *device; 1851 - int ret = 0; 1852 - 1853 - might_sleep(); 1854 - 1855 444 if (!enable_sriov) 1856 445 return -ENOENT; 1857 446 1858 - device = vfio_device_get_from_dev(&pdev->dev); 1859 - if (!device) 1860 - return -ENODEV; 1861 - 1862 - if (nr_virtfn == 0) 1863 - pci_disable_sriov(pdev); 1864 - else 1865 - ret = pci_enable_sriov(pdev, nr_virtfn); 1866 - 1867 - vfio_device_put(device); 1868 - 1869 - return ret < 0 ? ret : nr_virtfn; 447 + return vfio_pci_core_sriov_configure(pdev, nr_virtfn); 1870 448 } 1871 449 1872 - static const struct pci_error_handlers vfio_err_handlers = { 1873 - .error_detected = vfio_pci_aer_err_detected, 450 + static const struct pci_device_id vfio_pci_table[] = { 451 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_ANY_ID, PCI_ANY_ID) }, /* match all by default */ 452 + {} 1874 453 }; 454 + 455 + MODULE_DEVICE_TABLE(pci, vfio_pci_table); 1875 456 1876 457 static struct pci_driver vfio_pci_driver = { 1877 458 .name = "vfio-pci", 1878 - .id_table = NULL, /* only dynamic ids */ 459 + .id_table = vfio_pci_table, 1879 460 .probe = vfio_pci_probe, 1880 461 .remove = vfio_pci_remove, 1881 462 .sriov_configure = vfio_pci_sriov_configure, 1882 - .err_handler = &vfio_err_handlers, 463 + .err_handler = &vfio_pci_core_err_handlers, 1883 464 }; 1884 - 1885 - static DEFINE_MUTEX(reflck_lock); 1886 - 1887 - static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) 1888 - { 1889 - struct vfio_pci_reflck *reflck; 1890 - 1891 - reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); 1892 - if (!reflck) 1893 - return ERR_PTR(-ENOMEM); 1894 - 1895 - kref_init(&reflck->kref); 1896 - mutex_init(&reflck->lock); 1897 - 1898 - return reflck; 1899 - } 1900 - 1901 - static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) 1902 - { 1903 - kref_get(&reflck->kref); 1904 - } 1905 - 1906 - static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) 1907 - { 1908 - struct vfio_pci_reflck **preflck = data; 1909 - struct vfio_device *device; 1910 - struct vfio_pci_device *vdev; 1911 - 1912 - device = vfio_device_get_from_dev(&pdev->dev); 1913 - if (!device) 1914 - return 0; 1915 - 1916 - if (pci_dev_driver(pdev) != &vfio_pci_driver) { 1917 - vfio_device_put(device); 1918 - return 0; 1919 - } 1920 - 1921 - vdev = container_of(device, struct vfio_pci_device, vdev); 1922 - 1923 - if (vdev->reflck) { 1924 - vfio_pci_reflck_get(vdev->reflck); 1925 - *preflck = vdev->reflck; 1926 - vfio_device_put(device); 1927 - return 1; 1928 - } 1929 - 1930 - vfio_device_put(device); 1931 - return 0; 1932 - } 1933 - 1934 - static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) 1935 - { 1936 - bool slot = !pci_probe_reset_slot(vdev->pdev->slot); 1937 - 1938 - mutex_lock(&reflck_lock); 1939 - 1940 - if (pci_is_root_bus(vdev->pdev->bus) || 1941 - vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, 1942 - &vdev->reflck, slot) <= 0) 1943 - vdev->reflck = vfio_pci_reflck_alloc(); 1944 - 1945 - mutex_unlock(&reflck_lock); 1946 - 1947 - return PTR_ERR_OR_ZERO(vdev->reflck); 1948 - } 1949 - 1950 - static void vfio_pci_reflck_release(struct kref *kref) 1951 - { 1952 - struct vfio_pci_reflck *reflck = container_of(kref, 1953 - struct vfio_pci_reflck, 1954 - kref); 1955 - 1956 - kfree(reflck); 1957 - mutex_unlock(&reflck_lock); 1958 - } 1959 - 1960 - static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) 1961 - { 1962 - kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); 1963 - } 1964 - 1965 - static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) 1966 - { 1967 - struct vfio_devices *devs = data; 1968 - struct vfio_device *device; 1969 - struct vfio_pci_device *vdev; 1970 - 1971 - if (devs->cur_index == devs->max_index) 1972 - return -ENOSPC; 1973 - 1974 - device = vfio_device_get_from_dev(&pdev->dev); 1975 - if (!device) 1976 - return -EINVAL; 1977 - 1978 - if (pci_dev_driver(pdev) != &vfio_pci_driver) { 1979 - vfio_device_put(device); 1980 - return -EBUSY; 1981 - } 1982 - 1983 - vdev = container_of(device, struct vfio_pci_device, vdev); 1984 - 1985 - /* Fault if the device is not unused */ 1986 - if (vdev->refcnt) { 1987 - vfio_device_put(device); 1988 - return -EBUSY; 1989 - } 1990 - 1991 - devs->devices[devs->cur_index++] = vdev; 1992 - return 0; 1993 - } 1994 - 1995 - static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) 1996 - { 1997 - struct vfio_devices *devs = data; 1998 - struct vfio_device *device; 1999 - struct vfio_pci_device *vdev; 2000 - 2001 - if (devs->cur_index == devs->max_index) 2002 - return -ENOSPC; 2003 - 2004 - device = vfio_device_get_from_dev(&pdev->dev); 2005 - if (!device) 2006 - return -EINVAL; 2007 - 2008 - if (pci_dev_driver(pdev) != &vfio_pci_driver) { 2009 - vfio_device_put(device); 2010 - return -EBUSY; 2011 - } 2012 - 2013 - vdev = container_of(device, struct vfio_pci_device, vdev); 2014 - 2015 - /* 2016 - * Locking multiple devices is prone to deadlock, runaway and 2017 - * unwind if we hit contention. 2018 - */ 2019 - if (!vfio_pci_zap_and_vma_lock(vdev, true)) { 2020 - vfio_device_put(device); 2021 - return -EBUSY; 2022 - } 2023 - 2024 - devs->devices[devs->cur_index++] = vdev; 2025 - return 0; 2026 - } 2027 - 2028 - /* 2029 - * If a bus or slot reset is available for the provided device and: 2030 - * - All of the devices affected by that bus or slot reset are unused 2031 - * (!refcnt) 2032 - * - At least one of the affected devices is marked dirty via 2033 - * needs_reset (such as by lack of FLR support) 2034 - * Then attempt to perform that bus or slot reset. Callers are required 2035 - * to hold vdev->reflck->lock, protecting the bus/slot reset group from 2036 - * concurrent opens. A vfio_device reference is acquired for each device 2037 - * to prevent unbinds during the reset operation. 2038 - * 2039 - * NB: vfio-core considers a group to be viable even if some devices are 2040 - * bound to drivers like pci-stub or pcieport. Here we require all devices 2041 - * to be bound to vfio_pci since that's the only way we can be sure they 2042 - * stay put. 2043 - */ 2044 - static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) 2045 - { 2046 - struct vfio_devices devs = { .cur_index = 0 }; 2047 - int i = 0, ret = -EINVAL; 2048 - bool slot = false; 2049 - struct vfio_pci_device *tmp; 2050 - 2051 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 2052 - slot = true; 2053 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 2054 - return; 2055 - 2056 - if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, 2057 - &i, slot) || !i) 2058 - return; 2059 - 2060 - devs.max_index = i; 2061 - devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); 2062 - if (!devs.devices) 2063 - return; 2064 - 2065 - if (vfio_pci_for_each_slot_or_bus(vdev->pdev, 2066 - vfio_pci_get_unused_devs, 2067 - &devs, slot)) 2068 - goto put_devs; 2069 - 2070 - /* Does at least one need a reset? */ 2071 - for (i = 0; i < devs.cur_index; i++) { 2072 - tmp = devs.devices[i]; 2073 - if (tmp->needs_reset) { 2074 - ret = pci_reset_bus(vdev->pdev); 2075 - break; 2076 - } 2077 - } 2078 - 2079 - put_devs: 2080 - for (i = 0; i < devs.cur_index; i++) { 2081 - tmp = devs.devices[i]; 2082 - 2083 - /* 2084 - * If reset was successful, affected devices no longer need 2085 - * a reset and we should return all the collateral devices 2086 - * to low power. If not successful, we either didn't reset 2087 - * the bus or timed out waiting for it, so let's not touch 2088 - * the power state. 2089 - */ 2090 - if (!ret) { 2091 - tmp->needs_reset = false; 2092 - 2093 - if (tmp != vdev && !disable_idle_d3) 2094 - vfio_pci_set_power_state(tmp, PCI_D3hot); 2095 - } 2096 - 2097 - vfio_device_put(&tmp->vdev); 2098 - } 2099 - 2100 - kfree(devs.devices); 2101 - } 2102 - 2103 - static void __exit vfio_pci_cleanup(void) 2104 - { 2105 - pci_unregister_driver(&vfio_pci_driver); 2106 - vfio_pci_uninit_perm_bits(); 2107 - } 2108 465 2109 466 static void __init vfio_pci_fill_ids(void) 2110 467 { ··· 239 2418 static int __init vfio_pci_init(void) 240 2419 { 241 2420 int ret; 2421 + bool is_disable_vga = true; 242 2422 243 - /* Allocate shared config space permission data used by all devices */ 244 - ret = vfio_pci_init_perm_bits(); 245 - if (ret) 246 - return ret; 2423 + #ifdef CONFIG_VFIO_PCI_VGA 2424 + is_disable_vga = disable_vga; 2425 + #endif 2426 + 2427 + vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3); 247 2428 248 2429 /* Register and scan for devices */ 249 2430 ret = pci_register_driver(&vfio_pci_driver); 250 2431 if (ret) 251 - goto out_driver; 2432 + return ret; 252 2433 253 2434 vfio_pci_fill_ids(); 254 2435 ··· 258 2435 pr_warn("device denylist disabled.\n"); 259 2436 260 2437 return 0; 261 - 262 - out_driver: 263 - vfio_pci_uninit_perm_bits(); 264 - return ret; 265 2438 } 266 - 267 2439 module_init(vfio_pci_init); 2440 + 2441 + static void __exit vfio_pci_cleanup(void) 2442 + { 2443 + pci_unregister_driver(&vfio_pci_driver); 2444 + } 268 2445 module_exit(vfio_pci_cleanup); 269 2446 270 - MODULE_VERSION(DRIVER_VERSION); 271 2447 MODULE_LICENSE("GPL v2"); 272 2448 MODULE_AUTHOR(DRIVER_AUTHOR); 273 2449 MODULE_DESCRIPTION(DRIVER_DESC);
+35 -35
drivers/vfio/pci/vfio_pci_config.c
··· 26 26 #include <linux/vfio.h> 27 27 #include <linux/slab.h> 28 28 29 - #include "vfio_pci_private.h" 29 + #include <linux/vfio_pci_core.h> 30 30 31 31 /* Fake capability ID for standard config space */ 32 32 #define PCI_CAP_ID_BASIC 0 ··· 108 108 struct perm_bits { 109 109 u8 *virt; /* read/write virtual data, not hw */ 110 110 u8 *write; /* writeable bits */ 111 - int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, 111 + int (*readfn)(struct vfio_pci_core_device *vdev, int pos, int count, 112 112 struct perm_bits *perm, int offset, __le32 *val); 113 - int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, 113 + int (*writefn)(struct vfio_pci_core_device *vdev, int pos, int count, 114 114 struct perm_bits *perm, int offset, __le32 val); 115 115 }; 116 116 ··· 171 171 return ret; 172 172 } 173 173 174 - static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, 174 + static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos, 175 175 int count, struct perm_bits *perm, 176 176 int offset, __le32 *val) 177 177 { ··· 197 197 return count; 198 198 } 199 199 200 - static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, 200 + static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos, 201 201 int count, struct perm_bits *perm, 202 202 int offset, __le32 val) 203 203 { ··· 244 244 } 245 245 246 246 /* Allow direct read from hardware, except for capability next pointer */ 247 - static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, 247 + static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, 248 248 int count, struct perm_bits *perm, 249 249 int offset, __le32 *val) 250 250 { ··· 269 269 } 270 270 271 271 /* Raw access skips any kind of virtualization */ 272 - static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, 272 + static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, 273 273 int count, struct perm_bits *perm, 274 274 int offset, __le32 val) 275 275 { ··· 282 282 return count; 283 283 } 284 284 285 - static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, 285 + static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, 286 286 int count, struct perm_bits *perm, 287 287 int offset, __le32 *val) 288 288 { ··· 296 296 } 297 297 298 298 /* Virt access uses only virtualization */ 299 - static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos, 299 + static int vfio_virt_config_write(struct vfio_pci_core_device *vdev, int pos, 300 300 int count, struct perm_bits *perm, 301 301 int offset, __le32 val) 302 302 { ··· 304 304 return count; 305 305 } 306 306 307 - static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos, 307 + static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos, 308 308 int count, struct perm_bits *perm, 309 309 int offset, __le32 *val) 310 310 { ··· 396 396 } 397 397 398 398 /* Caller should hold memory_lock semaphore */ 399 - bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev) 399 + bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) 400 400 { 401 401 struct pci_dev *pdev = vdev->pdev; 402 402 u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); ··· 413 413 * Restore the *real* BARs after we detect a FLR or backdoor reset. 414 414 * (backdoor = some device specific technique that we didn't catch) 415 415 */ 416 - static void vfio_bar_restore(struct vfio_pci_device *vdev) 416 + static void vfio_bar_restore(struct vfio_pci_core_device *vdev) 417 417 { 418 418 struct pci_dev *pdev = vdev->pdev; 419 419 u32 *rbar = vdev->rbar; ··· 460 460 * Pretend we're hardware and tweak the values of the *virtual* PCI BARs 461 461 * to reflect the hardware capabilities. This implements BAR sizing. 462 462 */ 463 - static void vfio_bar_fixup(struct vfio_pci_device *vdev) 463 + static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) 464 464 { 465 465 struct pci_dev *pdev = vdev->pdev; 466 466 int i; ··· 514 514 vdev->bardirty = false; 515 515 } 516 516 517 - static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, 517 + static int vfio_basic_config_read(struct vfio_pci_core_device *vdev, int pos, 518 518 int count, struct perm_bits *perm, 519 519 int offset, __le32 *val) 520 520 { ··· 536 536 } 537 537 538 538 /* Test whether BARs match the value we think they should contain */ 539 - static bool vfio_need_bar_restore(struct vfio_pci_device *vdev) 539 + static bool vfio_need_bar_restore(struct vfio_pci_core_device *vdev) 540 540 { 541 541 int i = 0, pos = PCI_BASE_ADDRESS_0, ret; 542 542 u32 bar; ··· 552 552 return false; 553 553 } 554 554 555 - static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, 555 + static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, 556 556 int count, struct perm_bits *perm, 557 557 int offset, __le32 val) 558 558 { ··· 692 692 return 0; 693 693 } 694 694 695 - static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos, 695 + static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, 696 696 int count, struct perm_bits *perm, 697 697 int offset, __le32 val) 698 698 { ··· 747 747 return 0; 748 748 } 749 749 750 - static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos, 750 + static int vfio_vpd_config_write(struct vfio_pci_core_device *vdev, int pos, 751 751 int count, struct perm_bits *perm, 752 752 int offset, __le32 val) 753 753 { ··· 829 829 return 0; 830 830 } 831 831 832 - static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, 832 + static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, 833 833 int count, struct perm_bits *perm, 834 834 int offset, __le32 val) 835 835 { ··· 913 913 return 0; 914 914 } 915 915 916 - static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos, 916 + static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, 917 917 int count, struct perm_bits *perm, 918 918 int offset, __le32 val) 919 919 { ··· 1072 1072 return ret; 1073 1073 } 1074 1074 1075 - static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) 1075 + static int vfio_find_cap_start(struct vfio_pci_core_device *vdev, int pos) 1076 1076 { 1077 1077 u8 cap; 1078 1078 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : ··· 1089 1089 return pos; 1090 1090 } 1091 1091 1092 - static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, 1092 + static int vfio_msi_config_read(struct vfio_pci_core_device *vdev, int pos, 1093 1093 int count, struct perm_bits *perm, 1094 1094 int offset, __le32 *val) 1095 1095 { ··· 1109 1109 return vfio_default_config_read(vdev, pos, count, perm, offset, val); 1110 1110 } 1111 1111 1112 - static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, 1112 + static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos, 1113 1113 int count, struct perm_bits *perm, 1114 1114 int offset, __le32 val) 1115 1115 { ··· 1189 1189 } 1190 1190 1191 1191 /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ 1192 - static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) 1192 + static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos) 1193 1193 { 1194 1194 struct pci_dev *pdev = vdev->pdev; 1195 1195 int len, ret; ··· 1222 1222 } 1223 1223 1224 1224 /* Determine extended capability length for VC (2 & 9) and MFVC */ 1225 - static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) 1225 + static int vfio_vc_cap_len(struct vfio_pci_core_device *vdev, u16 pos) 1226 1226 { 1227 1227 struct pci_dev *pdev = vdev->pdev; 1228 1228 u32 tmp; ··· 1263 1263 return len; 1264 1264 } 1265 1265 1266 - static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) 1266 + static int vfio_cap_len(struct vfio_pci_core_device *vdev, u8 cap, u8 pos) 1267 1267 { 1268 1268 struct pci_dev *pdev = vdev->pdev; 1269 1269 u32 dword; ··· 1338 1338 return 0; 1339 1339 } 1340 1340 1341 - static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) 1341 + static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epos) 1342 1342 { 1343 1343 struct pci_dev *pdev = vdev->pdev; 1344 1344 u8 byte; ··· 1412 1412 return 0; 1413 1413 } 1414 1414 1415 - static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, 1415 + static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev, 1416 1416 int offset, int size) 1417 1417 { 1418 1418 struct pci_dev *pdev = vdev->pdev; ··· 1459 1459 return ret; 1460 1460 } 1461 1461 1462 - static int vfio_cap_init(struct vfio_pci_device *vdev) 1462 + static int vfio_cap_init(struct vfio_pci_core_device *vdev) 1463 1463 { 1464 1464 struct pci_dev *pdev = vdev->pdev; 1465 1465 u8 *map = vdev->pci_config_map; ··· 1549 1549 return 0; 1550 1550 } 1551 1551 1552 - static int vfio_ecap_init(struct vfio_pci_device *vdev) 1552 + static int vfio_ecap_init(struct vfio_pci_core_device *vdev) 1553 1553 { 1554 1554 struct pci_dev *pdev = vdev->pdev; 1555 1555 u8 *map = vdev->pci_config_map; ··· 1669 1669 * for each area requiring emulated bits, but the array of pointers 1670 1670 * would be comparable in size (at least for standard config space). 1671 1671 */ 1672 - int vfio_config_init(struct vfio_pci_device *vdev) 1672 + int vfio_config_init(struct vfio_pci_core_device *vdev) 1673 1673 { 1674 1674 struct pci_dev *pdev = vdev->pdev; 1675 1675 u8 *map, *vconfig; ··· 1773 1773 return pcibios_err_to_errno(ret); 1774 1774 } 1775 1775 1776 - void vfio_config_free(struct vfio_pci_device *vdev) 1776 + void vfio_config_free(struct vfio_pci_core_device *vdev) 1777 1777 { 1778 1778 kfree(vdev->vconfig); 1779 1779 vdev->vconfig = NULL; ··· 1790 1790 * Find the remaining number of bytes in a dword that match the given 1791 1791 * position. Stop at either the end of the capability or the dword boundary. 1792 1792 */ 1793 - static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, 1793 + static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev, 1794 1794 loff_t pos) 1795 1795 { 1796 1796 u8 cap = vdev->pci_config_map[pos]; ··· 1802 1802 return i; 1803 1803 } 1804 1804 1805 - static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, 1805 + static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1806 1806 size_t count, loff_t *ppos, bool iswrite) 1807 1807 { 1808 1808 struct pci_dev *pdev = vdev->pdev; ··· 1885 1885 return ret; 1886 1886 } 1887 1887 1888 - ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, 1888 + ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1889 1889 size_t count, loff_t *ppos, bool iswrite) 1890 1890 { 1891 1891 size_t done = 0;
+2157
drivers/vfio/pci/vfio_pci_core.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 + * Author: Alex Williamson <alex.williamson@redhat.com> 5 + * 6 + * Derived from original vfio: 7 + * Copyright 2010 Cisco Systems, Inc. All rights reserved. 8 + * Author: Tom Lyon, pugs@cisco.com 9 + */ 10 + 11 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 + 13 + #include <linux/device.h> 14 + #include <linux/eventfd.h> 15 + #include <linux/file.h> 16 + #include <linux/interrupt.h> 17 + #include <linux/iommu.h> 18 + #include <linux/module.h> 19 + #include <linux/mutex.h> 20 + #include <linux/notifier.h> 21 + #include <linux/pci.h> 22 + #include <linux/pm_runtime.h> 23 + #include <linux/slab.h> 24 + #include <linux/types.h> 25 + #include <linux/uaccess.h> 26 + #include <linux/vgaarb.h> 27 + #include <linux/nospec.h> 28 + #include <linux/sched/mm.h> 29 + 30 + #include <linux/vfio_pci_core.h> 31 + 32 + #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 33 + #define DRIVER_DESC "core driver for VFIO based PCI devices" 34 + 35 + static bool nointxmask; 36 + static bool disable_vga; 37 + static bool disable_idle_d3; 38 + 39 + static inline bool vfio_vga_disabled(void) 40 + { 41 + #ifdef CONFIG_VFIO_PCI_VGA 42 + return disable_vga; 43 + #else 44 + return true; 45 + #endif 46 + } 47 + 48 + /* 49 + * Our VGA arbiter participation is limited since we don't know anything 50 + * about the device itself. However, if the device is the only VGA device 51 + * downstream of a bridge and VFIO VGA support is disabled, then we can 52 + * safely return legacy VGA IO and memory as not decoded since the user 53 + * has no way to get to it and routing can be disabled externally at the 54 + * bridge. 55 + */ 56 + static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga) 57 + { 58 + struct pci_dev *tmp = NULL; 59 + unsigned char max_busnr; 60 + unsigned int decodes; 61 + 62 + if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 63 + return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 64 + VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 65 + 66 + max_busnr = pci_bus_max_busnr(pdev->bus); 67 + decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 68 + 69 + while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 70 + if (tmp == pdev || 71 + pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 72 + pci_is_root_bus(tmp->bus)) 73 + continue; 74 + 75 + if (tmp->bus->number >= pdev->bus->number && 76 + tmp->bus->number <= max_busnr) { 77 + pci_dev_put(tmp); 78 + decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 79 + break; 80 + } 81 + } 82 + 83 + return decodes; 84 + } 85 + 86 + static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) 87 + { 88 + struct resource *res; 89 + int i; 90 + struct vfio_pci_dummy_resource *dummy_res; 91 + 92 + for (i = 0; i < PCI_STD_NUM_BARS; i++) { 93 + int bar = i + PCI_STD_RESOURCES; 94 + 95 + res = &vdev->pdev->resource[bar]; 96 + 97 + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 98 + goto no_mmap; 99 + 100 + if (!(res->flags & IORESOURCE_MEM)) 101 + goto no_mmap; 102 + 103 + /* 104 + * The PCI core shouldn't set up a resource with a 105 + * type but zero size. But there may be bugs that 106 + * cause us to do that. 107 + */ 108 + if (!resource_size(res)) 109 + goto no_mmap; 110 + 111 + if (resource_size(res) >= PAGE_SIZE) { 112 + vdev->bar_mmap_supported[bar] = true; 113 + continue; 114 + } 115 + 116 + if (!(res->start & ~PAGE_MASK)) { 117 + /* 118 + * Add a dummy resource to reserve the remainder 119 + * of the exclusive page in case that hot-add 120 + * device's bar is assigned into it. 121 + */ 122 + dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 123 + if (dummy_res == NULL) 124 + goto no_mmap; 125 + 126 + dummy_res->resource.name = "vfio sub-page reserved"; 127 + dummy_res->resource.start = res->end + 1; 128 + dummy_res->resource.end = res->start + PAGE_SIZE - 1; 129 + dummy_res->resource.flags = res->flags; 130 + if (request_resource(res->parent, 131 + &dummy_res->resource)) { 132 + kfree(dummy_res); 133 + goto no_mmap; 134 + } 135 + dummy_res->index = bar; 136 + list_add(&dummy_res->res_next, 137 + &vdev->dummy_resources_list); 138 + vdev->bar_mmap_supported[bar] = true; 139 + continue; 140 + } 141 + /* 142 + * Here we don't handle the case when the BAR is not page 143 + * aligned because we can't expect the BAR will be 144 + * assigned into the same location in a page in guest 145 + * when we passthrough the BAR. And it's hard to access 146 + * this BAR in userspace because we have no way to get 147 + * the BAR's location in a page. 148 + */ 149 + no_mmap: 150 + vdev->bar_mmap_supported[bar] = false; 151 + } 152 + } 153 + 154 + struct vfio_pci_group_info; 155 + static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); 156 + static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 157 + struct vfio_pci_group_info *groups); 158 + 159 + /* 160 + * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 161 + * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 162 + * If a device implements the former but not the latter we would typically 163 + * expect broken_intx_masking be set and require an exclusive interrupt. 164 + * However since we do have control of the device's ability to assert INTx, 165 + * we can instead pretend that the device does not implement INTx, virtualizing 166 + * the pin register to report zero and maintaining DisINTx set on the host. 167 + */ 168 + static bool vfio_pci_nointx(struct pci_dev *pdev) 169 + { 170 + switch (pdev->vendor) { 171 + case PCI_VENDOR_ID_INTEL: 172 + switch (pdev->device) { 173 + /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 174 + case 0x1572: 175 + case 0x1574: 176 + case 0x1580 ... 0x1581: 177 + case 0x1583 ... 0x158b: 178 + case 0x37d0 ... 0x37d2: 179 + /* X550 */ 180 + case 0x1563: 181 + return true; 182 + default: 183 + return false; 184 + } 185 + } 186 + 187 + return false; 188 + } 189 + 190 + static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev) 191 + { 192 + struct pci_dev *pdev = vdev->pdev; 193 + u16 pmcsr; 194 + 195 + if (!pdev->pm_cap) 196 + return; 197 + 198 + pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 199 + 200 + vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 201 + } 202 + 203 + /* 204 + * pci_set_power_state() wrapper handling devices which perform a soft reset on 205 + * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 206 + * restore when returned to D0. Saved separately from pci_saved_state for use 207 + * by PM capability emulation and separately from pci_dev internal saved state 208 + * to avoid it being overwritten and consumed around other resets. 209 + */ 210 + int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) 211 + { 212 + struct pci_dev *pdev = vdev->pdev; 213 + bool needs_restore = false, needs_save = false; 214 + int ret; 215 + 216 + if (vdev->needs_pm_restore) { 217 + if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 218 + pci_save_state(pdev); 219 + needs_save = true; 220 + } 221 + 222 + if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 223 + needs_restore = true; 224 + } 225 + 226 + ret = pci_set_power_state(pdev, state); 227 + 228 + if (!ret) { 229 + /* D3 might be unsupported via quirk, skip unless in D3 */ 230 + if (needs_save && pdev->current_state >= PCI_D3hot) { 231 + vdev->pm_save = pci_store_saved_state(pdev); 232 + } else if (needs_restore) { 233 + pci_load_and_free_saved_state(pdev, &vdev->pm_save); 234 + pci_restore_state(pdev); 235 + } 236 + } 237 + 238 + return ret; 239 + } 240 + 241 + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) 242 + { 243 + struct pci_dev *pdev = vdev->pdev; 244 + int ret; 245 + u16 cmd; 246 + u8 msix_pos; 247 + 248 + vfio_pci_set_power_state(vdev, PCI_D0); 249 + 250 + /* Don't allow our initial saved state to include busmaster */ 251 + pci_clear_master(pdev); 252 + 253 + ret = pci_enable_device(pdev); 254 + if (ret) 255 + return ret; 256 + 257 + /* If reset fails because of the device lock, fail this path entirely */ 258 + ret = pci_try_reset_function(pdev); 259 + if (ret == -EAGAIN) { 260 + pci_disable_device(pdev); 261 + return ret; 262 + } 263 + 264 + vdev->reset_works = !ret; 265 + pci_save_state(pdev); 266 + vdev->pci_saved_state = pci_store_saved_state(pdev); 267 + if (!vdev->pci_saved_state) 268 + pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 269 + 270 + if (likely(!nointxmask)) { 271 + if (vfio_pci_nointx(pdev)) { 272 + pci_info(pdev, "Masking broken INTx support\n"); 273 + vdev->nointx = true; 274 + pci_intx(pdev, 0); 275 + } else 276 + vdev->pci_2_3 = pci_intx_mask_supported(pdev); 277 + } 278 + 279 + pci_read_config_word(pdev, PCI_COMMAND, &cmd); 280 + if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 281 + cmd &= ~PCI_COMMAND_INTX_DISABLE; 282 + pci_write_config_word(pdev, PCI_COMMAND, cmd); 283 + } 284 + 285 + ret = vfio_config_init(vdev); 286 + if (ret) { 287 + kfree(vdev->pci_saved_state); 288 + vdev->pci_saved_state = NULL; 289 + pci_disable_device(pdev); 290 + return ret; 291 + } 292 + 293 + msix_pos = pdev->msix_cap; 294 + if (msix_pos) { 295 + u16 flags; 296 + u32 table; 297 + 298 + pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 299 + pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 300 + 301 + vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 302 + vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 303 + vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 304 + } else 305 + vdev->msix_bar = 0xFF; 306 + 307 + if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 308 + vdev->has_vga = true; 309 + 310 + 311 + return 0; 312 + } 313 + EXPORT_SYMBOL_GPL(vfio_pci_core_enable); 314 + 315 + void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) 316 + { 317 + struct pci_dev *pdev = vdev->pdev; 318 + struct vfio_pci_dummy_resource *dummy_res, *tmp; 319 + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 320 + int i, bar; 321 + 322 + /* For needs_reset */ 323 + lockdep_assert_held(&vdev->vdev.dev_set->lock); 324 + 325 + /* Stop the device from further DMA */ 326 + pci_clear_master(pdev); 327 + 328 + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 329 + VFIO_IRQ_SET_ACTION_TRIGGER, 330 + vdev->irq_type, 0, 0, NULL); 331 + 332 + /* Device closed, don't need mutex here */ 333 + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 334 + &vdev->ioeventfds_list, next) { 335 + vfio_virqfd_disable(&ioeventfd->virqfd); 336 + list_del(&ioeventfd->next); 337 + kfree(ioeventfd); 338 + } 339 + vdev->ioeventfds_nr = 0; 340 + 341 + vdev->virq_disabled = false; 342 + 343 + for (i = 0; i < vdev->num_regions; i++) 344 + vdev->region[i].ops->release(vdev, &vdev->region[i]); 345 + 346 + vdev->num_regions = 0; 347 + kfree(vdev->region); 348 + vdev->region = NULL; /* don't krealloc a freed pointer */ 349 + 350 + vfio_config_free(vdev); 351 + 352 + for (i = 0; i < PCI_STD_NUM_BARS; i++) { 353 + bar = i + PCI_STD_RESOURCES; 354 + if (!vdev->barmap[bar]) 355 + continue; 356 + pci_iounmap(pdev, vdev->barmap[bar]); 357 + pci_release_selected_regions(pdev, 1 << bar); 358 + vdev->barmap[bar] = NULL; 359 + } 360 + 361 + list_for_each_entry_safe(dummy_res, tmp, 362 + &vdev->dummy_resources_list, res_next) { 363 + list_del(&dummy_res->res_next); 364 + release_resource(&dummy_res->resource); 365 + kfree(dummy_res); 366 + } 367 + 368 + vdev->needs_reset = true; 369 + 370 + /* 371 + * If we have saved state, restore it. If we can reset the device, 372 + * even better. Resetting with current state seems better than 373 + * nothing, but saving and restoring current state without reset 374 + * is just busy work. 375 + */ 376 + if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 377 + pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 378 + 379 + if (!vdev->reset_works) 380 + goto out; 381 + 382 + pci_save_state(pdev); 383 + } 384 + 385 + /* 386 + * Disable INTx and MSI, presumably to avoid spurious interrupts 387 + * during reset. Stolen from pci_reset_function() 388 + */ 389 + pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 390 + 391 + /* 392 + * Try to get the locks ourselves to prevent a deadlock. The 393 + * success of this is dependent on being able to lock the device, 394 + * which is not always possible. 395 + * We can not use the "try" reset interface here, which will 396 + * overwrite the previously restored configuration information. 397 + */ 398 + if (vdev->reset_works && pci_dev_trylock(pdev)) { 399 + if (!__pci_reset_function_locked(pdev)) 400 + vdev->needs_reset = false; 401 + pci_dev_unlock(pdev); 402 + } 403 + 404 + pci_restore_state(pdev); 405 + out: 406 + pci_disable_device(pdev); 407 + 408 + if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) 409 + vfio_pci_set_power_state(vdev, PCI_D3hot); 410 + } 411 + EXPORT_SYMBOL_GPL(vfio_pci_core_disable); 412 + 413 + static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev) 414 + { 415 + struct pci_dev *physfn = pci_physfn(vdev->pdev); 416 + struct vfio_device *pf_dev; 417 + 418 + if (!vdev->pdev->is_virtfn) 419 + return NULL; 420 + 421 + pf_dev = vfio_device_get_from_dev(&physfn->dev); 422 + if (!pf_dev) 423 + return NULL; 424 + 425 + if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) { 426 + vfio_device_put(pf_dev); 427 + return NULL; 428 + } 429 + 430 + return container_of(pf_dev, struct vfio_pci_core_device, vdev); 431 + } 432 + 433 + static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val) 434 + { 435 + struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); 436 + 437 + if (!pf_vdev) 438 + return; 439 + 440 + mutex_lock(&pf_vdev->vf_token->lock); 441 + pf_vdev->vf_token->users += val; 442 + WARN_ON(pf_vdev->vf_token->users < 0); 443 + mutex_unlock(&pf_vdev->vf_token->lock); 444 + 445 + vfio_device_put(&pf_vdev->vdev); 446 + } 447 + 448 + void vfio_pci_core_close_device(struct vfio_device *core_vdev) 449 + { 450 + struct vfio_pci_core_device *vdev = 451 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 452 + 453 + vfio_pci_vf_token_user_add(vdev, -1); 454 + vfio_spapr_pci_eeh_release(vdev->pdev); 455 + vfio_pci_core_disable(vdev); 456 + 457 + mutex_lock(&vdev->igate); 458 + if (vdev->err_trigger) { 459 + eventfd_ctx_put(vdev->err_trigger); 460 + vdev->err_trigger = NULL; 461 + } 462 + if (vdev->req_trigger) { 463 + eventfd_ctx_put(vdev->req_trigger); 464 + vdev->req_trigger = NULL; 465 + } 466 + mutex_unlock(&vdev->igate); 467 + } 468 + EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); 469 + 470 + void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) 471 + { 472 + vfio_pci_probe_mmaps(vdev); 473 + vfio_spapr_pci_eeh_open(vdev->pdev); 474 + vfio_pci_vf_token_user_add(vdev, 1); 475 + } 476 + EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); 477 + 478 + static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) 479 + { 480 + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 481 + u8 pin; 482 + 483 + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 484 + vdev->nointx || vdev->pdev->is_virtfn) 485 + return 0; 486 + 487 + pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 488 + 489 + return pin ? 1 : 0; 490 + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 491 + u8 pos; 492 + u16 flags; 493 + 494 + pos = vdev->pdev->msi_cap; 495 + if (pos) { 496 + pci_read_config_word(vdev->pdev, 497 + pos + PCI_MSI_FLAGS, &flags); 498 + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 499 + } 500 + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 501 + u8 pos; 502 + u16 flags; 503 + 504 + pos = vdev->pdev->msix_cap; 505 + if (pos) { 506 + pci_read_config_word(vdev->pdev, 507 + pos + PCI_MSIX_FLAGS, &flags); 508 + 509 + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 510 + } 511 + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 512 + if (pci_is_pcie(vdev->pdev)) 513 + return 1; 514 + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 515 + return 1; 516 + } 517 + 518 + return 0; 519 + } 520 + 521 + static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 522 + { 523 + (*(int *)data)++; 524 + return 0; 525 + } 526 + 527 + struct vfio_pci_fill_info { 528 + int max; 529 + int cur; 530 + struct vfio_pci_dependent_device *devices; 531 + }; 532 + 533 + static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 534 + { 535 + struct vfio_pci_fill_info *fill = data; 536 + struct iommu_group *iommu_group; 537 + 538 + if (fill->cur == fill->max) 539 + return -EAGAIN; /* Something changed, try again */ 540 + 541 + iommu_group = iommu_group_get(&pdev->dev); 542 + if (!iommu_group) 543 + return -EPERM; /* Cannot reset non-isolated devices */ 544 + 545 + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 546 + fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 547 + fill->devices[fill->cur].bus = pdev->bus->number; 548 + fill->devices[fill->cur].devfn = pdev->devfn; 549 + fill->cur++; 550 + iommu_group_put(iommu_group); 551 + return 0; 552 + } 553 + 554 + struct vfio_pci_group_info { 555 + int count; 556 + struct vfio_group **groups; 557 + }; 558 + 559 + static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 560 + { 561 + for (; pdev; pdev = pdev->bus->self) 562 + if (pdev->bus == slot->bus) 563 + return (pdev->slot == slot); 564 + return false; 565 + } 566 + 567 + struct vfio_pci_walk_info { 568 + int (*fn)(struct pci_dev *, void *data); 569 + void *data; 570 + struct pci_dev *pdev; 571 + bool slot; 572 + int ret; 573 + }; 574 + 575 + static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 576 + { 577 + struct vfio_pci_walk_info *walk = data; 578 + 579 + if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 580 + walk->ret = walk->fn(pdev, walk->data); 581 + 582 + return walk->ret; 583 + } 584 + 585 + static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 586 + int (*fn)(struct pci_dev *, 587 + void *data), void *data, 588 + bool slot) 589 + { 590 + struct vfio_pci_walk_info walk = { 591 + .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 592 + }; 593 + 594 + pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 595 + 596 + return walk.ret; 597 + } 598 + 599 + static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, 600 + struct vfio_info_cap *caps) 601 + { 602 + struct vfio_info_cap_header header = { 603 + .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 604 + .version = 1 605 + }; 606 + 607 + return vfio_info_add_capability(caps, &header, sizeof(header)); 608 + } 609 + 610 + int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, 611 + unsigned int type, unsigned int subtype, 612 + const struct vfio_pci_regops *ops, 613 + size_t size, u32 flags, void *data) 614 + { 615 + struct vfio_pci_region *region; 616 + 617 + region = krealloc(vdev->region, 618 + (vdev->num_regions + 1) * sizeof(*region), 619 + GFP_KERNEL); 620 + if (!region) 621 + return -ENOMEM; 622 + 623 + vdev->region = region; 624 + vdev->region[vdev->num_regions].type = type; 625 + vdev->region[vdev->num_regions].subtype = subtype; 626 + vdev->region[vdev->num_regions].ops = ops; 627 + vdev->region[vdev->num_regions].size = size; 628 + vdev->region[vdev->num_regions].flags = flags; 629 + vdev->region[vdev->num_regions].data = data; 630 + 631 + vdev->num_regions++; 632 + 633 + return 0; 634 + } 635 + EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); 636 + 637 + long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, 638 + unsigned long arg) 639 + { 640 + struct vfio_pci_core_device *vdev = 641 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 642 + unsigned long minsz; 643 + 644 + if (cmd == VFIO_DEVICE_GET_INFO) { 645 + struct vfio_device_info info; 646 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 647 + unsigned long capsz; 648 + int ret; 649 + 650 + minsz = offsetofend(struct vfio_device_info, num_irqs); 651 + 652 + /* For backward compatibility, cannot require this */ 653 + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 654 + 655 + if (copy_from_user(&info, (void __user *)arg, minsz)) 656 + return -EFAULT; 657 + 658 + if (info.argsz < minsz) 659 + return -EINVAL; 660 + 661 + if (info.argsz >= capsz) { 662 + minsz = capsz; 663 + info.cap_offset = 0; 664 + } 665 + 666 + info.flags = VFIO_DEVICE_FLAGS_PCI; 667 + 668 + if (vdev->reset_works) 669 + info.flags |= VFIO_DEVICE_FLAGS_RESET; 670 + 671 + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 672 + info.num_irqs = VFIO_PCI_NUM_IRQS; 673 + 674 + ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 675 + if (ret && ret != -ENODEV) { 676 + pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 677 + return ret; 678 + } 679 + 680 + if (caps.size) { 681 + info.flags |= VFIO_DEVICE_FLAGS_CAPS; 682 + if (info.argsz < sizeof(info) + caps.size) { 683 + info.argsz = sizeof(info) + caps.size; 684 + } else { 685 + vfio_info_cap_shift(&caps, sizeof(info)); 686 + if (copy_to_user((void __user *)arg + 687 + sizeof(info), caps.buf, 688 + caps.size)) { 689 + kfree(caps.buf); 690 + return -EFAULT; 691 + } 692 + info.cap_offset = sizeof(info); 693 + } 694 + 695 + kfree(caps.buf); 696 + } 697 + 698 + return copy_to_user((void __user *)arg, &info, minsz) ? 699 + -EFAULT : 0; 700 + 701 + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 702 + struct pci_dev *pdev = vdev->pdev; 703 + struct vfio_region_info info; 704 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 705 + int i, ret; 706 + 707 + minsz = offsetofend(struct vfio_region_info, offset); 708 + 709 + if (copy_from_user(&info, (void __user *)arg, minsz)) 710 + return -EFAULT; 711 + 712 + if (info.argsz < minsz) 713 + return -EINVAL; 714 + 715 + switch (info.index) { 716 + case VFIO_PCI_CONFIG_REGION_INDEX: 717 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 718 + info.size = pdev->cfg_size; 719 + info.flags = VFIO_REGION_INFO_FLAG_READ | 720 + VFIO_REGION_INFO_FLAG_WRITE; 721 + break; 722 + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 723 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 724 + info.size = pci_resource_len(pdev, info.index); 725 + if (!info.size) { 726 + info.flags = 0; 727 + break; 728 + } 729 + 730 + info.flags = VFIO_REGION_INFO_FLAG_READ | 731 + VFIO_REGION_INFO_FLAG_WRITE; 732 + if (vdev->bar_mmap_supported[info.index]) { 733 + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 734 + if (info.index == vdev->msix_bar) { 735 + ret = msix_mmappable_cap(vdev, &caps); 736 + if (ret) 737 + return ret; 738 + } 739 + } 740 + 741 + break; 742 + case VFIO_PCI_ROM_REGION_INDEX: 743 + { 744 + void __iomem *io; 745 + size_t size; 746 + u16 cmd; 747 + 748 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 749 + info.flags = 0; 750 + 751 + /* Report the BAR size, not the ROM size */ 752 + info.size = pci_resource_len(pdev, info.index); 753 + if (!info.size) { 754 + /* Shadow ROMs appear as PCI option ROMs */ 755 + if (pdev->resource[PCI_ROM_RESOURCE].flags & 756 + IORESOURCE_ROM_SHADOW) 757 + info.size = 0x20000; 758 + else 759 + break; 760 + } 761 + 762 + /* 763 + * Is it really there? Enable memory decode for 764 + * implicit access in pci_map_rom(). 765 + */ 766 + cmd = vfio_pci_memory_lock_and_enable(vdev); 767 + io = pci_map_rom(pdev, &size); 768 + if (io) { 769 + info.flags = VFIO_REGION_INFO_FLAG_READ; 770 + pci_unmap_rom(pdev, io); 771 + } else { 772 + info.size = 0; 773 + } 774 + vfio_pci_memory_unlock_and_restore(vdev, cmd); 775 + 776 + break; 777 + } 778 + case VFIO_PCI_VGA_REGION_INDEX: 779 + if (!vdev->has_vga) 780 + return -EINVAL; 781 + 782 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 783 + info.size = 0xc0000; 784 + info.flags = VFIO_REGION_INFO_FLAG_READ | 785 + VFIO_REGION_INFO_FLAG_WRITE; 786 + 787 + break; 788 + default: 789 + { 790 + struct vfio_region_info_cap_type cap_type = { 791 + .header.id = VFIO_REGION_INFO_CAP_TYPE, 792 + .header.version = 1 }; 793 + 794 + if (info.index >= 795 + VFIO_PCI_NUM_REGIONS + vdev->num_regions) 796 + return -EINVAL; 797 + info.index = array_index_nospec(info.index, 798 + VFIO_PCI_NUM_REGIONS + 799 + vdev->num_regions); 800 + 801 + i = info.index - VFIO_PCI_NUM_REGIONS; 802 + 803 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 804 + info.size = vdev->region[i].size; 805 + info.flags = vdev->region[i].flags; 806 + 807 + cap_type.type = vdev->region[i].type; 808 + cap_type.subtype = vdev->region[i].subtype; 809 + 810 + ret = vfio_info_add_capability(&caps, &cap_type.header, 811 + sizeof(cap_type)); 812 + if (ret) 813 + return ret; 814 + 815 + if (vdev->region[i].ops->add_capability) { 816 + ret = vdev->region[i].ops->add_capability(vdev, 817 + &vdev->region[i], &caps); 818 + if (ret) 819 + return ret; 820 + } 821 + } 822 + } 823 + 824 + if (caps.size) { 825 + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 826 + if (info.argsz < sizeof(info) + caps.size) { 827 + info.argsz = sizeof(info) + caps.size; 828 + info.cap_offset = 0; 829 + } else { 830 + vfio_info_cap_shift(&caps, sizeof(info)); 831 + if (copy_to_user((void __user *)arg + 832 + sizeof(info), caps.buf, 833 + caps.size)) { 834 + kfree(caps.buf); 835 + return -EFAULT; 836 + } 837 + info.cap_offset = sizeof(info); 838 + } 839 + 840 + kfree(caps.buf); 841 + } 842 + 843 + return copy_to_user((void __user *)arg, &info, minsz) ? 844 + -EFAULT : 0; 845 + 846 + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 847 + struct vfio_irq_info info; 848 + 849 + minsz = offsetofend(struct vfio_irq_info, count); 850 + 851 + if (copy_from_user(&info, (void __user *)arg, minsz)) 852 + return -EFAULT; 853 + 854 + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 855 + return -EINVAL; 856 + 857 + switch (info.index) { 858 + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 859 + case VFIO_PCI_REQ_IRQ_INDEX: 860 + break; 861 + case VFIO_PCI_ERR_IRQ_INDEX: 862 + if (pci_is_pcie(vdev->pdev)) 863 + break; 864 + fallthrough; 865 + default: 866 + return -EINVAL; 867 + } 868 + 869 + info.flags = VFIO_IRQ_INFO_EVENTFD; 870 + 871 + info.count = vfio_pci_get_irq_count(vdev, info.index); 872 + 873 + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 874 + info.flags |= (VFIO_IRQ_INFO_MASKABLE | 875 + VFIO_IRQ_INFO_AUTOMASKED); 876 + else 877 + info.flags |= VFIO_IRQ_INFO_NORESIZE; 878 + 879 + return copy_to_user((void __user *)arg, &info, minsz) ? 880 + -EFAULT : 0; 881 + 882 + } else if (cmd == VFIO_DEVICE_SET_IRQS) { 883 + struct vfio_irq_set hdr; 884 + u8 *data = NULL; 885 + int max, ret = 0; 886 + size_t data_size = 0; 887 + 888 + minsz = offsetofend(struct vfio_irq_set, count); 889 + 890 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 891 + return -EFAULT; 892 + 893 + max = vfio_pci_get_irq_count(vdev, hdr.index); 894 + 895 + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 896 + VFIO_PCI_NUM_IRQS, &data_size); 897 + if (ret) 898 + return ret; 899 + 900 + if (data_size) { 901 + data = memdup_user((void __user *)(arg + minsz), 902 + data_size); 903 + if (IS_ERR(data)) 904 + return PTR_ERR(data); 905 + } 906 + 907 + mutex_lock(&vdev->igate); 908 + 909 + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 910 + hdr.start, hdr.count, data); 911 + 912 + mutex_unlock(&vdev->igate); 913 + kfree(data); 914 + 915 + return ret; 916 + 917 + } else if (cmd == VFIO_DEVICE_RESET) { 918 + int ret; 919 + 920 + if (!vdev->reset_works) 921 + return -EINVAL; 922 + 923 + vfio_pci_zap_and_down_write_memory_lock(vdev); 924 + ret = pci_try_reset_function(vdev->pdev); 925 + up_write(&vdev->memory_lock); 926 + 927 + return ret; 928 + 929 + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 930 + struct vfio_pci_hot_reset_info hdr; 931 + struct vfio_pci_fill_info fill = { 0 }; 932 + struct vfio_pci_dependent_device *devices = NULL; 933 + bool slot = false; 934 + int ret = 0; 935 + 936 + minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 937 + 938 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 939 + return -EFAULT; 940 + 941 + if (hdr.argsz < minsz) 942 + return -EINVAL; 943 + 944 + hdr.flags = 0; 945 + 946 + /* Can we do a slot or bus reset or neither? */ 947 + if (!pci_probe_reset_slot(vdev->pdev->slot)) 948 + slot = true; 949 + else if (pci_probe_reset_bus(vdev->pdev->bus)) 950 + return -ENODEV; 951 + 952 + /* How many devices are affected? */ 953 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 954 + vfio_pci_count_devs, 955 + &fill.max, slot); 956 + if (ret) 957 + return ret; 958 + 959 + WARN_ON(!fill.max); /* Should always be at least one */ 960 + 961 + /* 962 + * If there's enough space, fill it now, otherwise return 963 + * -ENOSPC and the number of devices affected. 964 + */ 965 + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 966 + ret = -ENOSPC; 967 + hdr.count = fill.max; 968 + goto reset_info_exit; 969 + } 970 + 971 + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 972 + if (!devices) 973 + return -ENOMEM; 974 + 975 + fill.devices = devices; 976 + 977 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 978 + vfio_pci_fill_devs, 979 + &fill, slot); 980 + 981 + /* 982 + * If a device was removed between counting and filling, 983 + * we may come up short of fill.max. If a device was 984 + * added, we'll have a return of -EAGAIN above. 985 + */ 986 + if (!ret) 987 + hdr.count = fill.cur; 988 + 989 + reset_info_exit: 990 + if (copy_to_user((void __user *)arg, &hdr, minsz)) 991 + ret = -EFAULT; 992 + 993 + if (!ret) { 994 + if (copy_to_user((void __user *)(arg + minsz), devices, 995 + hdr.count * sizeof(*devices))) 996 + ret = -EFAULT; 997 + } 998 + 999 + kfree(devices); 1000 + return ret; 1001 + 1002 + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 1003 + struct vfio_pci_hot_reset hdr; 1004 + int32_t *group_fds; 1005 + struct vfio_group **groups; 1006 + struct vfio_pci_group_info info; 1007 + bool slot = false; 1008 + int group_idx, count = 0, ret = 0; 1009 + 1010 + minsz = offsetofend(struct vfio_pci_hot_reset, count); 1011 + 1012 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1013 + return -EFAULT; 1014 + 1015 + if (hdr.argsz < minsz || hdr.flags) 1016 + return -EINVAL; 1017 + 1018 + /* Can we do a slot or bus reset or neither? */ 1019 + if (!pci_probe_reset_slot(vdev->pdev->slot)) 1020 + slot = true; 1021 + else if (pci_probe_reset_bus(vdev->pdev->bus)) 1022 + return -ENODEV; 1023 + 1024 + /* 1025 + * We can't let userspace give us an arbitrarily large 1026 + * buffer to copy, so verify how many we think there 1027 + * could be. Note groups can have multiple devices so 1028 + * one group per device is the max. 1029 + */ 1030 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1031 + vfio_pci_count_devs, 1032 + &count, slot); 1033 + if (ret) 1034 + return ret; 1035 + 1036 + /* Somewhere between 1 and count is OK */ 1037 + if (!hdr.count || hdr.count > count) 1038 + return -EINVAL; 1039 + 1040 + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1041 + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 1042 + if (!group_fds || !groups) { 1043 + kfree(group_fds); 1044 + kfree(groups); 1045 + return -ENOMEM; 1046 + } 1047 + 1048 + if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1049 + hdr.count * sizeof(*group_fds))) { 1050 + kfree(group_fds); 1051 + kfree(groups); 1052 + return -EFAULT; 1053 + } 1054 + 1055 + /* 1056 + * For each group_fd, get the group through the vfio external 1057 + * user interface and store the group and iommu ID. This 1058 + * ensures the group is held across the reset. 1059 + */ 1060 + for (group_idx = 0; group_idx < hdr.count; group_idx++) { 1061 + struct vfio_group *group; 1062 + struct fd f = fdget(group_fds[group_idx]); 1063 + if (!f.file) { 1064 + ret = -EBADF; 1065 + break; 1066 + } 1067 + 1068 + group = vfio_group_get_external_user(f.file); 1069 + fdput(f); 1070 + if (IS_ERR(group)) { 1071 + ret = PTR_ERR(group); 1072 + break; 1073 + } 1074 + 1075 + groups[group_idx] = group; 1076 + } 1077 + 1078 + kfree(group_fds); 1079 + 1080 + /* release reference to groups on error */ 1081 + if (ret) 1082 + goto hot_reset_release; 1083 + 1084 + info.count = hdr.count; 1085 + info.groups = groups; 1086 + 1087 + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); 1088 + 1089 + hot_reset_release: 1090 + for (group_idx--; group_idx >= 0; group_idx--) 1091 + vfio_group_put_external_user(groups[group_idx]); 1092 + 1093 + kfree(groups); 1094 + return ret; 1095 + } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1096 + struct vfio_device_ioeventfd ioeventfd; 1097 + int count; 1098 + 1099 + minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1100 + 1101 + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1102 + return -EFAULT; 1103 + 1104 + if (ioeventfd.argsz < minsz) 1105 + return -EINVAL; 1106 + 1107 + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1108 + return -EINVAL; 1109 + 1110 + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1111 + 1112 + if (hweight8(count) != 1 || ioeventfd.fd < -1) 1113 + return -EINVAL; 1114 + 1115 + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1116 + ioeventfd.data, count, ioeventfd.fd); 1117 + } else if (cmd == VFIO_DEVICE_FEATURE) { 1118 + struct vfio_device_feature feature; 1119 + uuid_t uuid; 1120 + 1121 + minsz = offsetofend(struct vfio_device_feature, flags); 1122 + 1123 + if (copy_from_user(&feature, (void __user *)arg, minsz)) 1124 + return -EFAULT; 1125 + 1126 + if (feature.argsz < minsz) 1127 + return -EINVAL; 1128 + 1129 + /* Check unknown flags */ 1130 + if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 1131 + VFIO_DEVICE_FEATURE_SET | 1132 + VFIO_DEVICE_FEATURE_GET | 1133 + VFIO_DEVICE_FEATURE_PROBE)) 1134 + return -EINVAL; 1135 + 1136 + /* GET & SET are mutually exclusive except with PROBE */ 1137 + if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1138 + (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1139 + (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1140 + return -EINVAL; 1141 + 1142 + switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1143 + case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1144 + if (!vdev->vf_token) 1145 + return -ENOTTY; 1146 + 1147 + /* 1148 + * We do not support GET of the VF Token UUID as this 1149 + * could expose the token of the previous device user. 1150 + */ 1151 + if (feature.flags & VFIO_DEVICE_FEATURE_GET) 1152 + return -EINVAL; 1153 + 1154 + if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 1155 + return 0; 1156 + 1157 + /* Don't SET unless told to do so */ 1158 + if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 1159 + return -EINVAL; 1160 + 1161 + if (feature.argsz < minsz + sizeof(uuid)) 1162 + return -EINVAL; 1163 + 1164 + if (copy_from_user(&uuid, (void __user *)(arg + minsz), 1165 + sizeof(uuid))) 1166 + return -EFAULT; 1167 + 1168 + mutex_lock(&vdev->vf_token->lock); 1169 + uuid_copy(&vdev->vf_token->uuid, &uuid); 1170 + mutex_unlock(&vdev->vf_token->lock); 1171 + 1172 + return 0; 1173 + default: 1174 + return -ENOTTY; 1175 + } 1176 + } 1177 + 1178 + return -ENOTTY; 1179 + } 1180 + EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); 1181 + 1182 + static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1183 + size_t count, loff_t *ppos, bool iswrite) 1184 + { 1185 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1186 + 1187 + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1188 + return -EINVAL; 1189 + 1190 + switch (index) { 1191 + case VFIO_PCI_CONFIG_REGION_INDEX: 1192 + return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1193 + 1194 + case VFIO_PCI_ROM_REGION_INDEX: 1195 + if (iswrite) 1196 + return -EINVAL; 1197 + return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1198 + 1199 + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1200 + return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1201 + 1202 + case VFIO_PCI_VGA_REGION_INDEX: 1203 + return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1204 + default: 1205 + index -= VFIO_PCI_NUM_REGIONS; 1206 + return vdev->region[index].ops->rw(vdev, buf, 1207 + count, ppos, iswrite); 1208 + } 1209 + 1210 + return -EINVAL; 1211 + } 1212 + 1213 + ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, 1214 + size_t count, loff_t *ppos) 1215 + { 1216 + struct vfio_pci_core_device *vdev = 1217 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1218 + 1219 + if (!count) 1220 + return 0; 1221 + 1222 + return vfio_pci_rw(vdev, buf, count, ppos, false); 1223 + } 1224 + EXPORT_SYMBOL_GPL(vfio_pci_core_read); 1225 + 1226 + ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, 1227 + size_t count, loff_t *ppos) 1228 + { 1229 + struct vfio_pci_core_device *vdev = 1230 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1231 + 1232 + if (!count) 1233 + return 0; 1234 + 1235 + return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1236 + } 1237 + EXPORT_SYMBOL_GPL(vfio_pci_core_write); 1238 + 1239 + /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1240 + static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) 1241 + { 1242 + struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1243 + 1244 + /* 1245 + * Lock ordering: 1246 + * vma_lock is nested under mmap_lock for vm_ops callback paths. 1247 + * The memory_lock semaphore is used by both code paths calling 1248 + * into this function to zap vmas and the vm_ops.fault callback 1249 + * to protect the memory enable state of the device. 1250 + * 1251 + * When zapping vmas we need to maintain the mmap_lock => vma_lock 1252 + * ordering, which requires using vma_lock to walk vma_list to 1253 + * acquire an mm, then dropping vma_lock to get the mmap_lock and 1254 + * reacquiring vma_lock. This logic is derived from similar 1255 + * requirements in uverbs_user_mmap_disassociate(). 1256 + * 1257 + * mmap_lock must always be the top-level lock when it is taken. 1258 + * Therefore we can only hold the memory_lock write lock when 1259 + * vma_list is empty, as we'd need to take mmap_lock to clear 1260 + * entries. vma_list can only be guaranteed empty when holding 1261 + * vma_lock, thus memory_lock is nested under vma_lock. 1262 + * 1263 + * This enables the vm_ops.fault callback to acquire vma_lock, 1264 + * followed by memory_lock read lock, while already holding 1265 + * mmap_lock without risk of deadlock. 1266 + */ 1267 + while (1) { 1268 + struct mm_struct *mm = NULL; 1269 + 1270 + if (try) { 1271 + if (!mutex_trylock(&vdev->vma_lock)) 1272 + return 0; 1273 + } else { 1274 + mutex_lock(&vdev->vma_lock); 1275 + } 1276 + while (!list_empty(&vdev->vma_list)) { 1277 + mmap_vma = list_first_entry(&vdev->vma_list, 1278 + struct vfio_pci_mmap_vma, 1279 + vma_next); 1280 + mm = mmap_vma->vma->vm_mm; 1281 + if (mmget_not_zero(mm)) 1282 + break; 1283 + 1284 + list_del(&mmap_vma->vma_next); 1285 + kfree(mmap_vma); 1286 + mm = NULL; 1287 + } 1288 + if (!mm) 1289 + return 1; 1290 + mutex_unlock(&vdev->vma_lock); 1291 + 1292 + if (try) { 1293 + if (!mmap_read_trylock(mm)) { 1294 + mmput(mm); 1295 + return 0; 1296 + } 1297 + } else { 1298 + mmap_read_lock(mm); 1299 + } 1300 + if (try) { 1301 + if (!mutex_trylock(&vdev->vma_lock)) { 1302 + mmap_read_unlock(mm); 1303 + mmput(mm); 1304 + return 0; 1305 + } 1306 + } else { 1307 + mutex_lock(&vdev->vma_lock); 1308 + } 1309 + list_for_each_entry_safe(mmap_vma, tmp, 1310 + &vdev->vma_list, vma_next) { 1311 + struct vm_area_struct *vma = mmap_vma->vma; 1312 + 1313 + if (vma->vm_mm != mm) 1314 + continue; 1315 + 1316 + list_del(&mmap_vma->vma_next); 1317 + kfree(mmap_vma); 1318 + 1319 + zap_vma_ptes(vma, vma->vm_start, 1320 + vma->vm_end - vma->vm_start); 1321 + } 1322 + mutex_unlock(&vdev->vma_lock); 1323 + mmap_read_unlock(mm); 1324 + mmput(mm); 1325 + } 1326 + } 1327 + 1328 + void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) 1329 + { 1330 + vfio_pci_zap_and_vma_lock(vdev, false); 1331 + down_write(&vdev->memory_lock); 1332 + mutex_unlock(&vdev->vma_lock); 1333 + } 1334 + 1335 + u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) 1336 + { 1337 + u16 cmd; 1338 + 1339 + down_write(&vdev->memory_lock); 1340 + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1341 + if (!(cmd & PCI_COMMAND_MEMORY)) 1342 + pci_write_config_word(vdev->pdev, PCI_COMMAND, 1343 + cmd | PCI_COMMAND_MEMORY); 1344 + 1345 + return cmd; 1346 + } 1347 + 1348 + void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd) 1349 + { 1350 + pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1351 + up_write(&vdev->memory_lock); 1352 + } 1353 + 1354 + /* Caller holds vma_lock */ 1355 + static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, 1356 + struct vm_area_struct *vma) 1357 + { 1358 + struct vfio_pci_mmap_vma *mmap_vma; 1359 + 1360 + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1361 + if (!mmap_vma) 1362 + return -ENOMEM; 1363 + 1364 + mmap_vma->vma = vma; 1365 + list_add(&mmap_vma->vma_next, &vdev->vma_list); 1366 + 1367 + return 0; 1368 + } 1369 + 1370 + /* 1371 + * Zap mmaps on open so that we can fault them in on access and therefore 1372 + * our vma_list only tracks mappings accessed since last zap. 1373 + */ 1374 + static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1375 + { 1376 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1377 + } 1378 + 1379 + static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1380 + { 1381 + struct vfio_pci_core_device *vdev = vma->vm_private_data; 1382 + struct vfio_pci_mmap_vma *mmap_vma; 1383 + 1384 + mutex_lock(&vdev->vma_lock); 1385 + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1386 + if (mmap_vma->vma == vma) { 1387 + list_del(&mmap_vma->vma_next); 1388 + kfree(mmap_vma); 1389 + break; 1390 + } 1391 + } 1392 + mutex_unlock(&vdev->vma_lock); 1393 + } 1394 + 1395 + static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1396 + { 1397 + struct vm_area_struct *vma = vmf->vma; 1398 + struct vfio_pci_core_device *vdev = vma->vm_private_data; 1399 + struct vfio_pci_mmap_vma *mmap_vma; 1400 + vm_fault_t ret = VM_FAULT_NOPAGE; 1401 + 1402 + mutex_lock(&vdev->vma_lock); 1403 + down_read(&vdev->memory_lock); 1404 + 1405 + if (!__vfio_pci_memory_enabled(vdev)) { 1406 + ret = VM_FAULT_SIGBUS; 1407 + goto up_out; 1408 + } 1409 + 1410 + /* 1411 + * We populate the whole vma on fault, so we need to test whether 1412 + * the vma has already been mapped, such as for concurrent faults 1413 + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1414 + * we ask it to fill the same range again. 1415 + */ 1416 + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1417 + if (mmap_vma->vma == vma) 1418 + goto up_out; 1419 + } 1420 + 1421 + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1422 + vma->vm_end - vma->vm_start, 1423 + vma->vm_page_prot)) { 1424 + ret = VM_FAULT_SIGBUS; 1425 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1426 + goto up_out; 1427 + } 1428 + 1429 + if (__vfio_pci_add_vma(vdev, vma)) { 1430 + ret = VM_FAULT_OOM; 1431 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1432 + } 1433 + 1434 + up_out: 1435 + up_read(&vdev->memory_lock); 1436 + mutex_unlock(&vdev->vma_lock); 1437 + return ret; 1438 + } 1439 + 1440 + static const struct vm_operations_struct vfio_pci_mmap_ops = { 1441 + .open = vfio_pci_mmap_open, 1442 + .close = vfio_pci_mmap_close, 1443 + .fault = vfio_pci_mmap_fault, 1444 + }; 1445 + 1446 + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1447 + { 1448 + struct vfio_pci_core_device *vdev = 1449 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1450 + struct pci_dev *pdev = vdev->pdev; 1451 + unsigned int index; 1452 + u64 phys_len, req_len, pgoff, req_start; 1453 + int ret; 1454 + 1455 + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1456 + 1457 + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1458 + return -EINVAL; 1459 + if (vma->vm_end < vma->vm_start) 1460 + return -EINVAL; 1461 + if ((vma->vm_flags & VM_SHARED) == 0) 1462 + return -EINVAL; 1463 + if (index >= VFIO_PCI_NUM_REGIONS) { 1464 + int regnum = index - VFIO_PCI_NUM_REGIONS; 1465 + struct vfio_pci_region *region = vdev->region + regnum; 1466 + 1467 + if (region->ops && region->ops->mmap && 1468 + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1469 + return region->ops->mmap(vdev, region, vma); 1470 + return -EINVAL; 1471 + } 1472 + if (index >= VFIO_PCI_ROM_REGION_INDEX) 1473 + return -EINVAL; 1474 + if (!vdev->bar_mmap_supported[index]) 1475 + return -EINVAL; 1476 + 1477 + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1478 + req_len = vma->vm_end - vma->vm_start; 1479 + pgoff = vma->vm_pgoff & 1480 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1481 + req_start = pgoff << PAGE_SHIFT; 1482 + 1483 + if (req_start + req_len > phys_len) 1484 + return -EINVAL; 1485 + 1486 + /* 1487 + * Even though we don't make use of the barmap for the mmap, 1488 + * we need to request the region and the barmap tracks that. 1489 + */ 1490 + if (!vdev->barmap[index]) { 1491 + ret = pci_request_selected_regions(pdev, 1492 + 1 << index, "vfio-pci"); 1493 + if (ret) 1494 + return ret; 1495 + 1496 + vdev->barmap[index] = pci_iomap(pdev, index, 0); 1497 + if (!vdev->barmap[index]) { 1498 + pci_release_selected_regions(pdev, 1 << index); 1499 + return -ENOMEM; 1500 + } 1501 + } 1502 + 1503 + vma->vm_private_data = vdev; 1504 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1505 + vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1506 + 1507 + /* 1508 + * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1509 + * change vm_flags within the fault handler. Set them now. 1510 + */ 1511 + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1512 + vma->vm_ops = &vfio_pci_mmap_ops; 1513 + 1514 + return 0; 1515 + } 1516 + EXPORT_SYMBOL_GPL(vfio_pci_core_mmap); 1517 + 1518 + void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) 1519 + { 1520 + struct vfio_pci_core_device *vdev = 1521 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1522 + struct pci_dev *pdev = vdev->pdev; 1523 + 1524 + mutex_lock(&vdev->igate); 1525 + 1526 + if (vdev->req_trigger) { 1527 + if (!(count % 10)) 1528 + pci_notice_ratelimited(pdev, 1529 + "Relaying device request to user (#%u)\n", 1530 + count); 1531 + eventfd_signal(vdev->req_trigger, 1); 1532 + } else if (count == 0) { 1533 + pci_warn(pdev, 1534 + "No device request channel registered, blocked until released by user\n"); 1535 + } 1536 + 1537 + mutex_unlock(&vdev->igate); 1538 + } 1539 + EXPORT_SYMBOL_GPL(vfio_pci_core_request); 1540 + 1541 + static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, 1542 + bool vf_token, uuid_t *uuid) 1543 + { 1544 + /* 1545 + * There's always some degree of trust or collaboration between SR-IOV 1546 + * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1547 + * can disrupt VFs with a reset, but often the PF has more explicit 1548 + * access to deny service to the VF or access data passed through the 1549 + * VF. We therefore require an opt-in via a shared VF token (UUID) to 1550 + * represent this trust. This both prevents that a VF driver might 1551 + * assume the PF driver is a trusted, in-kernel driver, and also that 1552 + * a PF driver might be replaced with a rogue driver, unknown to in-use 1553 + * VF drivers. 1554 + * 1555 + * Therefore when presented with a VF, if the PF is a vfio device and 1556 + * it is bound to the vfio-pci driver, the user needs to provide a VF 1557 + * token to access the device, in the form of appending a vf_token to 1558 + * the device name, for example: 1559 + * 1560 + * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1561 + * 1562 + * When presented with a PF which has VFs in use, the user must also 1563 + * provide the current VF token to prove collaboration with existing 1564 + * VF users. If VFs are not in use, the VF token provided for the PF 1565 + * device will act to set the VF token. 1566 + * 1567 + * If the VF token is provided but unused, an error is generated. 1568 + */ 1569 + if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1570 + return 0; /* No VF token provided or required */ 1571 + 1572 + if (vdev->pdev->is_virtfn) { 1573 + struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); 1574 + bool match; 1575 + 1576 + if (!pf_vdev) { 1577 + if (!vf_token) 1578 + return 0; /* PF is not vfio-pci, no VF token */ 1579 + 1580 + pci_info_ratelimited(vdev->pdev, 1581 + "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1582 + return -EINVAL; 1583 + } 1584 + 1585 + if (!vf_token) { 1586 + vfio_device_put(&pf_vdev->vdev); 1587 + pci_info_ratelimited(vdev->pdev, 1588 + "VF token required to access device\n"); 1589 + return -EACCES; 1590 + } 1591 + 1592 + mutex_lock(&pf_vdev->vf_token->lock); 1593 + match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1594 + mutex_unlock(&pf_vdev->vf_token->lock); 1595 + 1596 + vfio_device_put(&pf_vdev->vdev); 1597 + 1598 + if (!match) { 1599 + pci_info_ratelimited(vdev->pdev, 1600 + "Incorrect VF token provided for device\n"); 1601 + return -EACCES; 1602 + } 1603 + } else if (vdev->vf_token) { 1604 + mutex_lock(&vdev->vf_token->lock); 1605 + if (vdev->vf_token->users) { 1606 + if (!vf_token) { 1607 + mutex_unlock(&vdev->vf_token->lock); 1608 + pci_info_ratelimited(vdev->pdev, 1609 + "VF token required to access device\n"); 1610 + return -EACCES; 1611 + } 1612 + 1613 + if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1614 + mutex_unlock(&vdev->vf_token->lock); 1615 + pci_info_ratelimited(vdev->pdev, 1616 + "Incorrect VF token provided for device\n"); 1617 + return -EACCES; 1618 + } 1619 + } else if (vf_token) { 1620 + uuid_copy(&vdev->vf_token->uuid, uuid); 1621 + } 1622 + 1623 + mutex_unlock(&vdev->vf_token->lock); 1624 + } else if (vf_token) { 1625 + pci_info_ratelimited(vdev->pdev, 1626 + "VF token incorrectly provided, not a PF or VF\n"); 1627 + return -EINVAL; 1628 + } 1629 + 1630 + return 0; 1631 + } 1632 + 1633 + #define VF_TOKEN_ARG "vf_token=" 1634 + 1635 + int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) 1636 + { 1637 + struct vfio_pci_core_device *vdev = 1638 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1639 + bool vf_token = false; 1640 + uuid_t uuid; 1641 + int ret; 1642 + 1643 + if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1644 + return 0; /* No match */ 1645 + 1646 + if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1647 + buf += strlen(pci_name(vdev->pdev)); 1648 + 1649 + if (*buf != ' ') 1650 + return 0; /* No match: non-whitespace after name */ 1651 + 1652 + while (*buf) { 1653 + if (*buf == ' ') { 1654 + buf++; 1655 + continue; 1656 + } 1657 + 1658 + if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1659 + strlen(VF_TOKEN_ARG))) { 1660 + buf += strlen(VF_TOKEN_ARG); 1661 + 1662 + if (strlen(buf) < UUID_STRING_LEN) 1663 + return -EINVAL; 1664 + 1665 + ret = uuid_parse(buf, &uuid); 1666 + if (ret) 1667 + return ret; 1668 + 1669 + vf_token = true; 1670 + buf += UUID_STRING_LEN; 1671 + } else { 1672 + /* Unknown/duplicate option */ 1673 + return -EINVAL; 1674 + } 1675 + } 1676 + } 1677 + 1678 + ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1679 + if (ret) 1680 + return ret; 1681 + 1682 + return 1; /* Match */ 1683 + } 1684 + EXPORT_SYMBOL_GPL(vfio_pci_core_match); 1685 + 1686 + static int vfio_pci_bus_notifier(struct notifier_block *nb, 1687 + unsigned long action, void *data) 1688 + { 1689 + struct vfio_pci_core_device *vdev = container_of(nb, 1690 + struct vfio_pci_core_device, nb); 1691 + struct device *dev = data; 1692 + struct pci_dev *pdev = to_pci_dev(dev); 1693 + struct pci_dev *physfn = pci_physfn(pdev); 1694 + 1695 + if (action == BUS_NOTIFY_ADD_DEVICE && 1696 + pdev->is_virtfn && physfn == vdev->pdev) { 1697 + pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1698 + pci_name(pdev)); 1699 + pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1700 + vdev->vdev.ops->name); 1701 + } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1702 + pdev->is_virtfn && physfn == vdev->pdev) { 1703 + struct pci_driver *drv = pci_dev_driver(pdev); 1704 + 1705 + if (drv && drv != pci_dev_driver(vdev->pdev)) 1706 + pci_warn(vdev->pdev, 1707 + "VF %s bound to driver %s while PF bound to driver %s\n", 1708 + pci_name(pdev), drv->name, 1709 + pci_dev_driver(vdev->pdev)->name); 1710 + } 1711 + 1712 + return 0; 1713 + } 1714 + 1715 + static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev) 1716 + { 1717 + struct pci_dev *pdev = vdev->pdev; 1718 + int ret; 1719 + 1720 + if (!pdev->is_physfn) 1721 + return 0; 1722 + 1723 + vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1724 + if (!vdev->vf_token) 1725 + return -ENOMEM; 1726 + 1727 + mutex_init(&vdev->vf_token->lock); 1728 + uuid_gen(&vdev->vf_token->uuid); 1729 + 1730 + vdev->nb.notifier_call = vfio_pci_bus_notifier; 1731 + ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1732 + if (ret) { 1733 + kfree(vdev->vf_token); 1734 + return ret; 1735 + } 1736 + return 0; 1737 + } 1738 + 1739 + static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev) 1740 + { 1741 + if (!vdev->vf_token) 1742 + return; 1743 + 1744 + bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1745 + WARN_ON(vdev->vf_token->users); 1746 + mutex_destroy(&vdev->vf_token->lock); 1747 + kfree(vdev->vf_token); 1748 + } 1749 + 1750 + static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev) 1751 + { 1752 + struct pci_dev *pdev = vdev->pdev; 1753 + int ret; 1754 + 1755 + if (!vfio_pci_is_vga(pdev)) 1756 + return 0; 1757 + 1758 + ret = vga_client_register(pdev, vfio_pci_set_decode); 1759 + if (ret) 1760 + return ret; 1761 + vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false)); 1762 + return 0; 1763 + } 1764 + 1765 + static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) 1766 + { 1767 + struct pci_dev *pdev = vdev->pdev; 1768 + 1769 + if (!vfio_pci_is_vga(pdev)) 1770 + return; 1771 + vga_client_unregister(pdev); 1772 + vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1773 + VGA_RSRC_LEGACY_IO | 1774 + VGA_RSRC_LEGACY_MEM); 1775 + } 1776 + 1777 + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, 1778 + struct pci_dev *pdev, 1779 + const struct vfio_device_ops *vfio_pci_ops) 1780 + { 1781 + vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); 1782 + vdev->pdev = pdev; 1783 + vdev->irq_type = VFIO_PCI_NUM_IRQS; 1784 + mutex_init(&vdev->igate); 1785 + spin_lock_init(&vdev->irqlock); 1786 + mutex_init(&vdev->ioeventfds_lock); 1787 + INIT_LIST_HEAD(&vdev->dummy_resources_list); 1788 + INIT_LIST_HEAD(&vdev->ioeventfds_list); 1789 + mutex_init(&vdev->vma_lock); 1790 + INIT_LIST_HEAD(&vdev->vma_list); 1791 + init_rwsem(&vdev->memory_lock); 1792 + } 1793 + EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); 1794 + 1795 + void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) 1796 + { 1797 + mutex_destroy(&vdev->igate); 1798 + mutex_destroy(&vdev->ioeventfds_lock); 1799 + mutex_destroy(&vdev->vma_lock); 1800 + vfio_uninit_group_dev(&vdev->vdev); 1801 + kfree(vdev->region); 1802 + kfree(vdev->pm_save); 1803 + } 1804 + EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); 1805 + 1806 + int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) 1807 + { 1808 + struct pci_dev *pdev = vdev->pdev; 1809 + struct iommu_group *group; 1810 + int ret; 1811 + 1812 + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1813 + return -EINVAL; 1814 + 1815 + /* 1816 + * Prevent binding to PFs with VFs enabled, the VFs might be in use 1817 + * by the host or other users. We cannot capture the VFs if they 1818 + * already exist, nor can we track VF users. Disabling SR-IOV here 1819 + * would initiate removing the VFs, which would unbind the driver, 1820 + * which is prone to blocking if that VF is also in use by vfio-pci. 1821 + * Just reject these PFs and let the user sort it out. 1822 + */ 1823 + if (pci_num_vf(pdev)) { 1824 + pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1825 + return -EBUSY; 1826 + } 1827 + 1828 + group = vfio_iommu_group_get(&pdev->dev); 1829 + if (!group) 1830 + return -EINVAL; 1831 + 1832 + if (pci_is_root_bus(pdev->bus)) { 1833 + ret = vfio_assign_device_set(&vdev->vdev, vdev); 1834 + } else if (!pci_probe_reset_slot(pdev->slot)) { 1835 + ret = vfio_assign_device_set(&vdev->vdev, pdev->slot); 1836 + } else { 1837 + /* 1838 + * If there is no slot reset support for this device, the whole 1839 + * bus needs to be grouped together to support bus-wide resets. 1840 + */ 1841 + ret = vfio_assign_device_set(&vdev->vdev, pdev->bus); 1842 + } 1843 + 1844 + if (ret) 1845 + goto out_group_put; 1846 + ret = vfio_pci_vf_init(vdev); 1847 + if (ret) 1848 + goto out_group_put; 1849 + ret = vfio_pci_vga_init(vdev); 1850 + if (ret) 1851 + goto out_vf; 1852 + 1853 + vfio_pci_probe_power_state(vdev); 1854 + 1855 + if (!disable_idle_d3) { 1856 + /* 1857 + * pci-core sets the device power state to an unknown value at 1858 + * bootup and after being removed from a driver. The only 1859 + * transition it allows from this unknown state is to D0, which 1860 + * typically happens when a driver calls pci_enable_device(). 1861 + * We're not ready to enable the device yet, but we do want to 1862 + * be able to get to D3. Therefore first do a D0 transition 1863 + * before going to D3. 1864 + */ 1865 + vfio_pci_set_power_state(vdev, PCI_D0); 1866 + vfio_pci_set_power_state(vdev, PCI_D3hot); 1867 + } 1868 + 1869 + ret = vfio_register_group_dev(&vdev->vdev); 1870 + if (ret) 1871 + goto out_power; 1872 + return 0; 1873 + 1874 + out_power: 1875 + if (!disable_idle_d3) 1876 + vfio_pci_set_power_state(vdev, PCI_D0); 1877 + out_vf: 1878 + vfio_pci_vf_uninit(vdev); 1879 + out_group_put: 1880 + vfio_iommu_group_put(group, &pdev->dev); 1881 + return ret; 1882 + } 1883 + EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); 1884 + 1885 + void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) 1886 + { 1887 + struct pci_dev *pdev = vdev->pdev; 1888 + 1889 + pci_disable_sriov(pdev); 1890 + 1891 + vfio_unregister_group_dev(&vdev->vdev); 1892 + 1893 + vfio_pci_vf_uninit(vdev); 1894 + vfio_pci_vga_uninit(vdev); 1895 + 1896 + vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1897 + 1898 + if (!disable_idle_d3) 1899 + vfio_pci_set_power_state(vdev, PCI_D0); 1900 + } 1901 + EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); 1902 + 1903 + static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1904 + pci_channel_state_t state) 1905 + { 1906 + struct vfio_pci_core_device *vdev; 1907 + struct vfio_device *device; 1908 + 1909 + device = vfio_device_get_from_dev(&pdev->dev); 1910 + if (device == NULL) 1911 + return PCI_ERS_RESULT_DISCONNECT; 1912 + 1913 + vdev = container_of(device, struct vfio_pci_core_device, vdev); 1914 + 1915 + mutex_lock(&vdev->igate); 1916 + 1917 + if (vdev->err_trigger) 1918 + eventfd_signal(vdev->err_trigger, 1); 1919 + 1920 + mutex_unlock(&vdev->igate); 1921 + 1922 + vfio_device_put(device); 1923 + 1924 + return PCI_ERS_RESULT_CAN_RECOVER; 1925 + } 1926 + 1927 + int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 1928 + { 1929 + struct vfio_device *device; 1930 + int ret = 0; 1931 + 1932 + device = vfio_device_get_from_dev(&pdev->dev); 1933 + if (!device) 1934 + return -ENODEV; 1935 + 1936 + if (nr_virtfn == 0) 1937 + pci_disable_sriov(pdev); 1938 + else 1939 + ret = pci_enable_sriov(pdev, nr_virtfn); 1940 + 1941 + vfio_device_put(device); 1942 + 1943 + return ret < 0 ? ret : nr_virtfn; 1944 + } 1945 + EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); 1946 + 1947 + const struct pci_error_handlers vfio_pci_core_err_handlers = { 1948 + .error_detected = vfio_pci_aer_err_detected, 1949 + }; 1950 + EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); 1951 + 1952 + static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, 1953 + struct vfio_pci_group_info *groups) 1954 + { 1955 + unsigned int i; 1956 + 1957 + for (i = 0; i < groups->count; i++) 1958 + if (groups->groups[i] == vdev->vdev.group) 1959 + return true; 1960 + return false; 1961 + } 1962 + 1963 + static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) 1964 + { 1965 + struct vfio_device_set *dev_set = data; 1966 + struct vfio_device *cur; 1967 + 1968 + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 1969 + if (cur->dev == &pdev->dev) 1970 + return 0; 1971 + return -EBUSY; 1972 + } 1973 + 1974 + /* 1975 + * vfio-core considers a group to be viable and will create a vfio_device even 1976 + * if some devices are bound to drivers like pci-stub or pcieport. Here we 1977 + * require all PCI devices to be inside our dev_set since that ensures they stay 1978 + * put and that every driver controlling the device can co-ordinate with the 1979 + * device reset. 1980 + * 1981 + * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be 1982 + * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise. 1983 + */ 1984 + static struct pci_dev * 1985 + vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) 1986 + { 1987 + struct pci_dev *pdev; 1988 + 1989 + lockdep_assert_held(&dev_set->lock); 1990 + 1991 + /* 1992 + * By definition all PCI devices in the dev_set share the same PCI 1993 + * reset, so any pci_dev will have the same outcomes for 1994 + * pci_probe_reset_*() and pci_reset_bus(). 1995 + */ 1996 + pdev = list_first_entry(&dev_set->device_list, 1997 + struct vfio_pci_core_device, 1998 + vdev.dev_set_list)->pdev; 1999 + 2000 + /* pci_reset_bus() is supported */ 2001 + if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus)) 2002 + return NULL; 2003 + 2004 + if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set, 2005 + dev_set, 2006 + !pci_probe_reset_slot(pdev->slot))) 2007 + return NULL; 2008 + return pdev; 2009 + } 2010 + 2011 + /* 2012 + * We need to get memory_lock for each device, but devices can share mmap_lock, 2013 + * therefore we need to zap and hold the vma_lock for each device, and only then 2014 + * get each memory_lock. 2015 + */ 2016 + static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 2017 + struct vfio_pci_group_info *groups) 2018 + { 2019 + struct vfio_pci_core_device *cur_mem; 2020 + struct vfio_pci_core_device *cur_vma; 2021 + struct vfio_pci_core_device *cur; 2022 + struct pci_dev *pdev; 2023 + bool is_mem = true; 2024 + int ret; 2025 + 2026 + mutex_lock(&dev_set->lock); 2027 + cur_mem = list_first_entry(&dev_set->device_list, 2028 + struct vfio_pci_core_device, 2029 + vdev.dev_set_list); 2030 + 2031 + pdev = vfio_pci_dev_set_resettable(dev_set); 2032 + if (!pdev) { 2033 + ret = -EINVAL; 2034 + goto err_unlock; 2035 + } 2036 + 2037 + list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { 2038 + /* 2039 + * Test whether all the affected devices are contained by the 2040 + * set of groups provided by the user. 2041 + */ 2042 + if (!vfio_dev_in_groups(cur_vma, groups)) { 2043 + ret = -EINVAL; 2044 + goto err_undo; 2045 + } 2046 + 2047 + /* 2048 + * Locking multiple devices is prone to deadlock, runaway and 2049 + * unwind if we hit contention. 2050 + */ 2051 + if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { 2052 + ret = -EBUSY; 2053 + goto err_undo; 2054 + } 2055 + } 2056 + cur_vma = NULL; 2057 + 2058 + list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { 2059 + if (!down_write_trylock(&cur_mem->memory_lock)) { 2060 + ret = -EBUSY; 2061 + goto err_undo; 2062 + } 2063 + mutex_unlock(&cur_mem->vma_lock); 2064 + } 2065 + cur_mem = NULL; 2066 + 2067 + ret = pci_reset_bus(pdev); 2068 + 2069 + err_undo: 2070 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2071 + if (cur == cur_mem) 2072 + is_mem = false; 2073 + if (cur == cur_vma) 2074 + break; 2075 + if (is_mem) 2076 + up_write(&cur->memory_lock); 2077 + else 2078 + mutex_unlock(&cur->vma_lock); 2079 + } 2080 + err_unlock: 2081 + mutex_unlock(&dev_set->lock); 2082 + return ret; 2083 + } 2084 + 2085 + static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) 2086 + { 2087 + struct vfio_pci_core_device *cur; 2088 + bool needs_reset = false; 2089 + 2090 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2091 + /* No VFIO device in the set can have an open device FD */ 2092 + if (cur->vdev.open_count) 2093 + return false; 2094 + needs_reset |= cur->needs_reset; 2095 + } 2096 + return needs_reset; 2097 + } 2098 + 2099 + /* 2100 + * If a bus or slot reset is available for the provided dev_set and: 2101 + * - All of the devices affected by that bus or slot reset are unused 2102 + * - At least one of the affected devices is marked dirty via 2103 + * needs_reset (such as by lack of FLR support) 2104 + * Then attempt to perform that bus or slot reset. 2105 + * Returns true if the dev_set was reset. 2106 + */ 2107 + static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) 2108 + { 2109 + struct vfio_pci_core_device *cur; 2110 + struct pci_dev *pdev; 2111 + int ret; 2112 + 2113 + if (!vfio_pci_dev_set_needs_reset(dev_set)) 2114 + return false; 2115 + 2116 + pdev = vfio_pci_dev_set_resettable(dev_set); 2117 + if (!pdev) 2118 + return false; 2119 + 2120 + ret = pci_reset_bus(pdev); 2121 + if (ret) 2122 + return false; 2123 + 2124 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2125 + cur->needs_reset = false; 2126 + if (!disable_idle_d3) 2127 + vfio_pci_set_power_state(cur, PCI_D3hot); 2128 + } 2129 + return true; 2130 + } 2131 + 2132 + void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, 2133 + bool is_disable_idle_d3) 2134 + { 2135 + nointxmask = is_nointxmask; 2136 + disable_vga = is_disable_vga; 2137 + disable_idle_d3 = is_disable_idle_d3; 2138 + } 2139 + EXPORT_SYMBOL_GPL(vfio_pci_core_set_params); 2140 + 2141 + static void vfio_pci_core_cleanup(void) 2142 + { 2143 + vfio_pci_uninit_perm_bits(); 2144 + } 2145 + 2146 + static int __init vfio_pci_core_init(void) 2147 + { 2148 + /* Allocate shared config space permission data used by all devices */ 2149 + return vfio_pci_init_perm_bits(); 2150 + } 2151 + 2152 + module_init(vfio_pci_core_init); 2153 + module_exit(vfio_pci_core_cleanup); 2154 + 2155 + MODULE_LICENSE("GPL v2"); 2156 + MODULE_AUTHOR(DRIVER_AUTHOR); 2157 + MODULE_DESCRIPTION(DRIVER_DESC);
+12 -11
drivers/vfio/pci/vfio_pci_igd.c
··· 15 15 #include <linux/uaccess.h> 16 16 #include <linux/vfio.h> 17 17 18 - #include "vfio_pci_private.h" 18 + #include <linux/vfio_pci_core.h> 19 19 20 20 #define OPREGION_SIGNATURE "IntelGraphicsMem" 21 21 #define OPREGION_SIZE (8 * 1024) ··· 25 25 #define OPREGION_RVDS 0x3c2 26 26 #define OPREGION_VERSION 0x16 27 27 28 - static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf, 29 - size_t count, loff_t *ppos, bool iswrite) 28 + static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev, 29 + char __user *buf, size_t count, loff_t *ppos, 30 + bool iswrite) 30 31 { 31 32 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 32 33 void *base = vdev->region[i].data; ··· 46 45 return count; 47 46 } 48 47 49 - static void vfio_pci_igd_release(struct vfio_pci_device *vdev, 48 + static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev, 50 49 struct vfio_pci_region *region) 51 50 { 52 51 memunmap(region->data); ··· 57 56 .release = vfio_pci_igd_release, 58 57 }; 59 58 60 - static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) 59 + static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) 61 60 { 62 61 __le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR); 63 62 u32 addr, size; ··· 161 160 return ret; 162 161 } 163 162 164 - static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, 165 - char __user *buf, size_t count, loff_t *ppos, 166 - bool iswrite) 163 + static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, 164 + char __user *buf, size_t count, loff_t *ppos, 165 + bool iswrite) 167 166 { 168 167 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 169 168 struct pci_dev *pdev = vdev->region[i].data; ··· 254 253 return count; 255 254 } 256 255 257 - static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev, 256 + static void vfio_pci_igd_cfg_release(struct vfio_pci_core_device *vdev, 258 257 struct vfio_pci_region *region) 259 258 { 260 259 struct pci_dev *pdev = region->data; ··· 267 266 .release = vfio_pci_igd_cfg_release, 268 267 }; 269 268 270 - static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev) 269 + static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) 271 270 { 272 271 struct pci_dev *host_bridge, *lpc_bridge; 273 272 int ret; ··· 315 314 return 0; 316 315 } 317 316 318 - int vfio_pci_igd_init(struct vfio_pci_device *vdev) 317 + int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 319 318 { 320 319 int ret; 321 320
+21 -21
drivers/vfio/pci/vfio_pci_intrs.c
··· 20 20 #include <linux/wait.h> 21 21 #include <linux/slab.h> 22 22 23 - #include "vfio_pci_private.h" 23 + #include <linux/vfio_pci_core.h> 24 24 25 25 /* 26 26 * INTx 27 27 */ 28 28 static void vfio_send_intx_eventfd(void *opaque, void *unused) 29 29 { 30 - struct vfio_pci_device *vdev = opaque; 30 + struct vfio_pci_core_device *vdev = opaque; 31 31 32 32 if (likely(is_intx(vdev) && !vdev->virq_disabled)) 33 33 eventfd_signal(vdev->ctx[0].trigger, 1); 34 34 } 35 35 36 - void vfio_pci_intx_mask(struct vfio_pci_device *vdev) 36 + void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 37 37 { 38 38 struct pci_dev *pdev = vdev->pdev; 39 39 unsigned long flags; ··· 73 73 */ 74 74 static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) 75 75 { 76 - struct vfio_pci_device *vdev = opaque; 76 + struct vfio_pci_core_device *vdev = opaque; 77 77 struct pci_dev *pdev = vdev->pdev; 78 78 unsigned long flags; 79 79 int ret = 0; ··· 107 107 return ret; 108 108 } 109 109 110 - void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) 110 + void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 111 111 { 112 112 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) 113 113 vfio_send_intx_eventfd(vdev, NULL); ··· 115 115 116 116 static irqreturn_t vfio_intx_handler(int irq, void *dev_id) 117 117 { 118 - struct vfio_pci_device *vdev = dev_id; 118 + struct vfio_pci_core_device *vdev = dev_id; 119 119 unsigned long flags; 120 120 int ret = IRQ_NONE; 121 121 ··· 139 139 return ret; 140 140 } 141 141 142 - static int vfio_intx_enable(struct vfio_pci_device *vdev) 142 + static int vfio_intx_enable(struct vfio_pci_core_device *vdev) 143 143 { 144 144 if (!is_irq_none(vdev)) 145 145 return -EINVAL; ··· 168 168 return 0; 169 169 } 170 170 171 - static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) 171 + static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) 172 172 { 173 173 struct pci_dev *pdev = vdev->pdev; 174 174 unsigned long irqflags = IRQF_SHARED; ··· 223 223 return 0; 224 224 } 225 225 226 - static void vfio_intx_disable(struct vfio_pci_device *vdev) 226 + static void vfio_intx_disable(struct vfio_pci_core_device *vdev) 227 227 { 228 228 vfio_virqfd_disable(&vdev->ctx[0].unmask); 229 229 vfio_virqfd_disable(&vdev->ctx[0].mask); ··· 244 244 return IRQ_HANDLED; 245 245 } 246 246 247 - static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) 247 + static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix) 248 248 { 249 249 struct pci_dev *pdev = vdev->pdev; 250 250 unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI; ··· 285 285 return 0; 286 286 } 287 287 288 - static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, 288 + static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, 289 289 int vector, int fd, bool msix) 290 290 { 291 291 struct pci_dev *pdev = vdev->pdev; ··· 364 364 return 0; 365 365 } 366 366 367 - static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, 367 + static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, 368 368 unsigned count, int32_t *fds, bool msix) 369 369 { 370 370 int i, j, ret = 0; ··· 385 385 return ret; 386 386 } 387 387 388 - static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) 388 + static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) 389 389 { 390 390 struct pci_dev *pdev = vdev->pdev; 391 391 int i; ··· 417 417 /* 418 418 * IOCTL support 419 419 */ 420 - static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, 420 + static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev, 421 421 unsigned index, unsigned start, 422 422 unsigned count, uint32_t flags, void *data) 423 423 { ··· 444 444 return 0; 445 445 } 446 446 447 - static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, 447 + static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev, 448 448 unsigned index, unsigned start, 449 449 unsigned count, uint32_t flags, void *data) 450 450 { ··· 464 464 return 0; 465 465 } 466 466 467 - static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, 467 + static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, 468 468 unsigned index, unsigned start, 469 469 unsigned count, uint32_t flags, void *data) 470 470 { ··· 507 507 return 0; 508 508 } 509 509 510 - static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, 510 + static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, 511 511 unsigned index, unsigned start, 512 512 unsigned count, uint32_t flags, void *data) 513 513 { ··· 613 613 return -EINVAL; 614 614 } 615 615 616 - static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, 616 + static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, 617 617 unsigned index, unsigned start, 618 618 unsigned count, uint32_t flags, void *data) 619 619 { ··· 624 624 count, flags, data); 625 625 } 626 626 627 - static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, 627 + static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, 628 628 unsigned index, unsigned start, 629 629 unsigned count, uint32_t flags, void *data) 630 630 { ··· 635 635 count, flags, data); 636 636 } 637 637 638 - int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 638 + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, 639 639 unsigned index, unsigned start, unsigned count, 640 640 void *data) 641 641 { 642 - int (*func)(struct vfio_pci_device *vdev, unsigned index, 642 + int (*func)(struct vfio_pci_core_device *vdev, unsigned index, 643 643 unsigned start, unsigned count, uint32_t flags, 644 644 void *data) = NULL; 645 645
+60 -36
drivers/vfio/pci/vfio_pci_private.h include/linux/vfio_pci_core.h
··· 10 10 11 11 #include <linux/mutex.h> 12 12 #include <linux/pci.h> 13 + #include <linux/vfio.h> 13 14 #include <linux/irqbypass.h> 14 15 #include <linux/types.h> 15 16 #include <linux/uuid.h> 16 17 #include <linux/notifier.h> 17 18 18 - #ifndef VFIO_PCI_PRIVATE_H 19 - #define VFIO_PCI_PRIVATE_H 19 + #ifndef VFIO_PCI_CORE_H 20 + #define VFIO_PCI_CORE_H 20 21 21 22 #define VFIO_PCI_OFFSET_SHIFT 40 22 23 ··· 34 33 35 34 struct vfio_pci_ioeventfd { 36 35 struct list_head next; 37 - struct vfio_pci_device *vdev; 36 + struct vfio_pci_core_device *vdev; 38 37 struct virqfd *virqfd; 39 38 void __iomem *addr; 40 39 uint64_t data; ··· 53 52 struct irq_bypass_producer producer; 54 53 }; 55 54 56 - struct vfio_pci_device; 55 + struct vfio_pci_core_device; 57 56 struct vfio_pci_region; 58 57 59 58 struct vfio_pci_regops { 60 - size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, 59 + ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, 61 60 size_t count, loff_t *ppos, bool iswrite); 62 - void (*release)(struct vfio_pci_device *vdev, 61 + void (*release)(struct vfio_pci_core_device *vdev, 63 62 struct vfio_pci_region *region); 64 - int (*mmap)(struct vfio_pci_device *vdev, 63 + int (*mmap)(struct vfio_pci_core_device *vdev, 65 64 struct vfio_pci_region *region, 66 65 struct vm_area_struct *vma); 67 - int (*add_capability)(struct vfio_pci_device *vdev, 66 + int (*add_capability)(struct vfio_pci_core_device *vdev, 68 67 struct vfio_pci_region *region, 69 68 struct vfio_info_cap *caps); 70 69 }; ··· 84 83 struct list_head res_next; 85 84 }; 86 85 87 - struct vfio_pci_reflck { 88 - struct kref kref; 89 - struct mutex lock; 90 - }; 91 - 92 86 struct vfio_pci_vf_token { 93 87 struct mutex lock; 94 88 uuid_t uuid; ··· 95 99 struct list_head vma_next; 96 100 }; 97 101 98 - struct vfio_pci_device { 102 + struct vfio_pci_core_device { 99 103 struct vfio_device vdev; 100 104 struct pci_dev *pdev; 101 105 void __iomem *barmap[PCI_STD_NUM_BARS]; ··· 126 130 bool needs_pm_restore; 127 131 struct pci_saved_state *pci_saved_state; 128 132 struct pci_saved_state *pm_save; 129 - struct vfio_pci_reflck *reflck; 130 - int refcnt; 131 133 int ioeventfds_nr; 132 134 struct eventfd_ctx *err_trigger; 133 135 struct eventfd_ctx *req_trigger; ··· 145 151 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) 146 152 #define irq_is(vdev, type) (vdev->irq_type == type) 147 153 148 - extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); 149 - extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); 154 + extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); 155 + extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); 150 156 151 - extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 157 + extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, 152 158 uint32_t flags, unsigned index, 153 159 unsigned start, unsigned count, void *data); 154 160 155 - extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, 161 + extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, 156 162 char __user *buf, size_t count, 157 163 loff_t *ppos, bool iswrite); 158 164 159 - extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, 165 + extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, 160 166 size_t count, loff_t *ppos, bool iswrite); 161 167 162 - extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, 168 + extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, 163 169 size_t count, loff_t *ppos, bool iswrite); 164 170 165 - extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, 171 + extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, 166 172 uint64_t data, int count, int fd); 167 173 168 174 extern int vfio_pci_init_perm_bits(void); 169 175 extern void vfio_pci_uninit_perm_bits(void); 170 176 171 - extern int vfio_config_init(struct vfio_pci_device *vdev); 172 - extern void vfio_config_free(struct vfio_pci_device *vdev); 177 + extern int vfio_config_init(struct vfio_pci_core_device *vdev); 178 + extern void vfio_config_free(struct vfio_pci_core_device *vdev); 173 179 174 - extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 180 + extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, 175 181 unsigned int type, unsigned int subtype, 176 182 const struct vfio_pci_regops *ops, 177 183 size_t size, u32 flags, void *data); 178 184 179 - extern int vfio_pci_set_power_state(struct vfio_pci_device *vdev, 185 + extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, 180 186 pci_power_t state); 181 187 182 - extern bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev); 183 - extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device 188 + extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); 189 + extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device 184 190 *vdev); 185 - extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev); 186 - extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, 191 + extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); 192 + extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, 187 193 u16 cmd); 188 194 189 195 #ifdef CONFIG_VFIO_PCI_IGD 190 - extern int vfio_pci_igd_init(struct vfio_pci_device *vdev); 196 + extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); 191 197 #else 192 - static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) 198 + static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 193 199 { 194 200 return -ENODEV; 195 201 } 196 202 #endif 197 203 198 204 #ifdef CONFIG_S390 199 - extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 205 + extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 200 206 struct vfio_info_cap *caps); 201 207 #else 202 - static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 208 + static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 203 209 struct vfio_info_cap *caps) 204 210 { 205 211 return -ENODEV; 206 212 } 207 213 #endif 208 214 209 - #endif /* VFIO_PCI_PRIVATE_H */ 215 + /* Will be exported for vfio pci drivers usage */ 216 + void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, 217 + bool is_disable_idle_d3); 218 + void vfio_pci_core_close_device(struct vfio_device *core_vdev); 219 + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, 220 + struct pci_dev *pdev, 221 + const struct vfio_device_ops *vfio_pci_ops); 222 + int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); 223 + void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); 224 + void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); 225 + int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); 226 + extern const struct pci_error_handlers vfio_pci_core_err_handlers; 227 + long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, 228 + unsigned long arg); 229 + ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, 230 + size_t count, loff_t *ppos); 231 + ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, 232 + size_t count, loff_t *ppos); 233 + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); 234 + void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); 235 + int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); 236 + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); 237 + void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); 238 + void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); 239 + 240 + static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 241 + { 242 + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 243 + } 244 + 245 + #endif /* VFIO_PCI_CORE_H */
+9 -9
drivers/vfio/pci/vfio_pci_rdwr.c
··· 17 17 #include <linux/vfio.h> 18 18 #include <linux/vgaarb.h> 19 19 20 - #include "vfio_pci_private.h" 20 + #include <linux/vfio_pci_core.h> 21 21 22 22 #ifdef __LITTLE_ENDIAN 23 23 #define vfio_ioread64 ioread64 ··· 38 38 #define vfio_iowrite8 iowrite8 39 39 40 40 #define VFIO_IOWRITE(size) \ 41 - static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \ 41 + static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \ 42 42 bool test_mem, u##size val, void __iomem *io) \ 43 43 { \ 44 44 if (test_mem) { \ ··· 65 65 #endif 66 66 67 67 #define VFIO_IOREAD(size) \ 68 - static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \ 68 + static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \ 69 69 bool test_mem, u##size *val, void __iomem *io) \ 70 70 { \ 71 71 if (test_mem) { \ ··· 94 94 * reads with -1. This is intended for handling MSI-X vector tables and 95 95 * leftover space for ROM BARs. 96 96 */ 97 - static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem, 97 + static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 98 98 void __iomem *io, char __user *buf, 99 99 loff_t off, size_t count, size_t x_start, 100 100 size_t x_end, bool iswrite) ··· 200 200 return done; 201 201 } 202 202 203 - static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar) 203 + static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar) 204 204 { 205 205 struct pci_dev *pdev = vdev->pdev; 206 206 int ret; ··· 224 224 return 0; 225 225 } 226 226 227 - ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, 227 + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, 228 228 size_t count, loff_t *ppos, bool iswrite) 229 229 { 230 230 struct pci_dev *pdev = vdev->pdev; ··· 288 288 return done; 289 289 } 290 290 291 - ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, 291 + ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, 292 292 size_t count, loff_t *ppos, bool iswrite) 293 293 { 294 294 int ret; ··· 384 384 static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) 385 385 { 386 386 struct vfio_pci_ioeventfd *ioeventfd = opaque; 387 - struct vfio_pci_device *vdev = ioeventfd->vdev; 387 + struct vfio_pci_core_device *vdev = ioeventfd->vdev; 388 388 389 389 if (ioeventfd->test_mem) { 390 390 if (!down_read_trylock(&vdev->memory_lock)) ··· 410 410 vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem); 411 411 } 412 412 413 - long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, 413 + long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, 414 414 uint64_t data, int count, int fd) 415 415 { 416 416 struct pci_dev *pdev = vdev->pdev;
+3 -8
drivers/vfio/pci/vfio_pci_zdev.c
··· 1 - // SPDX-License-Identifier: GPL-2.0+ 1 + // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 3 * VFIO ZPCI devices support 4 4 * 5 5 * Copyright (C) IBM Corp. 2020. All rights reserved. 6 6 * Author(s): Pierre Morel <pmorel@linux.ibm.com> 7 7 * Matthew Rosato <mjrosato@linux.ibm.com> 8 - * 9 - * This program is free software; you can redistribute it and/or modify 10 - * it under the terms of the GNU General Public License version 2 as 11 - * published by the Free Software Foundation. 12 - * 13 8 */ 14 9 #include <linux/io.h> 15 10 #include <linux/pci.h> ··· 14 19 #include <asm/pci_clp.h> 15 20 #include <asm/pci_io.h> 16 21 17 - #include "vfio_pci_private.h" 22 + #include <linux/vfio_pci_core.h> 18 23 19 24 /* 20 25 * Add the Base PCI Function information to the device info region. ··· 109 114 /* 110 115 * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain. 111 116 */ 112 - int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 117 + int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 113 118 struct vfio_info_cap *caps) 114 119 { 115 120 struct zpci_dev *zdev = to_zpci(vdev->pdev);
+4 -2
drivers/vfio/platform/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config VFIO_PLATFORM 3 3 tristate "VFIO support for platform devices" 4 - depends on VFIO && EVENTFD && (ARM || ARM64 || COMPILE_TEST) 4 + depends on ARM || ARM64 || COMPILE_TEST 5 5 select VFIO_VIRQFD 6 6 help 7 7 Support for platform devices with VFIO. This is required to make ··· 10 10 11 11 If you don't know what to do here, say N. 12 12 13 + if VFIO_PLATFORM 13 14 config VFIO_AMBA 14 15 tristate "VFIO support for AMBA devices" 15 - depends on VFIO_PLATFORM && (ARM_AMBA || COMPILE_TEST) 16 + depends on ARM_AMBA || COMPILE_TEST 16 17 help 17 18 Support for ARM AMBA devices with VFIO. This is required to make 18 19 use of ARM AMBA devices present on the system using the VFIO ··· 22 21 If you don't know what to do here, say N. 23 22 24 23 source "drivers/vfio/platform/reset/Kconfig" 24 + endif
+1 -3
drivers/vfio/platform/reset/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config VFIO_PLATFORM_CALXEDAXGMAC_RESET 3 3 tristate "VFIO support for calxeda xgmac reset" 4 - depends on VFIO_PLATFORM 5 4 help 6 5 Enables the VFIO platform driver to handle reset for Calxeda xgmac 7 6 ··· 8 9 9 10 config VFIO_PLATFORM_AMDXGBE_RESET 10 11 tristate "VFIO support for AMD XGBE reset" 11 - depends on VFIO_PLATFORM 12 12 help 13 13 Enables the VFIO platform driver to handle reset for AMD XGBE 14 14 ··· 15 17 16 18 config VFIO_PLATFORM_BCMFLEXRM_RESET 17 19 tristate "VFIO support for Broadcom FlexRM reset" 18 - depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST) 20 + depends on ARCH_BCM_IPROC || COMPILE_TEST 19 21 default ARCH_BCM_IPROC 20 22 help 21 23 Enables the VFIO platform driver to handle reset for Broadcom FlexRM
+1 -9
drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 /* 2 3 * Copyright (C) 2017 Broadcom 3 - * 4 - * This program is free software; you can redistribute it and/or 5 - * modify it under the terms of the GNU General Public License as 6 - * published by the Free Software Foundation version 2. 7 - * 8 - * This program is distributed "as is" WITHOUT ANY WARRANTY of any 9 - * kind, whether express or implied; without even the implied warranty 10 - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 - * GNU General Public License for more details. 12 4 */ 13 5 14 6 /*
+45 -57
drivers/vfio/platform/vfio_platform_common.c
··· 218 218 return -EINVAL; 219 219 } 220 220 221 - static void vfio_platform_release(struct vfio_device *core_vdev) 221 + static void vfio_platform_close_device(struct vfio_device *core_vdev) 222 222 { 223 223 struct vfio_platform_device *vdev = 224 224 container_of(core_vdev, struct vfio_platform_device, vdev); 225 - 226 - mutex_lock(&driver_lock); 227 - 228 - if (!(--vdev->refcnt)) { 229 - const char *extra_dbg = NULL; 230 - int ret; 231 - 232 - ret = vfio_platform_call_reset(vdev, &extra_dbg); 233 - if (ret && vdev->reset_required) { 234 - dev_warn(vdev->device, "reset driver is required and reset call failed in release (%d) %s\n", 235 - ret, extra_dbg ? extra_dbg : ""); 236 - WARN_ON(1); 237 - } 238 - pm_runtime_put(vdev->device); 239 - vfio_platform_regions_cleanup(vdev); 240 - vfio_platform_irq_cleanup(vdev); 241 - } 242 - 243 - mutex_unlock(&driver_lock); 244 - } 245 - 246 - static int vfio_platform_open(struct vfio_device *core_vdev) 247 - { 248 - struct vfio_platform_device *vdev = 249 - container_of(core_vdev, struct vfio_platform_device, vdev); 225 + const char *extra_dbg = NULL; 250 226 int ret; 251 227 252 - mutex_lock(&driver_lock); 253 - 254 - if (!vdev->refcnt) { 255 - const char *extra_dbg = NULL; 256 - 257 - ret = vfio_platform_regions_init(vdev); 258 - if (ret) 259 - goto err_reg; 260 - 261 - ret = vfio_platform_irq_init(vdev); 262 - if (ret) 263 - goto err_irq; 264 - 265 - ret = pm_runtime_get_sync(vdev->device); 266 - if (ret < 0) 267 - goto err_rst; 268 - 269 - ret = vfio_platform_call_reset(vdev, &extra_dbg); 270 - if (ret && vdev->reset_required) { 271 - dev_warn(vdev->device, "reset driver is required and reset call failed in open (%d) %s\n", 272 - ret, extra_dbg ? extra_dbg : ""); 273 - goto err_rst; 274 - } 228 + ret = vfio_platform_call_reset(vdev, &extra_dbg); 229 + if (WARN_ON(ret && vdev->reset_required)) { 230 + dev_warn( 231 + vdev->device, 232 + "reset driver is required and reset call failed in release (%d) %s\n", 233 + ret, extra_dbg ? extra_dbg : ""); 275 234 } 235 + pm_runtime_put(vdev->device); 236 + vfio_platform_regions_cleanup(vdev); 237 + vfio_platform_irq_cleanup(vdev); 238 + } 276 239 277 - vdev->refcnt++; 240 + static int vfio_platform_open_device(struct vfio_device *core_vdev) 241 + { 242 + struct vfio_platform_device *vdev = 243 + container_of(core_vdev, struct vfio_platform_device, vdev); 244 + const char *extra_dbg = NULL; 245 + int ret; 278 246 279 - mutex_unlock(&driver_lock); 247 + ret = vfio_platform_regions_init(vdev); 248 + if (ret) 249 + return ret; 250 + 251 + ret = vfio_platform_irq_init(vdev); 252 + if (ret) 253 + goto err_irq; 254 + 255 + ret = pm_runtime_get_sync(vdev->device); 256 + if (ret < 0) 257 + goto err_rst; 258 + 259 + ret = vfio_platform_call_reset(vdev, &extra_dbg); 260 + if (ret && vdev->reset_required) { 261 + dev_warn( 262 + vdev->device, 263 + "reset driver is required and reset call failed in open (%d) %s\n", 264 + ret, extra_dbg ? extra_dbg : ""); 265 + goto err_rst; 266 + } 280 267 return 0; 281 268 282 269 err_rst: ··· 271 284 vfio_platform_irq_cleanup(vdev); 272 285 err_irq: 273 286 vfio_platform_regions_cleanup(vdev); 274 - err_reg: 275 - mutex_unlock(&driver_lock); 276 287 return ret; 277 288 } 278 289 ··· 601 616 602 617 static const struct vfio_device_ops vfio_platform_ops = { 603 618 .name = "vfio-platform", 604 - .open = vfio_platform_open, 605 - .release = vfio_platform_release, 619 + .open_device = vfio_platform_open_device, 620 + .close_device = vfio_platform_close_device, 606 621 .ioctl = vfio_platform_ioctl, 607 622 .read = vfio_platform_read, 608 623 .write = vfio_platform_write, ··· 652 667 ret = vfio_platform_of_probe(vdev, dev); 653 668 654 669 if (ret) 655 - return ret; 670 + goto out_uninit; 656 671 657 672 vdev->device = dev; 658 673 ··· 660 675 if (ret && vdev->reset_required) { 661 676 dev_err(dev, "No reset function found for device %s\n", 662 677 vdev->name); 663 - return ret; 678 + goto out_uninit; 664 679 } 665 680 666 681 group = vfio_iommu_group_get(dev); ··· 683 698 vfio_iommu_group_put(group, dev); 684 699 put_reset: 685 700 vfio_platform_put_reset(vdev); 701 + out_uninit: 702 + vfio_uninit_group_dev(&vdev->vdev); 686 703 return ret; 687 704 } 688 705 EXPORT_SYMBOL_GPL(vfio_platform_probe_common); ··· 695 708 696 709 pm_runtime_disable(vdev->device); 697 710 vfio_platform_put_reset(vdev); 711 + vfio_uninit_group_dev(&vdev->vdev); 698 712 vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev); 699 713 } 700 714 EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
-1
drivers/vfio/platform/vfio_platform_private.h
··· 48 48 u32 num_regions; 49 49 struct vfio_platform_irq *irqs; 50 50 u32 num_irqs; 51 - int refcnt; 52 51 struct mutex igate; 53 52 const char *compat; 54 53 const char *acpihid;
+120 -22
drivers/vfio/vfio.c
··· 96 96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 97 97 #endif 98 98 99 + static DEFINE_XARRAY(vfio_device_set_xa); 100 + 101 + int vfio_assign_device_set(struct vfio_device *device, void *set_id) 102 + { 103 + unsigned long idx = (unsigned long)set_id; 104 + struct vfio_device_set *new_dev_set; 105 + struct vfio_device_set *dev_set; 106 + 107 + if (WARN_ON(!set_id)) 108 + return -EINVAL; 109 + 110 + /* 111 + * Atomically acquire a singleton object in the xarray for this set_id 112 + */ 113 + xa_lock(&vfio_device_set_xa); 114 + dev_set = xa_load(&vfio_device_set_xa, idx); 115 + if (dev_set) 116 + goto found_get_ref; 117 + xa_unlock(&vfio_device_set_xa); 118 + 119 + new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL); 120 + if (!new_dev_set) 121 + return -ENOMEM; 122 + mutex_init(&new_dev_set->lock); 123 + INIT_LIST_HEAD(&new_dev_set->device_list); 124 + new_dev_set->set_id = set_id; 125 + 126 + xa_lock(&vfio_device_set_xa); 127 + dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set, 128 + GFP_KERNEL); 129 + if (!dev_set) { 130 + dev_set = new_dev_set; 131 + goto found_get_ref; 132 + } 133 + 134 + kfree(new_dev_set); 135 + if (xa_is_err(dev_set)) { 136 + xa_unlock(&vfio_device_set_xa); 137 + return xa_err(dev_set); 138 + } 139 + 140 + found_get_ref: 141 + dev_set->device_count++; 142 + xa_unlock(&vfio_device_set_xa); 143 + mutex_lock(&dev_set->lock); 144 + device->dev_set = dev_set; 145 + list_add_tail(&device->dev_set_list, &dev_set->device_list); 146 + mutex_unlock(&dev_set->lock); 147 + return 0; 148 + } 149 + EXPORT_SYMBOL_GPL(vfio_assign_device_set); 150 + 151 + static void vfio_release_device_set(struct vfio_device *device) 152 + { 153 + struct vfio_device_set *dev_set = device->dev_set; 154 + 155 + if (!dev_set) 156 + return; 157 + 158 + mutex_lock(&dev_set->lock); 159 + list_del(&device->dev_set_list); 160 + mutex_unlock(&dev_set->lock); 161 + 162 + xa_lock(&vfio_device_set_xa); 163 + if (!--dev_set->device_count) { 164 + __xa_erase(&vfio_device_set_xa, 165 + (unsigned long)dev_set->set_id); 166 + mutex_destroy(&dev_set->lock); 167 + kfree(dev_set); 168 + } 169 + xa_unlock(&vfio_device_set_xa); 170 + } 171 + 99 172 /* 100 173 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe 101 174 * and remove functions, any use cases other than acquiring the first ··· 822 749 } 823 750 EXPORT_SYMBOL_GPL(vfio_init_group_dev); 824 751 752 + void vfio_uninit_group_dev(struct vfio_device *device) 753 + { 754 + vfio_release_device_set(device); 755 + } 756 + EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); 757 + 825 758 int vfio_register_group_dev(struct vfio_device *device) 826 759 { 827 760 struct vfio_device *existing_device; 828 761 struct iommu_group *iommu_group; 829 762 struct vfio_group *group; 763 + 764 + /* 765 + * If the driver doesn't specify a set then the device is added to a 766 + * singleton set just for itself. 767 + */ 768 + if (!device->dev_set) 769 + vfio_assign_device_set(device, device); 830 770 831 771 iommu_group = iommu_group_get(device->dev); 832 772 if (!iommu_group) ··· 1442 1356 { 1443 1357 struct vfio_device *device; 1444 1358 struct file *filep; 1445 - int ret; 1359 + int fdno; 1360 + int ret = 0; 1446 1361 1447 1362 if (0 == atomic_read(&group->container_users) || 1448 1363 !group->container->iommu_driver || !vfio_group_viable(group)) ··· 1457 1370 return PTR_ERR(device); 1458 1371 1459 1372 if (!try_module_get(device->dev->driver->owner)) { 1460 - vfio_device_put(device); 1461 - return -ENODEV; 1373 + ret = -ENODEV; 1374 + goto err_device_put; 1462 1375 } 1463 1376 1464 - ret = device->ops->open(device); 1465 - if (ret) { 1466 - module_put(device->dev->driver->owner); 1467 - vfio_device_put(device); 1468 - return ret; 1377 + mutex_lock(&device->dev_set->lock); 1378 + device->open_count++; 1379 + if (device->open_count == 1 && device->ops->open_device) { 1380 + ret = device->ops->open_device(device); 1381 + if (ret) 1382 + goto err_undo_count; 1469 1383 } 1384 + mutex_unlock(&device->dev_set->lock); 1470 1385 1471 1386 /* 1472 1387 * We can't use anon_inode_getfd() because we need to modify 1473 1388 * the f_mode flags directly to allow more than just ioctls 1474 1389 */ 1475 - ret = get_unused_fd_flags(O_CLOEXEC); 1476 - if (ret < 0) { 1477 - device->ops->release(device); 1478 - module_put(device->dev->driver->owner); 1479 - vfio_device_put(device); 1480 - return ret; 1481 - } 1390 + fdno = ret = get_unused_fd_flags(O_CLOEXEC); 1391 + if (ret < 0) 1392 + goto err_close_device; 1482 1393 1483 1394 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, 1484 1395 device, O_RDWR); 1485 1396 if (IS_ERR(filep)) { 1486 - put_unused_fd(ret); 1487 1397 ret = PTR_ERR(filep); 1488 - device->ops->release(device); 1489 - module_put(device->dev->driver->owner); 1490 - vfio_device_put(device); 1491 - return ret; 1398 + goto err_fd; 1492 1399 } 1493 1400 1494 1401 /* ··· 1494 1413 1495 1414 atomic_inc(&group->container_users); 1496 1415 1497 - fd_install(ret, filep); 1416 + fd_install(fdno, filep); 1498 1417 1499 1418 if (group->noiommu) 1500 1419 dev_warn(device->dev, "vfio-noiommu device opened by user " 1501 1420 "(%s:%d)\n", current->comm, task_pid_nr(current)); 1421 + return fdno; 1502 1422 1423 + err_fd: 1424 + put_unused_fd(fdno); 1425 + err_close_device: 1426 + mutex_lock(&device->dev_set->lock); 1427 + if (device->open_count == 1 && device->ops->close_device) 1428 + device->ops->close_device(device); 1429 + err_undo_count: 1430 + device->open_count--; 1431 + mutex_unlock(&device->dev_set->lock); 1432 + module_put(device->dev->driver->owner); 1433 + err_device_put: 1434 + vfio_device_put(device); 1503 1435 return ret; 1504 1436 } 1505 1437 ··· 1650 1556 { 1651 1557 struct vfio_device *device = filep->private_data; 1652 1558 1653 - device->ops->release(device); 1559 + mutex_lock(&device->dev_set->lock); 1560 + if (!--device->open_count && device->ops->close_device) 1561 + device->ops->close_device(device); 1562 + mutex_unlock(&device->dev_set->lock); 1654 1563 1655 1564 module_put(device->dev->driver->owner); 1656 1565 ··· 2456 2359 class_destroy(vfio.class); 2457 2360 vfio.class = NULL; 2458 2361 misc_deregister(&vfio_dev); 2362 + xa_destroy(&vfio_device_set_xa); 2459 2363 } 2460 2364 2461 2365 module_init(vfio_init);
+4 -4
drivers/vfio/vfio_iommu_type1.c
··· 612 612 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start, 613 613 size_t size, struct vfio_dma **dma_p) 614 614 { 615 - int ret; 615 + int ret = 0; 616 616 617 617 do { 618 618 *dma_p = vfio_find_dma(iommu, start, size); 619 619 if (!*dma_p) 620 - ret = -EINVAL; 620 + return -EINVAL; 621 621 else if (!(*dma_p)->vaddr_invalid) 622 - ret = 0; 622 + return ret; 623 623 else 624 624 ret = vfio_wait(iommu); 625 - } while (ret > 0); 625 + } while (ret == WAITED); 626 626 627 627 return ret; 628 628 }
+2 -7
include/linux/mdev.h
··· 72 72 * @mdev: mdev_device device structure which is being 73 73 * destroyed 74 74 * Returns integer: success (0) or error (< 0) 75 - * @open: Open mediated device. 76 - * @mdev: mediated device. 77 - * Returns integer: success (0) or error (< 0) 78 - * @release: release mediated device 79 - * @mdev: mediated device. 80 75 * @read: Read emulation callback 81 76 * @mdev: mediated device structure 82 77 * @buf: read buffer ··· 106 111 107 112 int (*create)(struct mdev_device *mdev); 108 113 int (*remove)(struct mdev_device *mdev); 109 - int (*open)(struct mdev_device *mdev); 110 - void (*release)(struct mdev_device *mdev); 114 + int (*open_device)(struct mdev_device *mdev); 115 + void (*close_device)(struct mdev_device *mdev); 111 116 ssize_t (*read)(struct mdev_device *mdev, char __user *buf, 112 117 size_t count, loff_t *ppos); 113 118 ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
+6
include/linux/mod_devicetable.h
··· 16 16 17 17 #define PCI_ANY_ID (~0) 18 18 19 + enum { 20 + PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1, 21 + }; 22 + 19 23 /** 20 24 * struct pci_device_id - PCI device ID structure 21 25 * @vendor: Vendor ID to match (or PCI_ANY_ID) ··· 38 34 * Best practice is to use driver_data as an index 39 35 * into a static list of equivalent device types, 40 36 * instead of using it as a pointer. 37 + * @override_only: Match only when dev->driver_override is this driver. 41 38 */ 42 39 struct pci_device_id { 43 40 __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ 44 41 __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ 45 42 __u32 class, class_mask; /* (class,subclass,prog-if) triplet */ 46 43 kernel_ulong_t driver_data; /* Data private to the driver */ 44 + __u32 override_only; 47 45 }; 48 46 49 47
+29
include/linux/pci.h
··· 902 902 .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID 903 903 904 904 /** 905 + * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with 906 + * override_only flags. 907 + * @vend: the 16 bit PCI Vendor ID 908 + * @dev: the 16 bit PCI Device ID 909 + * @driver_override: the 32 bit PCI Device override_only 910 + * 911 + * This macro is used to create a struct pci_device_id that matches only a 912 + * driver_override device. The subvendor and subdevice fields will be set to 913 + * PCI_ANY_ID. 914 + */ 915 + #define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \ 916 + .vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \ 917 + .subdevice = PCI_ANY_ID, .override_only = (driver_override) 918 + 919 + /** 920 + * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO 921 + * "driver_override" PCI device. 922 + * @vend: the 16 bit PCI Vendor ID 923 + * @dev: the 16 bit PCI Device ID 924 + * 925 + * This macro is used to create a struct pci_device_id that matches a 926 + * specific device. The subvendor and subdevice fields will be set to 927 + * PCI_ANY_ID and the driver_override will be set to 928 + * PCI_ID_F_VFIO_DRIVER_OVERRIDE. 929 + */ 930 + #define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \ 931 + PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE) 932 + 933 + /** 905 934 * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem 906 935 * @vend: the 16 bit PCI Vendor ID 907 936 * @dev: the 16 bit PCI Device ID
+22 -4
include/linux/vfio.h
··· 15 15 #include <linux/poll.h> 16 16 #include <uapi/linux/vfio.h> 17 17 18 + /* 19 + * VFIO devices can be placed in a set, this allows all devices to share this 20 + * structure and the VFIO core will provide a lock that is held around 21 + * open_device()/close_device() for all devices in the set. 22 + */ 23 + struct vfio_device_set { 24 + void *set_id; 25 + struct mutex lock; 26 + struct list_head device_list; 27 + unsigned int device_count; 28 + }; 29 + 18 30 struct vfio_device { 19 31 struct device *dev; 20 32 const struct vfio_device_ops *ops; 21 33 struct vfio_group *group; 34 + struct vfio_device_set *dev_set; 35 + struct list_head dev_set_list; 22 36 23 37 /* Members below here are private, not for driver use */ 24 38 refcount_t refcount; 39 + unsigned int open_count; 25 40 struct completion comp; 26 41 struct list_head group_next; 27 42 }; ··· 44 29 /** 45 30 * struct vfio_device_ops - VFIO bus driver device callbacks 46 31 * 47 - * @open: Called when userspace creates new file descriptor for device 48 - * @release: Called when userspace releases file descriptor for device 32 + * @open_device: Called when the first file descriptor is opened for this device 33 + * @close_device: Opposite of open_device 49 34 * @read: Perform read(2) on device file descriptor 50 35 * @write: Perform write(2) on device file descriptor 51 36 * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* ··· 58 43 */ 59 44 struct vfio_device_ops { 60 45 char *name; 61 - int (*open)(struct vfio_device *vdev); 62 - void (*release)(struct vfio_device *vdev); 46 + int (*open_device)(struct vfio_device *vdev); 47 + void (*close_device)(struct vfio_device *vdev); 63 48 ssize_t (*read)(struct vfio_device *vdev, char __user *buf, 64 49 size_t count, loff_t *ppos); 65 50 ssize_t (*write)(struct vfio_device *vdev, const char __user *buf, ··· 76 61 77 62 void vfio_init_group_dev(struct vfio_device *device, struct device *dev, 78 63 const struct vfio_device_ops *ops); 64 + void vfio_uninit_group_dev(struct vfio_device *device); 79 65 int vfio_register_group_dev(struct vfio_device *device); 80 66 void vfio_unregister_group_dev(struct vfio_device *device); 81 67 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); 82 68 extern void vfio_device_put(struct vfio_device *device); 69 + 70 + int vfio_assign_device_set(struct vfio_device *device, void *set_id); 83 71 84 72 /* events for the backend driver notify callback */ 85 73 enum vfio_iommu_notify_type {
+19 -21
samples/vfio-mdev/mbochs.c
··· 129 129 static struct class *mbochs_class; 130 130 static struct cdev mbochs_cdev; 131 131 static struct device mbochs_dev; 132 - static int mbochs_used_mbytes; 132 + static atomic_t mbochs_avail_mbytes; 133 133 static const struct vfio_device_ops mbochs_dev_ops; 134 134 135 135 struct vfio_region_info_ext { ··· 507 507 508 508 static int mbochs_probe(struct mdev_device *mdev) 509 509 { 510 + int avail_mbytes = atomic_read(&mbochs_avail_mbytes); 510 511 const struct mbochs_type *type = 511 512 &mbochs_types[mdev_get_type_group_id(mdev)]; 512 513 struct device *dev = mdev_dev(mdev); 513 514 struct mdev_state *mdev_state; 514 515 int ret = -ENOMEM; 515 516 516 - if (type->mbytes + mbochs_used_mbytes > max_mbytes) 517 - return -ENOMEM; 517 + do { 518 + if (avail_mbytes < type->mbytes) 519 + return -ENOSPC; 520 + } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes, 521 + avail_mbytes - type->mbytes)); 518 522 519 523 mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); 520 524 if (mdev_state == NULL) 521 - return -ENOMEM; 525 + goto err_avail; 522 526 vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops); 523 527 524 528 mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); ··· 553 549 mbochs_create_config_space(mdev_state); 554 550 mbochs_reset(mdev_state); 555 551 556 - mbochs_used_mbytes += type->mbytes; 557 - 558 552 ret = vfio_register_group_dev(&mdev_state->vdev); 559 553 if (ret) 560 554 goto err_mem; 561 555 dev_set_drvdata(&mdev->dev, mdev_state); 562 556 return 0; 563 - 564 557 err_mem: 558 + vfio_uninit_group_dev(&mdev_state->vdev); 559 + kfree(mdev_state->pages); 565 560 kfree(mdev_state->vconfig); 566 561 kfree(mdev_state); 562 + err_avail: 563 + atomic_add(type->mbytes, &mbochs_avail_mbytes); 567 564 return ret; 568 565 } 569 566 ··· 572 567 { 573 568 struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); 574 569 575 - mbochs_used_mbytes -= mdev_state->type->mbytes; 576 570 vfio_unregister_group_dev(&mdev_state->vdev); 571 + vfio_uninit_group_dev(&mdev_state->vdev); 572 + atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); 577 573 kfree(mdev_state->pages); 578 574 kfree(mdev_state->vconfig); 579 575 kfree(mdev_state); ··· 1278 1272 return -ENOTTY; 1279 1273 } 1280 1274 1281 - static int mbochs_open(struct vfio_device *vdev) 1282 - { 1283 - if (!try_module_get(THIS_MODULE)) 1284 - return -ENODEV; 1285 - 1286 - return 0; 1287 - } 1288 - 1289 - static void mbochs_close(struct vfio_device *vdev) 1275 + static void mbochs_close_device(struct vfio_device *vdev) 1290 1276 { 1291 1277 struct mdev_state *mdev_state = 1292 1278 container_of(vdev, struct mdev_state, vdev); ··· 1298 1300 mbochs_put_pages(mdev_state); 1299 1301 1300 1302 mutex_unlock(&mdev_state->ops_lock); 1301 - module_put(THIS_MODULE); 1302 1303 } 1303 1304 1304 1305 static ssize_t ··· 1352 1355 { 1353 1356 const struct mbochs_type *type = 1354 1357 &mbochs_types[mtype_get_type_group_id(mtype)]; 1355 - int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes; 1358 + int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes; 1356 1359 1357 1360 return sprintf(buf, "%d\n", count); 1358 1361 } ··· 1396 1399 }; 1397 1400 1398 1401 static const struct vfio_device_ops mbochs_dev_ops = { 1399 - .open = mbochs_open, 1400 - .release = mbochs_close, 1402 + .close_device = mbochs_close_device, 1401 1403 .read = mbochs_read, 1402 1404 .write = mbochs_write, 1403 1405 .ioctl = mbochs_ioctl, ··· 1432 1436 static int __init mbochs_dev_init(void) 1433 1437 { 1434 1438 int ret = 0; 1439 + 1440 + atomic_set(&mbochs_avail_mbytes, max_mbytes); 1435 1441 1436 1442 ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME); 1437 1443 if (ret < 0) {
+15 -25
samples/vfio-mdev/mdpy.c
··· 235 235 236 236 mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); 237 237 if (mdev_state->vconfig == NULL) { 238 - kfree(mdev_state); 239 - return -ENOMEM; 238 + ret = -ENOMEM; 239 + goto err_state; 240 240 } 241 241 242 242 fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); 243 243 244 244 mdev_state->memblk = vmalloc_user(fbsize); 245 245 if (!mdev_state->memblk) { 246 - kfree(mdev_state->vconfig); 247 - kfree(mdev_state); 248 - return -ENOMEM; 246 + ret = -ENOMEM; 247 + goto err_vconfig; 249 248 } 250 249 dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, 251 250 type->height); ··· 259 260 mdpy_count++; 260 261 261 262 ret = vfio_register_group_dev(&mdev_state->vdev); 262 - if (ret) { 263 - kfree(mdev_state->vconfig); 264 - kfree(mdev_state); 265 - return ret; 266 - } 263 + if (ret) 264 + goto err_mem; 267 265 dev_set_drvdata(&mdev->dev, mdev_state); 268 266 return 0; 267 + err_mem: 268 + vfree(mdev_state->memblk); 269 + err_vconfig: 270 + kfree(mdev_state->vconfig); 271 + err_state: 272 + vfio_uninit_group_dev(&mdev_state->vdev); 273 + kfree(mdev_state); 274 + return ret; 269 275 } 270 276 271 277 static void mdpy_remove(struct mdev_device *mdev) ··· 282 278 vfio_unregister_group_dev(&mdev_state->vdev); 283 279 vfree(mdev_state->memblk); 284 280 kfree(mdev_state->vconfig); 281 + vfio_uninit_group_dev(&mdev_state->vdev); 285 282 kfree(mdev_state); 286 283 287 284 mdpy_count--; ··· 614 609 return -ENOTTY; 615 610 } 616 611 617 - static int mdpy_open(struct vfio_device *vdev) 618 - { 619 - if (!try_module_get(THIS_MODULE)) 620 - return -ENODEV; 621 - 622 - return 0; 623 - } 624 - 625 - static void mdpy_close(struct vfio_device *vdev) 626 - { 627 - module_put(THIS_MODULE); 628 - } 629 - 630 612 static ssize_t 631 613 resolution_show(struct device *dev, struct device_attribute *attr, 632 614 char *buf) ··· 708 716 }; 709 717 710 718 static const struct vfio_device_ops mdpy_dev_ops = { 711 - .open = mdpy_open, 712 - .release = mdpy_close, 713 719 .read = mdpy_read, 714 720 .write = mdpy_write, 715 721 .ioctl = mdpy_ioctl,
+16 -24
samples/vfio-mdev/mtty.c
··· 718 718 719 719 mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); 720 720 if (mdev_state == NULL) { 721 - atomic_add(nr_ports, &mdev_avail_ports); 722 - return -ENOMEM; 721 + ret = -ENOMEM; 722 + goto err_nr_ports; 723 723 } 724 724 725 725 vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops); ··· 732 732 mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); 733 733 734 734 if (mdev_state->vconfig == NULL) { 735 - kfree(mdev_state); 736 - atomic_add(nr_ports, &mdev_avail_ports); 737 - return -ENOMEM; 735 + ret = -ENOMEM; 736 + goto err_state; 738 737 } 739 738 740 739 mutex_init(&mdev_state->ops_lock); ··· 742 743 mtty_create_config_space(mdev_state); 743 744 744 745 ret = vfio_register_group_dev(&mdev_state->vdev); 745 - if (ret) { 746 - kfree(mdev_state); 747 - atomic_add(nr_ports, &mdev_avail_ports); 748 - return ret; 749 - } 750 - 746 + if (ret) 747 + goto err_vconfig; 751 748 dev_set_drvdata(&mdev->dev, mdev_state); 752 749 return 0; 750 + 751 + err_vconfig: 752 + kfree(mdev_state->vconfig); 753 + err_state: 754 + vfio_uninit_group_dev(&mdev_state->vdev); 755 + kfree(mdev_state); 756 + err_nr_ports: 757 + atomic_add(nr_ports, &mdev_avail_ports); 758 + return ret; 753 759 } 754 760 755 761 static void mtty_remove(struct mdev_device *mdev) ··· 765 761 vfio_unregister_group_dev(&mdev_state->vdev); 766 762 767 763 kfree(mdev_state->vconfig); 764 + vfio_uninit_group_dev(&mdev_state->vdev); 768 765 kfree(mdev_state); 769 766 atomic_add(nr_ports, &mdev_avail_ports); 770 767 } ··· 1207 1202 return -ENOTTY; 1208 1203 } 1209 1204 1210 - static int mtty_open(struct vfio_device *vdev) 1211 - { 1212 - pr_info("%s\n", __func__); 1213 - return 0; 1214 - } 1215 - 1216 - static void mtty_close(struct vfio_device *mdev) 1217 - { 1218 - pr_info("%s\n", __func__); 1219 - } 1220 - 1221 1205 static ssize_t 1222 1206 sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, 1223 1207 char *buf) ··· 1314 1320 1315 1321 static const struct vfio_device_ops mtty_dev_ops = { 1316 1322 .name = "vfio-mtty", 1317 - .open = mtty_open, 1318 - .release = mtty_close, 1319 1323 .read = mtty_read, 1320 1324 .write = mtty_write, 1321 1325 .ioctl = mtty_ioctl,
+1
scripts/mod/devicetable-offsets.c
··· 42 42 DEVID_FIELD(pci_device_id, subdevice); 43 43 DEVID_FIELD(pci_device_id, class); 44 44 DEVID_FIELD(pci_device_id, class_mask); 45 + DEVID_FIELD(pci_device_id, override_only); 45 46 46 47 DEVID(ccw_device_id); 47 48 DEVID_FIELD(ccw_device_id, match_flags);
+15 -2
scripts/mod/file2alias.c
··· 426 426 return 1; 427 427 } 428 428 429 - /* Looks like: pci:vNdNsvNsdNbcNscNiN. */ 429 + /* Looks like: pci:vNdNsvNsdNbcNscNiN or <prefix>_pci:vNdNsvNsdNbcNscNiN. */ 430 430 static int do_pci_entry(const char *filename, 431 431 void *symval, char *alias) 432 432 { ··· 440 440 DEF_FIELD(symval, pci_device_id, subdevice); 441 441 DEF_FIELD(symval, pci_device_id, class); 442 442 DEF_FIELD(symval, pci_device_id, class_mask); 443 + DEF_FIELD(symval, pci_device_id, override_only); 443 444 444 - strcpy(alias, "pci:"); 445 + switch (override_only) { 446 + case 0: 447 + strcpy(alias, "pci:"); 448 + break; 449 + case PCI_ID_F_VFIO_DRIVER_OVERRIDE: 450 + strcpy(alias, "vfio_pci:"); 451 + break; 452 + default: 453 + warn("Unknown PCI driver_override alias %08X\n", 454 + override_only); 455 + return 0; 456 + } 457 + 445 458 ADD(alias, "v", vendor != PCI_ANY_ID, vendor); 446 459 ADD(alias, "d", device != PCI_ANY_ID, device); 447 460 ADD(alias, "sv", subvendor != PCI_ANY_ID, subvendor);