Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branches 'v5.15/vfio/spdx-license-cleanups', 'v5.15/vfio/dma-valid-waited-v3', 'v5.15/vfio/vfio-pci-core-v5' and 'v5.15/vfio/vfio-ap' into v5.15/vfio/next

+2651 -2430
+1
Documentation/PCI/pci.rst
··· 103 103 - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) 104 104 - class and classmask fields default to 0 105 105 - driver_data defaults to 0UL. 106 + - override_only field defaults to 0. 106 107 107 108 Note that driver_data must match the value used by any of the pci_device_id 108 109 entries defined in the driver. This makes the driver_data field mandatory
+1
MAINTAINERS
··· 19466 19466 F: Documentation/driver-api/vfio.rst 19467 19467 F: drivers/vfio/ 19468 19468 F: include/linux/vfio.h 19469 + F: include/linux/vfio_pci_core.h 19469 19470 F: include/uapi/linux/vfio.h 19470 19471 19471 19472 VFIO FSL-MC DRIVER
+3 -5
arch/s390/include/asm/kvm_host.h
··· 798 798 unsigned short ibc; 799 799 }; 800 800 801 - struct kvm_s390_module_hook { 802 - int (*hook)(struct kvm_vcpu *vcpu); 803 - struct module *owner; 804 - }; 801 + typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); 805 802 806 803 struct kvm_s390_crypto { 807 804 struct kvm_s390_crypto_cb *crycb; 808 - struct kvm_s390_module_hook *pqap_hook; 805 + struct rw_semaphore pqap_hook_rwsem; 806 + crypto_hook *pqap_hook; 809 807 __u32 crycbd; 810 808 __u8 aes_kw; 811 809 __u8 dea_kw;
+28 -4
arch/s390/kvm/kvm-s390.c
··· 2559 2559 kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; 2560 2560 } 2561 2561 2562 + /* 2563 + * kvm_arch_crypto_set_masks 2564 + * 2565 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks 2566 + * to be set. 2567 + * @apm: the mask identifying the accessible AP adapters 2568 + * @aqm: the mask identifying the accessible AP domains 2569 + * @adm: the mask identifying the accessible AP control domains 2570 + * 2571 + * Set the masks that identify the adapters, domains and control domains to 2572 + * which the KVM guest is granted access. 2573 + * 2574 + * Note: The kvm->lock mutex must be locked by the caller before invoking this 2575 + * function. 2576 + */ 2562 2577 void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, 2563 2578 unsigned long *aqm, unsigned long *adm) 2564 2579 { 2565 2580 struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; 2566 2581 2567 - mutex_lock(&kvm->lock); 2568 2582 kvm_s390_vcpu_block_all(kvm); 2569 2583 2570 2584 switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { ··· 2609 2595 /* recreate the shadow crycb for each vcpu */ 2610 2596 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); 2611 2597 kvm_s390_vcpu_unblock_all(kvm); 2612 - mutex_unlock(&kvm->lock); 2613 2598 } 2614 2599 EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); 2615 2600 2601 + /* 2602 + * kvm_arch_crypto_clear_masks 2603 + * 2604 + * @kvm: pointer to the target guest's KVM struct containing the crypto masks 2605 + * to be cleared. 2606 + * 2607 + * Clear the masks that identify the adapters, domains and control domains to 2608 + * which the KVM guest is granted access. 2609 + * 2610 + * Note: The kvm->lock mutex must be locked by the caller before invoking this 2611 + * function. 2612 + */ 2616 2613 void kvm_arch_crypto_clear_masks(struct kvm *kvm) 2617 2614 { 2618 - mutex_lock(&kvm->lock); 2619 2615 kvm_s390_vcpu_block_all(kvm); 2620 2616 2621 2617 memset(&kvm->arch.crypto.crycb->apcb0, 0, ··· 2637 2613 /* recreate the shadow crycb for each vcpu */ 2638 2614 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); 2639 2615 kvm_s390_vcpu_unblock_all(kvm); 2640 - mutex_unlock(&kvm->lock); 2641 2616 } 2642 2617 EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); 2643 2618 ··· 2653 2630 { 2654 2631 kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; 2655 2632 kvm_s390_set_crycb_format(kvm); 2633 + init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem); 2656 2634 2657 2635 if (!test_kvm_facility(kvm, 76)) 2658 2636 return;
+9 -6
arch/s390/kvm/priv.c
··· 610 610 static int handle_pqap(struct kvm_vcpu *vcpu) 611 611 { 612 612 struct ap_queue_status status = {}; 613 + crypto_hook pqap_hook; 613 614 unsigned long reg0; 614 615 int ret; 615 616 uint8_t fc; ··· 655 654 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 656 655 657 656 /* 658 - * Verify that the hook callback is registered, lock the owner 659 - * and call the hook. 657 + * If the hook callback is registered, there will be a pointer to the 658 + * hook function pointer in the kvm_s390_crypto structure. Lock the 659 + * owner, retrieve the hook function pointer and call the hook. 660 660 */ 661 + down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 661 662 if (vcpu->kvm->arch.crypto.pqap_hook) { 662 - if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) 663 - return -EOPNOTSUPP; 664 - ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); 665 - module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); 663 + pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; 664 + ret = pqap_hook(vcpu); 666 665 if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) 667 666 kvm_s390_set_psw_cc(vcpu, 3); 667 + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 668 668 return ret; 669 669 } 670 + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); 670 671 /* 671 672 * A vfio_driver must register a hook. 672 673 * No hook means no driver to enable the SIE CRYCB and no queues.
+21 -7
drivers/pci/pci-driver.c
··· 136 136 struct pci_dev *dev) 137 137 { 138 138 struct pci_dynid *dynid; 139 - const struct pci_device_id *found_id = NULL; 139 + const struct pci_device_id *found_id = NULL, *ids; 140 140 141 141 /* When driver_override is set, only bind to the matching driver */ 142 142 if (dev->driver_override && strcmp(dev->driver_override, drv->name)) ··· 152 152 } 153 153 spin_unlock(&drv->dynids.lock); 154 154 155 - if (!found_id) 156 - found_id = pci_match_id(drv->id_table, dev); 155 + if (found_id) 156 + return found_id; 157 + 158 + for (ids = drv->id_table; (found_id = pci_match_id(ids, dev)); 159 + ids = found_id + 1) { 160 + /* 161 + * The match table is split based on driver_override. 162 + * In case override_only was set, enforce driver_override 163 + * matching. 164 + */ 165 + if (found_id->override_only) { 166 + if (dev->driver_override) 167 + return found_id; 168 + } else { 169 + return found_id; 170 + } 171 + } 157 172 158 173 /* driver_override will always match, send a dummy id */ 159 - if (!found_id && dev->driver_override) 160 - found_id = &pci_device_id_any; 161 - 162 - return found_id; 174 + if (dev->driver_override) 175 + return &pci_device_id_any; 176 + return NULL; 163 177 } 164 178 165 179 /**
+132 -150
drivers/s390/crypto/vfio_ap_ops.c
··· 24 24 #define VFIO_AP_MDEV_TYPE_HWVIRT "passthrough" 25 25 #define VFIO_AP_MDEV_NAME_HWVIRT "VFIO AP Passthrough Device" 26 26 27 - static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev); 27 + static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev); 28 28 static struct vfio_ap_queue *vfio_ap_find_queue(int apqn); 29 + static const struct vfio_device_ops vfio_ap_matrix_dev_ops; 29 30 30 31 static int match_apqn(struct device *dev, const void *data) 31 32 { ··· 295 294 matrix_mdev = container_of(vcpu->kvm->arch.crypto.pqap_hook, 296 295 struct ap_matrix_mdev, pqap_hook); 297 296 298 - /* 299 - * If the KVM pointer is in the process of being set, wait until the 300 - * process has completed. 301 - */ 302 - wait_event_cmd(matrix_mdev->wait_for_kvm, 303 - !matrix_mdev->kvm_busy, 304 - mutex_unlock(&matrix_dev->lock), 305 - mutex_lock(&matrix_dev->lock)); 306 - 307 297 /* If the there is no guest using the mdev, there is nothing to do */ 308 298 if (!matrix_mdev->kvm) 309 299 goto out_unlock; ··· 327 335 matrix->adm_max = info->apxa ? info->Nd : 15; 328 336 } 329 337 330 - static int vfio_ap_mdev_create(struct mdev_device *mdev) 338 + static int vfio_ap_mdev_probe(struct mdev_device *mdev) 331 339 { 332 340 struct ap_matrix_mdev *matrix_mdev; 341 + int ret; 333 342 334 343 if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) 335 344 return -EPERM; 336 345 337 346 matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL); 338 347 if (!matrix_mdev) { 339 - atomic_inc(&matrix_dev->available_instances); 340 - return -ENOMEM; 348 + ret = -ENOMEM; 349 + goto err_dec_available; 341 350 } 351 + vfio_init_group_dev(&matrix_mdev->vdev, &mdev->dev, 352 + &vfio_ap_matrix_dev_ops); 342 353 343 354 matrix_mdev->mdev = mdev; 344 355 vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); 345 - init_waitqueue_head(&matrix_mdev->wait_for_kvm); 346 - mdev_set_drvdata(mdev, matrix_mdev); 347 - matrix_mdev->pqap_hook.hook = handle_pqap; 348 - matrix_mdev->pqap_hook.owner = THIS_MODULE; 356 + matrix_mdev->pqap_hook = handle_pqap; 349 357 mutex_lock(&matrix_dev->lock); 350 358 list_add(&matrix_mdev->node, &matrix_dev->mdev_list); 351 359 mutex_unlock(&matrix_dev->lock); 352 360 361 + ret = vfio_register_group_dev(&matrix_mdev->vdev); 362 + if (ret) 363 + goto err_list; 364 + dev_set_drvdata(&mdev->dev, matrix_mdev); 353 365 return 0; 366 + 367 + err_list: 368 + mutex_lock(&matrix_dev->lock); 369 + list_del(&matrix_mdev->node); 370 + mutex_unlock(&matrix_dev->lock); 371 + kfree(matrix_mdev); 372 + err_dec_available: 373 + atomic_inc(&matrix_dev->available_instances); 374 + return ret; 354 375 } 355 376 356 - static int vfio_ap_mdev_remove(struct mdev_device *mdev) 377 + static void vfio_ap_mdev_remove(struct mdev_device *mdev) 357 378 { 358 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 379 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); 380 + 381 + vfio_unregister_group_dev(&matrix_mdev->vdev); 359 382 360 383 mutex_lock(&matrix_dev->lock); 361 - vfio_ap_mdev_reset_queues(mdev); 384 + vfio_ap_mdev_reset_queues(matrix_mdev); 362 385 list_del(&matrix_mdev->node); 363 386 kfree(matrix_mdev); 364 - mdev_set_drvdata(mdev, NULL); 365 387 atomic_inc(&matrix_dev->available_instances); 366 388 mutex_unlock(&matrix_dev->lock); 367 - 368 - return 0; 369 389 } 370 390 371 391 static ssize_t name_show(struct mdev_type *mtype, ··· 619 615 { 620 616 int ret; 621 617 unsigned long apid; 622 - struct mdev_device *mdev = mdev_from_dev(dev); 623 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 618 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 624 619 625 620 mutex_lock(&matrix_dev->lock); 626 621 627 - /* 628 - * If the KVM pointer is in flux or the guest is running, disallow 629 - * un-assignment of adapter 630 - */ 631 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 622 + /* If the KVM guest is running, disallow assignment of adapter */ 623 + if (matrix_mdev->kvm) { 632 624 ret = -EBUSY; 633 625 goto done; 634 626 } ··· 688 688 { 689 689 int ret; 690 690 unsigned long apid; 691 - struct mdev_device *mdev = mdev_from_dev(dev); 692 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 691 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 693 692 694 693 mutex_lock(&matrix_dev->lock); 695 694 696 - /* 697 - * If the KVM pointer is in flux or the guest is running, disallow 698 - * un-assignment of adapter 699 - */ 700 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 695 + /* If the KVM guest is running, disallow unassignment of adapter */ 696 + if (matrix_mdev->kvm) { 701 697 ret = -EBUSY; 702 698 goto done; 703 699 } ··· 773 777 { 774 778 int ret; 775 779 unsigned long apqi; 776 - struct mdev_device *mdev = mdev_from_dev(dev); 777 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 780 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 778 781 unsigned long max_apqi = matrix_mdev->matrix.aqm_max; 779 782 780 783 mutex_lock(&matrix_dev->lock); 781 784 782 - /* 783 - * If the KVM pointer is in flux or the guest is running, disallow 784 - * assignment of domain 785 - */ 786 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 785 + /* If the KVM guest is running, disallow assignment of domain */ 786 + if (matrix_mdev->kvm) { 787 787 ret = -EBUSY; 788 788 goto done; 789 789 } ··· 838 846 { 839 847 int ret; 840 848 unsigned long apqi; 841 - struct mdev_device *mdev = mdev_from_dev(dev); 842 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 849 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 843 850 844 851 mutex_lock(&matrix_dev->lock); 845 852 846 - /* 847 - * If the KVM pointer is in flux or the guest is running, disallow 848 - * un-assignment of domain 849 - */ 850 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 853 + /* If the KVM guest is running, disallow unassignment of domain */ 854 + if (matrix_mdev->kvm) { 851 855 ret = -EBUSY; 852 856 goto done; 853 857 } ··· 888 900 { 889 901 int ret; 890 902 unsigned long id; 891 - struct mdev_device *mdev = mdev_from_dev(dev); 892 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 903 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 893 904 894 905 mutex_lock(&matrix_dev->lock); 895 906 896 - /* 897 - * If the KVM pointer is in flux or the guest is running, disallow 898 - * assignment of control domain. 899 - */ 900 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 907 + /* If the KVM guest is running, disallow assignment of control domain */ 908 + if (matrix_mdev->kvm) { 901 909 ret = -EBUSY; 902 910 goto done; 903 911 } ··· 942 958 { 943 959 int ret; 944 960 unsigned long domid; 945 - struct mdev_device *mdev = mdev_from_dev(dev); 946 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 961 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 947 962 unsigned long max_domid = matrix_mdev->matrix.adm_max; 948 963 949 964 mutex_lock(&matrix_dev->lock); 950 965 951 - /* 952 - * If the KVM pointer is in flux or the guest is running, disallow 953 - * un-assignment of control domain. 954 - */ 955 - if (matrix_mdev->kvm_busy || matrix_mdev->kvm) { 966 + /* If a KVM guest is running, disallow unassignment of control domain */ 967 + if (matrix_mdev->kvm) { 956 968 ret = -EBUSY; 957 969 goto done; 958 970 } ··· 977 997 int nchars = 0; 978 998 int n; 979 999 char *bufpos = buf; 980 - struct mdev_device *mdev = mdev_from_dev(dev); 981 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1000 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 982 1001 unsigned long max_domid = matrix_mdev->matrix.adm_max; 983 1002 984 1003 mutex_lock(&matrix_dev->lock); ··· 995 1016 static ssize_t matrix_show(struct device *dev, struct device_attribute *attr, 996 1017 char *buf) 997 1018 { 998 - struct mdev_device *mdev = mdev_from_dev(dev); 999 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1019 + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); 1000 1020 char *bufpos = buf; 1001 1021 unsigned long apid; 1002 1022 unsigned long apqi; ··· 1087 1109 struct ap_matrix_mdev *m; 1088 1110 1089 1111 if (kvm->arch.crypto.crycbd) { 1112 + down_write(&kvm->arch.crypto.pqap_hook_rwsem); 1113 + kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook; 1114 + up_write(&kvm->arch.crypto.pqap_hook_rwsem); 1115 + 1116 + mutex_lock(&kvm->lock); 1117 + mutex_lock(&matrix_dev->lock); 1118 + 1090 1119 list_for_each_entry(m, &matrix_dev->mdev_list, node) { 1091 - if (m != matrix_mdev && m->kvm == kvm) 1120 + if (m != matrix_mdev && m->kvm == kvm) { 1121 + mutex_unlock(&kvm->lock); 1122 + mutex_unlock(&matrix_dev->lock); 1092 1123 return -EPERM; 1124 + } 1093 1125 } 1094 1126 1095 1127 kvm_get_kvm(kvm); 1096 - matrix_mdev->kvm_busy = true; 1097 - mutex_unlock(&matrix_dev->lock); 1128 + matrix_mdev->kvm = kvm; 1098 1129 kvm_arch_crypto_set_masks(kvm, 1099 1130 matrix_mdev->matrix.apm, 1100 1131 matrix_mdev->matrix.aqm, 1101 1132 matrix_mdev->matrix.adm); 1102 - mutex_lock(&matrix_dev->lock); 1103 - kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook; 1104 - matrix_mdev->kvm = kvm; 1105 - matrix_mdev->kvm_busy = false; 1106 - wake_up_all(&matrix_mdev->wait_for_kvm); 1133 + 1134 + mutex_unlock(&kvm->lock); 1135 + mutex_unlock(&matrix_dev->lock); 1107 1136 } 1108 1137 1109 1138 return 0; ··· 1160 1175 * done under the @matrix_mdev->lock. 1161 1176 * 1162 1177 */ 1163 - static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev) 1178 + static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev, 1179 + struct kvm *kvm) 1164 1180 { 1165 - /* 1166 - * If the KVM pointer is in the process of being set, wait until the 1167 - * process has completed. 1168 - */ 1169 - wait_event_cmd(matrix_mdev->wait_for_kvm, 1170 - !matrix_mdev->kvm_busy, 1171 - mutex_unlock(&matrix_dev->lock), 1172 - mutex_lock(&matrix_dev->lock)); 1181 + if (kvm && kvm->arch.crypto.crycbd) { 1182 + down_write(&kvm->arch.crypto.pqap_hook_rwsem); 1183 + kvm->arch.crypto.pqap_hook = NULL; 1184 + up_write(&kvm->arch.crypto.pqap_hook_rwsem); 1173 1185 1174 - if (matrix_mdev->kvm) { 1175 - matrix_mdev->kvm_busy = true; 1176 - mutex_unlock(&matrix_dev->lock); 1177 - kvm_arch_crypto_clear_masks(matrix_mdev->kvm); 1186 + mutex_lock(&kvm->lock); 1178 1187 mutex_lock(&matrix_dev->lock); 1179 - vfio_ap_mdev_reset_queues(matrix_mdev->mdev); 1180 - matrix_mdev->kvm->arch.crypto.pqap_hook = NULL; 1181 - kvm_put_kvm(matrix_mdev->kvm); 1188 + 1189 + kvm_arch_crypto_clear_masks(kvm); 1190 + vfio_ap_mdev_reset_queues(matrix_mdev); 1191 + kvm_put_kvm(kvm); 1182 1192 matrix_mdev->kvm = NULL; 1183 - matrix_mdev->kvm_busy = false; 1184 - wake_up_all(&matrix_mdev->wait_for_kvm); 1193 + 1194 + mutex_unlock(&kvm->lock); 1195 + mutex_unlock(&matrix_dev->lock); 1185 1196 } 1186 1197 } 1187 1198 ··· 1190 1209 if (action != VFIO_GROUP_NOTIFY_SET_KVM) 1191 1210 return NOTIFY_OK; 1192 1211 1193 - mutex_lock(&matrix_dev->lock); 1194 1212 matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier); 1195 1213 1196 1214 if (!data) 1197 - vfio_ap_mdev_unset_kvm(matrix_mdev); 1215 + vfio_ap_mdev_unset_kvm(matrix_mdev, matrix_mdev->kvm); 1198 1216 else if (vfio_ap_mdev_set_kvm(matrix_mdev, data)) 1199 1217 notify_rc = NOTIFY_DONE; 1200 - 1201 - mutex_unlock(&matrix_dev->lock); 1202 1218 1203 1219 return notify_rc; 1204 1220 } ··· 1266 1288 return ret; 1267 1289 } 1268 1290 1269 - static int vfio_ap_mdev_reset_queues(struct mdev_device *mdev) 1291 + static int vfio_ap_mdev_reset_queues(struct ap_matrix_mdev *matrix_mdev) 1270 1292 { 1271 1293 int ret; 1272 1294 int rc = 0; 1273 1295 unsigned long apid, apqi; 1274 1296 struct vfio_ap_queue *q; 1275 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1276 1297 1277 1298 for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, 1278 1299 matrix_mdev->matrix.apm_max + 1) { ··· 1292 1315 return rc; 1293 1316 } 1294 1317 1295 - static int vfio_ap_mdev_open_device(struct mdev_device *mdev) 1318 + static int vfio_ap_mdev_open_device(struct vfio_device *vdev) 1296 1319 { 1297 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1320 + struct ap_matrix_mdev *matrix_mdev = 1321 + container_of(vdev, struct ap_matrix_mdev, vdev); 1298 1322 unsigned long events; 1299 1323 int ret; 1300 - 1301 - 1302 - if (!try_module_get(THIS_MODULE)) 1303 - return -ENODEV; 1304 1324 1305 1325 matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; 1306 1326 events = VFIO_GROUP_NOTIFY_SET_KVM; 1307 1327 1308 - ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1328 + ret = vfio_register_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1309 1329 &events, &matrix_mdev->group_notifier); 1310 - if (ret) { 1311 - module_put(THIS_MODULE); 1330 + if (ret) 1312 1331 return ret; 1313 - } 1314 1332 1315 1333 matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier; 1316 1334 events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; 1317 - ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 1335 + ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, 1318 1336 &events, &matrix_mdev->iommu_notifier); 1319 - if (!ret) 1320 - return ret; 1337 + if (ret) 1338 + goto out_unregister_group; 1339 + return 0; 1321 1340 1322 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1341 + out_unregister_group: 1342 + vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1323 1343 &matrix_mdev->group_notifier); 1324 - module_put(THIS_MODULE); 1325 1344 return ret; 1326 1345 } 1327 1346 1328 - static void vfio_ap_mdev_close_device(struct mdev_device *mdev) 1347 + static void vfio_ap_mdev_close_device(struct vfio_device *vdev) 1329 1348 { 1330 - struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev); 1349 + struct ap_matrix_mdev *matrix_mdev = 1350 + container_of(vdev, struct ap_matrix_mdev, vdev); 1331 1351 1332 - mutex_lock(&matrix_dev->lock); 1333 - vfio_ap_mdev_unset_kvm(matrix_mdev); 1334 - mutex_unlock(&matrix_dev->lock); 1335 - 1336 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, 1352 + vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, 1337 1353 &matrix_mdev->iommu_notifier); 1338 - vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, 1354 + vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, 1339 1355 &matrix_mdev->group_notifier); 1340 - module_put(THIS_MODULE); 1356 + vfio_ap_mdev_unset_kvm(matrix_mdev, matrix_mdev->kvm); 1341 1357 } 1342 1358 1343 1359 static int vfio_ap_mdev_get_device_info(unsigned long arg) ··· 1353 1383 return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 1354 1384 } 1355 1385 1356 - static ssize_t vfio_ap_mdev_ioctl(struct mdev_device *mdev, 1386 + static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev, 1357 1387 unsigned int cmd, unsigned long arg) 1358 1388 { 1389 + struct ap_matrix_mdev *matrix_mdev = 1390 + container_of(vdev, struct ap_matrix_mdev, vdev); 1359 1391 int ret; 1360 - struct ap_matrix_mdev *matrix_mdev; 1361 1392 1362 1393 mutex_lock(&matrix_dev->lock); 1363 1394 switch (cmd) { ··· 1366 1395 ret = vfio_ap_mdev_get_device_info(arg); 1367 1396 break; 1368 1397 case VFIO_DEVICE_RESET: 1369 - matrix_mdev = mdev_get_drvdata(mdev); 1370 - if (WARN(!matrix_mdev, "Driver data missing from mdev!!")) { 1371 - ret = -EINVAL; 1372 - break; 1373 - } 1374 - 1375 - /* 1376 - * If the KVM pointer is in the process of being set, wait until 1377 - * the process has completed. 1378 - */ 1379 - wait_event_cmd(matrix_mdev->wait_for_kvm, 1380 - !matrix_mdev->kvm_busy, 1381 - mutex_unlock(&matrix_dev->lock), 1382 - mutex_lock(&matrix_dev->lock)); 1383 - 1384 - ret = vfio_ap_mdev_reset_queues(mdev); 1398 + ret = vfio_ap_mdev_reset_queues(matrix_mdev); 1385 1399 break; 1386 1400 default: 1387 1401 ret = -EOPNOTSUPP; ··· 1377 1421 return ret; 1378 1422 } 1379 1423 1424 + static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { 1425 + .open_device = vfio_ap_mdev_open_device, 1426 + .close_device = vfio_ap_mdev_close_device, 1427 + .ioctl = vfio_ap_mdev_ioctl, 1428 + }; 1429 + 1430 + static struct mdev_driver vfio_ap_matrix_driver = { 1431 + .driver = { 1432 + .name = "vfio_ap_mdev", 1433 + .owner = THIS_MODULE, 1434 + .mod_name = KBUILD_MODNAME, 1435 + .dev_groups = vfio_ap_mdev_attr_groups, 1436 + }, 1437 + .probe = vfio_ap_mdev_probe, 1438 + .remove = vfio_ap_mdev_remove, 1439 + }; 1440 + 1380 1441 static const struct mdev_parent_ops vfio_ap_matrix_ops = { 1381 1442 .owner = THIS_MODULE, 1443 + .device_driver = &vfio_ap_matrix_driver, 1382 1444 .supported_type_groups = vfio_ap_mdev_type_groups, 1383 - .mdev_attr_groups = vfio_ap_mdev_attr_groups, 1384 - .create = vfio_ap_mdev_create, 1385 - .remove = vfio_ap_mdev_remove, 1386 - .open_device = vfio_ap_mdev_open_device, 1387 - .close_device = vfio_ap_mdev_close_device, 1388 - .ioctl = vfio_ap_mdev_ioctl, 1389 1445 }; 1390 1446 1391 1447 int vfio_ap_mdev_register(void) 1392 1448 { 1449 + int ret; 1450 + 1393 1451 atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT); 1394 1452 1395 - return mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops); 1453 + ret = mdev_register_driver(&vfio_ap_matrix_driver); 1454 + if (ret) 1455 + return ret; 1456 + 1457 + ret = mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_ops); 1458 + if (ret) 1459 + goto err_driver; 1460 + return 0; 1461 + 1462 + err_driver: 1463 + mdev_unregister_driver(&vfio_ap_matrix_driver); 1464 + return ret; 1396 1465 } 1397 1466 1398 1467 void vfio_ap_mdev_unregister(void) 1399 1468 { 1400 1469 mdev_unregister_device(&matrix_dev->device); 1470 + mdev_unregister_driver(&vfio_ap_matrix_driver); 1401 1471 }
+3 -3
drivers/s390/crypto/vfio_ap_private.h
··· 18 18 #include <linux/delay.h> 19 19 #include <linux/mutex.h> 20 20 #include <linux/kvm_host.h> 21 + #include <linux/vfio.h> 21 22 22 23 #include "ap_bus.h" 23 24 ··· 80 79 * @kvm: the struct holding guest's state 81 80 */ 82 81 struct ap_matrix_mdev { 82 + struct vfio_device vdev; 83 83 struct list_head node; 84 84 struct ap_matrix matrix; 85 85 struct notifier_block group_notifier; 86 86 struct notifier_block iommu_notifier; 87 - bool kvm_busy; 88 - wait_queue_head_t wait_for_kvm; 89 87 struct kvm *kvm; 90 - struct kvm_s390_module_hook pqap_hook; 88 + crypto_hook pqap_hook; 91 89 struct mdev_device *mdev; 92 90 }; 93 91
+22 -21
drivers/vfio/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - config VFIO_IOMMU_TYPE1 3 - tristate 4 - depends on VFIO 5 - default n 6 - 7 - config VFIO_IOMMU_SPAPR_TCE 8 - tristate 9 - depends on VFIO && SPAPR_TCE_IOMMU 10 - default VFIO 11 - 12 - config VFIO_SPAPR_EEH 13 - tristate 14 - depends on EEH && VFIO_IOMMU_SPAPR_TCE 15 - default VFIO 16 - 17 - config VFIO_VIRQFD 18 - tristate 19 - depends on VFIO && EVENTFD 20 - default n 21 - 22 2 menuconfig VFIO 23 3 tristate "VFIO Non-Privileged userspace driver framework" 24 4 select IOMMU_API ··· 9 29 10 30 If you don't know what to do here, say N. 11 31 32 + if VFIO 33 + config VFIO_IOMMU_TYPE1 34 + tristate 35 + default n 36 + 37 + config VFIO_IOMMU_SPAPR_TCE 38 + tristate 39 + depends on SPAPR_TCE_IOMMU 40 + default VFIO 41 + 42 + config VFIO_SPAPR_EEH 43 + tristate 44 + depends on EEH && VFIO_IOMMU_SPAPR_TCE 45 + default VFIO 46 + 47 + config VFIO_VIRQFD 48 + tristate 49 + select EVENTFD 50 + default n 51 + 12 52 config VFIO_NOIOMMU 13 53 bool "VFIO No-IOMMU support" 14 - depends on VFIO 15 54 help 16 55 VFIO is built on the ability to isolate devices using the IOMMU. 17 56 Only with an IOMMU can userspace access to DMA capable devices be ··· 47 48 source "drivers/vfio/platform/Kconfig" 48 49 source "drivers/vfio/mdev/Kconfig" 49 50 source "drivers/vfio/fsl-mc/Kconfig" 51 + endif 52 + 50 53 source "virt/lib/Kconfig"
+2 -1
drivers/vfio/fsl-mc/Kconfig
··· 1 1 config VFIO_FSL_MC 2 2 tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" 3 - depends on VFIO && FSL_MC_BUS && EVENTFD 3 + depends on FSL_MC_BUS 4 + select EVENTFD 4 5 help 5 6 Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc 6 7 (Management Complex) devices. This is required to passthrough
-1
drivers/vfio/mdev/Kconfig
··· 2 2 3 3 config VFIO_MDEV 4 4 tristate "Mediated device driver framework" 5 - depends on VFIO 6 5 default n 7 6 help 8 7 Provides a framework to virtualize devices.
+22 -18
drivers/vfio/pci/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - config VFIO_PCI 3 - tristate "VFIO support for PCI devices" 4 - depends on VFIO && PCI && EVENTFD 5 - depends on MMU 2 + if PCI && MMU 3 + config VFIO_PCI_CORE 4 + tristate 6 5 select VFIO_VIRQFD 7 6 select IRQ_BYPASS_MANAGER 7 + 8 + config VFIO_PCI_MMAP 9 + def_bool y if !S390 10 + 11 + config VFIO_PCI_INTX 12 + def_bool y if !S390 13 + 14 + config VFIO_PCI 15 + tristate "Generic VFIO support for any PCI device" 16 + select VFIO_PCI_CORE 8 17 help 9 - Support for the PCI VFIO bus driver. This is required to make 10 - use of PCI drivers using the VFIO framework. 18 + Support for the generic PCI VFIO bus driver which can connect any 19 + PCI device to the VFIO framework. 11 20 12 21 If you don't know what to do here, say N. 13 22 23 + if VFIO_PCI 14 24 config VFIO_PCI_VGA 15 - bool "VFIO PCI support for VGA devices" 16 - depends on VFIO_PCI && X86 && VGA_ARB 25 + bool "Generic VFIO PCI support for VGA devices" 26 + depends on X86 && VGA_ARB 17 27 help 18 28 Support for VGA extension to VFIO PCI. This exposes an additional 19 29 region on VGA devices for accessing legacy VGA addresses used by ··· 31 21 32 22 If you don't know what to do here, say N. 33 23 34 - config VFIO_PCI_MMAP 35 - depends on VFIO_PCI 36 - def_bool y if !S390 37 - 38 - config VFIO_PCI_INTX 39 - depends on VFIO_PCI 40 - def_bool y if !S390 41 - 42 24 config VFIO_PCI_IGD 43 - bool "VFIO PCI extensions for Intel graphics (GVT-d)" 44 - depends on VFIO_PCI && X86 25 + bool "Generic VFIO PCI extensions for Intel graphics (GVT-d)" 26 + depends on X86 45 27 default y 46 28 help 47 29 Support for Intel IGD specific extensions to enable direct ··· 42 40 and LPC bridge config space. 43 41 44 42 To enable Intel IGD assignment through vfio-pci, say Y. 43 + endif 44 + endif
+5 -3
drivers/vfio/pci/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 3 - vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 4 - vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 5 - vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o 3 + vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 4 + vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o 5 + obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o 6 6 7 + vfio-pci-y := vfio_pci.o 8 + vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 7 9 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+47 -2095
drivers/vfio/pci/vfio_pci.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + * 3 5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 6 * Author: Alex Williamson <alex.williamson@redhat.com> 5 7 * ··· 20 18 #include <linux/module.h> 21 19 #include <linux/mutex.h> 22 20 #include <linux/notifier.h> 23 - #include <linux/pci.h> 24 21 #include <linux/pm_runtime.h> 25 22 #include <linux/slab.h> 26 23 #include <linux/types.h> 27 24 #include <linux/uaccess.h> 28 - #include <linux/vfio.h> 29 - #include <linux/vgaarb.h> 30 - #include <linux/nospec.h> 31 - #include <linux/sched/mm.h> 32 25 33 - #include "vfio_pci_private.h" 26 + #include <linux/vfio_pci_core.h> 34 27 35 - #define DRIVER_VERSION "0.2" 36 28 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 37 29 #define DRIVER_DESC "VFIO PCI - User Level meta-driver" 38 30 ··· 59 63 static bool disable_denylist; 60 64 module_param(disable_denylist, bool, 0444); 61 65 MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); 62 - 63 - static inline bool vfio_vga_disabled(void) 64 - { 65 - #ifdef CONFIG_VFIO_PCI_VGA 66 - return disable_vga; 67 - #else 68 - return true; 69 - #endif 70 - } 71 66 72 67 static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) 73 68 { ··· 98 111 return true; 99 112 } 100 113 101 - /* 102 - * Our VGA arbiter participation is limited since we don't know anything 103 - * about the device itself. However, if the device is the only VGA device 104 - * downstream of a bridge and VFIO VGA support is disabled, then we can 105 - * safely return legacy VGA IO and memory as not decoded since the user 106 - * has no way to get to it and routing can be disabled externally at the 107 - * bridge. 108 - */ 109 - static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 114 + static int vfio_pci_open_device(struct vfio_device *core_vdev) 110 115 { 111 - struct vfio_pci_device *vdev = opaque; 112 - struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 113 - unsigned char max_busnr; 114 - unsigned int decodes; 115 - 116 - if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 117 - return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 118 - VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 119 - 120 - max_busnr = pci_bus_max_busnr(pdev->bus); 121 - decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 122 - 123 - while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 124 - if (tmp == pdev || 125 - pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 126 - pci_is_root_bus(tmp->bus)) 127 - continue; 128 - 129 - if (tmp->bus->number >= pdev->bus->number && 130 - tmp->bus->number <= max_busnr) { 131 - pci_dev_put(tmp); 132 - decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 133 - break; 134 - } 135 - } 136 - 137 - return decodes; 138 - } 139 - 140 - static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 141 - { 142 - return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 143 - } 144 - 145 - static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) 146 - { 147 - struct resource *res; 148 - int i; 149 - struct vfio_pci_dummy_resource *dummy_res; 150 - 151 - for (i = 0; i < PCI_STD_NUM_BARS; i++) { 152 - int bar = i + PCI_STD_RESOURCES; 153 - 154 - res = &vdev->pdev->resource[bar]; 155 - 156 - if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 157 - goto no_mmap; 158 - 159 - if (!(res->flags & IORESOURCE_MEM)) 160 - goto no_mmap; 161 - 162 - /* 163 - * The PCI core shouldn't set up a resource with a 164 - * type but zero size. But there may be bugs that 165 - * cause us to do that. 166 - */ 167 - if (!resource_size(res)) 168 - goto no_mmap; 169 - 170 - if (resource_size(res) >= PAGE_SIZE) { 171 - vdev->bar_mmap_supported[bar] = true; 172 - continue; 173 - } 174 - 175 - if (!(res->start & ~PAGE_MASK)) { 176 - /* 177 - * Add a dummy resource to reserve the remainder 178 - * of the exclusive page in case that hot-add 179 - * device's bar is assigned into it. 180 - */ 181 - dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 182 - if (dummy_res == NULL) 183 - goto no_mmap; 184 - 185 - dummy_res->resource.name = "vfio sub-page reserved"; 186 - dummy_res->resource.start = res->end + 1; 187 - dummy_res->resource.end = res->start + PAGE_SIZE - 1; 188 - dummy_res->resource.flags = res->flags; 189 - if (request_resource(res->parent, 190 - &dummy_res->resource)) { 191 - kfree(dummy_res); 192 - goto no_mmap; 193 - } 194 - dummy_res->index = bar; 195 - list_add(&dummy_res->res_next, 196 - &vdev->dummy_resources_list); 197 - vdev->bar_mmap_supported[bar] = true; 198 - continue; 199 - } 200 - /* 201 - * Here we don't handle the case when the BAR is not page 202 - * aligned because we can't expect the BAR will be 203 - * assigned into the same location in a page in guest 204 - * when we passthrough the BAR. And it's hard to access 205 - * this BAR in userspace because we have no way to get 206 - * the BAR's location in a page. 207 - */ 208 - no_mmap: 209 - vdev->bar_mmap_supported[bar] = false; 210 - } 211 - } 212 - 213 - struct vfio_pci_group_info; 214 - static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); 215 - static void vfio_pci_disable(struct vfio_pci_device *vdev); 216 - static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 217 - struct vfio_pci_group_info *groups); 218 - 219 - /* 220 - * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 221 - * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 222 - * If a device implements the former but not the latter we would typically 223 - * expect broken_intx_masking be set and require an exclusive interrupt. 224 - * However since we do have control of the device's ability to assert INTx, 225 - * we can instead pretend that the device does not implement INTx, virtualizing 226 - * the pin register to report zero and maintaining DisINTx set on the host. 227 - */ 228 - static bool vfio_pci_nointx(struct pci_dev *pdev) 229 - { 230 - switch (pdev->vendor) { 231 - case PCI_VENDOR_ID_INTEL: 232 - switch (pdev->device) { 233 - /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 234 - case 0x1572: 235 - case 0x1574: 236 - case 0x1580 ... 0x1581: 237 - case 0x1583 ... 0x158b: 238 - case 0x37d0 ... 0x37d2: 239 - /* X550 */ 240 - case 0x1563: 241 - return true; 242 - default: 243 - return false; 244 - } 245 - } 246 - 247 - return false; 248 - } 249 - 250 - static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev) 251 - { 252 - struct pci_dev *pdev = vdev->pdev; 253 - u16 pmcsr; 254 - 255 - if (!pdev->pm_cap) 256 - return; 257 - 258 - pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 259 - 260 - vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 261 - } 262 - 263 - /* 264 - * pci_set_power_state() wrapper handling devices which perform a soft reset on 265 - * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 266 - * restore when returned to D0. Saved separately from pci_saved_state for use 267 - * by PM capability emulation and separately from pci_dev internal saved state 268 - * to avoid it being overwritten and consumed around other resets. 269 - */ 270 - int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) 271 - { 272 - struct pci_dev *pdev = vdev->pdev; 273 - bool needs_restore = false, needs_save = false; 274 - int ret; 275 - 276 - if (vdev->needs_pm_restore) { 277 - if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 278 - pci_save_state(pdev); 279 - needs_save = true; 280 - } 281 - 282 - if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 283 - needs_restore = true; 284 - } 285 - 286 - ret = pci_set_power_state(pdev, state); 287 - 288 - if (!ret) { 289 - /* D3 might be unsupported via quirk, skip unless in D3 */ 290 - if (needs_save && pdev->current_state >= PCI_D3hot) { 291 - vdev->pm_save = pci_store_saved_state(pdev); 292 - } else if (needs_restore) { 293 - pci_load_and_free_saved_state(pdev, &vdev->pm_save); 294 - pci_restore_state(pdev); 295 - } 296 - } 297 - 298 - return ret; 299 - } 300 - 301 - static int vfio_pci_enable(struct vfio_pci_device *vdev) 302 - { 116 + struct vfio_pci_core_device *vdev = 117 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 303 118 struct pci_dev *pdev = vdev->pdev; 304 119 int ret; 305 - u16 cmd; 306 - u8 msix_pos; 307 120 308 - vfio_pci_set_power_state(vdev, PCI_D0); 309 - 310 - /* Don't allow our initial saved state to include busmaster */ 311 - pci_clear_master(pdev); 312 - 313 - ret = pci_enable_device(pdev); 121 + ret = vfio_pci_core_enable(vdev); 314 122 if (ret) 315 123 return ret; 316 - 317 - /* If reset fails because of the device lock, fail this path entirely */ 318 - ret = pci_try_reset_function(pdev); 319 - if (ret == -EAGAIN) { 320 - pci_disable_device(pdev); 321 - return ret; 322 - } 323 - 324 - vdev->reset_works = !ret; 325 - pci_save_state(pdev); 326 - vdev->pci_saved_state = pci_store_saved_state(pdev); 327 - if (!vdev->pci_saved_state) 328 - pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 329 - 330 - if (likely(!nointxmask)) { 331 - if (vfio_pci_nointx(pdev)) { 332 - pci_info(pdev, "Masking broken INTx support\n"); 333 - vdev->nointx = true; 334 - pci_intx(pdev, 0); 335 - } else 336 - vdev->pci_2_3 = pci_intx_mask_supported(pdev); 337 - } 338 - 339 - pci_read_config_word(pdev, PCI_COMMAND, &cmd); 340 - if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 341 - cmd &= ~PCI_COMMAND_INTX_DISABLE; 342 - pci_write_config_word(pdev, PCI_COMMAND, cmd); 343 - } 344 - 345 - ret = vfio_config_init(vdev); 346 - if (ret) { 347 - kfree(vdev->pci_saved_state); 348 - vdev->pci_saved_state = NULL; 349 - pci_disable_device(pdev); 350 - return ret; 351 - } 352 - 353 - msix_pos = pdev->msix_cap; 354 - if (msix_pos) { 355 - u16 flags; 356 - u32 table; 357 - 358 - pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 359 - pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 360 - 361 - vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 362 - vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 363 - vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 364 - } else 365 - vdev->msix_bar = 0xFF; 366 - 367 - if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 368 - vdev->has_vga = true; 369 124 370 125 if (vfio_pci_is_vga(pdev) && 371 126 pdev->vendor == PCI_VENDOR_ID_INTEL && ··· 115 386 ret = vfio_pci_igd_init(vdev); 116 387 if (ret && ret != -ENODEV) { 117 388 pci_warn(pdev, "Failed to setup Intel IGD regions\n"); 118 - goto disable_exit; 119 - } 120 - } 121 - 122 - vfio_pci_probe_mmaps(vdev); 123 - 124 - return 0; 125 - 126 - disable_exit: 127 - vfio_pci_disable(vdev); 128 - return ret; 129 - } 130 - 131 - static void vfio_pci_disable(struct vfio_pci_device *vdev) 132 - { 133 - struct pci_dev *pdev = vdev->pdev; 134 - struct vfio_pci_dummy_resource *dummy_res, *tmp; 135 - struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 136 - int i, bar; 137 - 138 - /* For needs_reset */ 139 - lockdep_assert_held(&vdev->vdev.dev_set->lock); 140 - 141 - /* Stop the device from further DMA */ 142 - pci_clear_master(pdev); 143 - 144 - vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 145 - VFIO_IRQ_SET_ACTION_TRIGGER, 146 - vdev->irq_type, 0, 0, NULL); 147 - 148 - /* Device closed, don't need mutex here */ 149 - list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 150 - &vdev->ioeventfds_list, next) { 151 - vfio_virqfd_disable(&ioeventfd->virqfd); 152 - list_del(&ioeventfd->next); 153 - kfree(ioeventfd); 154 - } 155 - vdev->ioeventfds_nr = 0; 156 - 157 - vdev->virq_disabled = false; 158 - 159 - for (i = 0; i < vdev->num_regions; i++) 160 - vdev->region[i].ops->release(vdev, &vdev->region[i]); 161 - 162 - vdev->num_regions = 0; 163 - kfree(vdev->region); 164 - vdev->region = NULL; /* don't krealloc a freed pointer */ 165 - 166 - vfio_config_free(vdev); 167 - 168 - for (i = 0; i < PCI_STD_NUM_BARS; i++) { 169 - bar = i + PCI_STD_RESOURCES; 170 - if (!vdev->barmap[bar]) 171 - continue; 172 - pci_iounmap(pdev, vdev->barmap[bar]); 173 - pci_release_selected_regions(pdev, 1 << bar); 174 - vdev->barmap[bar] = NULL; 175 - } 176 - 177 - list_for_each_entry_safe(dummy_res, tmp, 178 - &vdev->dummy_resources_list, res_next) { 179 - list_del(&dummy_res->res_next); 180 - release_resource(&dummy_res->resource); 181 - kfree(dummy_res); 182 - } 183 - 184 - vdev->needs_reset = true; 185 - 186 - /* 187 - * If we have saved state, restore it. If we can reset the device, 188 - * even better. Resetting with current state seems better than 189 - * nothing, but saving and restoring current state without reset 190 - * is just busy work. 191 - */ 192 - if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 193 - pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 194 - 195 - if (!vdev->reset_works) 196 - goto out; 197 - 198 - pci_save_state(pdev); 199 - } 200 - 201 - /* 202 - * Disable INTx and MSI, presumably to avoid spurious interrupts 203 - * during reset. Stolen from pci_reset_function() 204 - */ 205 - pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 206 - 207 - /* 208 - * Try to get the locks ourselves to prevent a deadlock. The 209 - * success of this is dependent on being able to lock the device, 210 - * which is not always possible. 211 - * We can not use the "try" reset interface here, which will 212 - * overwrite the previously restored configuration information. 213 - */ 214 - if (vdev->reset_works && pci_dev_trylock(pdev)) { 215 - if (!__pci_reset_function_locked(pdev)) 216 - vdev->needs_reset = false; 217 - pci_dev_unlock(pdev); 218 - } 219 - 220 - pci_restore_state(pdev); 221 - out: 222 - pci_disable_device(pdev); 223 - 224 - if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) 225 - vfio_pci_set_power_state(vdev, PCI_D3hot); 226 - } 227 - 228 - static struct pci_driver vfio_pci_driver; 229 - 230 - static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev) 231 - { 232 - struct pci_dev *physfn = pci_physfn(vdev->pdev); 233 - struct vfio_device *pf_dev; 234 - 235 - if (!vdev->pdev->is_virtfn) 236 - return NULL; 237 - 238 - pf_dev = vfio_device_get_from_dev(&physfn->dev); 239 - if (!pf_dev) 240 - return NULL; 241 - 242 - if (pci_dev_driver(physfn) != &vfio_pci_driver) { 243 - vfio_device_put(pf_dev); 244 - return NULL; 245 - } 246 - 247 - return container_of(pf_dev, struct vfio_pci_device, vdev); 248 - } 249 - 250 - static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) 251 - { 252 - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 253 - 254 - if (!pf_vdev) 255 - return; 256 - 257 - mutex_lock(&pf_vdev->vf_token->lock); 258 - pf_vdev->vf_token->users += val; 259 - WARN_ON(pf_vdev->vf_token->users < 0); 260 - mutex_unlock(&pf_vdev->vf_token->lock); 261 - 262 - vfio_device_put(&pf_vdev->vdev); 263 - } 264 - 265 - static void vfio_pci_close_device(struct vfio_device *core_vdev) 266 - { 267 - struct vfio_pci_device *vdev = 268 - container_of(core_vdev, struct vfio_pci_device, vdev); 269 - 270 - vfio_pci_vf_token_user_add(vdev, -1); 271 - vfio_spapr_pci_eeh_release(vdev->pdev); 272 - vfio_pci_disable(vdev); 273 - 274 - mutex_lock(&vdev->igate); 275 - if (vdev->err_trigger) { 276 - eventfd_ctx_put(vdev->err_trigger); 277 - vdev->err_trigger = NULL; 278 - } 279 - if (vdev->req_trigger) { 280 - eventfd_ctx_put(vdev->req_trigger); 281 - vdev->req_trigger = NULL; 282 - } 283 - mutex_unlock(&vdev->igate); 284 - } 285 - 286 - static int vfio_pci_open_device(struct vfio_device *core_vdev) 287 - { 288 - struct vfio_pci_device *vdev = 289 - container_of(core_vdev, struct vfio_pci_device, vdev); 290 - int ret = 0; 291 - 292 - ret = vfio_pci_enable(vdev); 293 - if (ret) 294 - return ret; 295 - 296 - vfio_spapr_pci_eeh_open(vdev->pdev); 297 - vfio_pci_vf_token_user_add(vdev, 1); 298 - return 0; 299 - } 300 - 301 - static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) 302 - { 303 - if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 304 - u8 pin; 305 - 306 - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 307 - vdev->nointx || vdev->pdev->is_virtfn) 308 - return 0; 309 - 310 - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 311 - 312 - return pin ? 1 : 0; 313 - } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 314 - u8 pos; 315 - u16 flags; 316 - 317 - pos = vdev->pdev->msi_cap; 318 - if (pos) { 319 - pci_read_config_word(vdev->pdev, 320 - pos + PCI_MSI_FLAGS, &flags); 321 - return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 322 - } 323 - } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 324 - u8 pos; 325 - u16 flags; 326 - 327 - pos = vdev->pdev->msix_cap; 328 - if (pos) { 329 - pci_read_config_word(vdev->pdev, 330 - pos + PCI_MSIX_FLAGS, &flags); 331 - 332 - return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 333 - } 334 - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 335 - if (pci_is_pcie(vdev->pdev)) 336 - return 1; 337 - } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 338 - return 1; 339 - } 340 - 341 - return 0; 342 - } 343 - 344 - static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 345 - { 346 - (*(int *)data)++; 347 - return 0; 348 - } 349 - 350 - struct vfio_pci_fill_info { 351 - int max; 352 - int cur; 353 - struct vfio_pci_dependent_device *devices; 354 - }; 355 - 356 - static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 357 - { 358 - struct vfio_pci_fill_info *fill = data; 359 - struct iommu_group *iommu_group; 360 - 361 - if (fill->cur == fill->max) 362 - return -EAGAIN; /* Something changed, try again */ 363 - 364 - iommu_group = iommu_group_get(&pdev->dev); 365 - if (!iommu_group) 366 - return -EPERM; /* Cannot reset non-isolated devices */ 367 - 368 - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 369 - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 370 - fill->devices[fill->cur].bus = pdev->bus->number; 371 - fill->devices[fill->cur].devfn = pdev->devfn; 372 - fill->cur++; 373 - iommu_group_put(iommu_group); 374 - return 0; 375 - } 376 - 377 - struct vfio_pci_group_info { 378 - int count; 379 - struct vfio_group **groups; 380 - }; 381 - 382 - static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 383 - { 384 - for (; pdev; pdev = pdev->bus->self) 385 - if (pdev->bus == slot->bus) 386 - return (pdev->slot == slot); 387 - return false; 388 - } 389 - 390 - struct vfio_pci_walk_info { 391 - int (*fn)(struct pci_dev *, void *data); 392 - void *data; 393 - struct pci_dev *pdev; 394 - bool slot; 395 - int ret; 396 - }; 397 - 398 - static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 399 - { 400 - struct vfio_pci_walk_info *walk = data; 401 - 402 - if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 403 - walk->ret = walk->fn(pdev, walk->data); 404 - 405 - return walk->ret; 406 - } 407 - 408 - static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 409 - int (*fn)(struct pci_dev *, 410 - void *data), void *data, 411 - bool slot) 412 - { 413 - struct vfio_pci_walk_info walk = { 414 - .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 415 - }; 416 - 417 - pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 418 - 419 - return walk.ret; 420 - } 421 - 422 - static int msix_mmappable_cap(struct vfio_pci_device *vdev, 423 - struct vfio_info_cap *caps) 424 - { 425 - struct vfio_info_cap_header header = { 426 - .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 427 - .version = 1 428 - }; 429 - 430 - return vfio_info_add_capability(caps, &header, sizeof(header)); 431 - } 432 - 433 - int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 434 - unsigned int type, unsigned int subtype, 435 - const struct vfio_pci_regops *ops, 436 - size_t size, u32 flags, void *data) 437 - { 438 - struct vfio_pci_region *region; 439 - 440 - region = krealloc(vdev->region, 441 - (vdev->num_regions + 1) * sizeof(*region), 442 - GFP_KERNEL); 443 - if (!region) 444 - return -ENOMEM; 445 - 446 - vdev->region = region; 447 - vdev->region[vdev->num_regions].type = type; 448 - vdev->region[vdev->num_regions].subtype = subtype; 449 - vdev->region[vdev->num_regions].ops = ops; 450 - vdev->region[vdev->num_regions].size = size; 451 - vdev->region[vdev->num_regions].flags = flags; 452 - vdev->region[vdev->num_regions].data = data; 453 - 454 - vdev->num_regions++; 455 - 456 - return 0; 457 - } 458 - 459 - static long vfio_pci_ioctl(struct vfio_device *core_vdev, 460 - unsigned int cmd, unsigned long arg) 461 - { 462 - struct vfio_pci_device *vdev = 463 - container_of(core_vdev, struct vfio_pci_device, vdev); 464 - unsigned long minsz; 465 - 466 - if (cmd == VFIO_DEVICE_GET_INFO) { 467 - struct vfio_device_info info; 468 - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 469 - unsigned long capsz; 470 - int ret; 471 - 472 - minsz = offsetofend(struct vfio_device_info, num_irqs); 473 - 474 - /* For backward compatibility, cannot require this */ 475 - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 476 - 477 - if (copy_from_user(&info, (void __user *)arg, minsz)) 478 - return -EFAULT; 479 - 480 - if (info.argsz < minsz) 481 - return -EINVAL; 482 - 483 - if (info.argsz >= capsz) { 484 - minsz = capsz; 485 - info.cap_offset = 0; 486 - } 487 - 488 - info.flags = VFIO_DEVICE_FLAGS_PCI; 489 - 490 - if (vdev->reset_works) 491 - info.flags |= VFIO_DEVICE_FLAGS_RESET; 492 - 493 - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 494 - info.num_irqs = VFIO_PCI_NUM_IRQS; 495 - 496 - ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 497 - if (ret && ret != -ENODEV) { 498 - pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 389 + vfio_pci_core_disable(vdev); 499 390 return ret; 500 391 } 501 - 502 - if (caps.size) { 503 - info.flags |= VFIO_DEVICE_FLAGS_CAPS; 504 - if (info.argsz < sizeof(info) + caps.size) { 505 - info.argsz = sizeof(info) + caps.size; 506 - } else { 507 - vfio_info_cap_shift(&caps, sizeof(info)); 508 - if (copy_to_user((void __user *)arg + 509 - sizeof(info), caps.buf, 510 - caps.size)) { 511 - kfree(caps.buf); 512 - return -EFAULT; 513 - } 514 - info.cap_offset = sizeof(info); 515 - } 516 - 517 - kfree(caps.buf); 518 - } 519 - 520 - return copy_to_user((void __user *)arg, &info, minsz) ? 521 - -EFAULT : 0; 522 - 523 - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 524 - struct pci_dev *pdev = vdev->pdev; 525 - struct vfio_region_info info; 526 - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 527 - int i, ret; 528 - 529 - minsz = offsetofend(struct vfio_region_info, offset); 530 - 531 - if (copy_from_user(&info, (void __user *)arg, minsz)) 532 - return -EFAULT; 533 - 534 - if (info.argsz < minsz) 535 - return -EINVAL; 536 - 537 - switch (info.index) { 538 - case VFIO_PCI_CONFIG_REGION_INDEX: 539 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 540 - info.size = pdev->cfg_size; 541 - info.flags = VFIO_REGION_INFO_FLAG_READ | 542 - VFIO_REGION_INFO_FLAG_WRITE; 543 - break; 544 - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 545 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 546 - info.size = pci_resource_len(pdev, info.index); 547 - if (!info.size) { 548 - info.flags = 0; 549 - break; 550 - } 551 - 552 - info.flags = VFIO_REGION_INFO_FLAG_READ | 553 - VFIO_REGION_INFO_FLAG_WRITE; 554 - if (vdev->bar_mmap_supported[info.index]) { 555 - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 556 - if (info.index == vdev->msix_bar) { 557 - ret = msix_mmappable_cap(vdev, &caps); 558 - if (ret) 559 - return ret; 560 - } 561 - } 562 - 563 - break; 564 - case VFIO_PCI_ROM_REGION_INDEX: 565 - { 566 - void __iomem *io; 567 - size_t size; 568 - u16 cmd; 569 - 570 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 571 - info.flags = 0; 572 - 573 - /* Report the BAR size, not the ROM size */ 574 - info.size = pci_resource_len(pdev, info.index); 575 - if (!info.size) { 576 - /* Shadow ROMs appear as PCI option ROMs */ 577 - if (pdev->resource[PCI_ROM_RESOURCE].flags & 578 - IORESOURCE_ROM_SHADOW) 579 - info.size = 0x20000; 580 - else 581 - break; 582 - } 583 - 584 - /* 585 - * Is it really there? Enable memory decode for 586 - * implicit access in pci_map_rom(). 587 - */ 588 - cmd = vfio_pci_memory_lock_and_enable(vdev); 589 - io = pci_map_rom(pdev, &size); 590 - if (io) { 591 - info.flags = VFIO_REGION_INFO_FLAG_READ; 592 - pci_unmap_rom(pdev, io); 593 - } else { 594 - info.size = 0; 595 - } 596 - vfio_pci_memory_unlock_and_restore(vdev, cmd); 597 - 598 - break; 599 - } 600 - case VFIO_PCI_VGA_REGION_INDEX: 601 - if (!vdev->has_vga) 602 - return -EINVAL; 603 - 604 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 605 - info.size = 0xc0000; 606 - info.flags = VFIO_REGION_INFO_FLAG_READ | 607 - VFIO_REGION_INFO_FLAG_WRITE; 608 - 609 - break; 610 - default: 611 - { 612 - struct vfio_region_info_cap_type cap_type = { 613 - .header.id = VFIO_REGION_INFO_CAP_TYPE, 614 - .header.version = 1 }; 615 - 616 - if (info.index >= 617 - VFIO_PCI_NUM_REGIONS + vdev->num_regions) 618 - return -EINVAL; 619 - info.index = array_index_nospec(info.index, 620 - VFIO_PCI_NUM_REGIONS + 621 - vdev->num_regions); 622 - 623 - i = info.index - VFIO_PCI_NUM_REGIONS; 624 - 625 - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 626 - info.size = vdev->region[i].size; 627 - info.flags = vdev->region[i].flags; 628 - 629 - cap_type.type = vdev->region[i].type; 630 - cap_type.subtype = vdev->region[i].subtype; 631 - 632 - ret = vfio_info_add_capability(&caps, &cap_type.header, 633 - sizeof(cap_type)); 634 - if (ret) 635 - return ret; 636 - 637 - if (vdev->region[i].ops->add_capability) { 638 - ret = vdev->region[i].ops->add_capability(vdev, 639 - &vdev->region[i], &caps); 640 - if (ret) 641 - return ret; 642 - } 643 - } 644 - } 645 - 646 - if (caps.size) { 647 - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 648 - if (info.argsz < sizeof(info) + caps.size) { 649 - info.argsz = sizeof(info) + caps.size; 650 - info.cap_offset = 0; 651 - } else { 652 - vfio_info_cap_shift(&caps, sizeof(info)); 653 - if (copy_to_user((void __user *)arg + 654 - sizeof(info), caps.buf, 655 - caps.size)) { 656 - kfree(caps.buf); 657 - return -EFAULT; 658 - } 659 - info.cap_offset = sizeof(info); 660 - } 661 - 662 - kfree(caps.buf); 663 - } 664 - 665 - return copy_to_user((void __user *)arg, &info, minsz) ? 666 - -EFAULT : 0; 667 - 668 - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 669 - struct vfio_irq_info info; 670 - 671 - minsz = offsetofend(struct vfio_irq_info, count); 672 - 673 - if (copy_from_user(&info, (void __user *)arg, minsz)) 674 - return -EFAULT; 675 - 676 - if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 677 - return -EINVAL; 678 - 679 - switch (info.index) { 680 - case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 681 - case VFIO_PCI_REQ_IRQ_INDEX: 682 - break; 683 - case VFIO_PCI_ERR_IRQ_INDEX: 684 - if (pci_is_pcie(vdev->pdev)) 685 - break; 686 - fallthrough; 687 - default: 688 - return -EINVAL; 689 - } 690 - 691 - info.flags = VFIO_IRQ_INFO_EVENTFD; 692 - 693 - info.count = vfio_pci_get_irq_count(vdev, info.index); 694 - 695 - if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 696 - info.flags |= (VFIO_IRQ_INFO_MASKABLE | 697 - VFIO_IRQ_INFO_AUTOMASKED); 698 - else 699 - info.flags |= VFIO_IRQ_INFO_NORESIZE; 700 - 701 - return copy_to_user((void __user *)arg, &info, minsz) ? 702 - -EFAULT : 0; 703 - 704 - } else if (cmd == VFIO_DEVICE_SET_IRQS) { 705 - struct vfio_irq_set hdr; 706 - u8 *data = NULL; 707 - int max, ret = 0; 708 - size_t data_size = 0; 709 - 710 - minsz = offsetofend(struct vfio_irq_set, count); 711 - 712 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 713 - return -EFAULT; 714 - 715 - max = vfio_pci_get_irq_count(vdev, hdr.index); 716 - 717 - ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 718 - VFIO_PCI_NUM_IRQS, &data_size); 719 - if (ret) 720 - return ret; 721 - 722 - if (data_size) { 723 - data = memdup_user((void __user *)(arg + minsz), 724 - data_size); 725 - if (IS_ERR(data)) 726 - return PTR_ERR(data); 727 - } 728 - 729 - mutex_lock(&vdev->igate); 730 - 731 - ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 732 - hdr.start, hdr.count, data); 733 - 734 - mutex_unlock(&vdev->igate); 735 - kfree(data); 736 - 737 - return ret; 738 - 739 - } else if (cmd == VFIO_DEVICE_RESET) { 740 - int ret; 741 - 742 - if (!vdev->reset_works) 743 - return -EINVAL; 744 - 745 - vfio_pci_zap_and_down_write_memory_lock(vdev); 746 - ret = pci_try_reset_function(vdev->pdev); 747 - up_write(&vdev->memory_lock); 748 - 749 - return ret; 750 - 751 - } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 752 - struct vfio_pci_hot_reset_info hdr; 753 - struct vfio_pci_fill_info fill = { 0 }; 754 - struct vfio_pci_dependent_device *devices = NULL; 755 - bool slot = false; 756 - int ret = 0; 757 - 758 - minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 759 - 760 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 761 - return -EFAULT; 762 - 763 - if (hdr.argsz < minsz) 764 - return -EINVAL; 765 - 766 - hdr.flags = 0; 767 - 768 - /* Can we do a slot or bus reset or neither? */ 769 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 770 - slot = true; 771 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 772 - return -ENODEV; 773 - 774 - /* How many devices are affected? */ 775 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 776 - vfio_pci_count_devs, 777 - &fill.max, slot); 778 - if (ret) 779 - return ret; 780 - 781 - WARN_ON(!fill.max); /* Should always be at least one */ 782 - 783 - /* 784 - * If there's enough space, fill it now, otherwise return 785 - * -ENOSPC and the number of devices affected. 786 - */ 787 - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 788 - ret = -ENOSPC; 789 - hdr.count = fill.max; 790 - goto reset_info_exit; 791 - } 792 - 793 - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 794 - if (!devices) 795 - return -ENOMEM; 796 - 797 - fill.devices = devices; 798 - 799 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 800 - vfio_pci_fill_devs, 801 - &fill, slot); 802 - 803 - /* 804 - * If a device was removed between counting and filling, 805 - * we may come up short of fill.max. If a device was 806 - * added, we'll have a return of -EAGAIN above. 807 - */ 808 - if (!ret) 809 - hdr.count = fill.cur; 810 - 811 - reset_info_exit: 812 - if (copy_to_user((void __user *)arg, &hdr, minsz)) 813 - ret = -EFAULT; 814 - 815 - if (!ret) { 816 - if (copy_to_user((void __user *)(arg + minsz), devices, 817 - hdr.count * sizeof(*devices))) 818 - ret = -EFAULT; 819 - } 820 - 821 - kfree(devices); 822 - return ret; 823 - 824 - } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 825 - struct vfio_pci_hot_reset hdr; 826 - int32_t *group_fds; 827 - struct vfio_group **groups; 828 - struct vfio_pci_group_info info; 829 - bool slot = false; 830 - int group_idx, count = 0, ret = 0; 831 - 832 - minsz = offsetofend(struct vfio_pci_hot_reset, count); 833 - 834 - if (copy_from_user(&hdr, (void __user *)arg, minsz)) 835 - return -EFAULT; 836 - 837 - if (hdr.argsz < minsz || hdr.flags) 838 - return -EINVAL; 839 - 840 - /* Can we do a slot or bus reset or neither? */ 841 - if (!pci_probe_reset_slot(vdev->pdev->slot)) 842 - slot = true; 843 - else if (pci_probe_reset_bus(vdev->pdev->bus)) 844 - return -ENODEV; 845 - 846 - /* 847 - * We can't let userspace give us an arbitrarily large 848 - * buffer to copy, so verify how many we think there 849 - * could be. Note groups can have multiple devices so 850 - * one group per device is the max. 851 - */ 852 - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 853 - vfio_pci_count_devs, 854 - &count, slot); 855 - if (ret) 856 - return ret; 857 - 858 - /* Somewhere between 1 and count is OK */ 859 - if (!hdr.count || hdr.count > count) 860 - return -EINVAL; 861 - 862 - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 863 - groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 864 - if (!group_fds || !groups) { 865 - kfree(group_fds); 866 - kfree(groups); 867 - return -ENOMEM; 868 - } 869 - 870 - if (copy_from_user(group_fds, (void __user *)(arg + minsz), 871 - hdr.count * sizeof(*group_fds))) { 872 - kfree(group_fds); 873 - kfree(groups); 874 - return -EFAULT; 875 - } 876 - 877 - /* 878 - * For each group_fd, get the group through the vfio external 879 - * user interface and store the group and iommu ID. This 880 - * ensures the group is held across the reset. 881 - */ 882 - for (group_idx = 0; group_idx < hdr.count; group_idx++) { 883 - struct vfio_group *group; 884 - struct fd f = fdget(group_fds[group_idx]); 885 - if (!f.file) { 886 - ret = -EBADF; 887 - break; 888 - } 889 - 890 - group = vfio_group_get_external_user(f.file); 891 - fdput(f); 892 - if (IS_ERR(group)) { 893 - ret = PTR_ERR(group); 894 - break; 895 - } 896 - 897 - groups[group_idx] = group; 898 - } 899 - 900 - kfree(group_fds); 901 - 902 - /* release reference to groups on error */ 903 - if (ret) 904 - goto hot_reset_release; 905 - 906 - info.count = hdr.count; 907 - info.groups = groups; 908 - 909 - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); 910 - 911 - hot_reset_release: 912 - for (group_idx--; group_idx >= 0; group_idx--) 913 - vfio_group_put_external_user(groups[group_idx]); 914 - 915 - kfree(groups); 916 - return ret; 917 - } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 918 - struct vfio_device_ioeventfd ioeventfd; 919 - int count; 920 - 921 - minsz = offsetofend(struct vfio_device_ioeventfd, fd); 922 - 923 - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 924 - return -EFAULT; 925 - 926 - if (ioeventfd.argsz < minsz) 927 - return -EINVAL; 928 - 929 - if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 930 - return -EINVAL; 931 - 932 - count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 933 - 934 - if (hweight8(count) != 1 || ioeventfd.fd < -1) 935 - return -EINVAL; 936 - 937 - return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 938 - ioeventfd.data, count, ioeventfd.fd); 939 - } else if (cmd == VFIO_DEVICE_FEATURE) { 940 - struct vfio_device_feature feature; 941 - uuid_t uuid; 942 - 943 - minsz = offsetofend(struct vfio_device_feature, flags); 944 - 945 - if (copy_from_user(&feature, (void __user *)arg, minsz)) 946 - return -EFAULT; 947 - 948 - if (feature.argsz < minsz) 949 - return -EINVAL; 950 - 951 - /* Check unknown flags */ 952 - if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 953 - VFIO_DEVICE_FEATURE_SET | 954 - VFIO_DEVICE_FEATURE_GET | 955 - VFIO_DEVICE_FEATURE_PROBE)) 956 - return -EINVAL; 957 - 958 - /* GET & SET are mutually exclusive except with PROBE */ 959 - if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 960 - (feature.flags & VFIO_DEVICE_FEATURE_SET) && 961 - (feature.flags & VFIO_DEVICE_FEATURE_GET)) 962 - return -EINVAL; 963 - 964 - switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 965 - case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 966 - if (!vdev->vf_token) 967 - return -ENOTTY; 968 - 969 - /* 970 - * We do not support GET of the VF Token UUID as this 971 - * could expose the token of the previous device user. 972 - */ 973 - if (feature.flags & VFIO_DEVICE_FEATURE_GET) 974 - return -EINVAL; 975 - 976 - if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 977 - return 0; 978 - 979 - /* Don't SET unless told to do so */ 980 - if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 981 - return -EINVAL; 982 - 983 - if (feature.argsz < minsz + sizeof(uuid)) 984 - return -EINVAL; 985 - 986 - if (copy_from_user(&uuid, (void __user *)(arg + minsz), 987 - sizeof(uuid))) 988 - return -EFAULT; 989 - 990 - mutex_lock(&vdev->vf_token->lock); 991 - uuid_copy(&vdev->vf_token->uuid, &uuid); 992 - mutex_unlock(&vdev->vf_token->lock); 993 - 994 - return 0; 995 - default: 996 - return -ENOTTY; 997 - } 998 392 } 999 393 1000 - return -ENOTTY; 1001 - } 1002 - 1003 - static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf, 1004 - size_t count, loff_t *ppos, bool iswrite) 1005 - { 1006 - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1007 - 1008 - if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1009 - return -EINVAL; 1010 - 1011 - switch (index) { 1012 - case VFIO_PCI_CONFIG_REGION_INDEX: 1013 - return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1014 - 1015 - case VFIO_PCI_ROM_REGION_INDEX: 1016 - if (iswrite) 1017 - return -EINVAL; 1018 - return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1019 - 1020 - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1021 - return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1022 - 1023 - case VFIO_PCI_VGA_REGION_INDEX: 1024 - return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1025 - default: 1026 - index -= VFIO_PCI_NUM_REGIONS; 1027 - return vdev->region[index].ops->rw(vdev, buf, 1028 - count, ppos, iswrite); 1029 - } 1030 - 1031 - return -EINVAL; 1032 - } 1033 - 1034 - static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf, 1035 - size_t count, loff_t *ppos) 1036 - { 1037 - struct vfio_pci_device *vdev = 1038 - container_of(core_vdev, struct vfio_pci_device, vdev); 1039 - 1040 - if (!count) 1041 - return 0; 1042 - 1043 - return vfio_pci_rw(vdev, buf, count, ppos, false); 1044 - } 1045 - 1046 - static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf, 1047 - size_t count, loff_t *ppos) 1048 - { 1049 - struct vfio_pci_device *vdev = 1050 - container_of(core_vdev, struct vfio_pci_device, vdev); 1051 - 1052 - if (!count) 1053 - return 0; 1054 - 1055 - return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1056 - } 1057 - 1058 - /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1059 - static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) 1060 - { 1061 - struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1062 - 1063 - /* 1064 - * Lock ordering: 1065 - * vma_lock is nested under mmap_lock for vm_ops callback paths. 1066 - * The memory_lock semaphore is used by both code paths calling 1067 - * into this function to zap vmas and the vm_ops.fault callback 1068 - * to protect the memory enable state of the device. 1069 - * 1070 - * When zapping vmas we need to maintain the mmap_lock => vma_lock 1071 - * ordering, which requires using vma_lock to walk vma_list to 1072 - * acquire an mm, then dropping vma_lock to get the mmap_lock and 1073 - * reacquiring vma_lock. This logic is derived from similar 1074 - * requirements in uverbs_user_mmap_disassociate(). 1075 - * 1076 - * mmap_lock must always be the top-level lock when it is taken. 1077 - * Therefore we can only hold the memory_lock write lock when 1078 - * vma_list is empty, as we'd need to take mmap_lock to clear 1079 - * entries. vma_list can only be guaranteed empty when holding 1080 - * vma_lock, thus memory_lock is nested under vma_lock. 1081 - * 1082 - * This enables the vm_ops.fault callback to acquire vma_lock, 1083 - * followed by memory_lock read lock, while already holding 1084 - * mmap_lock without risk of deadlock. 1085 - */ 1086 - while (1) { 1087 - struct mm_struct *mm = NULL; 1088 - 1089 - if (try) { 1090 - if (!mutex_trylock(&vdev->vma_lock)) 1091 - return 0; 1092 - } else { 1093 - mutex_lock(&vdev->vma_lock); 1094 - } 1095 - while (!list_empty(&vdev->vma_list)) { 1096 - mmap_vma = list_first_entry(&vdev->vma_list, 1097 - struct vfio_pci_mmap_vma, 1098 - vma_next); 1099 - mm = mmap_vma->vma->vm_mm; 1100 - if (mmget_not_zero(mm)) 1101 - break; 1102 - 1103 - list_del(&mmap_vma->vma_next); 1104 - kfree(mmap_vma); 1105 - mm = NULL; 1106 - } 1107 - if (!mm) 1108 - return 1; 1109 - mutex_unlock(&vdev->vma_lock); 1110 - 1111 - if (try) { 1112 - if (!mmap_read_trylock(mm)) { 1113 - mmput(mm); 1114 - return 0; 1115 - } 1116 - } else { 1117 - mmap_read_lock(mm); 1118 - } 1119 - if (try) { 1120 - if (!mutex_trylock(&vdev->vma_lock)) { 1121 - mmap_read_unlock(mm); 1122 - mmput(mm); 1123 - return 0; 1124 - } 1125 - } else { 1126 - mutex_lock(&vdev->vma_lock); 1127 - } 1128 - list_for_each_entry_safe(mmap_vma, tmp, 1129 - &vdev->vma_list, vma_next) { 1130 - struct vm_area_struct *vma = mmap_vma->vma; 1131 - 1132 - if (vma->vm_mm != mm) 1133 - continue; 1134 - 1135 - list_del(&mmap_vma->vma_next); 1136 - kfree(mmap_vma); 1137 - 1138 - zap_vma_ptes(vma, vma->vm_start, 1139 - vma->vm_end - vma->vm_start); 1140 - } 1141 - mutex_unlock(&vdev->vma_lock); 1142 - mmap_read_unlock(mm); 1143 - mmput(mm); 1144 - } 1145 - } 1146 - 1147 - void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev) 1148 - { 1149 - vfio_pci_zap_and_vma_lock(vdev, false); 1150 - down_write(&vdev->memory_lock); 1151 - mutex_unlock(&vdev->vma_lock); 1152 - } 1153 - 1154 - u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev) 1155 - { 1156 - u16 cmd; 1157 - 1158 - down_write(&vdev->memory_lock); 1159 - pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1160 - if (!(cmd & PCI_COMMAND_MEMORY)) 1161 - pci_write_config_word(vdev->pdev, PCI_COMMAND, 1162 - cmd | PCI_COMMAND_MEMORY); 1163 - 1164 - return cmd; 1165 - } 1166 - 1167 - void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd) 1168 - { 1169 - pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1170 - up_write(&vdev->memory_lock); 1171 - } 1172 - 1173 - /* Caller holds vma_lock */ 1174 - static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, 1175 - struct vm_area_struct *vma) 1176 - { 1177 - struct vfio_pci_mmap_vma *mmap_vma; 1178 - 1179 - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1180 - if (!mmap_vma) 1181 - return -ENOMEM; 1182 - 1183 - mmap_vma->vma = vma; 1184 - list_add(&mmap_vma->vma_next, &vdev->vma_list); 394 + vfio_pci_core_finish_enable(vdev); 1185 395 1186 396 return 0; 1187 - } 1188 - 1189 - /* 1190 - * Zap mmaps on open so that we can fault them in on access and therefore 1191 - * our vma_list only tracks mappings accessed since last zap. 1192 - */ 1193 - static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1194 - { 1195 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1196 - } 1197 - 1198 - static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1199 - { 1200 - struct vfio_pci_device *vdev = vma->vm_private_data; 1201 - struct vfio_pci_mmap_vma *mmap_vma; 1202 - 1203 - mutex_lock(&vdev->vma_lock); 1204 - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1205 - if (mmap_vma->vma == vma) { 1206 - list_del(&mmap_vma->vma_next); 1207 - kfree(mmap_vma); 1208 - break; 1209 - } 1210 - } 1211 - mutex_unlock(&vdev->vma_lock); 1212 - } 1213 - 1214 - static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1215 - { 1216 - struct vm_area_struct *vma = vmf->vma; 1217 - struct vfio_pci_device *vdev = vma->vm_private_data; 1218 - struct vfio_pci_mmap_vma *mmap_vma; 1219 - vm_fault_t ret = VM_FAULT_NOPAGE; 1220 - 1221 - mutex_lock(&vdev->vma_lock); 1222 - down_read(&vdev->memory_lock); 1223 - 1224 - if (!__vfio_pci_memory_enabled(vdev)) { 1225 - ret = VM_FAULT_SIGBUS; 1226 - goto up_out; 1227 - } 1228 - 1229 - /* 1230 - * We populate the whole vma on fault, so we need to test whether 1231 - * the vma has already been mapped, such as for concurrent faults 1232 - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1233 - * we ask it to fill the same range again. 1234 - */ 1235 - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1236 - if (mmap_vma->vma == vma) 1237 - goto up_out; 1238 - } 1239 - 1240 - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1241 - vma->vm_end - vma->vm_start, 1242 - vma->vm_page_prot)) { 1243 - ret = VM_FAULT_SIGBUS; 1244 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1245 - goto up_out; 1246 - } 1247 - 1248 - if (__vfio_pci_add_vma(vdev, vma)) { 1249 - ret = VM_FAULT_OOM; 1250 - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1251 - } 1252 - 1253 - up_out: 1254 - up_read(&vdev->memory_lock); 1255 - mutex_unlock(&vdev->vma_lock); 1256 - return ret; 1257 - } 1258 - 1259 - static const struct vm_operations_struct vfio_pci_mmap_ops = { 1260 - .open = vfio_pci_mmap_open, 1261 - .close = vfio_pci_mmap_close, 1262 - .fault = vfio_pci_mmap_fault, 1263 - }; 1264 - 1265 - static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1266 - { 1267 - struct vfio_pci_device *vdev = 1268 - container_of(core_vdev, struct vfio_pci_device, vdev); 1269 - struct pci_dev *pdev = vdev->pdev; 1270 - unsigned int index; 1271 - u64 phys_len, req_len, pgoff, req_start; 1272 - int ret; 1273 - 1274 - index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1275 - 1276 - if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1277 - return -EINVAL; 1278 - if (vma->vm_end < vma->vm_start) 1279 - return -EINVAL; 1280 - if ((vma->vm_flags & VM_SHARED) == 0) 1281 - return -EINVAL; 1282 - if (index >= VFIO_PCI_NUM_REGIONS) { 1283 - int regnum = index - VFIO_PCI_NUM_REGIONS; 1284 - struct vfio_pci_region *region = vdev->region + regnum; 1285 - 1286 - if (region->ops && region->ops->mmap && 1287 - (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1288 - return region->ops->mmap(vdev, region, vma); 1289 - return -EINVAL; 1290 - } 1291 - if (index >= VFIO_PCI_ROM_REGION_INDEX) 1292 - return -EINVAL; 1293 - if (!vdev->bar_mmap_supported[index]) 1294 - return -EINVAL; 1295 - 1296 - phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1297 - req_len = vma->vm_end - vma->vm_start; 1298 - pgoff = vma->vm_pgoff & 1299 - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1300 - req_start = pgoff << PAGE_SHIFT; 1301 - 1302 - if (req_start + req_len > phys_len) 1303 - return -EINVAL; 1304 - 1305 - /* 1306 - * Even though we don't make use of the barmap for the mmap, 1307 - * we need to request the region and the barmap tracks that. 1308 - */ 1309 - if (!vdev->barmap[index]) { 1310 - ret = pci_request_selected_regions(pdev, 1311 - 1 << index, "vfio-pci"); 1312 - if (ret) 1313 - return ret; 1314 - 1315 - vdev->barmap[index] = pci_iomap(pdev, index, 0); 1316 - if (!vdev->barmap[index]) { 1317 - pci_release_selected_regions(pdev, 1 << index); 1318 - return -ENOMEM; 1319 - } 1320 - } 1321 - 1322 - vma->vm_private_data = vdev; 1323 - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1324 - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1325 - 1326 - /* 1327 - * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1328 - * change vm_flags within the fault handler. Set them now. 1329 - */ 1330 - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1331 - vma->vm_ops = &vfio_pci_mmap_ops; 1332 - 1333 - return 0; 1334 - } 1335 - 1336 - static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count) 1337 - { 1338 - struct vfio_pci_device *vdev = 1339 - container_of(core_vdev, struct vfio_pci_device, vdev); 1340 - struct pci_dev *pdev = vdev->pdev; 1341 - 1342 - mutex_lock(&vdev->igate); 1343 - 1344 - if (vdev->req_trigger) { 1345 - if (!(count % 10)) 1346 - pci_notice_ratelimited(pdev, 1347 - "Relaying device request to user (#%u)\n", 1348 - count); 1349 - eventfd_signal(vdev->req_trigger, 1); 1350 - } else if (count == 0) { 1351 - pci_warn(pdev, 1352 - "No device request channel registered, blocked until released by user\n"); 1353 - } 1354 - 1355 - mutex_unlock(&vdev->igate); 1356 - } 1357 - 1358 - static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, 1359 - bool vf_token, uuid_t *uuid) 1360 - { 1361 - /* 1362 - * There's always some degree of trust or collaboration between SR-IOV 1363 - * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1364 - * can disrupt VFs with a reset, but often the PF has more explicit 1365 - * access to deny service to the VF or access data passed through the 1366 - * VF. We therefore require an opt-in via a shared VF token (UUID) to 1367 - * represent this trust. This both prevents that a VF driver might 1368 - * assume the PF driver is a trusted, in-kernel driver, and also that 1369 - * a PF driver might be replaced with a rogue driver, unknown to in-use 1370 - * VF drivers. 1371 - * 1372 - * Therefore when presented with a VF, if the PF is a vfio device and 1373 - * it is bound to the vfio-pci driver, the user needs to provide a VF 1374 - * token to access the device, in the form of appending a vf_token to 1375 - * the device name, for example: 1376 - * 1377 - * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1378 - * 1379 - * When presented with a PF which has VFs in use, the user must also 1380 - * provide the current VF token to prove collaboration with existing 1381 - * VF users. If VFs are not in use, the VF token provided for the PF 1382 - * device will act to set the VF token. 1383 - * 1384 - * If the VF token is provided but unused, an error is generated. 1385 - */ 1386 - if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1387 - return 0; /* No VF token provided or required */ 1388 - 1389 - if (vdev->pdev->is_virtfn) { 1390 - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); 1391 - bool match; 1392 - 1393 - if (!pf_vdev) { 1394 - if (!vf_token) 1395 - return 0; /* PF is not vfio-pci, no VF token */ 1396 - 1397 - pci_info_ratelimited(vdev->pdev, 1398 - "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1399 - return -EINVAL; 1400 - } 1401 - 1402 - if (!vf_token) { 1403 - vfio_device_put(&pf_vdev->vdev); 1404 - pci_info_ratelimited(vdev->pdev, 1405 - "VF token required to access device\n"); 1406 - return -EACCES; 1407 - } 1408 - 1409 - mutex_lock(&pf_vdev->vf_token->lock); 1410 - match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1411 - mutex_unlock(&pf_vdev->vf_token->lock); 1412 - 1413 - vfio_device_put(&pf_vdev->vdev); 1414 - 1415 - if (!match) { 1416 - pci_info_ratelimited(vdev->pdev, 1417 - "Incorrect VF token provided for device\n"); 1418 - return -EACCES; 1419 - } 1420 - } else if (vdev->vf_token) { 1421 - mutex_lock(&vdev->vf_token->lock); 1422 - if (vdev->vf_token->users) { 1423 - if (!vf_token) { 1424 - mutex_unlock(&vdev->vf_token->lock); 1425 - pci_info_ratelimited(vdev->pdev, 1426 - "VF token required to access device\n"); 1427 - return -EACCES; 1428 - } 1429 - 1430 - if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1431 - mutex_unlock(&vdev->vf_token->lock); 1432 - pci_info_ratelimited(vdev->pdev, 1433 - "Incorrect VF token provided for device\n"); 1434 - return -EACCES; 1435 - } 1436 - } else if (vf_token) { 1437 - uuid_copy(&vdev->vf_token->uuid, uuid); 1438 - } 1439 - 1440 - mutex_unlock(&vdev->vf_token->lock); 1441 - } else if (vf_token) { 1442 - pci_info_ratelimited(vdev->pdev, 1443 - "VF token incorrectly provided, not a PF or VF\n"); 1444 - return -EINVAL; 1445 - } 1446 - 1447 - return 0; 1448 - } 1449 - 1450 - #define VF_TOKEN_ARG "vf_token=" 1451 - 1452 - static int vfio_pci_match(struct vfio_device *core_vdev, char *buf) 1453 - { 1454 - struct vfio_pci_device *vdev = 1455 - container_of(core_vdev, struct vfio_pci_device, vdev); 1456 - bool vf_token = false; 1457 - uuid_t uuid; 1458 - int ret; 1459 - 1460 - if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1461 - return 0; /* No match */ 1462 - 1463 - if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1464 - buf += strlen(pci_name(vdev->pdev)); 1465 - 1466 - if (*buf != ' ') 1467 - return 0; /* No match: non-whitespace after name */ 1468 - 1469 - while (*buf) { 1470 - if (*buf == ' ') { 1471 - buf++; 1472 - continue; 1473 - } 1474 - 1475 - if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1476 - strlen(VF_TOKEN_ARG))) { 1477 - buf += strlen(VF_TOKEN_ARG); 1478 - 1479 - if (strlen(buf) < UUID_STRING_LEN) 1480 - return -EINVAL; 1481 - 1482 - ret = uuid_parse(buf, &uuid); 1483 - if (ret) 1484 - return ret; 1485 - 1486 - vf_token = true; 1487 - buf += UUID_STRING_LEN; 1488 - } else { 1489 - /* Unknown/duplicate option */ 1490 - return -EINVAL; 1491 - } 1492 - } 1493 - } 1494 - 1495 - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1496 - if (ret) 1497 - return ret; 1498 - 1499 - return 1; /* Match */ 1500 397 } 1501 398 1502 399 static const struct vfio_device_ops vfio_pci_ops = { 1503 400 .name = "vfio-pci", 1504 401 .open_device = vfio_pci_open_device, 1505 - .close_device = vfio_pci_close_device, 1506 - .ioctl = vfio_pci_ioctl, 1507 - .read = vfio_pci_read, 1508 - .write = vfio_pci_write, 1509 - .mmap = vfio_pci_mmap, 1510 - .request = vfio_pci_request, 1511 - .match = vfio_pci_match, 402 + .close_device = vfio_pci_core_close_device, 403 + .ioctl = vfio_pci_core_ioctl, 404 + .read = vfio_pci_core_read, 405 + .write = vfio_pci_core_write, 406 + .mmap = vfio_pci_core_mmap, 407 + .request = vfio_pci_core_request, 408 + .match = vfio_pci_core_match, 1512 409 }; 1513 - 1514 - static int vfio_pci_bus_notifier(struct notifier_block *nb, 1515 - unsigned long action, void *data) 1516 - { 1517 - struct vfio_pci_device *vdev = container_of(nb, 1518 - struct vfio_pci_device, nb); 1519 - struct device *dev = data; 1520 - struct pci_dev *pdev = to_pci_dev(dev); 1521 - struct pci_dev *physfn = pci_physfn(pdev); 1522 - 1523 - if (action == BUS_NOTIFY_ADD_DEVICE && 1524 - pdev->is_virtfn && physfn == vdev->pdev) { 1525 - pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1526 - pci_name(pdev)); 1527 - pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1528 - vfio_pci_ops.name); 1529 - } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1530 - pdev->is_virtfn && physfn == vdev->pdev) { 1531 - struct pci_driver *drv = pci_dev_driver(pdev); 1532 - 1533 - if (drv && drv != &vfio_pci_driver) 1534 - pci_warn(vdev->pdev, 1535 - "VF %s bound to driver %s while PF bound to vfio-pci\n", 1536 - pci_name(pdev), drv->name); 1537 - } 1538 - 1539 - return 0; 1540 - } 1541 - 1542 - static int vfio_pci_vf_init(struct vfio_pci_device *vdev) 1543 - { 1544 - struct pci_dev *pdev = vdev->pdev; 1545 - int ret; 1546 - 1547 - if (!pdev->is_physfn) 1548 - return 0; 1549 - 1550 - vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1551 - if (!vdev->vf_token) 1552 - return -ENOMEM; 1553 - 1554 - mutex_init(&vdev->vf_token->lock); 1555 - uuid_gen(&vdev->vf_token->uuid); 1556 - 1557 - vdev->nb.notifier_call = vfio_pci_bus_notifier; 1558 - ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1559 - if (ret) { 1560 - kfree(vdev->vf_token); 1561 - return ret; 1562 - } 1563 - return 0; 1564 - } 1565 - 1566 - static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) 1567 - { 1568 - if (!vdev->vf_token) 1569 - return; 1570 - 1571 - bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1572 - WARN_ON(vdev->vf_token->users); 1573 - mutex_destroy(&vdev->vf_token->lock); 1574 - kfree(vdev->vf_token); 1575 - } 1576 - 1577 - static int vfio_pci_vga_init(struct vfio_pci_device *vdev) 1578 - { 1579 - struct pci_dev *pdev = vdev->pdev; 1580 - int ret; 1581 - 1582 - if (!vfio_pci_is_vga(pdev)) 1583 - return 0; 1584 - 1585 - ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 1586 - if (ret) 1587 - return ret; 1588 - vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false)); 1589 - return 0; 1590 - } 1591 - 1592 - static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) 1593 - { 1594 - struct pci_dev *pdev = vdev->pdev; 1595 - 1596 - if (!vfio_pci_is_vga(pdev)) 1597 - return; 1598 - vga_client_register(pdev, NULL, NULL, NULL); 1599 - vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1600 - VGA_RSRC_LEGACY_IO | 1601 - VGA_RSRC_LEGACY_MEM); 1602 - } 1603 410 1604 411 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 1605 412 { 1606 - struct vfio_pci_device *vdev; 1607 - struct iommu_group *group; 413 + struct vfio_pci_core_device *vdev; 1608 414 int ret; 1609 415 1610 416 if (vfio_pci_is_denylisted(pdev)) 1611 417 return -EINVAL; 1612 418 1613 - if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1614 - return -EINVAL; 1615 - 1616 - /* 1617 - * Prevent binding to PFs with VFs enabled, the VFs might be in use 1618 - * by the host or other users. We cannot capture the VFs if they 1619 - * already exist, nor can we track VF users. Disabling SR-IOV here 1620 - * would initiate removing the VFs, which would unbind the driver, 1621 - * which is prone to blocking if that VF is also in use by vfio-pci. 1622 - * Just reject these PFs and let the user sort it out. 1623 - */ 1624 - if (pci_num_vf(pdev)) { 1625 - pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1626 - return -EBUSY; 1627 - } 1628 - 1629 - group = vfio_iommu_group_get(&pdev->dev); 1630 - if (!group) 1631 - return -EINVAL; 1632 - 1633 419 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 1634 - if (!vdev) { 1635 - ret = -ENOMEM; 1636 - goto out_group_put; 1637 - } 420 + if (!vdev) 421 + return -ENOMEM; 422 + vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); 1638 423 1639 - vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops); 1640 - vdev->pdev = pdev; 1641 - vdev->irq_type = VFIO_PCI_NUM_IRQS; 1642 - mutex_init(&vdev->igate); 1643 - spin_lock_init(&vdev->irqlock); 1644 - mutex_init(&vdev->ioeventfds_lock); 1645 - INIT_LIST_HEAD(&vdev->dummy_resources_list); 1646 - INIT_LIST_HEAD(&vdev->ioeventfds_list); 1647 - mutex_init(&vdev->vma_lock); 1648 - INIT_LIST_HEAD(&vdev->vma_list); 1649 - init_rwsem(&vdev->memory_lock); 1650 - 1651 - if (pci_is_root_bus(pdev->bus)) { 1652 - ret = vfio_assign_device_set(&vdev->vdev, vdev); 1653 - } else if (!pci_probe_reset_slot(pdev->slot)) { 1654 - ret = vfio_assign_device_set(&vdev->vdev, pdev->slot); 1655 - } else { 1656 - /* 1657 - * If there is no slot reset support for this device, the whole 1658 - * bus needs to be grouped together to support bus-wide resets. 1659 - */ 1660 - ret = vfio_assign_device_set(&vdev->vdev, pdev->bus); 1661 - } 1662 - 424 + ret = vfio_pci_core_register_device(vdev); 1663 425 if (ret) 1664 - goto out_uninit; 1665 - ret = vfio_pci_vf_init(vdev); 1666 - if (ret) 1667 - goto out_uninit; 1668 - ret = vfio_pci_vga_init(vdev); 1669 - if (ret) 1670 - goto out_vf; 1671 - 1672 - vfio_pci_probe_power_state(vdev); 1673 - 1674 - if (!disable_idle_d3) { 1675 - /* 1676 - * pci-core sets the device power state to an unknown value at 1677 - * bootup and after being removed from a driver. The only 1678 - * transition it allows from this unknown state is to D0, which 1679 - * typically happens when a driver calls pci_enable_device(). 1680 - * We're not ready to enable the device yet, but we do want to 1681 - * be able to get to D3. Therefore first do a D0 transition 1682 - * before going to D3. 1683 - */ 1684 - vfio_pci_set_power_state(vdev, PCI_D0); 1685 - vfio_pci_set_power_state(vdev, PCI_D3hot); 1686 - } 1687 - 1688 - ret = vfio_register_group_dev(&vdev->vdev); 1689 - if (ret) 1690 - goto out_power; 426 + goto out_free; 1691 427 dev_set_drvdata(&pdev->dev, vdev); 1692 428 return 0; 1693 429 1694 - out_power: 1695 - if (!disable_idle_d3) 1696 - vfio_pci_set_power_state(vdev, PCI_D0); 1697 - out_vf: 1698 - vfio_pci_vf_uninit(vdev); 1699 - out_uninit: 1700 - vfio_uninit_group_dev(&vdev->vdev); 1701 - kfree(vdev->pm_save); 430 + out_free: 431 + vfio_pci_core_uninit_device(vdev); 1702 432 kfree(vdev); 1703 - out_group_put: 1704 - vfio_iommu_group_put(group, &pdev->dev); 1705 433 return ret; 1706 434 } 1707 435 1708 436 static void vfio_pci_remove(struct pci_dev *pdev) 1709 437 { 1710 - struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev); 438 + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); 1711 439 1712 - pci_disable_sriov(pdev); 1713 - 1714 - vfio_unregister_group_dev(&vdev->vdev); 1715 - 1716 - vfio_pci_vf_uninit(vdev); 1717 - vfio_uninit_group_dev(&vdev->vdev); 1718 - vfio_pci_vga_uninit(vdev); 1719 - 1720 - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1721 - 1722 - if (!disable_idle_d3) 1723 - vfio_pci_set_power_state(vdev, PCI_D0); 1724 - 1725 - mutex_destroy(&vdev->ioeventfds_lock); 1726 - kfree(vdev->region); 1727 - kfree(vdev->pm_save); 440 + vfio_pci_core_unregister_device(vdev); 441 + vfio_pci_core_uninit_device(vdev); 1728 442 kfree(vdev); 1729 - } 1730 - 1731 - static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1732 - pci_channel_state_t state) 1733 - { 1734 - struct vfio_pci_device *vdev; 1735 - struct vfio_device *device; 1736 - 1737 - device = vfio_device_get_from_dev(&pdev->dev); 1738 - if (device == NULL) 1739 - return PCI_ERS_RESULT_DISCONNECT; 1740 - 1741 - vdev = container_of(device, struct vfio_pci_device, vdev); 1742 - 1743 - mutex_lock(&vdev->igate); 1744 - 1745 - if (vdev->err_trigger) 1746 - eventfd_signal(vdev->err_trigger, 1); 1747 - 1748 - mutex_unlock(&vdev->igate); 1749 - 1750 - vfio_device_put(device); 1751 - 1752 - return PCI_ERS_RESULT_CAN_RECOVER; 1753 443 } 1754 444 1755 445 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 1756 446 { 1757 - struct vfio_device *device; 1758 - int ret = 0; 1759 - 1760 - might_sleep(); 1761 - 1762 447 if (!enable_sriov) 1763 448 return -ENOENT; 1764 449 1765 - device = vfio_device_get_from_dev(&pdev->dev); 1766 - if (!device) 1767 - return -ENODEV; 1768 - 1769 - if (nr_virtfn == 0) 1770 - pci_disable_sriov(pdev); 1771 - else 1772 - ret = pci_enable_sriov(pdev, nr_virtfn); 1773 - 1774 - vfio_device_put(device); 1775 - 1776 - return ret < 0 ? ret : nr_virtfn; 450 + return vfio_pci_core_sriov_configure(pdev, nr_virtfn); 1777 451 } 1778 452 1779 - static const struct pci_error_handlers vfio_err_handlers = { 1780 - .error_detected = vfio_pci_aer_err_detected, 453 + static const struct pci_device_id vfio_pci_table[] = { 454 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_ANY_ID, PCI_ANY_ID) }, /* match all by default */ 455 + {} 1781 456 }; 457 + 458 + MODULE_DEVICE_TABLE(pci, vfio_pci_table); 1782 459 1783 460 static struct pci_driver vfio_pci_driver = { 1784 461 .name = "vfio-pci", 1785 - .id_table = NULL, /* only dynamic ids */ 462 + .id_table = vfio_pci_table, 1786 463 .probe = vfio_pci_probe, 1787 464 .remove = vfio_pci_remove, 1788 465 .sriov_configure = vfio_pci_sriov_configure, 1789 - .err_handler = &vfio_err_handlers, 466 + .err_handler = &vfio_pci_core_err_handlers, 1790 467 }; 1791 - 1792 - static bool vfio_dev_in_groups(struct vfio_pci_device *vdev, 1793 - struct vfio_pci_group_info *groups) 1794 - { 1795 - unsigned int i; 1796 - 1797 - for (i = 0; i < groups->count; i++) 1798 - if (groups->groups[i] == vdev->vdev.group) 1799 - return true; 1800 - return false; 1801 - } 1802 - 1803 - static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) 1804 - { 1805 - struct vfio_device_set *dev_set = data; 1806 - struct vfio_device *cur; 1807 - 1808 - list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 1809 - if (cur->dev == &pdev->dev) 1810 - return 0; 1811 - return -EBUSY; 1812 - } 1813 - 1814 - /* 1815 - * vfio-core considers a group to be viable and will create a vfio_device even 1816 - * if some devices are bound to drivers like pci-stub or pcieport. Here we 1817 - * require all PCI devices to be inside our dev_set since that ensures they stay 1818 - * put and that every driver controlling the device can co-ordinate with the 1819 - * device reset. 1820 - * 1821 - * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be 1822 - * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise. 1823 - */ 1824 - static struct pci_dev * 1825 - vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) 1826 - { 1827 - struct pci_dev *pdev; 1828 - 1829 - lockdep_assert_held(&dev_set->lock); 1830 - 1831 - /* 1832 - * By definition all PCI devices in the dev_set share the same PCI 1833 - * reset, so any pci_dev will have the same outcomes for 1834 - * pci_probe_reset_*() and pci_reset_bus(). 1835 - */ 1836 - pdev = list_first_entry(&dev_set->device_list, struct vfio_pci_device, 1837 - vdev.dev_set_list)->pdev; 1838 - 1839 - /* pci_reset_bus() is supported */ 1840 - if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus)) 1841 - return NULL; 1842 - 1843 - if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set, 1844 - dev_set, 1845 - !pci_probe_reset_slot(pdev->slot))) 1846 - return NULL; 1847 - return pdev; 1848 - } 1849 - 1850 - /* 1851 - * We need to get memory_lock for each device, but devices can share mmap_lock, 1852 - * therefore we need to zap and hold the vma_lock for each device, and only then 1853 - * get each memory_lock. 1854 - */ 1855 - static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 1856 - struct vfio_pci_group_info *groups) 1857 - { 1858 - struct vfio_pci_device *cur_mem; 1859 - struct vfio_pci_device *cur_vma; 1860 - struct vfio_pci_device *cur; 1861 - struct pci_dev *pdev; 1862 - bool is_mem = true; 1863 - int ret; 1864 - 1865 - mutex_lock(&dev_set->lock); 1866 - cur_mem = list_first_entry(&dev_set->device_list, 1867 - struct vfio_pci_device, vdev.dev_set_list); 1868 - 1869 - pdev = vfio_pci_dev_set_resettable(dev_set); 1870 - if (!pdev) { 1871 - ret = -EINVAL; 1872 - goto err_unlock; 1873 - } 1874 - 1875 - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { 1876 - /* 1877 - * Test whether all the affected devices are contained by the 1878 - * set of groups provided by the user. 1879 - */ 1880 - if (!vfio_dev_in_groups(cur_vma, groups)) { 1881 - ret = -EINVAL; 1882 - goto err_undo; 1883 - } 1884 - 1885 - /* 1886 - * Locking multiple devices is prone to deadlock, runaway and 1887 - * unwind if we hit contention. 1888 - */ 1889 - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { 1890 - ret = -EBUSY; 1891 - goto err_undo; 1892 - } 1893 - } 1894 - cur_vma = NULL; 1895 - 1896 - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { 1897 - if (!down_write_trylock(&cur_mem->memory_lock)) { 1898 - ret = -EBUSY; 1899 - goto err_undo; 1900 - } 1901 - mutex_unlock(&cur_mem->vma_lock); 1902 - } 1903 - cur_mem = NULL; 1904 - 1905 - ret = pci_reset_bus(pdev); 1906 - 1907 - err_undo: 1908 - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 1909 - if (cur == cur_mem) 1910 - is_mem = false; 1911 - if (cur == cur_vma) 1912 - break; 1913 - if (is_mem) 1914 - up_write(&cur->memory_lock); 1915 - else 1916 - mutex_unlock(&cur->vma_lock); 1917 - } 1918 - err_unlock: 1919 - mutex_unlock(&dev_set->lock); 1920 - return ret; 1921 - } 1922 - 1923 - static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) 1924 - { 1925 - struct vfio_pci_device *cur; 1926 - bool needs_reset = false; 1927 - 1928 - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 1929 - /* No VFIO device in the set can have an open device FD */ 1930 - if (cur->vdev.open_count) 1931 - return false; 1932 - needs_reset |= cur->needs_reset; 1933 - } 1934 - return needs_reset; 1935 - } 1936 - 1937 - /* 1938 - * If a bus or slot reset is available for the provided dev_set and: 1939 - * - All of the devices affected by that bus or slot reset are unused 1940 - * - At least one of the affected devices is marked dirty via 1941 - * needs_reset (such as by lack of FLR support) 1942 - * Then attempt to perform that bus or slot reset. 1943 - * Returns true if the dev_set was reset. 1944 - */ 1945 - static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) 1946 - { 1947 - struct vfio_pci_device *cur; 1948 - struct pci_dev *pdev; 1949 - int ret; 1950 - 1951 - if (!vfio_pci_dev_set_needs_reset(dev_set)) 1952 - return false; 1953 - 1954 - pdev = vfio_pci_dev_set_resettable(dev_set); 1955 - if (!pdev) 1956 - return false; 1957 - 1958 - ret = pci_reset_bus(pdev); 1959 - if (ret) 1960 - return false; 1961 - 1962 - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 1963 - cur->needs_reset = false; 1964 - if (!disable_idle_d3) 1965 - vfio_pci_set_power_state(cur, PCI_D3hot); 1966 - } 1967 - return true; 1968 - } 1969 - 1970 - static void __exit vfio_pci_cleanup(void) 1971 - { 1972 - pci_unregister_driver(&vfio_pci_driver); 1973 - vfio_pci_uninit_perm_bits(); 1974 - } 1975 468 1976 469 static void __init vfio_pci_fill_ids(void) 1977 470 { ··· 239 2288 static int __init vfio_pci_init(void) 240 2289 { 241 2290 int ret; 2291 + bool is_disable_vga = true; 242 2292 243 - /* Allocate shared config space permission data used by all devices */ 244 - ret = vfio_pci_init_perm_bits(); 245 - if (ret) 246 - return ret; 2293 + #ifdef CONFIG_VFIO_PCI_VGA 2294 + is_disable_vga = disable_vga; 2295 + #endif 2296 + 2297 + vfio_pci_core_set_params(nointxmask, is_disable_vga, disable_idle_d3); 247 2298 248 2299 /* Register and scan for devices */ 249 2300 ret = pci_register_driver(&vfio_pci_driver); 250 2301 if (ret) 251 - goto out_driver; 2302 + return ret; 252 2303 253 2304 vfio_pci_fill_ids(); 254 2305 ··· 258 2305 pr_warn("device denylist disabled.\n"); 259 2306 260 2307 return 0; 261 - 262 - out_driver: 263 - vfio_pci_uninit_perm_bits(); 264 - return ret; 265 2308 } 266 - 267 2309 module_init(vfio_pci_init); 2310 + 2311 + static void __exit vfio_pci_cleanup(void) 2312 + { 2313 + pci_unregister_driver(&vfio_pci_driver); 2314 + } 268 2315 module_exit(vfio_pci_cleanup); 269 2316 270 - MODULE_VERSION(DRIVER_VERSION); 271 2317 MODULE_LICENSE("GPL v2"); 272 2318 MODULE_AUTHOR(DRIVER_AUTHOR); 273 2319 MODULE_DESCRIPTION(DRIVER_DESC);
+35 -35
drivers/vfio/pci/vfio_pci_config.c
··· 26 26 #include <linux/vfio.h> 27 27 #include <linux/slab.h> 28 28 29 - #include "vfio_pci_private.h" 29 + #include <linux/vfio_pci_core.h> 30 30 31 31 /* Fake capability ID for standard config space */ 32 32 #define PCI_CAP_ID_BASIC 0 ··· 108 108 struct perm_bits { 109 109 u8 *virt; /* read/write virtual data, not hw */ 110 110 u8 *write; /* writeable bits */ 111 - int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, 111 + int (*readfn)(struct vfio_pci_core_device *vdev, int pos, int count, 112 112 struct perm_bits *perm, int offset, __le32 *val); 113 - int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, 113 + int (*writefn)(struct vfio_pci_core_device *vdev, int pos, int count, 114 114 struct perm_bits *perm, int offset, __le32 val); 115 115 }; 116 116 ··· 171 171 return ret; 172 172 } 173 173 174 - static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, 174 + static int vfio_default_config_read(struct vfio_pci_core_device *vdev, int pos, 175 175 int count, struct perm_bits *perm, 176 176 int offset, __le32 *val) 177 177 { ··· 197 197 return count; 198 198 } 199 199 200 - static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, 200 + static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos, 201 201 int count, struct perm_bits *perm, 202 202 int offset, __le32 val) 203 203 { ··· 244 244 } 245 245 246 246 /* Allow direct read from hardware, except for capability next pointer */ 247 - static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, 247 + static int vfio_direct_config_read(struct vfio_pci_core_device *vdev, int pos, 248 248 int count, struct perm_bits *perm, 249 249 int offset, __le32 *val) 250 250 { ··· 269 269 } 270 270 271 271 /* Raw access skips any kind of virtualization */ 272 - static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, 272 + static int vfio_raw_config_write(struct vfio_pci_core_device *vdev, int pos, 273 273 int count, struct perm_bits *perm, 274 274 int offset, __le32 val) 275 275 { ··· 282 282 return count; 283 283 } 284 284 285 - static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, 285 + static int vfio_raw_config_read(struct vfio_pci_core_device *vdev, int pos, 286 286 int count, struct perm_bits *perm, 287 287 int offset, __le32 *val) 288 288 { ··· 296 296 } 297 297 298 298 /* Virt access uses only virtualization */ 299 - static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos, 299 + static int vfio_virt_config_write(struct vfio_pci_core_device *vdev, int pos, 300 300 int count, struct perm_bits *perm, 301 301 int offset, __le32 val) 302 302 { ··· 304 304 return count; 305 305 } 306 306 307 - static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos, 307 + static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos, 308 308 int count, struct perm_bits *perm, 309 309 int offset, __le32 *val) 310 310 { ··· 396 396 } 397 397 398 398 /* Caller should hold memory_lock semaphore */ 399 - bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev) 399 + bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) 400 400 { 401 401 struct pci_dev *pdev = vdev->pdev; 402 402 u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); ··· 413 413 * Restore the *real* BARs after we detect a FLR or backdoor reset. 414 414 * (backdoor = some device specific technique that we didn't catch) 415 415 */ 416 - static void vfio_bar_restore(struct vfio_pci_device *vdev) 416 + static void vfio_bar_restore(struct vfio_pci_core_device *vdev) 417 417 { 418 418 struct pci_dev *pdev = vdev->pdev; 419 419 u32 *rbar = vdev->rbar; ··· 460 460 * Pretend we're hardware and tweak the values of the *virtual* PCI BARs 461 461 * to reflect the hardware capabilities. This implements BAR sizing. 462 462 */ 463 - static void vfio_bar_fixup(struct vfio_pci_device *vdev) 463 + static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) 464 464 { 465 465 struct pci_dev *pdev = vdev->pdev; 466 466 int i; ··· 514 514 vdev->bardirty = false; 515 515 } 516 516 517 - static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, 517 + static int vfio_basic_config_read(struct vfio_pci_core_device *vdev, int pos, 518 518 int count, struct perm_bits *perm, 519 519 int offset, __le32 *val) 520 520 { ··· 536 536 } 537 537 538 538 /* Test whether BARs match the value we think they should contain */ 539 - static bool vfio_need_bar_restore(struct vfio_pci_device *vdev) 539 + static bool vfio_need_bar_restore(struct vfio_pci_core_device *vdev) 540 540 { 541 541 int i = 0, pos = PCI_BASE_ADDRESS_0, ret; 542 542 u32 bar; ··· 552 552 return false; 553 553 } 554 554 555 - static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, 555 + static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, 556 556 int count, struct perm_bits *perm, 557 557 int offset, __le32 val) 558 558 { ··· 692 692 return 0; 693 693 } 694 694 695 - static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos, 695 + static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, 696 696 int count, struct perm_bits *perm, 697 697 int offset, __le32 val) 698 698 { ··· 747 747 return 0; 748 748 } 749 749 750 - static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos, 750 + static int vfio_vpd_config_write(struct vfio_pci_core_device *vdev, int pos, 751 751 int count, struct perm_bits *perm, 752 752 int offset, __le32 val) 753 753 { ··· 829 829 return 0; 830 830 } 831 831 832 - static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, 832 + static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, 833 833 int count, struct perm_bits *perm, 834 834 int offset, __le32 val) 835 835 { ··· 913 913 return 0; 914 914 } 915 915 916 - static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos, 916 + static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, 917 917 int count, struct perm_bits *perm, 918 918 int offset, __le32 val) 919 919 { ··· 1072 1072 return ret; 1073 1073 } 1074 1074 1075 - static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) 1075 + static int vfio_find_cap_start(struct vfio_pci_core_device *vdev, int pos) 1076 1076 { 1077 1077 u8 cap; 1078 1078 int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : ··· 1089 1089 return pos; 1090 1090 } 1091 1091 1092 - static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, 1092 + static int vfio_msi_config_read(struct vfio_pci_core_device *vdev, int pos, 1093 1093 int count, struct perm_bits *perm, 1094 1094 int offset, __le32 *val) 1095 1095 { ··· 1109 1109 return vfio_default_config_read(vdev, pos, count, perm, offset, val); 1110 1110 } 1111 1111 1112 - static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, 1112 + static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos, 1113 1113 int count, struct perm_bits *perm, 1114 1114 int offset, __le32 val) 1115 1115 { ··· 1189 1189 } 1190 1190 1191 1191 /* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ 1192 - static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) 1192 + static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos) 1193 1193 { 1194 1194 struct pci_dev *pdev = vdev->pdev; 1195 1195 int len, ret; ··· 1222 1222 } 1223 1223 1224 1224 /* Determine extended capability length for VC (2 & 9) and MFVC */ 1225 - static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) 1225 + static int vfio_vc_cap_len(struct vfio_pci_core_device *vdev, u16 pos) 1226 1226 { 1227 1227 struct pci_dev *pdev = vdev->pdev; 1228 1228 u32 tmp; ··· 1263 1263 return len; 1264 1264 } 1265 1265 1266 - static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) 1266 + static int vfio_cap_len(struct vfio_pci_core_device *vdev, u8 cap, u8 pos) 1267 1267 { 1268 1268 struct pci_dev *pdev = vdev->pdev; 1269 1269 u32 dword; ··· 1338 1338 return 0; 1339 1339 } 1340 1340 1341 - static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) 1341 + static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epos) 1342 1342 { 1343 1343 struct pci_dev *pdev = vdev->pdev; 1344 1344 u8 byte; ··· 1412 1412 return 0; 1413 1413 } 1414 1414 1415 - static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, 1415 + static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev, 1416 1416 int offset, int size) 1417 1417 { 1418 1418 struct pci_dev *pdev = vdev->pdev; ··· 1459 1459 return ret; 1460 1460 } 1461 1461 1462 - static int vfio_cap_init(struct vfio_pci_device *vdev) 1462 + static int vfio_cap_init(struct vfio_pci_core_device *vdev) 1463 1463 { 1464 1464 struct pci_dev *pdev = vdev->pdev; 1465 1465 u8 *map = vdev->pci_config_map; ··· 1549 1549 return 0; 1550 1550 } 1551 1551 1552 - static int vfio_ecap_init(struct vfio_pci_device *vdev) 1552 + static int vfio_ecap_init(struct vfio_pci_core_device *vdev) 1553 1553 { 1554 1554 struct pci_dev *pdev = vdev->pdev; 1555 1555 u8 *map = vdev->pci_config_map; ··· 1669 1669 * for each area requiring emulated bits, but the array of pointers 1670 1670 * would be comparable in size (at least for standard config space). 1671 1671 */ 1672 - int vfio_config_init(struct vfio_pci_device *vdev) 1672 + int vfio_config_init(struct vfio_pci_core_device *vdev) 1673 1673 { 1674 1674 struct pci_dev *pdev = vdev->pdev; 1675 1675 u8 *map, *vconfig; ··· 1773 1773 return pcibios_err_to_errno(ret); 1774 1774 } 1775 1775 1776 - void vfio_config_free(struct vfio_pci_device *vdev) 1776 + void vfio_config_free(struct vfio_pci_core_device *vdev) 1777 1777 { 1778 1778 kfree(vdev->vconfig); 1779 1779 vdev->vconfig = NULL; ··· 1790 1790 * Find the remaining number of bytes in a dword that match the given 1791 1791 * position. Stop at either the end of the capability or the dword boundary. 1792 1792 */ 1793 - static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, 1793 + static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev, 1794 1794 loff_t pos) 1795 1795 { 1796 1796 u8 cap = vdev->pci_config_map[pos]; ··· 1802 1802 return i; 1803 1803 } 1804 1804 1805 - static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, 1805 + static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1806 1806 size_t count, loff_t *ppos, bool iswrite) 1807 1807 { 1808 1808 struct pci_dev *pdev = vdev->pdev; ··· 1885 1885 return ret; 1886 1886 } 1887 1887 1888 - ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, 1888 + ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1889 1889 size_t count, loff_t *ppos, bool iswrite) 1890 1890 { 1891 1891 size_t done = 0;
+2158
drivers/vfio/pci/vfio_pci_core.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. 4 + * Author: Alex Williamson <alex.williamson@redhat.com> 5 + * 6 + * Derived from original vfio: 7 + * Copyright 2010 Cisco Systems, Inc. All rights reserved. 8 + * Author: Tom Lyon, pugs@cisco.com 9 + */ 10 + 11 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 + 13 + #include <linux/device.h> 14 + #include <linux/eventfd.h> 15 + #include <linux/file.h> 16 + #include <linux/interrupt.h> 17 + #include <linux/iommu.h> 18 + #include <linux/module.h> 19 + #include <linux/mutex.h> 20 + #include <linux/notifier.h> 21 + #include <linux/pci.h> 22 + #include <linux/pm_runtime.h> 23 + #include <linux/slab.h> 24 + #include <linux/types.h> 25 + #include <linux/uaccess.h> 26 + #include <linux/vgaarb.h> 27 + #include <linux/nospec.h> 28 + #include <linux/sched/mm.h> 29 + 30 + #include <linux/vfio_pci_core.h> 31 + 32 + #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 33 + #define DRIVER_DESC "core driver for VFIO based PCI devices" 34 + 35 + static bool nointxmask; 36 + static bool disable_vga; 37 + static bool disable_idle_d3; 38 + 39 + static inline bool vfio_vga_disabled(void) 40 + { 41 + #ifdef CONFIG_VFIO_PCI_VGA 42 + return disable_vga; 43 + #else 44 + return true; 45 + #endif 46 + } 47 + 48 + /* 49 + * Our VGA arbiter participation is limited since we don't know anything 50 + * about the device itself. However, if the device is the only VGA device 51 + * downstream of a bridge and VFIO VGA support is disabled, then we can 52 + * safely return legacy VGA IO and memory as not decoded since the user 53 + * has no way to get to it and routing can be disabled externally at the 54 + * bridge. 55 + */ 56 + static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) 57 + { 58 + struct vfio_pci_core_device *vdev = opaque; 59 + struct pci_dev *tmp = NULL, *pdev = vdev->pdev; 60 + unsigned char max_busnr; 61 + unsigned int decodes; 62 + 63 + if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) 64 + return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 65 + VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 66 + 67 + max_busnr = pci_bus_max_busnr(pdev->bus); 68 + decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 69 + 70 + while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { 71 + if (tmp == pdev || 72 + pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || 73 + pci_is_root_bus(tmp->bus)) 74 + continue; 75 + 76 + if (tmp->bus->number >= pdev->bus->number && 77 + tmp->bus->number <= max_busnr) { 78 + pci_dev_put(tmp); 79 + decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; 80 + break; 81 + } 82 + } 83 + 84 + return decodes; 85 + } 86 + 87 + static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) 88 + { 89 + struct resource *res; 90 + int i; 91 + struct vfio_pci_dummy_resource *dummy_res; 92 + 93 + for (i = 0; i < PCI_STD_NUM_BARS; i++) { 94 + int bar = i + PCI_STD_RESOURCES; 95 + 96 + res = &vdev->pdev->resource[bar]; 97 + 98 + if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) 99 + goto no_mmap; 100 + 101 + if (!(res->flags & IORESOURCE_MEM)) 102 + goto no_mmap; 103 + 104 + /* 105 + * The PCI core shouldn't set up a resource with a 106 + * type but zero size. But there may be bugs that 107 + * cause us to do that. 108 + */ 109 + if (!resource_size(res)) 110 + goto no_mmap; 111 + 112 + if (resource_size(res) >= PAGE_SIZE) { 113 + vdev->bar_mmap_supported[bar] = true; 114 + continue; 115 + } 116 + 117 + if (!(res->start & ~PAGE_MASK)) { 118 + /* 119 + * Add a dummy resource to reserve the remainder 120 + * of the exclusive page in case that hot-add 121 + * device's bar is assigned into it. 122 + */ 123 + dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); 124 + if (dummy_res == NULL) 125 + goto no_mmap; 126 + 127 + dummy_res->resource.name = "vfio sub-page reserved"; 128 + dummy_res->resource.start = res->end + 1; 129 + dummy_res->resource.end = res->start + PAGE_SIZE - 1; 130 + dummy_res->resource.flags = res->flags; 131 + if (request_resource(res->parent, 132 + &dummy_res->resource)) { 133 + kfree(dummy_res); 134 + goto no_mmap; 135 + } 136 + dummy_res->index = bar; 137 + list_add(&dummy_res->res_next, 138 + &vdev->dummy_resources_list); 139 + vdev->bar_mmap_supported[bar] = true; 140 + continue; 141 + } 142 + /* 143 + * Here we don't handle the case when the BAR is not page 144 + * aligned because we can't expect the BAR will be 145 + * assigned into the same location in a page in guest 146 + * when we passthrough the BAR. And it's hard to access 147 + * this BAR in userspace because we have no way to get 148 + * the BAR's location in a page. 149 + */ 150 + no_mmap: 151 + vdev->bar_mmap_supported[bar] = false; 152 + } 153 + } 154 + 155 + struct vfio_pci_group_info; 156 + static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); 157 + static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 158 + struct vfio_pci_group_info *groups); 159 + 160 + /* 161 + * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND 162 + * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. 163 + * If a device implements the former but not the latter we would typically 164 + * expect broken_intx_masking be set and require an exclusive interrupt. 165 + * However since we do have control of the device's ability to assert INTx, 166 + * we can instead pretend that the device does not implement INTx, virtualizing 167 + * the pin register to report zero and maintaining DisINTx set on the host. 168 + */ 169 + static bool vfio_pci_nointx(struct pci_dev *pdev) 170 + { 171 + switch (pdev->vendor) { 172 + case PCI_VENDOR_ID_INTEL: 173 + switch (pdev->device) { 174 + /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */ 175 + case 0x1572: 176 + case 0x1574: 177 + case 0x1580 ... 0x1581: 178 + case 0x1583 ... 0x158b: 179 + case 0x37d0 ... 0x37d2: 180 + /* X550 */ 181 + case 0x1563: 182 + return true; 183 + default: 184 + return false; 185 + } 186 + } 187 + 188 + return false; 189 + } 190 + 191 + static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev) 192 + { 193 + struct pci_dev *pdev = vdev->pdev; 194 + u16 pmcsr; 195 + 196 + if (!pdev->pm_cap) 197 + return; 198 + 199 + pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); 200 + 201 + vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); 202 + } 203 + 204 + /* 205 + * pci_set_power_state() wrapper handling devices which perform a soft reset on 206 + * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, 207 + * restore when returned to D0. Saved separately from pci_saved_state for use 208 + * by PM capability emulation and separately from pci_dev internal saved state 209 + * to avoid it being overwritten and consumed around other resets. 210 + */ 211 + int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) 212 + { 213 + struct pci_dev *pdev = vdev->pdev; 214 + bool needs_restore = false, needs_save = false; 215 + int ret; 216 + 217 + if (vdev->needs_pm_restore) { 218 + if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { 219 + pci_save_state(pdev); 220 + needs_save = true; 221 + } 222 + 223 + if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) 224 + needs_restore = true; 225 + } 226 + 227 + ret = pci_set_power_state(pdev, state); 228 + 229 + if (!ret) { 230 + /* D3 might be unsupported via quirk, skip unless in D3 */ 231 + if (needs_save && pdev->current_state >= PCI_D3hot) { 232 + vdev->pm_save = pci_store_saved_state(pdev); 233 + } else if (needs_restore) { 234 + pci_load_and_free_saved_state(pdev, &vdev->pm_save); 235 + pci_restore_state(pdev); 236 + } 237 + } 238 + 239 + return ret; 240 + } 241 + 242 + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) 243 + { 244 + struct pci_dev *pdev = vdev->pdev; 245 + int ret; 246 + u16 cmd; 247 + u8 msix_pos; 248 + 249 + vfio_pci_set_power_state(vdev, PCI_D0); 250 + 251 + /* Don't allow our initial saved state to include busmaster */ 252 + pci_clear_master(pdev); 253 + 254 + ret = pci_enable_device(pdev); 255 + if (ret) 256 + return ret; 257 + 258 + /* If reset fails because of the device lock, fail this path entirely */ 259 + ret = pci_try_reset_function(pdev); 260 + if (ret == -EAGAIN) { 261 + pci_disable_device(pdev); 262 + return ret; 263 + } 264 + 265 + vdev->reset_works = !ret; 266 + pci_save_state(pdev); 267 + vdev->pci_saved_state = pci_store_saved_state(pdev); 268 + if (!vdev->pci_saved_state) 269 + pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); 270 + 271 + if (likely(!nointxmask)) { 272 + if (vfio_pci_nointx(pdev)) { 273 + pci_info(pdev, "Masking broken INTx support\n"); 274 + vdev->nointx = true; 275 + pci_intx(pdev, 0); 276 + } else 277 + vdev->pci_2_3 = pci_intx_mask_supported(pdev); 278 + } 279 + 280 + pci_read_config_word(pdev, PCI_COMMAND, &cmd); 281 + if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { 282 + cmd &= ~PCI_COMMAND_INTX_DISABLE; 283 + pci_write_config_word(pdev, PCI_COMMAND, cmd); 284 + } 285 + 286 + ret = vfio_config_init(vdev); 287 + if (ret) { 288 + kfree(vdev->pci_saved_state); 289 + vdev->pci_saved_state = NULL; 290 + pci_disable_device(pdev); 291 + return ret; 292 + } 293 + 294 + msix_pos = pdev->msix_cap; 295 + if (msix_pos) { 296 + u16 flags; 297 + u32 table; 298 + 299 + pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); 300 + pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); 301 + 302 + vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; 303 + vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; 304 + vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; 305 + } else 306 + vdev->msix_bar = 0xFF; 307 + 308 + if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 309 + vdev->has_vga = true; 310 + 311 + 312 + return 0; 313 + } 314 + EXPORT_SYMBOL_GPL(vfio_pci_core_enable); 315 + 316 + void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) 317 + { 318 + struct pci_dev *pdev = vdev->pdev; 319 + struct vfio_pci_dummy_resource *dummy_res, *tmp; 320 + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; 321 + int i, bar; 322 + 323 + /* For needs_reset */ 324 + lockdep_assert_held(&vdev->vdev.dev_set->lock); 325 + 326 + /* Stop the device from further DMA */ 327 + pci_clear_master(pdev); 328 + 329 + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | 330 + VFIO_IRQ_SET_ACTION_TRIGGER, 331 + vdev->irq_type, 0, 0, NULL); 332 + 333 + /* Device closed, don't need mutex here */ 334 + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, 335 + &vdev->ioeventfds_list, next) { 336 + vfio_virqfd_disable(&ioeventfd->virqfd); 337 + list_del(&ioeventfd->next); 338 + kfree(ioeventfd); 339 + } 340 + vdev->ioeventfds_nr = 0; 341 + 342 + vdev->virq_disabled = false; 343 + 344 + for (i = 0; i < vdev->num_regions; i++) 345 + vdev->region[i].ops->release(vdev, &vdev->region[i]); 346 + 347 + vdev->num_regions = 0; 348 + kfree(vdev->region); 349 + vdev->region = NULL; /* don't krealloc a freed pointer */ 350 + 351 + vfio_config_free(vdev); 352 + 353 + for (i = 0; i < PCI_STD_NUM_BARS; i++) { 354 + bar = i + PCI_STD_RESOURCES; 355 + if (!vdev->barmap[bar]) 356 + continue; 357 + pci_iounmap(pdev, vdev->barmap[bar]); 358 + pci_release_selected_regions(pdev, 1 << bar); 359 + vdev->barmap[bar] = NULL; 360 + } 361 + 362 + list_for_each_entry_safe(dummy_res, tmp, 363 + &vdev->dummy_resources_list, res_next) { 364 + list_del(&dummy_res->res_next); 365 + release_resource(&dummy_res->resource); 366 + kfree(dummy_res); 367 + } 368 + 369 + vdev->needs_reset = true; 370 + 371 + /* 372 + * If we have saved state, restore it. If we can reset the device, 373 + * even better. Resetting with current state seems better than 374 + * nothing, but saving and restoring current state without reset 375 + * is just busy work. 376 + */ 377 + if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { 378 + pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); 379 + 380 + if (!vdev->reset_works) 381 + goto out; 382 + 383 + pci_save_state(pdev); 384 + } 385 + 386 + /* 387 + * Disable INTx and MSI, presumably to avoid spurious interrupts 388 + * during reset. Stolen from pci_reset_function() 389 + */ 390 + pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); 391 + 392 + /* 393 + * Try to get the locks ourselves to prevent a deadlock. The 394 + * success of this is dependent on being able to lock the device, 395 + * which is not always possible. 396 + * We can not use the "try" reset interface here, which will 397 + * overwrite the previously restored configuration information. 398 + */ 399 + if (vdev->reset_works && pci_dev_trylock(pdev)) { 400 + if (!__pci_reset_function_locked(pdev)) 401 + vdev->needs_reset = false; 402 + pci_dev_unlock(pdev); 403 + } 404 + 405 + pci_restore_state(pdev); 406 + out: 407 + pci_disable_device(pdev); 408 + 409 + if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) 410 + vfio_pci_set_power_state(vdev, PCI_D3hot); 411 + } 412 + EXPORT_SYMBOL_GPL(vfio_pci_core_disable); 413 + 414 + static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev) 415 + { 416 + struct pci_dev *physfn = pci_physfn(vdev->pdev); 417 + struct vfio_device *pf_dev; 418 + 419 + if (!vdev->pdev->is_virtfn) 420 + return NULL; 421 + 422 + pf_dev = vfio_device_get_from_dev(&physfn->dev); 423 + if (!pf_dev) 424 + return NULL; 425 + 426 + if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) { 427 + vfio_device_put(pf_dev); 428 + return NULL; 429 + } 430 + 431 + return container_of(pf_dev, struct vfio_pci_core_device, vdev); 432 + } 433 + 434 + static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val) 435 + { 436 + struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); 437 + 438 + if (!pf_vdev) 439 + return; 440 + 441 + mutex_lock(&pf_vdev->vf_token->lock); 442 + pf_vdev->vf_token->users += val; 443 + WARN_ON(pf_vdev->vf_token->users < 0); 444 + mutex_unlock(&pf_vdev->vf_token->lock); 445 + 446 + vfio_device_put(&pf_vdev->vdev); 447 + } 448 + 449 + void vfio_pci_core_close_device(struct vfio_device *core_vdev) 450 + { 451 + struct vfio_pci_core_device *vdev = 452 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 453 + 454 + vfio_pci_vf_token_user_add(vdev, -1); 455 + vfio_spapr_pci_eeh_release(vdev->pdev); 456 + vfio_pci_core_disable(vdev); 457 + 458 + mutex_lock(&vdev->igate); 459 + if (vdev->err_trigger) { 460 + eventfd_ctx_put(vdev->err_trigger); 461 + vdev->err_trigger = NULL; 462 + } 463 + if (vdev->req_trigger) { 464 + eventfd_ctx_put(vdev->req_trigger); 465 + vdev->req_trigger = NULL; 466 + } 467 + mutex_unlock(&vdev->igate); 468 + } 469 + EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); 470 + 471 + void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) 472 + { 473 + vfio_pci_probe_mmaps(vdev); 474 + vfio_spapr_pci_eeh_open(vdev->pdev); 475 + vfio_pci_vf_token_user_add(vdev, 1); 476 + } 477 + EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); 478 + 479 + static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) 480 + { 481 + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 482 + u8 pin; 483 + 484 + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 485 + vdev->nointx || vdev->pdev->is_virtfn) 486 + return 0; 487 + 488 + pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 489 + 490 + return pin ? 1 : 0; 491 + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 492 + u8 pos; 493 + u16 flags; 494 + 495 + pos = vdev->pdev->msi_cap; 496 + if (pos) { 497 + pci_read_config_word(vdev->pdev, 498 + pos + PCI_MSI_FLAGS, &flags); 499 + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); 500 + } 501 + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { 502 + u8 pos; 503 + u16 flags; 504 + 505 + pos = vdev->pdev->msix_cap; 506 + if (pos) { 507 + pci_read_config_word(vdev->pdev, 508 + pos + PCI_MSIX_FLAGS, &flags); 509 + 510 + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 511 + } 512 + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { 513 + if (pci_is_pcie(vdev->pdev)) 514 + return 1; 515 + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { 516 + return 1; 517 + } 518 + 519 + return 0; 520 + } 521 + 522 + static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) 523 + { 524 + (*(int *)data)++; 525 + return 0; 526 + } 527 + 528 + struct vfio_pci_fill_info { 529 + int max; 530 + int cur; 531 + struct vfio_pci_dependent_device *devices; 532 + }; 533 + 534 + static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) 535 + { 536 + struct vfio_pci_fill_info *fill = data; 537 + struct iommu_group *iommu_group; 538 + 539 + if (fill->cur == fill->max) 540 + return -EAGAIN; /* Something changed, try again */ 541 + 542 + iommu_group = iommu_group_get(&pdev->dev); 543 + if (!iommu_group) 544 + return -EPERM; /* Cannot reset non-isolated devices */ 545 + 546 + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); 547 + fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); 548 + fill->devices[fill->cur].bus = pdev->bus->number; 549 + fill->devices[fill->cur].devfn = pdev->devfn; 550 + fill->cur++; 551 + iommu_group_put(iommu_group); 552 + return 0; 553 + } 554 + 555 + struct vfio_pci_group_info { 556 + int count; 557 + struct vfio_group **groups; 558 + }; 559 + 560 + static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) 561 + { 562 + for (; pdev; pdev = pdev->bus->self) 563 + if (pdev->bus == slot->bus) 564 + return (pdev->slot == slot); 565 + return false; 566 + } 567 + 568 + struct vfio_pci_walk_info { 569 + int (*fn)(struct pci_dev *, void *data); 570 + void *data; 571 + struct pci_dev *pdev; 572 + bool slot; 573 + int ret; 574 + }; 575 + 576 + static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) 577 + { 578 + struct vfio_pci_walk_info *walk = data; 579 + 580 + if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) 581 + walk->ret = walk->fn(pdev, walk->data); 582 + 583 + return walk->ret; 584 + } 585 + 586 + static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, 587 + int (*fn)(struct pci_dev *, 588 + void *data), void *data, 589 + bool slot) 590 + { 591 + struct vfio_pci_walk_info walk = { 592 + .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, 593 + }; 594 + 595 + pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); 596 + 597 + return walk.ret; 598 + } 599 + 600 + static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, 601 + struct vfio_info_cap *caps) 602 + { 603 + struct vfio_info_cap_header header = { 604 + .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, 605 + .version = 1 606 + }; 607 + 608 + return vfio_info_add_capability(caps, &header, sizeof(header)); 609 + } 610 + 611 + int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, 612 + unsigned int type, unsigned int subtype, 613 + const struct vfio_pci_regops *ops, 614 + size_t size, u32 flags, void *data) 615 + { 616 + struct vfio_pci_region *region; 617 + 618 + region = krealloc(vdev->region, 619 + (vdev->num_regions + 1) * sizeof(*region), 620 + GFP_KERNEL); 621 + if (!region) 622 + return -ENOMEM; 623 + 624 + vdev->region = region; 625 + vdev->region[vdev->num_regions].type = type; 626 + vdev->region[vdev->num_regions].subtype = subtype; 627 + vdev->region[vdev->num_regions].ops = ops; 628 + vdev->region[vdev->num_regions].size = size; 629 + vdev->region[vdev->num_regions].flags = flags; 630 + vdev->region[vdev->num_regions].data = data; 631 + 632 + vdev->num_regions++; 633 + 634 + return 0; 635 + } 636 + EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); 637 + 638 + long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, 639 + unsigned long arg) 640 + { 641 + struct vfio_pci_core_device *vdev = 642 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 643 + unsigned long minsz; 644 + 645 + if (cmd == VFIO_DEVICE_GET_INFO) { 646 + struct vfio_device_info info; 647 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 648 + unsigned long capsz; 649 + int ret; 650 + 651 + minsz = offsetofend(struct vfio_device_info, num_irqs); 652 + 653 + /* For backward compatibility, cannot require this */ 654 + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); 655 + 656 + if (copy_from_user(&info, (void __user *)arg, minsz)) 657 + return -EFAULT; 658 + 659 + if (info.argsz < minsz) 660 + return -EINVAL; 661 + 662 + if (info.argsz >= capsz) { 663 + minsz = capsz; 664 + info.cap_offset = 0; 665 + } 666 + 667 + info.flags = VFIO_DEVICE_FLAGS_PCI; 668 + 669 + if (vdev->reset_works) 670 + info.flags |= VFIO_DEVICE_FLAGS_RESET; 671 + 672 + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 673 + info.num_irqs = VFIO_PCI_NUM_IRQS; 674 + 675 + ret = vfio_pci_info_zdev_add_caps(vdev, &caps); 676 + if (ret && ret != -ENODEV) { 677 + pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); 678 + return ret; 679 + } 680 + 681 + if (caps.size) { 682 + info.flags |= VFIO_DEVICE_FLAGS_CAPS; 683 + if (info.argsz < sizeof(info) + caps.size) { 684 + info.argsz = sizeof(info) + caps.size; 685 + } else { 686 + vfio_info_cap_shift(&caps, sizeof(info)); 687 + if (copy_to_user((void __user *)arg + 688 + sizeof(info), caps.buf, 689 + caps.size)) { 690 + kfree(caps.buf); 691 + return -EFAULT; 692 + } 693 + info.cap_offset = sizeof(info); 694 + } 695 + 696 + kfree(caps.buf); 697 + } 698 + 699 + return copy_to_user((void __user *)arg, &info, minsz) ? 700 + -EFAULT : 0; 701 + 702 + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 703 + struct pci_dev *pdev = vdev->pdev; 704 + struct vfio_region_info info; 705 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 706 + int i, ret; 707 + 708 + minsz = offsetofend(struct vfio_region_info, offset); 709 + 710 + if (copy_from_user(&info, (void __user *)arg, minsz)) 711 + return -EFAULT; 712 + 713 + if (info.argsz < minsz) 714 + return -EINVAL; 715 + 716 + switch (info.index) { 717 + case VFIO_PCI_CONFIG_REGION_INDEX: 718 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 719 + info.size = pdev->cfg_size; 720 + info.flags = VFIO_REGION_INFO_FLAG_READ | 721 + VFIO_REGION_INFO_FLAG_WRITE; 722 + break; 723 + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 724 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 725 + info.size = pci_resource_len(pdev, info.index); 726 + if (!info.size) { 727 + info.flags = 0; 728 + break; 729 + } 730 + 731 + info.flags = VFIO_REGION_INFO_FLAG_READ | 732 + VFIO_REGION_INFO_FLAG_WRITE; 733 + if (vdev->bar_mmap_supported[info.index]) { 734 + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 735 + if (info.index == vdev->msix_bar) { 736 + ret = msix_mmappable_cap(vdev, &caps); 737 + if (ret) 738 + return ret; 739 + } 740 + } 741 + 742 + break; 743 + case VFIO_PCI_ROM_REGION_INDEX: 744 + { 745 + void __iomem *io; 746 + size_t size; 747 + u16 cmd; 748 + 749 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 750 + info.flags = 0; 751 + 752 + /* Report the BAR size, not the ROM size */ 753 + info.size = pci_resource_len(pdev, info.index); 754 + if (!info.size) { 755 + /* Shadow ROMs appear as PCI option ROMs */ 756 + if (pdev->resource[PCI_ROM_RESOURCE].flags & 757 + IORESOURCE_ROM_SHADOW) 758 + info.size = 0x20000; 759 + else 760 + break; 761 + } 762 + 763 + /* 764 + * Is it really there? Enable memory decode for 765 + * implicit access in pci_map_rom(). 766 + */ 767 + cmd = vfio_pci_memory_lock_and_enable(vdev); 768 + io = pci_map_rom(pdev, &size); 769 + if (io) { 770 + info.flags = VFIO_REGION_INFO_FLAG_READ; 771 + pci_unmap_rom(pdev, io); 772 + } else { 773 + info.size = 0; 774 + } 775 + vfio_pci_memory_unlock_and_restore(vdev, cmd); 776 + 777 + break; 778 + } 779 + case VFIO_PCI_VGA_REGION_INDEX: 780 + if (!vdev->has_vga) 781 + return -EINVAL; 782 + 783 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 784 + info.size = 0xc0000; 785 + info.flags = VFIO_REGION_INFO_FLAG_READ | 786 + VFIO_REGION_INFO_FLAG_WRITE; 787 + 788 + break; 789 + default: 790 + { 791 + struct vfio_region_info_cap_type cap_type = { 792 + .header.id = VFIO_REGION_INFO_CAP_TYPE, 793 + .header.version = 1 }; 794 + 795 + if (info.index >= 796 + VFIO_PCI_NUM_REGIONS + vdev->num_regions) 797 + return -EINVAL; 798 + info.index = array_index_nospec(info.index, 799 + VFIO_PCI_NUM_REGIONS + 800 + vdev->num_regions); 801 + 802 + i = info.index - VFIO_PCI_NUM_REGIONS; 803 + 804 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 805 + info.size = vdev->region[i].size; 806 + info.flags = vdev->region[i].flags; 807 + 808 + cap_type.type = vdev->region[i].type; 809 + cap_type.subtype = vdev->region[i].subtype; 810 + 811 + ret = vfio_info_add_capability(&caps, &cap_type.header, 812 + sizeof(cap_type)); 813 + if (ret) 814 + return ret; 815 + 816 + if (vdev->region[i].ops->add_capability) { 817 + ret = vdev->region[i].ops->add_capability(vdev, 818 + &vdev->region[i], &caps); 819 + if (ret) 820 + return ret; 821 + } 822 + } 823 + } 824 + 825 + if (caps.size) { 826 + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 827 + if (info.argsz < sizeof(info) + caps.size) { 828 + info.argsz = sizeof(info) + caps.size; 829 + info.cap_offset = 0; 830 + } else { 831 + vfio_info_cap_shift(&caps, sizeof(info)); 832 + if (copy_to_user((void __user *)arg + 833 + sizeof(info), caps.buf, 834 + caps.size)) { 835 + kfree(caps.buf); 836 + return -EFAULT; 837 + } 838 + info.cap_offset = sizeof(info); 839 + } 840 + 841 + kfree(caps.buf); 842 + } 843 + 844 + return copy_to_user((void __user *)arg, &info, minsz) ? 845 + -EFAULT : 0; 846 + 847 + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { 848 + struct vfio_irq_info info; 849 + 850 + minsz = offsetofend(struct vfio_irq_info, count); 851 + 852 + if (copy_from_user(&info, (void __user *)arg, minsz)) 853 + return -EFAULT; 854 + 855 + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) 856 + return -EINVAL; 857 + 858 + switch (info.index) { 859 + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 860 + case VFIO_PCI_REQ_IRQ_INDEX: 861 + break; 862 + case VFIO_PCI_ERR_IRQ_INDEX: 863 + if (pci_is_pcie(vdev->pdev)) 864 + break; 865 + fallthrough; 866 + default: 867 + return -EINVAL; 868 + } 869 + 870 + info.flags = VFIO_IRQ_INFO_EVENTFD; 871 + 872 + info.count = vfio_pci_get_irq_count(vdev, info.index); 873 + 874 + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) 875 + info.flags |= (VFIO_IRQ_INFO_MASKABLE | 876 + VFIO_IRQ_INFO_AUTOMASKED); 877 + else 878 + info.flags |= VFIO_IRQ_INFO_NORESIZE; 879 + 880 + return copy_to_user((void __user *)arg, &info, minsz) ? 881 + -EFAULT : 0; 882 + 883 + } else if (cmd == VFIO_DEVICE_SET_IRQS) { 884 + struct vfio_irq_set hdr; 885 + u8 *data = NULL; 886 + int max, ret = 0; 887 + size_t data_size = 0; 888 + 889 + minsz = offsetofend(struct vfio_irq_set, count); 890 + 891 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 892 + return -EFAULT; 893 + 894 + max = vfio_pci_get_irq_count(vdev, hdr.index); 895 + 896 + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, 897 + VFIO_PCI_NUM_IRQS, &data_size); 898 + if (ret) 899 + return ret; 900 + 901 + if (data_size) { 902 + data = memdup_user((void __user *)(arg + minsz), 903 + data_size); 904 + if (IS_ERR(data)) 905 + return PTR_ERR(data); 906 + } 907 + 908 + mutex_lock(&vdev->igate); 909 + 910 + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, 911 + hdr.start, hdr.count, data); 912 + 913 + mutex_unlock(&vdev->igate); 914 + kfree(data); 915 + 916 + return ret; 917 + 918 + } else if (cmd == VFIO_DEVICE_RESET) { 919 + int ret; 920 + 921 + if (!vdev->reset_works) 922 + return -EINVAL; 923 + 924 + vfio_pci_zap_and_down_write_memory_lock(vdev); 925 + ret = pci_try_reset_function(vdev->pdev); 926 + up_write(&vdev->memory_lock); 927 + 928 + return ret; 929 + 930 + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { 931 + struct vfio_pci_hot_reset_info hdr; 932 + struct vfio_pci_fill_info fill = { 0 }; 933 + struct vfio_pci_dependent_device *devices = NULL; 934 + bool slot = false; 935 + int ret = 0; 936 + 937 + minsz = offsetofend(struct vfio_pci_hot_reset_info, count); 938 + 939 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 940 + return -EFAULT; 941 + 942 + if (hdr.argsz < minsz) 943 + return -EINVAL; 944 + 945 + hdr.flags = 0; 946 + 947 + /* Can we do a slot or bus reset or neither? */ 948 + if (!pci_probe_reset_slot(vdev->pdev->slot)) 949 + slot = true; 950 + else if (pci_probe_reset_bus(vdev->pdev->bus)) 951 + return -ENODEV; 952 + 953 + /* How many devices are affected? */ 954 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 955 + vfio_pci_count_devs, 956 + &fill.max, slot); 957 + if (ret) 958 + return ret; 959 + 960 + WARN_ON(!fill.max); /* Should always be at least one */ 961 + 962 + /* 963 + * If there's enough space, fill it now, otherwise return 964 + * -ENOSPC and the number of devices affected. 965 + */ 966 + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { 967 + ret = -ENOSPC; 968 + hdr.count = fill.max; 969 + goto reset_info_exit; 970 + } 971 + 972 + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); 973 + if (!devices) 974 + return -ENOMEM; 975 + 976 + fill.devices = devices; 977 + 978 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 979 + vfio_pci_fill_devs, 980 + &fill, slot); 981 + 982 + /* 983 + * If a device was removed between counting and filling, 984 + * we may come up short of fill.max. If a device was 985 + * added, we'll have a return of -EAGAIN above. 986 + */ 987 + if (!ret) 988 + hdr.count = fill.cur; 989 + 990 + reset_info_exit: 991 + if (copy_to_user((void __user *)arg, &hdr, minsz)) 992 + ret = -EFAULT; 993 + 994 + if (!ret) { 995 + if (copy_to_user((void __user *)(arg + minsz), devices, 996 + hdr.count * sizeof(*devices))) 997 + ret = -EFAULT; 998 + } 999 + 1000 + kfree(devices); 1001 + return ret; 1002 + 1003 + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { 1004 + struct vfio_pci_hot_reset hdr; 1005 + int32_t *group_fds; 1006 + struct vfio_group **groups; 1007 + struct vfio_pci_group_info info; 1008 + bool slot = false; 1009 + int group_idx, count = 0, ret = 0; 1010 + 1011 + minsz = offsetofend(struct vfio_pci_hot_reset, count); 1012 + 1013 + if (copy_from_user(&hdr, (void __user *)arg, minsz)) 1014 + return -EFAULT; 1015 + 1016 + if (hdr.argsz < minsz || hdr.flags) 1017 + return -EINVAL; 1018 + 1019 + /* Can we do a slot or bus reset or neither? */ 1020 + if (!pci_probe_reset_slot(vdev->pdev->slot)) 1021 + slot = true; 1022 + else if (pci_probe_reset_bus(vdev->pdev->bus)) 1023 + return -ENODEV; 1024 + 1025 + /* 1026 + * We can't let userspace give us an arbitrarily large 1027 + * buffer to copy, so verify how many we think there 1028 + * could be. Note groups can have multiple devices so 1029 + * one group per device is the max. 1030 + */ 1031 + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, 1032 + vfio_pci_count_devs, 1033 + &count, slot); 1034 + if (ret) 1035 + return ret; 1036 + 1037 + /* Somewhere between 1 and count is OK */ 1038 + if (!hdr.count || hdr.count > count) 1039 + return -EINVAL; 1040 + 1041 + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); 1042 + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); 1043 + if (!group_fds || !groups) { 1044 + kfree(group_fds); 1045 + kfree(groups); 1046 + return -ENOMEM; 1047 + } 1048 + 1049 + if (copy_from_user(group_fds, (void __user *)(arg + minsz), 1050 + hdr.count * sizeof(*group_fds))) { 1051 + kfree(group_fds); 1052 + kfree(groups); 1053 + return -EFAULT; 1054 + } 1055 + 1056 + /* 1057 + * For each group_fd, get the group through the vfio external 1058 + * user interface and store the group and iommu ID. This 1059 + * ensures the group is held across the reset. 1060 + */ 1061 + for (group_idx = 0; group_idx < hdr.count; group_idx++) { 1062 + struct vfio_group *group; 1063 + struct fd f = fdget(group_fds[group_idx]); 1064 + if (!f.file) { 1065 + ret = -EBADF; 1066 + break; 1067 + } 1068 + 1069 + group = vfio_group_get_external_user(f.file); 1070 + fdput(f); 1071 + if (IS_ERR(group)) { 1072 + ret = PTR_ERR(group); 1073 + break; 1074 + } 1075 + 1076 + groups[group_idx] = group; 1077 + } 1078 + 1079 + kfree(group_fds); 1080 + 1081 + /* release reference to groups on error */ 1082 + if (ret) 1083 + goto hot_reset_release; 1084 + 1085 + info.count = hdr.count; 1086 + info.groups = groups; 1087 + 1088 + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); 1089 + 1090 + hot_reset_release: 1091 + for (group_idx--; group_idx >= 0; group_idx--) 1092 + vfio_group_put_external_user(groups[group_idx]); 1093 + 1094 + kfree(groups); 1095 + return ret; 1096 + } else if (cmd == VFIO_DEVICE_IOEVENTFD) { 1097 + struct vfio_device_ioeventfd ioeventfd; 1098 + int count; 1099 + 1100 + minsz = offsetofend(struct vfio_device_ioeventfd, fd); 1101 + 1102 + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) 1103 + return -EFAULT; 1104 + 1105 + if (ioeventfd.argsz < minsz) 1106 + return -EINVAL; 1107 + 1108 + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) 1109 + return -EINVAL; 1110 + 1111 + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; 1112 + 1113 + if (hweight8(count) != 1 || ioeventfd.fd < -1) 1114 + return -EINVAL; 1115 + 1116 + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, 1117 + ioeventfd.data, count, ioeventfd.fd); 1118 + } else if (cmd == VFIO_DEVICE_FEATURE) { 1119 + struct vfio_device_feature feature; 1120 + uuid_t uuid; 1121 + 1122 + minsz = offsetofend(struct vfio_device_feature, flags); 1123 + 1124 + if (copy_from_user(&feature, (void __user *)arg, minsz)) 1125 + return -EFAULT; 1126 + 1127 + if (feature.argsz < minsz) 1128 + return -EINVAL; 1129 + 1130 + /* Check unknown flags */ 1131 + if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | 1132 + VFIO_DEVICE_FEATURE_SET | 1133 + VFIO_DEVICE_FEATURE_GET | 1134 + VFIO_DEVICE_FEATURE_PROBE)) 1135 + return -EINVAL; 1136 + 1137 + /* GET & SET are mutually exclusive except with PROBE */ 1138 + if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && 1139 + (feature.flags & VFIO_DEVICE_FEATURE_SET) && 1140 + (feature.flags & VFIO_DEVICE_FEATURE_GET)) 1141 + return -EINVAL; 1142 + 1143 + switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { 1144 + case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: 1145 + if (!vdev->vf_token) 1146 + return -ENOTTY; 1147 + 1148 + /* 1149 + * We do not support GET of the VF Token UUID as this 1150 + * could expose the token of the previous device user. 1151 + */ 1152 + if (feature.flags & VFIO_DEVICE_FEATURE_GET) 1153 + return -EINVAL; 1154 + 1155 + if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) 1156 + return 0; 1157 + 1158 + /* Don't SET unless told to do so */ 1159 + if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) 1160 + return -EINVAL; 1161 + 1162 + if (feature.argsz < minsz + sizeof(uuid)) 1163 + return -EINVAL; 1164 + 1165 + if (copy_from_user(&uuid, (void __user *)(arg + minsz), 1166 + sizeof(uuid))) 1167 + return -EFAULT; 1168 + 1169 + mutex_lock(&vdev->vf_token->lock); 1170 + uuid_copy(&vdev->vf_token->uuid, &uuid); 1171 + mutex_unlock(&vdev->vf_token->lock); 1172 + 1173 + return 0; 1174 + default: 1175 + return -ENOTTY; 1176 + } 1177 + } 1178 + 1179 + return -ENOTTY; 1180 + } 1181 + EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); 1182 + 1183 + static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, 1184 + size_t count, loff_t *ppos, bool iswrite) 1185 + { 1186 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 1187 + 1188 + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1189 + return -EINVAL; 1190 + 1191 + switch (index) { 1192 + case VFIO_PCI_CONFIG_REGION_INDEX: 1193 + return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); 1194 + 1195 + case VFIO_PCI_ROM_REGION_INDEX: 1196 + if (iswrite) 1197 + return -EINVAL; 1198 + return vfio_pci_bar_rw(vdev, buf, count, ppos, false); 1199 + 1200 + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: 1201 + return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); 1202 + 1203 + case VFIO_PCI_VGA_REGION_INDEX: 1204 + return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 1205 + default: 1206 + index -= VFIO_PCI_NUM_REGIONS; 1207 + return vdev->region[index].ops->rw(vdev, buf, 1208 + count, ppos, iswrite); 1209 + } 1210 + 1211 + return -EINVAL; 1212 + } 1213 + 1214 + ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, 1215 + size_t count, loff_t *ppos) 1216 + { 1217 + struct vfio_pci_core_device *vdev = 1218 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1219 + 1220 + if (!count) 1221 + return 0; 1222 + 1223 + return vfio_pci_rw(vdev, buf, count, ppos, false); 1224 + } 1225 + EXPORT_SYMBOL_GPL(vfio_pci_core_read); 1226 + 1227 + ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, 1228 + size_t count, loff_t *ppos) 1229 + { 1230 + struct vfio_pci_core_device *vdev = 1231 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1232 + 1233 + if (!count) 1234 + return 0; 1235 + 1236 + return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); 1237 + } 1238 + EXPORT_SYMBOL_GPL(vfio_pci_core_write); 1239 + 1240 + /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ 1241 + static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) 1242 + { 1243 + struct vfio_pci_mmap_vma *mmap_vma, *tmp; 1244 + 1245 + /* 1246 + * Lock ordering: 1247 + * vma_lock is nested under mmap_lock for vm_ops callback paths. 1248 + * The memory_lock semaphore is used by both code paths calling 1249 + * into this function to zap vmas and the vm_ops.fault callback 1250 + * to protect the memory enable state of the device. 1251 + * 1252 + * When zapping vmas we need to maintain the mmap_lock => vma_lock 1253 + * ordering, which requires using vma_lock to walk vma_list to 1254 + * acquire an mm, then dropping vma_lock to get the mmap_lock and 1255 + * reacquiring vma_lock. This logic is derived from similar 1256 + * requirements in uverbs_user_mmap_disassociate(). 1257 + * 1258 + * mmap_lock must always be the top-level lock when it is taken. 1259 + * Therefore we can only hold the memory_lock write lock when 1260 + * vma_list is empty, as we'd need to take mmap_lock to clear 1261 + * entries. vma_list can only be guaranteed empty when holding 1262 + * vma_lock, thus memory_lock is nested under vma_lock. 1263 + * 1264 + * This enables the vm_ops.fault callback to acquire vma_lock, 1265 + * followed by memory_lock read lock, while already holding 1266 + * mmap_lock without risk of deadlock. 1267 + */ 1268 + while (1) { 1269 + struct mm_struct *mm = NULL; 1270 + 1271 + if (try) { 1272 + if (!mutex_trylock(&vdev->vma_lock)) 1273 + return 0; 1274 + } else { 1275 + mutex_lock(&vdev->vma_lock); 1276 + } 1277 + while (!list_empty(&vdev->vma_list)) { 1278 + mmap_vma = list_first_entry(&vdev->vma_list, 1279 + struct vfio_pci_mmap_vma, 1280 + vma_next); 1281 + mm = mmap_vma->vma->vm_mm; 1282 + if (mmget_not_zero(mm)) 1283 + break; 1284 + 1285 + list_del(&mmap_vma->vma_next); 1286 + kfree(mmap_vma); 1287 + mm = NULL; 1288 + } 1289 + if (!mm) 1290 + return 1; 1291 + mutex_unlock(&vdev->vma_lock); 1292 + 1293 + if (try) { 1294 + if (!mmap_read_trylock(mm)) { 1295 + mmput(mm); 1296 + return 0; 1297 + } 1298 + } else { 1299 + mmap_read_lock(mm); 1300 + } 1301 + if (try) { 1302 + if (!mutex_trylock(&vdev->vma_lock)) { 1303 + mmap_read_unlock(mm); 1304 + mmput(mm); 1305 + return 0; 1306 + } 1307 + } else { 1308 + mutex_lock(&vdev->vma_lock); 1309 + } 1310 + list_for_each_entry_safe(mmap_vma, tmp, 1311 + &vdev->vma_list, vma_next) { 1312 + struct vm_area_struct *vma = mmap_vma->vma; 1313 + 1314 + if (vma->vm_mm != mm) 1315 + continue; 1316 + 1317 + list_del(&mmap_vma->vma_next); 1318 + kfree(mmap_vma); 1319 + 1320 + zap_vma_ptes(vma, vma->vm_start, 1321 + vma->vm_end - vma->vm_start); 1322 + } 1323 + mutex_unlock(&vdev->vma_lock); 1324 + mmap_read_unlock(mm); 1325 + mmput(mm); 1326 + } 1327 + } 1328 + 1329 + void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) 1330 + { 1331 + vfio_pci_zap_and_vma_lock(vdev, false); 1332 + down_write(&vdev->memory_lock); 1333 + mutex_unlock(&vdev->vma_lock); 1334 + } 1335 + 1336 + u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) 1337 + { 1338 + u16 cmd; 1339 + 1340 + down_write(&vdev->memory_lock); 1341 + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); 1342 + if (!(cmd & PCI_COMMAND_MEMORY)) 1343 + pci_write_config_word(vdev->pdev, PCI_COMMAND, 1344 + cmd | PCI_COMMAND_MEMORY); 1345 + 1346 + return cmd; 1347 + } 1348 + 1349 + void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd) 1350 + { 1351 + pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); 1352 + up_write(&vdev->memory_lock); 1353 + } 1354 + 1355 + /* Caller holds vma_lock */ 1356 + static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, 1357 + struct vm_area_struct *vma) 1358 + { 1359 + struct vfio_pci_mmap_vma *mmap_vma; 1360 + 1361 + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); 1362 + if (!mmap_vma) 1363 + return -ENOMEM; 1364 + 1365 + mmap_vma->vma = vma; 1366 + list_add(&mmap_vma->vma_next, &vdev->vma_list); 1367 + 1368 + return 0; 1369 + } 1370 + 1371 + /* 1372 + * Zap mmaps on open so that we can fault them in on access and therefore 1373 + * our vma_list only tracks mappings accessed since last zap. 1374 + */ 1375 + static void vfio_pci_mmap_open(struct vm_area_struct *vma) 1376 + { 1377 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1378 + } 1379 + 1380 + static void vfio_pci_mmap_close(struct vm_area_struct *vma) 1381 + { 1382 + struct vfio_pci_core_device *vdev = vma->vm_private_data; 1383 + struct vfio_pci_mmap_vma *mmap_vma; 1384 + 1385 + mutex_lock(&vdev->vma_lock); 1386 + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1387 + if (mmap_vma->vma == vma) { 1388 + list_del(&mmap_vma->vma_next); 1389 + kfree(mmap_vma); 1390 + break; 1391 + } 1392 + } 1393 + mutex_unlock(&vdev->vma_lock); 1394 + } 1395 + 1396 + static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) 1397 + { 1398 + struct vm_area_struct *vma = vmf->vma; 1399 + struct vfio_pci_core_device *vdev = vma->vm_private_data; 1400 + struct vfio_pci_mmap_vma *mmap_vma; 1401 + vm_fault_t ret = VM_FAULT_NOPAGE; 1402 + 1403 + mutex_lock(&vdev->vma_lock); 1404 + down_read(&vdev->memory_lock); 1405 + 1406 + if (!__vfio_pci_memory_enabled(vdev)) { 1407 + ret = VM_FAULT_SIGBUS; 1408 + goto up_out; 1409 + } 1410 + 1411 + /* 1412 + * We populate the whole vma on fault, so we need to test whether 1413 + * the vma has already been mapped, such as for concurrent faults 1414 + * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if 1415 + * we ask it to fill the same range again. 1416 + */ 1417 + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { 1418 + if (mmap_vma->vma == vma) 1419 + goto up_out; 1420 + } 1421 + 1422 + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 1423 + vma->vm_end - vma->vm_start, 1424 + vma->vm_page_prot)) { 1425 + ret = VM_FAULT_SIGBUS; 1426 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1427 + goto up_out; 1428 + } 1429 + 1430 + if (__vfio_pci_add_vma(vdev, vma)) { 1431 + ret = VM_FAULT_OOM; 1432 + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); 1433 + } 1434 + 1435 + up_out: 1436 + up_read(&vdev->memory_lock); 1437 + mutex_unlock(&vdev->vma_lock); 1438 + return ret; 1439 + } 1440 + 1441 + static const struct vm_operations_struct vfio_pci_mmap_ops = { 1442 + .open = vfio_pci_mmap_open, 1443 + .close = vfio_pci_mmap_close, 1444 + .fault = vfio_pci_mmap_fault, 1445 + }; 1446 + 1447 + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) 1448 + { 1449 + struct vfio_pci_core_device *vdev = 1450 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1451 + struct pci_dev *pdev = vdev->pdev; 1452 + unsigned int index; 1453 + u64 phys_len, req_len, pgoff, req_start; 1454 + int ret; 1455 + 1456 + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 1457 + 1458 + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 1459 + return -EINVAL; 1460 + if (vma->vm_end < vma->vm_start) 1461 + return -EINVAL; 1462 + if ((vma->vm_flags & VM_SHARED) == 0) 1463 + return -EINVAL; 1464 + if (index >= VFIO_PCI_NUM_REGIONS) { 1465 + int regnum = index - VFIO_PCI_NUM_REGIONS; 1466 + struct vfio_pci_region *region = vdev->region + regnum; 1467 + 1468 + if (region->ops && region->ops->mmap && 1469 + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) 1470 + return region->ops->mmap(vdev, region, vma); 1471 + return -EINVAL; 1472 + } 1473 + if (index >= VFIO_PCI_ROM_REGION_INDEX) 1474 + return -EINVAL; 1475 + if (!vdev->bar_mmap_supported[index]) 1476 + return -EINVAL; 1477 + 1478 + phys_len = PAGE_ALIGN(pci_resource_len(pdev, index)); 1479 + req_len = vma->vm_end - vma->vm_start; 1480 + pgoff = vma->vm_pgoff & 1481 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 1482 + req_start = pgoff << PAGE_SHIFT; 1483 + 1484 + if (req_start + req_len > phys_len) 1485 + return -EINVAL; 1486 + 1487 + /* 1488 + * Even though we don't make use of the barmap for the mmap, 1489 + * we need to request the region and the barmap tracks that. 1490 + */ 1491 + if (!vdev->barmap[index]) { 1492 + ret = pci_request_selected_regions(pdev, 1493 + 1 << index, "vfio-pci"); 1494 + if (ret) 1495 + return ret; 1496 + 1497 + vdev->barmap[index] = pci_iomap(pdev, index, 0); 1498 + if (!vdev->barmap[index]) { 1499 + pci_release_selected_regions(pdev, 1 << index); 1500 + return -ENOMEM; 1501 + } 1502 + } 1503 + 1504 + vma->vm_private_data = vdev; 1505 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1506 + vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; 1507 + 1508 + /* 1509 + * See remap_pfn_range(), called from vfio_pci_fault() but we can't 1510 + * change vm_flags within the fault handler. Set them now. 1511 + */ 1512 + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 1513 + vma->vm_ops = &vfio_pci_mmap_ops; 1514 + 1515 + return 0; 1516 + } 1517 + EXPORT_SYMBOL_GPL(vfio_pci_core_mmap); 1518 + 1519 + void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) 1520 + { 1521 + struct vfio_pci_core_device *vdev = 1522 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1523 + struct pci_dev *pdev = vdev->pdev; 1524 + 1525 + mutex_lock(&vdev->igate); 1526 + 1527 + if (vdev->req_trigger) { 1528 + if (!(count % 10)) 1529 + pci_notice_ratelimited(pdev, 1530 + "Relaying device request to user (#%u)\n", 1531 + count); 1532 + eventfd_signal(vdev->req_trigger, 1); 1533 + } else if (count == 0) { 1534 + pci_warn(pdev, 1535 + "No device request channel registered, blocked until released by user\n"); 1536 + } 1537 + 1538 + mutex_unlock(&vdev->igate); 1539 + } 1540 + EXPORT_SYMBOL_GPL(vfio_pci_core_request); 1541 + 1542 + static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, 1543 + bool vf_token, uuid_t *uuid) 1544 + { 1545 + /* 1546 + * There's always some degree of trust or collaboration between SR-IOV 1547 + * PF and VFs, even if just that the PF hosts the SR-IOV capability and 1548 + * can disrupt VFs with a reset, but often the PF has more explicit 1549 + * access to deny service to the VF or access data passed through the 1550 + * VF. We therefore require an opt-in via a shared VF token (UUID) to 1551 + * represent this trust. This both prevents that a VF driver might 1552 + * assume the PF driver is a trusted, in-kernel driver, and also that 1553 + * a PF driver might be replaced with a rogue driver, unknown to in-use 1554 + * VF drivers. 1555 + * 1556 + * Therefore when presented with a VF, if the PF is a vfio device and 1557 + * it is bound to the vfio-pci driver, the user needs to provide a VF 1558 + * token to access the device, in the form of appending a vf_token to 1559 + * the device name, for example: 1560 + * 1561 + * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" 1562 + * 1563 + * When presented with a PF which has VFs in use, the user must also 1564 + * provide the current VF token to prove collaboration with existing 1565 + * VF users. If VFs are not in use, the VF token provided for the PF 1566 + * device will act to set the VF token. 1567 + * 1568 + * If the VF token is provided but unused, an error is generated. 1569 + */ 1570 + if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) 1571 + return 0; /* No VF token provided or required */ 1572 + 1573 + if (vdev->pdev->is_virtfn) { 1574 + struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); 1575 + bool match; 1576 + 1577 + if (!pf_vdev) { 1578 + if (!vf_token) 1579 + return 0; /* PF is not vfio-pci, no VF token */ 1580 + 1581 + pci_info_ratelimited(vdev->pdev, 1582 + "VF token incorrectly provided, PF not bound to vfio-pci\n"); 1583 + return -EINVAL; 1584 + } 1585 + 1586 + if (!vf_token) { 1587 + vfio_device_put(&pf_vdev->vdev); 1588 + pci_info_ratelimited(vdev->pdev, 1589 + "VF token required to access device\n"); 1590 + return -EACCES; 1591 + } 1592 + 1593 + mutex_lock(&pf_vdev->vf_token->lock); 1594 + match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); 1595 + mutex_unlock(&pf_vdev->vf_token->lock); 1596 + 1597 + vfio_device_put(&pf_vdev->vdev); 1598 + 1599 + if (!match) { 1600 + pci_info_ratelimited(vdev->pdev, 1601 + "Incorrect VF token provided for device\n"); 1602 + return -EACCES; 1603 + } 1604 + } else if (vdev->vf_token) { 1605 + mutex_lock(&vdev->vf_token->lock); 1606 + if (vdev->vf_token->users) { 1607 + if (!vf_token) { 1608 + mutex_unlock(&vdev->vf_token->lock); 1609 + pci_info_ratelimited(vdev->pdev, 1610 + "VF token required to access device\n"); 1611 + return -EACCES; 1612 + } 1613 + 1614 + if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { 1615 + mutex_unlock(&vdev->vf_token->lock); 1616 + pci_info_ratelimited(vdev->pdev, 1617 + "Incorrect VF token provided for device\n"); 1618 + return -EACCES; 1619 + } 1620 + } else if (vf_token) { 1621 + uuid_copy(&vdev->vf_token->uuid, uuid); 1622 + } 1623 + 1624 + mutex_unlock(&vdev->vf_token->lock); 1625 + } else if (vf_token) { 1626 + pci_info_ratelimited(vdev->pdev, 1627 + "VF token incorrectly provided, not a PF or VF\n"); 1628 + return -EINVAL; 1629 + } 1630 + 1631 + return 0; 1632 + } 1633 + 1634 + #define VF_TOKEN_ARG "vf_token=" 1635 + 1636 + int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) 1637 + { 1638 + struct vfio_pci_core_device *vdev = 1639 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 1640 + bool vf_token = false; 1641 + uuid_t uuid; 1642 + int ret; 1643 + 1644 + if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) 1645 + return 0; /* No match */ 1646 + 1647 + if (strlen(buf) > strlen(pci_name(vdev->pdev))) { 1648 + buf += strlen(pci_name(vdev->pdev)); 1649 + 1650 + if (*buf != ' ') 1651 + return 0; /* No match: non-whitespace after name */ 1652 + 1653 + while (*buf) { 1654 + if (*buf == ' ') { 1655 + buf++; 1656 + continue; 1657 + } 1658 + 1659 + if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, 1660 + strlen(VF_TOKEN_ARG))) { 1661 + buf += strlen(VF_TOKEN_ARG); 1662 + 1663 + if (strlen(buf) < UUID_STRING_LEN) 1664 + return -EINVAL; 1665 + 1666 + ret = uuid_parse(buf, &uuid); 1667 + if (ret) 1668 + return ret; 1669 + 1670 + vf_token = true; 1671 + buf += UUID_STRING_LEN; 1672 + } else { 1673 + /* Unknown/duplicate option */ 1674 + return -EINVAL; 1675 + } 1676 + } 1677 + } 1678 + 1679 + ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); 1680 + if (ret) 1681 + return ret; 1682 + 1683 + return 1; /* Match */ 1684 + } 1685 + EXPORT_SYMBOL_GPL(vfio_pci_core_match); 1686 + 1687 + static int vfio_pci_bus_notifier(struct notifier_block *nb, 1688 + unsigned long action, void *data) 1689 + { 1690 + struct vfio_pci_core_device *vdev = container_of(nb, 1691 + struct vfio_pci_core_device, nb); 1692 + struct device *dev = data; 1693 + struct pci_dev *pdev = to_pci_dev(dev); 1694 + struct pci_dev *physfn = pci_physfn(pdev); 1695 + 1696 + if (action == BUS_NOTIFY_ADD_DEVICE && 1697 + pdev->is_virtfn && physfn == vdev->pdev) { 1698 + pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", 1699 + pci_name(pdev)); 1700 + pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 1701 + vdev->vdev.ops->name); 1702 + } else if (action == BUS_NOTIFY_BOUND_DRIVER && 1703 + pdev->is_virtfn && physfn == vdev->pdev) { 1704 + struct pci_driver *drv = pci_dev_driver(pdev); 1705 + 1706 + if (drv && drv != pci_dev_driver(vdev->pdev)) 1707 + pci_warn(vdev->pdev, 1708 + "VF %s bound to driver %s while PF bound to driver %s\n", 1709 + pci_name(pdev), drv->name, 1710 + pci_dev_driver(vdev->pdev)->name); 1711 + } 1712 + 1713 + return 0; 1714 + } 1715 + 1716 + static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev) 1717 + { 1718 + struct pci_dev *pdev = vdev->pdev; 1719 + int ret; 1720 + 1721 + if (!pdev->is_physfn) 1722 + return 0; 1723 + 1724 + vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); 1725 + if (!vdev->vf_token) 1726 + return -ENOMEM; 1727 + 1728 + mutex_init(&vdev->vf_token->lock); 1729 + uuid_gen(&vdev->vf_token->uuid); 1730 + 1731 + vdev->nb.notifier_call = vfio_pci_bus_notifier; 1732 + ret = bus_register_notifier(&pci_bus_type, &vdev->nb); 1733 + if (ret) { 1734 + kfree(vdev->vf_token); 1735 + return ret; 1736 + } 1737 + return 0; 1738 + } 1739 + 1740 + static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev) 1741 + { 1742 + if (!vdev->vf_token) 1743 + return; 1744 + 1745 + bus_unregister_notifier(&pci_bus_type, &vdev->nb); 1746 + WARN_ON(vdev->vf_token->users); 1747 + mutex_destroy(&vdev->vf_token->lock); 1748 + kfree(vdev->vf_token); 1749 + } 1750 + 1751 + static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev) 1752 + { 1753 + struct pci_dev *pdev = vdev->pdev; 1754 + int ret; 1755 + 1756 + if (!vfio_pci_is_vga(pdev)) 1757 + return 0; 1758 + 1759 + ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); 1760 + if (ret) 1761 + return ret; 1762 + vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false)); 1763 + return 0; 1764 + } 1765 + 1766 + static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) 1767 + { 1768 + struct pci_dev *pdev = vdev->pdev; 1769 + 1770 + if (!vfio_pci_is_vga(pdev)) 1771 + return; 1772 + vga_client_register(pdev, NULL, NULL, NULL); 1773 + vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | 1774 + VGA_RSRC_LEGACY_IO | 1775 + VGA_RSRC_LEGACY_MEM); 1776 + } 1777 + 1778 + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, 1779 + struct pci_dev *pdev, 1780 + const struct vfio_device_ops *vfio_pci_ops) 1781 + { 1782 + vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); 1783 + vdev->pdev = pdev; 1784 + vdev->irq_type = VFIO_PCI_NUM_IRQS; 1785 + mutex_init(&vdev->igate); 1786 + spin_lock_init(&vdev->irqlock); 1787 + mutex_init(&vdev->ioeventfds_lock); 1788 + INIT_LIST_HEAD(&vdev->dummy_resources_list); 1789 + INIT_LIST_HEAD(&vdev->ioeventfds_list); 1790 + mutex_init(&vdev->vma_lock); 1791 + INIT_LIST_HEAD(&vdev->vma_list); 1792 + init_rwsem(&vdev->memory_lock); 1793 + } 1794 + EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); 1795 + 1796 + void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) 1797 + { 1798 + mutex_destroy(&vdev->igate); 1799 + mutex_destroy(&vdev->ioeventfds_lock); 1800 + mutex_destroy(&vdev->vma_lock); 1801 + vfio_uninit_group_dev(&vdev->vdev); 1802 + kfree(vdev->region); 1803 + kfree(vdev->pm_save); 1804 + } 1805 + EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); 1806 + 1807 + int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) 1808 + { 1809 + struct pci_dev *pdev = vdev->pdev; 1810 + struct iommu_group *group; 1811 + int ret; 1812 + 1813 + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1814 + return -EINVAL; 1815 + 1816 + /* 1817 + * Prevent binding to PFs with VFs enabled, the VFs might be in use 1818 + * by the host or other users. We cannot capture the VFs if they 1819 + * already exist, nor can we track VF users. Disabling SR-IOV here 1820 + * would initiate removing the VFs, which would unbind the driver, 1821 + * which is prone to blocking if that VF is also in use by vfio-pci. 1822 + * Just reject these PFs and let the user sort it out. 1823 + */ 1824 + if (pci_num_vf(pdev)) { 1825 + pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); 1826 + return -EBUSY; 1827 + } 1828 + 1829 + group = vfio_iommu_group_get(&pdev->dev); 1830 + if (!group) 1831 + return -EINVAL; 1832 + 1833 + if (pci_is_root_bus(pdev->bus)) { 1834 + ret = vfio_assign_device_set(&vdev->vdev, vdev); 1835 + } else if (!pci_probe_reset_slot(pdev->slot)) { 1836 + ret = vfio_assign_device_set(&vdev->vdev, pdev->slot); 1837 + } else { 1838 + /* 1839 + * If there is no slot reset support for this device, the whole 1840 + * bus needs to be grouped together to support bus-wide resets. 1841 + */ 1842 + ret = vfio_assign_device_set(&vdev->vdev, pdev->bus); 1843 + } 1844 + 1845 + if (ret) 1846 + goto out_group_put; 1847 + ret = vfio_pci_vf_init(vdev); 1848 + if (ret) 1849 + goto out_group_put; 1850 + ret = vfio_pci_vga_init(vdev); 1851 + if (ret) 1852 + goto out_vf; 1853 + 1854 + vfio_pci_probe_power_state(vdev); 1855 + 1856 + if (!disable_idle_d3) { 1857 + /* 1858 + * pci-core sets the device power state to an unknown value at 1859 + * bootup and after being removed from a driver. The only 1860 + * transition it allows from this unknown state is to D0, which 1861 + * typically happens when a driver calls pci_enable_device(). 1862 + * We're not ready to enable the device yet, but we do want to 1863 + * be able to get to D3. Therefore first do a D0 transition 1864 + * before going to D3. 1865 + */ 1866 + vfio_pci_set_power_state(vdev, PCI_D0); 1867 + vfio_pci_set_power_state(vdev, PCI_D3hot); 1868 + } 1869 + 1870 + ret = vfio_register_group_dev(&vdev->vdev); 1871 + if (ret) 1872 + goto out_power; 1873 + return 0; 1874 + 1875 + out_power: 1876 + if (!disable_idle_d3) 1877 + vfio_pci_set_power_state(vdev, PCI_D0); 1878 + out_vf: 1879 + vfio_pci_vf_uninit(vdev); 1880 + out_group_put: 1881 + vfio_iommu_group_put(group, &pdev->dev); 1882 + return ret; 1883 + } 1884 + EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); 1885 + 1886 + void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) 1887 + { 1888 + struct pci_dev *pdev = vdev->pdev; 1889 + 1890 + pci_disable_sriov(pdev); 1891 + 1892 + vfio_unregister_group_dev(&vdev->vdev); 1893 + 1894 + vfio_pci_vf_uninit(vdev); 1895 + vfio_pci_vga_uninit(vdev); 1896 + 1897 + vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1898 + 1899 + if (!disable_idle_d3) 1900 + vfio_pci_set_power_state(vdev, PCI_D0); 1901 + } 1902 + EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); 1903 + 1904 + static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, 1905 + pci_channel_state_t state) 1906 + { 1907 + struct vfio_pci_core_device *vdev; 1908 + struct vfio_device *device; 1909 + 1910 + device = vfio_device_get_from_dev(&pdev->dev); 1911 + if (device == NULL) 1912 + return PCI_ERS_RESULT_DISCONNECT; 1913 + 1914 + vdev = container_of(device, struct vfio_pci_core_device, vdev); 1915 + 1916 + mutex_lock(&vdev->igate); 1917 + 1918 + if (vdev->err_trigger) 1919 + eventfd_signal(vdev->err_trigger, 1); 1920 + 1921 + mutex_unlock(&vdev->igate); 1922 + 1923 + vfio_device_put(device); 1924 + 1925 + return PCI_ERS_RESULT_CAN_RECOVER; 1926 + } 1927 + 1928 + int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) 1929 + { 1930 + struct vfio_device *device; 1931 + int ret = 0; 1932 + 1933 + device = vfio_device_get_from_dev(&pdev->dev); 1934 + if (!device) 1935 + return -ENODEV; 1936 + 1937 + if (nr_virtfn == 0) 1938 + pci_disable_sriov(pdev); 1939 + else 1940 + ret = pci_enable_sriov(pdev, nr_virtfn); 1941 + 1942 + vfio_device_put(device); 1943 + 1944 + return ret < 0 ? ret : nr_virtfn; 1945 + } 1946 + EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); 1947 + 1948 + const struct pci_error_handlers vfio_pci_core_err_handlers = { 1949 + .error_detected = vfio_pci_aer_err_detected, 1950 + }; 1951 + EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); 1952 + 1953 + static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, 1954 + struct vfio_pci_group_info *groups) 1955 + { 1956 + unsigned int i; 1957 + 1958 + for (i = 0; i < groups->count; i++) 1959 + if (groups->groups[i] == vdev->vdev.group) 1960 + return true; 1961 + return false; 1962 + } 1963 + 1964 + static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) 1965 + { 1966 + struct vfio_device_set *dev_set = data; 1967 + struct vfio_device *cur; 1968 + 1969 + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) 1970 + if (cur->dev == &pdev->dev) 1971 + return 0; 1972 + return -EBUSY; 1973 + } 1974 + 1975 + /* 1976 + * vfio-core considers a group to be viable and will create a vfio_device even 1977 + * if some devices are bound to drivers like pci-stub or pcieport. Here we 1978 + * require all PCI devices to be inside our dev_set since that ensures they stay 1979 + * put and that every driver controlling the device can co-ordinate with the 1980 + * device reset. 1981 + * 1982 + * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be 1983 + * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise. 1984 + */ 1985 + static struct pci_dev * 1986 + vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) 1987 + { 1988 + struct pci_dev *pdev; 1989 + 1990 + lockdep_assert_held(&dev_set->lock); 1991 + 1992 + /* 1993 + * By definition all PCI devices in the dev_set share the same PCI 1994 + * reset, so any pci_dev will have the same outcomes for 1995 + * pci_probe_reset_*() and pci_reset_bus(). 1996 + */ 1997 + pdev = list_first_entry(&dev_set->device_list, 1998 + struct vfio_pci_core_device, 1999 + vdev.dev_set_list)->pdev; 2000 + 2001 + /* pci_reset_bus() is supported */ 2002 + if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus)) 2003 + return NULL; 2004 + 2005 + if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set, 2006 + dev_set, 2007 + !pci_probe_reset_slot(pdev->slot))) 2008 + return NULL; 2009 + return pdev; 2010 + } 2011 + 2012 + /* 2013 + * We need to get memory_lock for each device, but devices can share mmap_lock, 2014 + * therefore we need to zap and hold the vma_lock for each device, and only then 2015 + * get each memory_lock. 2016 + */ 2017 + static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, 2018 + struct vfio_pci_group_info *groups) 2019 + { 2020 + struct vfio_pci_core_device *cur_mem; 2021 + struct vfio_pci_core_device *cur_vma; 2022 + struct vfio_pci_core_device *cur; 2023 + struct pci_dev *pdev; 2024 + bool is_mem = true; 2025 + int ret; 2026 + 2027 + mutex_lock(&dev_set->lock); 2028 + cur_mem = list_first_entry(&dev_set->device_list, 2029 + struct vfio_pci_core_device, 2030 + vdev.dev_set_list); 2031 + 2032 + pdev = vfio_pci_dev_set_resettable(dev_set); 2033 + if (!pdev) { 2034 + ret = -EINVAL; 2035 + goto err_unlock; 2036 + } 2037 + 2038 + list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { 2039 + /* 2040 + * Test whether all the affected devices are contained by the 2041 + * set of groups provided by the user. 2042 + */ 2043 + if (!vfio_dev_in_groups(cur_vma, groups)) { 2044 + ret = -EINVAL; 2045 + goto err_undo; 2046 + } 2047 + 2048 + /* 2049 + * Locking multiple devices is prone to deadlock, runaway and 2050 + * unwind if we hit contention. 2051 + */ 2052 + if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { 2053 + ret = -EBUSY; 2054 + goto err_undo; 2055 + } 2056 + } 2057 + cur_vma = NULL; 2058 + 2059 + list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { 2060 + if (!down_write_trylock(&cur_mem->memory_lock)) { 2061 + ret = -EBUSY; 2062 + goto err_undo; 2063 + } 2064 + mutex_unlock(&cur_mem->vma_lock); 2065 + } 2066 + cur_mem = NULL; 2067 + 2068 + ret = pci_reset_bus(pdev); 2069 + 2070 + err_undo: 2071 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2072 + if (cur == cur_mem) 2073 + is_mem = false; 2074 + if (cur == cur_vma) 2075 + break; 2076 + if (is_mem) 2077 + up_write(&cur->memory_lock); 2078 + else 2079 + mutex_unlock(&cur->vma_lock); 2080 + } 2081 + err_unlock: 2082 + mutex_unlock(&dev_set->lock); 2083 + return ret; 2084 + } 2085 + 2086 + static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) 2087 + { 2088 + struct vfio_pci_core_device *cur; 2089 + bool needs_reset = false; 2090 + 2091 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2092 + /* No VFIO device in the set can have an open device FD */ 2093 + if (cur->vdev.open_count) 2094 + return false; 2095 + needs_reset |= cur->needs_reset; 2096 + } 2097 + return needs_reset; 2098 + } 2099 + 2100 + /* 2101 + * If a bus or slot reset is available for the provided dev_set and: 2102 + * - All of the devices affected by that bus or slot reset are unused 2103 + * - At least one of the affected devices is marked dirty via 2104 + * needs_reset (such as by lack of FLR support) 2105 + * Then attempt to perform that bus or slot reset. 2106 + * Returns true if the dev_set was reset. 2107 + */ 2108 + static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) 2109 + { 2110 + struct vfio_pci_core_device *cur; 2111 + struct pci_dev *pdev; 2112 + int ret; 2113 + 2114 + if (!vfio_pci_dev_set_needs_reset(dev_set)) 2115 + return false; 2116 + 2117 + pdev = vfio_pci_dev_set_resettable(dev_set); 2118 + if (!pdev) 2119 + return false; 2120 + 2121 + ret = pci_reset_bus(pdev); 2122 + if (ret) 2123 + return false; 2124 + 2125 + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { 2126 + cur->needs_reset = false; 2127 + if (!disable_idle_d3) 2128 + vfio_pci_set_power_state(cur, PCI_D3hot); 2129 + } 2130 + return true; 2131 + } 2132 + 2133 + void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, 2134 + bool is_disable_idle_d3) 2135 + { 2136 + nointxmask = is_nointxmask; 2137 + disable_vga = is_disable_vga; 2138 + disable_idle_d3 = is_disable_idle_d3; 2139 + } 2140 + EXPORT_SYMBOL_GPL(vfio_pci_core_set_params); 2141 + 2142 + static void vfio_pci_core_cleanup(void) 2143 + { 2144 + vfio_pci_uninit_perm_bits(); 2145 + } 2146 + 2147 + static int __init vfio_pci_core_init(void) 2148 + { 2149 + /* Allocate shared config space permission data used by all devices */ 2150 + return vfio_pci_init_perm_bits(); 2151 + } 2152 + 2153 + module_init(vfio_pci_core_init); 2154 + module_exit(vfio_pci_core_cleanup); 2155 + 2156 + MODULE_LICENSE("GPL v2"); 2157 + MODULE_AUTHOR(DRIVER_AUTHOR); 2158 + MODULE_DESCRIPTION(DRIVER_DESC);
+10 -9
drivers/vfio/pci/vfio_pci_igd.c
··· 15 15 #include <linux/uaccess.h> 16 16 #include <linux/vfio.h> 17 17 18 - #include "vfio_pci_private.h" 18 + #include <linux/vfio_pci_core.h> 19 19 20 20 #define OPREGION_SIGNATURE "IntelGraphicsMem" 21 21 #define OPREGION_SIZE (8 * 1024) ··· 25 25 #define OPREGION_RVDS 0x3c2 26 26 #define OPREGION_VERSION 0x16 27 27 28 - static ssize_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf, 29 - size_t count, loff_t *ppos, bool iswrite) 28 + static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev, 29 + char __user *buf, size_t count, loff_t *ppos, 30 + bool iswrite) 30 31 { 31 32 unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 32 33 void *base = vdev->region[i].data; ··· 46 45 return count; 47 46 } 48 47 49 - static void vfio_pci_igd_release(struct vfio_pci_device *vdev, 48 + static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev, 50 49 struct vfio_pci_region *region) 51 50 { 52 51 memunmap(region->data); ··· 57 56 .release = vfio_pci_igd_release, 58 57 }; 59 58 60 - static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) 59 + static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) 61 60 { 62 61 __le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR); 63 62 u32 addr, size; ··· 161 160 return ret; 162 161 } 163 162 164 - static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, 163 + static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, 165 164 char __user *buf, size_t count, loff_t *ppos, 166 165 bool iswrite) 167 166 { ··· 254 253 return count; 255 254 } 256 255 257 - static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev, 256 + static void vfio_pci_igd_cfg_release(struct vfio_pci_core_device *vdev, 258 257 struct vfio_pci_region *region) 259 258 { 260 259 struct pci_dev *pdev = region->data; ··· 267 266 .release = vfio_pci_igd_cfg_release, 268 267 }; 269 268 270 - static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev) 269 + static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) 271 270 { 272 271 struct pci_dev *host_bridge, *lpc_bridge; 273 272 int ret; ··· 315 314 return 0; 316 315 } 317 316 318 - int vfio_pci_igd_init(struct vfio_pci_device *vdev) 317 + int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 319 318 { 320 319 int ret; 321 320
+21 -21
drivers/vfio/pci/vfio_pci_intrs.c
··· 20 20 #include <linux/wait.h> 21 21 #include <linux/slab.h> 22 22 23 - #include "vfio_pci_private.h" 23 + #include <linux/vfio_pci_core.h> 24 24 25 25 /* 26 26 * INTx 27 27 */ 28 28 static void vfio_send_intx_eventfd(void *opaque, void *unused) 29 29 { 30 - struct vfio_pci_device *vdev = opaque; 30 + struct vfio_pci_core_device *vdev = opaque; 31 31 32 32 if (likely(is_intx(vdev) && !vdev->virq_disabled)) 33 33 eventfd_signal(vdev->ctx[0].trigger, 1); 34 34 } 35 35 36 - void vfio_pci_intx_mask(struct vfio_pci_device *vdev) 36 + void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 37 37 { 38 38 struct pci_dev *pdev = vdev->pdev; 39 39 unsigned long flags; ··· 73 73 */ 74 74 static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) 75 75 { 76 - struct vfio_pci_device *vdev = opaque; 76 + struct vfio_pci_core_device *vdev = opaque; 77 77 struct pci_dev *pdev = vdev->pdev; 78 78 unsigned long flags; 79 79 int ret = 0; ··· 107 107 return ret; 108 108 } 109 109 110 - void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) 110 + void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 111 111 { 112 112 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) 113 113 vfio_send_intx_eventfd(vdev, NULL); ··· 115 115 116 116 static irqreturn_t vfio_intx_handler(int irq, void *dev_id) 117 117 { 118 - struct vfio_pci_device *vdev = dev_id; 118 + struct vfio_pci_core_device *vdev = dev_id; 119 119 unsigned long flags; 120 120 int ret = IRQ_NONE; 121 121 ··· 139 139 return ret; 140 140 } 141 141 142 - static int vfio_intx_enable(struct vfio_pci_device *vdev) 142 + static int vfio_intx_enable(struct vfio_pci_core_device *vdev) 143 143 { 144 144 if (!is_irq_none(vdev)) 145 145 return -EINVAL; ··· 168 168 return 0; 169 169 } 170 170 171 - static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) 171 + static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) 172 172 { 173 173 struct pci_dev *pdev = vdev->pdev; 174 174 unsigned long irqflags = IRQF_SHARED; ··· 223 223 return 0; 224 224 } 225 225 226 - static void vfio_intx_disable(struct vfio_pci_device *vdev) 226 + static void vfio_intx_disable(struct vfio_pci_core_device *vdev) 227 227 { 228 228 vfio_virqfd_disable(&vdev->ctx[0].unmask); 229 229 vfio_virqfd_disable(&vdev->ctx[0].mask); ··· 244 244 return IRQ_HANDLED; 245 245 } 246 246 247 - static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) 247 + static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msix) 248 248 { 249 249 struct pci_dev *pdev = vdev->pdev; 250 250 unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI; ··· 285 285 return 0; 286 286 } 287 287 288 - static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, 288 + static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, 289 289 int vector, int fd, bool msix) 290 290 { 291 291 struct pci_dev *pdev = vdev->pdev; ··· 364 364 return 0; 365 365 } 366 366 367 - static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, 367 + static int vfio_msi_set_block(struct vfio_pci_core_device *vdev, unsigned start, 368 368 unsigned count, int32_t *fds, bool msix) 369 369 { 370 370 int i, j, ret = 0; ··· 385 385 return ret; 386 386 } 387 387 388 - static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) 388 + static void vfio_msi_disable(struct vfio_pci_core_device *vdev, bool msix) 389 389 { 390 390 struct pci_dev *pdev = vdev->pdev; 391 391 int i; ··· 417 417 /* 418 418 * IOCTL support 419 419 */ 420 - static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, 420 + static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev, 421 421 unsigned index, unsigned start, 422 422 unsigned count, uint32_t flags, void *data) 423 423 { ··· 444 444 return 0; 445 445 } 446 446 447 - static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, 447 + static int vfio_pci_set_intx_mask(struct vfio_pci_core_device *vdev, 448 448 unsigned index, unsigned start, 449 449 unsigned count, uint32_t flags, void *data) 450 450 { ··· 464 464 return 0; 465 465 } 466 466 467 - static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, 467 + static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, 468 468 unsigned index, unsigned start, 469 469 unsigned count, uint32_t flags, void *data) 470 470 { ··· 507 507 return 0; 508 508 } 509 509 510 - static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, 510 + static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, 511 511 unsigned index, unsigned start, 512 512 unsigned count, uint32_t flags, void *data) 513 513 { ··· 613 613 return -EINVAL; 614 614 } 615 615 616 - static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, 616 + static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, 617 617 unsigned index, unsigned start, 618 618 unsigned count, uint32_t flags, void *data) 619 619 { ··· 624 624 count, flags, data); 625 625 } 626 626 627 - static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, 627 + static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, 628 628 unsigned index, unsigned start, 629 629 unsigned count, uint32_t flags, void *data) 630 630 { ··· 635 635 count, flags, data); 636 636 } 637 637 638 - int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 638 + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, 639 639 unsigned index, unsigned start, unsigned count, 640 640 void *data) 641 641 { 642 - int (*func)(struct vfio_pci_device *vdev, unsigned index, 642 + int (*func)(struct vfio_pci_core_device *vdev, unsigned index, 643 643 unsigned start, unsigned count, uint32_t flags, 644 644 void *data) = NULL; 645 645
+60 -29
drivers/vfio/pci/vfio_pci_private.h include/linux/vfio_pci_core.h
··· 10 10 11 11 #include <linux/mutex.h> 12 12 #include <linux/pci.h> 13 + #include <linux/vfio.h> 13 14 #include <linux/irqbypass.h> 14 15 #include <linux/types.h> 15 16 #include <linux/uuid.h> 16 17 #include <linux/notifier.h> 17 18 18 - #ifndef VFIO_PCI_PRIVATE_H 19 - #define VFIO_PCI_PRIVATE_H 19 + #ifndef VFIO_PCI_CORE_H 20 + #define VFIO_PCI_CORE_H 20 21 21 22 #define VFIO_PCI_OFFSET_SHIFT 40 22 23 ··· 34 33 35 34 struct vfio_pci_ioeventfd { 36 35 struct list_head next; 37 - struct vfio_pci_device *vdev; 36 + struct vfio_pci_core_device *vdev; 38 37 struct virqfd *virqfd; 39 38 void __iomem *addr; 40 39 uint64_t data; ··· 53 52 struct irq_bypass_producer producer; 54 53 }; 55 54 56 - struct vfio_pci_device; 55 + struct vfio_pci_core_device; 57 56 struct vfio_pci_region; 58 57 59 58 struct vfio_pci_regops { 60 - ssize_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, 59 + ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, 61 60 size_t count, loff_t *ppos, bool iswrite); 62 - void (*release)(struct vfio_pci_device *vdev, 61 + void (*release)(struct vfio_pci_core_device *vdev, 63 62 struct vfio_pci_region *region); 64 - int (*mmap)(struct vfio_pci_device *vdev, 63 + int (*mmap)(struct vfio_pci_core_device *vdev, 65 64 struct vfio_pci_region *region, 66 65 struct vm_area_struct *vma); 67 - int (*add_capability)(struct vfio_pci_device *vdev, 66 + int (*add_capability)(struct vfio_pci_core_device *vdev, 68 67 struct vfio_pci_region *region, 69 68 struct vfio_info_cap *caps); 70 69 }; ··· 95 94 struct list_head vma_next; 96 95 }; 97 96 98 - struct vfio_pci_device { 97 + struct vfio_pci_core_device { 99 98 struct vfio_device vdev; 100 99 struct pci_dev *pdev; 101 100 void __iomem *barmap[PCI_STD_NUM_BARS]; ··· 145 144 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) 146 145 #define irq_is(vdev, type) (vdev->irq_type == type) 147 146 148 - extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); 149 - extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); 147 + extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); 148 + extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); 150 149 151 - extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, 150 + extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, 152 151 uint32_t flags, unsigned index, 153 152 unsigned start, unsigned count, void *data); 154 153 155 - extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, 154 + extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, 156 155 char __user *buf, size_t count, 157 156 loff_t *ppos, bool iswrite); 158 157 159 - extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, 158 + extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, 160 159 size_t count, loff_t *ppos, bool iswrite); 161 160 162 - extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, 161 + extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, 163 162 size_t count, loff_t *ppos, bool iswrite); 164 163 165 - extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, 164 + extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, 166 165 uint64_t data, int count, int fd); 167 166 168 167 extern int vfio_pci_init_perm_bits(void); 169 168 extern void vfio_pci_uninit_perm_bits(void); 170 169 171 - extern int vfio_config_init(struct vfio_pci_device *vdev); 172 - extern void vfio_config_free(struct vfio_pci_device *vdev); 170 + extern int vfio_config_init(struct vfio_pci_core_device *vdev); 171 + extern void vfio_config_free(struct vfio_pci_core_device *vdev); 173 172 174 - extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 173 + extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, 175 174 unsigned int type, unsigned int subtype, 176 175 const struct vfio_pci_regops *ops, 177 176 size_t size, u32 flags, void *data); 178 177 179 - extern int vfio_pci_set_power_state(struct vfio_pci_device *vdev, 178 + extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, 180 179 pci_power_t state); 181 180 182 - extern bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev); 183 - extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device 181 + extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); 182 + extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device 184 183 *vdev); 185 - extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev); 186 - extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, 184 + extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); 185 + extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, 187 186 u16 cmd); 188 187 189 188 #ifdef CONFIG_VFIO_PCI_IGD 190 - extern int vfio_pci_igd_init(struct vfio_pci_device *vdev); 189 + extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); 191 190 #else 192 - static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) 191 + static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 193 192 { 194 193 return -ENODEV; 195 194 } 196 195 #endif 197 196 198 197 #ifdef CONFIG_S390 199 - extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 198 + extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 200 199 struct vfio_info_cap *caps); 201 200 #else 202 - static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 201 + static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 203 202 struct vfio_info_cap *caps) 204 203 { 205 204 return -ENODEV; 206 205 } 207 206 #endif 208 207 209 - #endif /* VFIO_PCI_PRIVATE_H */ 208 + /* Will be exported for vfio pci drivers usage */ 209 + void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, 210 + bool is_disable_idle_d3); 211 + void vfio_pci_core_close_device(struct vfio_device *core_vdev); 212 + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, 213 + struct pci_dev *pdev, 214 + const struct vfio_device_ops *vfio_pci_ops); 215 + int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); 216 + void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); 217 + void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); 218 + int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); 219 + extern const struct pci_error_handlers vfio_pci_core_err_handlers; 220 + long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, 221 + unsigned long arg); 222 + ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, 223 + size_t count, loff_t *ppos); 224 + ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, 225 + size_t count, loff_t *ppos); 226 + int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); 227 + void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); 228 + int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); 229 + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); 230 + void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); 231 + void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); 232 + 233 + static inline bool vfio_pci_is_vga(struct pci_dev *pdev) 234 + { 235 + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; 236 + } 237 + 238 + #endif /* VFIO_PCI_CORE_H */
+9 -9
drivers/vfio/pci/vfio_pci_rdwr.c
··· 17 17 #include <linux/vfio.h> 18 18 #include <linux/vgaarb.h> 19 19 20 - #include "vfio_pci_private.h" 20 + #include <linux/vfio_pci_core.h> 21 21 22 22 #ifdef __LITTLE_ENDIAN 23 23 #define vfio_ioread64 ioread64 ··· 38 38 #define vfio_iowrite8 iowrite8 39 39 40 40 #define VFIO_IOWRITE(size) \ 41 - static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \ 41 + static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \ 42 42 bool test_mem, u##size val, void __iomem *io) \ 43 43 { \ 44 44 if (test_mem) { \ ··· 65 65 #endif 66 66 67 67 #define VFIO_IOREAD(size) \ 68 - static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \ 68 + static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \ 69 69 bool test_mem, u##size *val, void __iomem *io) \ 70 70 { \ 71 71 if (test_mem) { \ ··· 94 94 * reads with -1. This is intended for handling MSI-X vector tables and 95 95 * leftover space for ROM BARs. 96 96 */ 97 - static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem, 97 + static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 98 98 void __iomem *io, char __user *buf, 99 99 loff_t off, size_t count, size_t x_start, 100 100 size_t x_end, bool iswrite) ··· 200 200 return done; 201 201 } 202 202 203 - static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar) 203 + static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar) 204 204 { 205 205 struct pci_dev *pdev = vdev->pdev; 206 206 int ret; ··· 224 224 return 0; 225 225 } 226 226 227 - ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, 227 + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, 228 228 size_t count, loff_t *ppos, bool iswrite) 229 229 { 230 230 struct pci_dev *pdev = vdev->pdev; ··· 288 288 return done; 289 289 } 290 290 291 - ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, 291 + ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, 292 292 size_t count, loff_t *ppos, bool iswrite) 293 293 { 294 294 int ret; ··· 384 384 static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) 385 385 { 386 386 struct vfio_pci_ioeventfd *ioeventfd = opaque; 387 - struct vfio_pci_device *vdev = ioeventfd->vdev; 387 + struct vfio_pci_core_device *vdev = ioeventfd->vdev; 388 388 389 389 if (ioeventfd->test_mem) { 390 390 if (!down_read_trylock(&vdev->memory_lock)) ··· 410 410 vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem); 411 411 } 412 412 413 - long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, 413 + long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, 414 414 uint64_t data, int count, int fd) 415 415 { 416 416 struct pci_dev *pdev = vdev->pdev;
+2 -2
drivers/vfio/pci/vfio_pci_zdev.c
··· 14 14 #include <asm/pci_clp.h> 15 15 #include <asm/pci_io.h> 16 16 17 - #include "vfio_pci_private.h" 17 + #include <linux/vfio_pci_core.h> 18 18 19 19 /* 20 20 * Add the Base PCI Function information to the device info region. ··· 109 109 /* 110 110 * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain. 111 111 */ 112 - int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, 112 + int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, 113 113 struct vfio_info_cap *caps) 114 114 { 115 115 struct zpci_dev *zdev = to_zpci(vdev->pdev);
+4 -2
drivers/vfio/platform/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config VFIO_PLATFORM 3 3 tristate "VFIO support for platform devices" 4 - depends on VFIO && EVENTFD && (ARM || ARM64 || COMPILE_TEST) 4 + depends on ARM || ARM64 || COMPILE_TEST 5 5 select VFIO_VIRQFD 6 6 help 7 7 Support for platform devices with VFIO. This is required to make ··· 10 10 11 11 If you don't know what to do here, say N. 12 12 13 + if VFIO_PLATFORM 13 14 config VFIO_AMBA 14 15 tristate "VFIO support for AMBA devices" 15 - depends on VFIO_PLATFORM && (ARM_AMBA || COMPILE_TEST) 16 + depends on ARM_AMBA || COMPILE_TEST 16 17 help 17 18 Support for ARM AMBA devices with VFIO. This is required to make 18 19 use of ARM AMBA devices present on the system using the VFIO ··· 22 21 If you don't know what to do here, say N. 23 22 24 23 source "drivers/vfio/platform/reset/Kconfig" 24 + endif
+1 -3
drivers/vfio/platform/reset/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config VFIO_PLATFORM_CALXEDAXGMAC_RESET 3 3 tristate "VFIO support for calxeda xgmac reset" 4 - depends on VFIO_PLATFORM 5 4 help 6 5 Enables the VFIO platform driver to handle reset for Calxeda xgmac 7 6 ··· 8 9 9 10 config VFIO_PLATFORM_AMDXGBE_RESET 10 11 tristate "VFIO support for AMD XGBE reset" 11 - depends on VFIO_PLATFORM 12 12 help 13 13 Enables the VFIO platform driver to handle reset for AMD XGBE 14 14 ··· 15 17 16 18 config VFIO_PLATFORM_BCMFLEXRM_RESET 17 19 tristate "VFIO support for Broadcom FlexRM reset" 18 - depends on VFIO_PLATFORM && (ARCH_BCM_IPROC || COMPILE_TEST) 20 + depends on ARCH_BCM_IPROC || COMPILE_TEST 19 21 default ARCH_BCM_IPROC 20 22 help 21 23 Enables the VFIO platform driver to handle reset for Broadcom FlexRM
+4 -4
drivers/vfio/vfio_iommu_type1.c
··· 612 612 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start, 613 613 size_t size, struct vfio_dma **dma_p) 614 614 { 615 - int ret; 615 + int ret = 0; 616 616 617 617 do { 618 618 *dma_p = vfio_find_dma(iommu, start, size); 619 619 if (!*dma_p) 620 - ret = -EINVAL; 620 + return -EINVAL; 621 621 else if (!(*dma_p)->vaddr_invalid) 622 - ret = 0; 622 + return ret; 623 623 else 624 624 ret = vfio_wait(iommu); 625 - } while (ret > 0); 625 + } while (ret == WAITED); 626 626 627 627 return ret; 628 628 }
+6
include/linux/mod_devicetable.h
··· 16 16 17 17 #define PCI_ANY_ID (~0) 18 18 19 + enum { 20 + PCI_ID_F_VFIO_DRIVER_OVERRIDE = 1, 21 + }; 22 + 19 23 /** 20 24 * struct pci_device_id - PCI device ID structure 21 25 * @vendor: Vendor ID to match (or PCI_ANY_ID) ··· 38 34 * Best practice is to use driver_data as an index 39 35 * into a static list of equivalent device types, 40 36 * instead of using it as a pointer. 37 + * @override_only: Match only when dev->driver_override is this driver. 41 38 */ 42 39 struct pci_device_id { 43 40 __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ 44 41 __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ 45 42 __u32 class, class_mask; /* (class,subclass,prog-if) triplet */ 46 43 kernel_ulong_t driver_data; /* Data private to the driver */ 44 + __u32 override_only; 47 45 }; 48 46 49 47
+29
include/linux/pci.h
··· 902 902 .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID 903 903 904 904 /** 905 + * PCI_DEVICE_DRIVER_OVERRIDE - macro used to describe a PCI device with 906 + * override_only flags. 907 + * @vend: the 16 bit PCI Vendor ID 908 + * @dev: the 16 bit PCI Device ID 909 + * @driver_override: the 32 bit PCI Device override_only 910 + * 911 + * This macro is used to create a struct pci_device_id that matches only a 912 + * driver_override device. The subvendor and subdevice fields will be set to 913 + * PCI_ANY_ID. 914 + */ 915 + #define PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, driver_override) \ 916 + .vendor = (vend), .device = (dev), .subvendor = PCI_ANY_ID, \ 917 + .subdevice = PCI_ANY_ID, .override_only = (driver_override) 918 + 919 + /** 920 + * PCI_DRIVER_OVERRIDE_DEVICE_VFIO - macro used to describe a VFIO 921 + * "driver_override" PCI device. 922 + * @vend: the 16 bit PCI Vendor ID 923 + * @dev: the 16 bit PCI Device ID 924 + * 925 + * This macro is used to create a struct pci_device_id that matches a 926 + * specific device. The subvendor and subdevice fields will be set to 927 + * PCI_ANY_ID and the driver_override will be set to 928 + * PCI_ID_F_VFIO_DRIVER_OVERRIDE. 929 + */ 930 + #define PCI_DRIVER_OVERRIDE_DEVICE_VFIO(vend, dev) \ 931 + PCI_DEVICE_DRIVER_OVERRIDE(vend, dev, PCI_ID_F_VFIO_DRIVER_OVERRIDE) 932 + 933 + /** 905 934 * PCI_DEVICE_SUB - macro used to describe a specific PCI device with subsystem 906 935 * @vend: the 16 bit PCI Vendor ID 907 936 * @dev: the 16 bit PCI Device ID
+1
scripts/mod/devicetable-offsets.c
··· 42 42 DEVID_FIELD(pci_device_id, subdevice); 43 43 DEVID_FIELD(pci_device_id, class); 44 44 DEVID_FIELD(pci_device_id, class_mask); 45 + DEVID_FIELD(pci_device_id, override_only); 45 46 46 47 DEVID(ccw_device_id); 47 48 DEVID_FIELD(ccw_device_id, match_flags);
+15 -2
scripts/mod/file2alias.c
··· 426 426 return 1; 427 427 } 428 428 429 - /* Looks like: pci:vNdNsvNsdNbcNscNiN. */ 429 + /* Looks like: pci:vNdNsvNsdNbcNscNiN or <prefix>_pci:vNdNsvNsdNbcNscNiN. */ 430 430 static int do_pci_entry(const char *filename, 431 431 void *symval, char *alias) 432 432 { ··· 440 440 DEF_FIELD(symval, pci_device_id, subdevice); 441 441 DEF_FIELD(symval, pci_device_id, class); 442 442 DEF_FIELD(symval, pci_device_id, class_mask); 443 + DEF_FIELD(symval, pci_device_id, override_only); 443 444 444 - strcpy(alias, "pci:"); 445 + switch (override_only) { 446 + case 0: 447 + strcpy(alias, "pci:"); 448 + break; 449 + case PCI_ID_F_VFIO_DRIVER_OVERRIDE: 450 + strcpy(alias, "vfio_pci:"); 451 + break; 452 + default: 453 + warn("Unknown PCI driver_override alias %08X\n", 454 + override_only); 455 + return 0; 456 + } 457 + 445 458 ADD(alias, "v", vendor != PCI_ANY_ID, vendor); 446 459 ADD(alias, "d", device != PCI_ANY_ID, device); 447 460 ADD(alias, "sv", subvendor != PCI_ANY_ID, subvendor);