Merge tag 'vfio-v6.9-rc1' of https://github.com/awilliam/linux-vfio

+11 -5

MAINTAINERS

··· 23164 23164 S: Maintained 23165 23165 F: drivers/vfio/pci/mlx5/ 23166 23166 23167 - VFIO VIRTIO PCI DRIVER 23168 - M: Yishai Hadas <yishaih@nvidia.com> 23167 + VFIO NVIDIA GRACE GPU DRIVER 23168 + M: Ankit Agrawal <ankita@nvidia.com> 23169 23169 L: kvm@vger.kernel.org 23170 - L: virtualization@lists.linux.dev 23171 - S: Maintained 23172 - F: drivers/vfio/pci/virtio 23170 + S: Supported 23171 + F: drivers/vfio/pci/nvgrace-gpu/ 23173 23172 23174 23173 VFIO PCI DEVICE SPECIFIC DRIVERS 23175 23174 R: Jason Gunthorpe <jgg@nvidia.com> ··· 23192 23193 L: kvm@vger.kernel.org 23193 23194 S: Maintained 23194 23195 F: drivers/vfio/platform/ 23196 + 23197 + VFIO VIRTIO PCI DRIVER 23198 + M: Yishai Hadas <yishaih@nvidia.com> 23199 + L: kvm@vger.kernel.org 23200 + L: virtualization@lists.linux.dev 23201 + S: Maintained 23202 + F: drivers/vfio/pci/virtio 23195 23203 23196 23204 VGA_SWITCHEROO 23197 23205 R: Lukas Wunner <lukas@wunner.de>

+4 -3

drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c

··· 141 141 irq = &vdev->mc_irqs[index]; 142 142 143 143 if (flags & VFIO_IRQ_SET_DATA_NONE) { 144 - vfio_fsl_mc_irq_handler(hwirq, irq); 144 + if (irq->trigger) 145 + eventfd_signal(irq->trigger); 145 146 146 147 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 147 148 u8 trigger = *(u8 *)data; 148 149 149 - if (trigger) 150 - vfio_fsl_mc_irq_handler(hwirq, irq); 150 + if (trigger && irq->trigger) 151 + eventfd_signal(irq->trigger); 151 152 } 152 153 153 154 return 0;

+1 -1

drivers/vfio/mdev/mdev_driver.c

··· 40 40 return 0; 41 41 } 42 42 43 - struct bus_type mdev_bus_type = { 43 + const struct bus_type mdev_bus_type = { 44 44 .name = "mdev", 45 45 .probe = mdev_probe, 46 46 .remove = mdev_remove,

+1 -1

drivers/vfio/mdev/mdev_private.h

··· 13 13 int mdev_bus_register(void); 14 14 void mdev_bus_unregister(void); 15 15 16 - extern struct bus_type mdev_bus_type; 16 + extern const struct bus_type mdev_bus_type; 17 17 extern const struct attribute_group *mdev_device_groups[]; 18 18 19 19 #define to_mdev_type_attr(_attr) \

+2

drivers/vfio/pci/Kconfig

··· 67 67 68 68 source "drivers/vfio/pci/virtio/Kconfig" 69 69 70 + source "drivers/vfio/pci/nvgrace-gpu/Kconfig" 71 + 70 72 endmenu

+2

drivers/vfio/pci/Makefile

··· 15 15 obj-$(CONFIG_PDS_VFIO_PCI) += pds/ 16 16 17 17 obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ 18 + 19 + obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/

+12 -36

drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c

··· 630 630 } 631 631 } 632 632 633 - /* 634 - * This function is called in all state_mutex unlock cases to 635 - * handle a 'deferred_reset' if exists. 636 - */ 637 - static void 638 - hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev) 633 + static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev) 639 634 { 640 - again: 641 - spin_lock(&hisi_acc_vdev->reset_lock); 642 - if (hisi_acc_vdev->deferred_reset) { 643 - hisi_acc_vdev->deferred_reset = false; 644 - spin_unlock(&hisi_acc_vdev->reset_lock); 645 - hisi_acc_vdev->vf_qm_state = QM_NOT_READY; 646 - hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 647 - hisi_acc_vf_disable_fds(hisi_acc_vdev); 648 - goto again; 649 - } 650 - mutex_unlock(&hisi_acc_vdev->state_mutex); 651 - spin_unlock(&hisi_acc_vdev->reset_lock); 635 + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; 636 + hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 637 + hisi_acc_vf_disable_fds(hisi_acc_vdev); 652 638 } 653 639 654 640 static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) ··· 790 804 791 805 info.dirty_bytes = 0; 792 806 info.initial_bytes = migf->total_length - *pos; 807 + mutex_unlock(&migf->lock); 808 + mutex_unlock(&hisi_acc_vdev->state_mutex); 793 809 794 - ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 810 + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 795 811 out: 796 812 mutex_unlock(&migf->lock); 797 813 mutex_unlock(&hisi_acc_vdev->state_mutex); ··· 1059 1071 break; 1060 1072 } 1061 1073 } 1062 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1074 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1063 1075 return res; 1064 1076 } 1065 1077 ··· 1080 1092 1081 1093 mutex_lock(&hisi_acc_vdev->state_mutex); 1082 1094 *curr_state = hisi_acc_vdev->mig_state; 1083 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1095 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1084 1096 return 0; 1085 1097 } 1086 1098 ··· 1092 1104 VFIO_MIGRATION_STOP_COPY) 1093 1105 return; 1094 1106 1095 - /* 1096 - * As the higher VFIO layers are holding locks across reset and using 1097 - * those same locks with the mm_lock we need to prevent ABBA deadlock 1098 - * with the state_mutex and mm_lock. 1099 - * In case the state_mutex was taken already we defer the cleanup work 1100 - * to the unlock flow of the other running context. 1101 - */ 1102 - spin_lock(&hisi_acc_vdev->reset_lock); 1103 - hisi_acc_vdev->deferred_reset = true; 1104 - if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) { 1105 - spin_unlock(&hisi_acc_vdev->reset_lock); 1106 - return; 1107 - } 1108 - spin_unlock(&hisi_acc_vdev->reset_lock); 1109 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1107 + mutex_lock(&hisi_acc_vdev->state_mutex); 1108 + hisi_acc_vf_reset(hisi_acc_vdev); 1109 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1110 1110 } 1111 1111 1112 1112 static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)

+2 -4

drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h

··· 98 98 99 99 struct hisi_acc_vf_core_device { 100 100 struct vfio_pci_core_device core_device; 101 - u8 match_done:1; 102 - u8 deferred_reset:1; 101 + u8 match_done; 102 + 103 103 /* For migration state */ 104 104 struct mutex state_mutex; 105 105 enum vfio_device_mig_state mig_state; ··· 109 109 struct hisi_qm vf_qm; 110 110 u32 vf_qm_state; 111 111 int vf_id; 112 - /* For reset handler */ 113 - spinlock_t reset_lock; 114 112 struct hisi_acc_vf_migration_file *resuming_migf; 115 113 struct hisi_acc_vf_migration_file *saving_migf; 116 114 };

+132 -25

drivers/vfio/pci/mlx5/cmd.c

··· 108 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 109 if (ret) 110 110 return ret; 111 - if (mvdev->saving_migf->state == 112 - MLX5_MIGF_STATE_PRE_COPY_ERROR) { 111 + /* Upon cleanup, ignore previous pre_copy error state */ 112 + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR && 113 + !(query_flags & MLX5VF_QUERY_CLEANUP)) { 113 114 /* 114 115 * In case we had a PRE_COPY error, only query full 115 116 * image for final image ··· 121 120 return 0; 122 121 } 123 122 query_flags &= ~MLX5VF_QUERY_INC; 123 + } 124 + /* Block incremental query which is state-dependent */ 125 + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) { 126 + complete(&mvdev->saving_migf->save_comp); 127 + return -ENODEV; 124 128 } 125 129 } 126 130 ··· 153 147 remaining_total_size) : *state_size; 154 148 155 149 return 0; 150 + } 151 + 152 + static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev) 153 + { 154 + mvdev->tracker.object_changed = true; 155 + complete(&mvdev->tracker_comp); 156 156 } 157 157 158 158 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) ··· 201 189 /* Must be done outside the lock to let it progress */ 202 190 set_tracker_error(mvdev); 203 191 mutex_lock(&mvdev->state_mutex); 204 - mlx5vf_disable_fds(mvdev); 192 + mlx5vf_disable_fds(mvdev, NULL); 205 193 _mlx5vf_free_page_tracker_resources(mvdev); 206 194 mlx5vf_state_mutex_unlock(mvdev); 207 195 } ··· 233 221 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 234 222 goto end; 235 223 224 + if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 225 + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))) 226 + goto end; 227 + 236 228 mvdev->vf_id = pci_iov_vf_id(pdev); 237 229 if (mvdev->vf_id < 0) 238 230 goto end; ··· 266 250 mvdev->migrate_cap = 1; 267 251 mvdev->core_device.vdev.migration_flags = 268 252 VFIO_MIGRATION_STOP_COPY | 269 - VFIO_MIGRATION_P2P; 253 + VFIO_MIGRATION_P2P | 254 + VFIO_MIGRATION_PRE_COPY; 255 + 270 256 mvdev->core_device.vdev.mig_ops = mig_ops; 271 257 init_completion(&mvdev->tracker_comp); 272 258 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 273 259 mvdev->core_device.vdev.log_ops = log_ops; 274 - 275 - if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 276 - MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 277 - mvdev->core_device.vdev.migration_flags |= 278 - VFIO_MIGRATION_PRE_COPY; 279 260 280 261 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) 281 262 mvdev->chunk_mode = 1; ··· 413 400 __free_page(sg_page_iter_page(&sg_iter)); 414 401 sg_free_append_table(&buf->table); 415 402 kfree(buf); 403 + } 404 + 405 + static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 406 + unsigned int npages) 407 + { 408 + unsigned int to_alloc = npages; 409 + struct page **page_list; 410 + unsigned long filled; 411 + unsigned int to_fill; 412 + int ret; 413 + 414 + to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 415 + page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 416 + if (!page_list) 417 + return -ENOMEM; 418 + 419 + do { 420 + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 421 + page_list); 422 + if (!filled) { 423 + ret = -ENOMEM; 424 + goto err; 425 + } 426 + to_alloc -= filled; 427 + ret = sg_alloc_append_table_from_pages( 428 + &buf->table, page_list, filled, 0, 429 + filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 430 + GFP_KERNEL_ACCOUNT); 431 + 432 + if (ret) 433 + goto err; 434 + buf->allocated_length += filled * PAGE_SIZE; 435 + /* clean input for another bulk allocation */ 436 + memset(page_list, 0, filled * sizeof(*page_list)); 437 + to_fill = min_t(unsigned int, to_alloc, 438 + PAGE_SIZE / sizeof(*page_list)); 439 + } while (to_alloc > 0); 440 + 441 + kvfree(page_list); 442 + return 0; 443 + 444 + err: 445 + kvfree(page_list); 446 + return ret; 416 447 } 417 448 418 449 struct mlx5_vhca_data_buffer * ··· 665 608 666 609 err: 667 610 /* The error flow can't run from an interrupt context */ 668 - if (status == -EREMOTEIO) 611 + if (status == -EREMOTEIO) { 669 612 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 613 + /* Failed in FW, print cmd out failure details */ 614 + mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0, 615 + async_data->out); 616 + } 617 + 670 618 async_data->status = status; 671 619 queue_work(migf->mvdev->cb_wq, &async_data->work); 672 620 } ··· 685 623 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 686 624 struct mlx5_vhca_data_buffer *header_buf = NULL; 687 625 struct mlx5vf_async_data *async_data; 626 + bool pre_copy_cleanup = false; 688 627 int err; 689 628 690 629 lockdep_assert_held(&mvdev->state_mutex); ··· 695 632 err = wait_for_completion_interruptible(&migf->save_comp); 696 633 if (err) 697 634 return err; 635 + 636 + if ((migf->state == MLX5_MIGF_STATE_PRE_COPY || 637 + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc) 638 + pre_copy_cleanup = true; 698 639 699 640 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 700 641 /* ··· 718 651 719 652 async_data = &migf->async_data; 720 653 async_data->buf = buf; 721 - async_data->stop_copy_chunk = !track; 654 + async_data->stop_copy_chunk = (!track && !pre_copy_cleanup); 722 655 async_data->out = kvzalloc(out_size, GFP_KERNEL); 723 656 if (!async_data->out) { 724 657 err = -ENOMEM; 725 658 goto err_out; 726 659 } 727 660 728 - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 729 - if (async_data->stop_copy_chunk) { 730 - u8 header_idx = buf->stop_copy_chunk_num ? 731 - buf->stop_copy_chunk_num - 1 : 0; 661 + if (async_data->stop_copy_chunk) { 662 + u8 header_idx = buf->stop_copy_chunk_num ? 663 + buf->stop_copy_chunk_num - 1 : 0; 732 664 733 - header_buf = migf->buf_header[header_idx]; 734 - migf->buf_header[header_idx] = NULL; 735 - } 665 + header_buf = migf->buf_header[header_idx]; 666 + migf->buf_header[header_idx] = NULL; 667 + } 736 668 737 - if (!header_buf) { 738 - header_buf = mlx5vf_get_data_buffer(migf, 739 - sizeof(struct mlx5_vf_migration_header), DMA_NONE); 740 - if (IS_ERR(header_buf)) { 741 - err = PTR_ERR(header_buf); 742 - goto err_free; 743 - } 669 + if (!header_buf) { 670 + header_buf = mlx5vf_get_data_buffer(migf, 671 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 672 + if (IS_ERR(header_buf)) { 673 + err = PTR_ERR(header_buf); 674 + goto err_free; 744 675 } 745 676 } 746 677 ··· 965 900 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 966 901 } 967 902 903 + static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev, 904 + struct mlx5_vhca_page_tracker *tracker) 905 + { 906 + u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {}; 907 + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 908 + void *obj_context; 909 + void *cmd_hdr; 910 + int err; 911 + 912 + cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 913 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); 914 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 915 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id); 916 + 917 + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 918 + if (err) 919 + return err; 920 + 921 + obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context); 922 + tracker->status = MLX5_GET(page_track, obj_context, state); 923 + return 0; 924 + } 925 + 968 926 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 969 927 struct mlx5_vhca_cq_buf *buf, int nent, 970 928 int cqe_size) ··· 1045 957 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 1046 958 struct mlx5vf_pci_core_device *mvdev = container_of( 1047 959 tracker, struct mlx5vf_pci_core_device, tracker); 960 + struct mlx5_eqe_obj_change *object; 1048 961 struct mlx5_eqe *eqe = data; 1049 962 u8 event_type = (u8)type; 1050 963 u8 queue_type; 964 + u32 obj_id; 1051 965 int qp_num; 1052 966 1053 967 switch (event_type) { ··· 1064 974 qp_num != tracker->fw_qp->qpn) 1065 975 break; 1066 976 set_tracker_error(mvdev); 977 + break; 978 + case MLX5_EVENT_TYPE_OBJECT_CHANGE: 979 + object = &eqe->data.obj_change; 980 + obj_id = be32_to_cpu(object->obj_id); 981 + if (obj_id == tracker->id) 982 + set_tracker_change_event(mvdev); 1067 983 break; 1068 984 default: 1069 985 break; ··· 1730 1634 goto end; 1731 1635 } 1732 1636 1637 + if (tracker->is_err) { 1638 + err = -EIO; 1639 + goto end; 1640 + } 1641 + 1733 1642 mdev = mvdev->mdev; 1734 1643 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1735 1644 MLX5_PAGE_TRACK_STATE_REPORTING); ··· 1753 1652 dirty, &tracker->status); 1754 1653 if (poll_err == CQ_EMPTY) { 1755 1654 wait_for_completion(&mvdev->tracker_comp); 1655 + if (tracker->object_changed) { 1656 + tracker->object_changed = false; 1657 + err = mlx5vf_cmd_query_tracker(mdev, tracker); 1658 + if (err) 1659 + goto end; 1660 + } 1756 1661 continue; 1757 1662 } 1758 1663 }

+4 -7

drivers/vfio/pci/mlx5/cmd.h

··· 13 13 #include <linux/mlx5/cq.h> 14 14 #include <linux/mlx5/qp.h> 15 15 16 - #define MLX5VF_PRE_COPY_SUPP(mvdev) \ 17 - ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) 18 - 19 16 enum mlx5_vf_migf_state { 20 17 MLX5_MIGF_STATE_ERROR = 1, 21 18 MLX5_MIGF_STATE_PRE_COPY_ERROR, ··· 22 25 }; 23 26 24 27 enum mlx5_vf_load_state { 25 - MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, 26 28 MLX5_VF_LOAD_STATE_READ_HEADER, 27 29 MLX5_VF_LOAD_STATE_PREP_HEADER_DATA, 28 30 MLX5_VF_LOAD_STATE_READ_HEADER_DATA, ··· 158 162 u32 id; 159 163 u32 pdn; 160 164 u8 is_err:1; 165 + u8 object_changed:1; 161 166 struct mlx5_uars_page *uar; 162 167 struct mlx5_vhca_cq cq; 163 168 struct mlx5_vhca_qp *host_qp; ··· 193 196 enum { 194 197 MLX5VF_QUERY_INC = (1UL << 0), 195 198 MLX5VF_QUERY_FINAL = (1UL << 1), 199 + MLX5VF_QUERY_CLEANUP = (1UL << 2), 196 200 }; 197 201 198 202 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); ··· 224 226 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 225 227 size_t length, enum dma_data_direction dma_dir); 226 228 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); 227 - int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 228 - unsigned int npages); 229 229 struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 230 230 unsigned long offset); 231 231 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); 232 - void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); 232 + void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 233 + enum mlx5_vf_migf_state *last_save_state); 233 234 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); 234 235 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 235 236 u8 chunk_num, size_t next_required_umem_size);

+40 -108

drivers/vfio/pci/mlx5/main.c

··· 65 65 return NULL; 66 66 } 67 67 68 - int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 69 - unsigned int npages) 70 - { 71 - unsigned int to_alloc = npages; 72 - struct page **page_list; 73 - unsigned long filled; 74 - unsigned int to_fill; 75 - int ret; 76 - 77 - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 78 - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 79 - if (!page_list) 80 - return -ENOMEM; 81 - 82 - do { 83 - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 84 - page_list); 85 - if (!filled) { 86 - ret = -ENOMEM; 87 - goto err; 88 - } 89 - to_alloc -= filled; 90 - ret = sg_alloc_append_table_from_pages( 91 - &buf->table, page_list, filled, 0, 92 - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 93 - GFP_KERNEL_ACCOUNT); 94 - 95 - if (ret) 96 - goto err; 97 - buf->allocated_length += filled * PAGE_SIZE; 98 - /* clean input for another bulk allocation */ 99 - memset(page_list, 0, filled * sizeof(*page_list)); 100 - to_fill = min_t(unsigned int, to_alloc, 101 - PAGE_SIZE / sizeof(*page_list)); 102 - } while (to_alloc > 0); 103 - 104 - kvfree(page_list); 105 - return 0; 106 - 107 - err: 108 - kvfree(page_list); 109 - return ret; 110 - } 111 - 112 68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 113 69 { 114 70 mutex_lock(&migf->lock); ··· 733 777 return 0; 734 778 } 735 779 736 - static int 737 - mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 738 - loff_t requested_length, 739 - const char __user **buf, size_t *len, 740 - loff_t *pos, ssize_t *done) 741 - { 742 - int ret; 743 - 744 - if (requested_length > MAX_LOAD_SIZE) 745 - return -ENOMEM; 746 - 747 - if (vhca_buf->allocated_length < requested_length) { 748 - ret = mlx5vf_add_migration_pages( 749 - vhca_buf, 750 - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 751 - PAGE_SIZE)); 752 - if (ret) 753 - return ret; 754 - } 755 - 756 - while (*len) { 757 - ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 758 - done); 759 - if (ret) 760 - return ret; 761 - } 762 - 763 - return 0; 764 - } 765 - 766 780 static ssize_t 767 781 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 768 782 struct mlx5_vhca_data_buffer *vhca_buf, ··· 964 1038 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 965 1039 break; 966 1040 } 967 - case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 968 - ret = mlx5vf_resume_read_image_no_header(vhca_buf, 969 - requested_length, 970 - &buf, &len, pos, &done); 971 - if (ret) 972 - goto out_unlock; 973 - break; 974 1041 case MLX5_VF_LOAD_STATE_READ_IMAGE: 975 1042 ret = mlx5vf_resume_read_image(migf, vhca_buf, 976 1043 migf->record_size, ··· 1033 1114 } 1034 1115 1035 1116 migf->buf[0] = buf; 1036 - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 1037 - buf = mlx5vf_alloc_data_buffer(migf, 1038 - sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1039 - if (IS_ERR(buf)) { 1040 - ret = PTR_ERR(buf); 1041 - goto out_buf; 1042 - } 1043 - 1044 - migf->buf_header[0] = buf; 1045 - migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1046 - } else { 1047 - /* Initial state will be to read the image */ 1048 - migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 1117 + buf = mlx5vf_alloc_data_buffer(migf, 1118 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1119 + if (IS_ERR(buf)) { 1120 + ret = PTR_ERR(buf); 1121 + goto out_buf; 1049 1122 } 1123 + 1124 + migf->buf_header[0] = buf; 1125 + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1050 1126 1051 1127 stream_open(migf->filp->f_inode, migf->filp); 1052 1128 mutex_init(&migf->lock); ··· 1060 1146 return ERR_PTR(ret); 1061 1147 } 1062 1148 1063 - void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 1149 + void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1150 + enum mlx5_vf_migf_state *last_save_state) 1064 1151 { 1065 1152 if (mvdev->resuming_migf) { 1066 1153 mlx5vf_disable_fd(mvdev->resuming_migf); ··· 1072 1157 if (mvdev->saving_migf) { 1073 1158 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1074 1159 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1160 + if (last_save_state) 1161 + *last_save_state = mvdev->saving_migf->state; 1075 1162 mlx5vf_disable_fd(mvdev->saving_migf); 1076 1163 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1077 1164 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); ··· 1134 1217 return migf->filp; 1135 1218 } 1136 1219 1137 - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 1138 - (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1220 + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1221 + mlx5vf_disable_fds(mvdev, NULL); 1222 + return NULL; 1223 + } 1224 + 1225 + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1139 1226 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1140 1227 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1141 - mlx5vf_disable_fds(mvdev); 1142 - return NULL; 1228 + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1229 + struct mlx5_vhca_data_buffer *buf; 1230 + enum mlx5_vf_migf_state state; 1231 + size_t size; 1232 + 1233 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1234 + MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1235 + if (ret) 1236 + return ERR_PTR(ret); 1237 + buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); 1238 + if (IS_ERR(buf)) 1239 + return ERR_CAST(buf); 1240 + /* pre_copy cleanup */ 1241 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1242 + if (ret) { 1243 + mlx5vf_put_data_buffer(buf); 1244 + return ERR_PTR(ret); 1245 + } 1246 + mlx5vf_disable_fds(mvdev, &state); 1247 + return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1143 1248 } 1144 1249 1145 1250 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { ··· 1176 1237 } 1177 1238 1178 1239 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1179 - if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 1180 - ret = mlx5vf_cmd_load_vhca_state(mvdev, 1181 - mvdev->resuming_migf, 1182 - mvdev->resuming_migf->buf[0]); 1183 - if (ret) 1184 - return ERR_PTR(ret); 1185 - } 1186 - mlx5vf_disable_fds(mvdev); 1240 + mlx5vf_disable_fds(mvdev, NULL); 1187 1241 return NULL; 1188 1242 } 1189 1243 ··· 1221 1289 mvdev->deferred_reset = false; 1222 1290 spin_unlock(&mvdev->reset_lock); 1223 1291 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1224 - mlx5vf_disable_fds(mvdev); 1292 + mlx5vf_disable_fds(mvdev, NULL); 1225 1293 goto again; 1226 1294 } 1227 1295 mutex_unlock(&mvdev->state_mutex);

+10

drivers/vfio/pci/nvgrace-gpu/Kconfig

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + config NVGRACE_GPU_VFIO_PCI 3 + tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip" 4 + depends on ARM64 || (COMPILE_TEST && 64BIT) 5 + select VFIO_PCI_CORE 6 + help 7 + VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is 8 + required to assign the GPU device to userspace using KVM/qemu/etc. 9 + 10 + If you don't know what to do here, say N.

+3

drivers/vfio/pci/nvgrace-gpu/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o 3 + nvgrace-gpu-vfio-pci-y := main.o

+888

drivers/vfio/pci/nvgrace-gpu/main.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + */ 5 + 6 + #include <linux/sizes.h> 7 + #include <linux/vfio_pci_core.h> 8 + 9 + /* 10 + * The device memory usable to the workloads running in the VM is cached 11 + * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 12 + * to the VM and is represented as usemem. 13 + * Moreover, the VM GPU device driver needs a non-cacheable region to 14 + * support the MIG feature. This region is also exposed as a 64b BAR 15 + * (comprising of BAR2 and BAR3 region) and represented as resmem. 16 + */ 17 + #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 18 + #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 19 + 20 + /* Memory size expected as non cached and reserved by the VM driver */ 21 + #define RESMEM_SIZE SZ_1G 22 + 23 + /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 24 + #define MEMBLK_SIZE SZ_512M 25 + 26 + /* 27 + * The state of the two device memory region - resmem and usemem - is 28 + * saved as struct mem_region. 29 + */ 30 + struct mem_region { 31 + phys_addr_t memphys; /* Base physical address of the region */ 32 + size_t memlength; /* Region size */ 33 + size_t bar_size; /* Reported region BAR size */ 34 + __le64 bar_val; /* Emulated BAR offset registers */ 35 + union { 36 + void *memaddr; 37 + void __iomem *ioaddr; 38 + }; /* Base virtual address of the region */ 39 + }; 40 + 41 + struct nvgrace_gpu_pci_core_device { 42 + struct vfio_pci_core_device core_device; 43 + /* Cached and usable memory for the VM. */ 44 + struct mem_region usemem; 45 + /* Non cached memory carved out from the end of device memory */ 46 + struct mem_region resmem; 47 + /* Lock to control device memory kernel mapping */ 48 + struct mutex remap_lock; 49 + }; 50 + 51 + static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 52 + { 53 + struct nvgrace_gpu_pci_core_device *nvdev = 54 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 55 + core_device.vdev); 56 + 57 + nvdev->resmem.bar_val = 0; 58 + nvdev->usemem.bar_val = 0; 59 + } 60 + 61 + /* Choose the structure corresponding to the fake BAR with a given index. */ 62 + static struct mem_region * 63 + nvgrace_gpu_memregion(int index, 64 + struct nvgrace_gpu_pci_core_device *nvdev) 65 + { 66 + if (index == USEMEM_REGION_INDEX) 67 + return &nvdev->usemem; 68 + 69 + if (index == RESMEM_REGION_INDEX) 70 + return &nvdev->resmem; 71 + 72 + return NULL; 73 + } 74 + 75 + static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 76 + { 77 + struct vfio_pci_core_device *vdev = 78 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 79 + struct nvgrace_gpu_pci_core_device *nvdev = 80 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 81 + core_device.vdev); 82 + int ret; 83 + 84 + ret = vfio_pci_core_enable(vdev); 85 + if (ret) 86 + return ret; 87 + 88 + if (nvdev->usemem.memlength) { 89 + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 90 + mutex_init(&nvdev->remap_lock); 91 + } 92 + 93 + vfio_pci_core_finish_enable(vdev); 94 + 95 + return 0; 96 + } 97 + 98 + static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 99 + { 100 + struct nvgrace_gpu_pci_core_device *nvdev = 101 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 102 + core_device.vdev); 103 + 104 + /* Unmap the mapping to the device memory cached region */ 105 + if (nvdev->usemem.memaddr) { 106 + memunmap(nvdev->usemem.memaddr); 107 + nvdev->usemem.memaddr = NULL; 108 + } 109 + 110 + /* Unmap the mapping to the device memory non-cached region */ 111 + if (nvdev->resmem.ioaddr) { 112 + iounmap(nvdev->resmem.ioaddr); 113 + nvdev->resmem.ioaddr = NULL; 114 + } 115 + 116 + mutex_destroy(&nvdev->remap_lock); 117 + 118 + vfio_pci_core_close_device(core_vdev); 119 + } 120 + 121 + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 122 + struct vm_area_struct *vma) 123 + { 124 + struct nvgrace_gpu_pci_core_device *nvdev = 125 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 126 + core_device.vdev); 127 + struct mem_region *memregion; 128 + unsigned long start_pfn; 129 + u64 req_len, pgoff, end; 130 + unsigned int index; 131 + int ret = 0; 132 + 133 + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 134 + 135 + memregion = nvgrace_gpu_memregion(index, nvdev); 136 + if (!memregion) 137 + return vfio_pci_core_mmap(core_vdev, vma); 138 + 139 + /* 140 + * Request to mmap the BAR. Map to the CPU accessible memory on the 141 + * GPU using the memory information gathered from the system ACPI 142 + * tables. 143 + */ 144 + pgoff = vma->vm_pgoff & 145 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 146 + 147 + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 148 + check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 149 + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 150 + return -EOVERFLOW; 151 + 152 + /* 153 + * Check that the mapping request does not go beyond available device 154 + * memory size 155 + */ 156 + if (end > memregion->memlength) 157 + return -EINVAL; 158 + 159 + /* 160 + * The carved out region of the device memory needs the NORMAL_NC 161 + * property. Communicate as such to the hypervisor. 162 + */ 163 + if (index == RESMEM_REGION_INDEX) { 164 + /* 165 + * The nvgrace-gpu module has no issues with uncontained 166 + * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 167 + * set to communicate to the KVM to S2 map as NORMAL_NC. 168 + * This opens up guest usage of NORMAL_NC for this mapping. 169 + */ 170 + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 171 + 172 + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 173 + } 174 + 175 + /* 176 + * Perform a PFN map to the memory and back the device BAR by the 177 + * GPU memory. 178 + * 179 + * The available GPU memory size may not be power-of-2 aligned. The 180 + * remainder is only backed by vfio_device_ops read/write handlers. 181 + * 182 + * During device reset, the GPU is safely disconnected to the CPU 183 + * and access to the BAR will be immediately returned preventing 184 + * machine check. 185 + */ 186 + ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 187 + req_len, vma->vm_page_prot); 188 + if (ret) 189 + return ret; 190 + 191 + vma->vm_pgoff = start_pfn; 192 + 193 + return 0; 194 + } 195 + 196 + static long 197 + nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 198 + unsigned long arg) 199 + { 200 + struct nvgrace_gpu_pci_core_device *nvdev = 201 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 202 + core_device.vdev); 203 + unsigned long minsz = offsetofend(struct vfio_region_info, offset); 204 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 205 + struct vfio_region_info_cap_sparse_mmap *sparse; 206 + struct vfio_region_info info; 207 + struct mem_region *memregion; 208 + u32 size; 209 + int ret; 210 + 211 + if (copy_from_user(&info, (void __user *)arg, minsz)) 212 + return -EFAULT; 213 + 214 + if (info.argsz < minsz) 215 + return -EINVAL; 216 + 217 + /* 218 + * Request to determine the BAR region information. Send the 219 + * GPU memory information. 220 + */ 221 + memregion = nvgrace_gpu_memregion(info.index, nvdev); 222 + if (!memregion) 223 + return vfio_pci_core_ioctl(core_vdev, 224 + VFIO_DEVICE_GET_REGION_INFO, arg); 225 + 226 + size = struct_size(sparse, areas, 1); 227 + 228 + /* 229 + * Setup for sparse mapping for the device memory. Only the 230 + * available device memory on the hardware is shown as a 231 + * mappable region. 232 + */ 233 + sparse = kzalloc(size, GFP_KERNEL); 234 + if (!sparse) 235 + return -ENOMEM; 236 + 237 + sparse->nr_areas = 1; 238 + sparse->areas[0].offset = 0; 239 + sparse->areas[0].size = memregion->memlength; 240 + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 241 + sparse->header.version = 1; 242 + 243 + ret = vfio_info_add_capability(&caps, &sparse->header, size); 244 + kfree(sparse); 245 + if (ret) 246 + return ret; 247 + 248 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 249 + /* 250 + * The region memory size may not be power-of-2 aligned. 251 + * Given that the memory as a BAR and may not be 252 + * aligned, roundup to the next power-of-2. 253 + */ 254 + info.size = memregion->bar_size; 255 + info.flags = VFIO_REGION_INFO_FLAG_READ | 256 + VFIO_REGION_INFO_FLAG_WRITE | 257 + VFIO_REGION_INFO_FLAG_MMAP; 258 + 259 + if (caps.size) { 260 + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 261 + if (info.argsz < sizeof(info) + caps.size) { 262 + info.argsz = sizeof(info) + caps.size; 263 + info.cap_offset = 0; 264 + } else { 265 + vfio_info_cap_shift(&caps, sizeof(info)); 266 + if (copy_to_user((void __user *)arg + 267 + sizeof(info), caps.buf, 268 + caps.size)) { 269 + kfree(caps.buf); 270 + return -EFAULT; 271 + } 272 + info.cap_offset = sizeof(info); 273 + } 274 + kfree(caps.buf); 275 + } 276 + return copy_to_user((void __user *)arg, &info, minsz) ? 277 + -EFAULT : 0; 278 + } 279 + 280 + static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 281 + unsigned int cmd, unsigned long arg) 282 + { 283 + switch (cmd) { 284 + case VFIO_DEVICE_GET_REGION_INFO: 285 + return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); 286 + case VFIO_DEVICE_IOEVENTFD: 287 + return -ENOTTY; 288 + case VFIO_DEVICE_RESET: 289 + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 290 + fallthrough; 291 + default: 292 + return vfio_pci_core_ioctl(core_vdev, cmd, arg); 293 + } 294 + } 295 + 296 + static __le64 297 + nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 298 + { 299 + u64 tmp_val; 300 + 301 + tmp_val = le64_to_cpu(val64); 302 + tmp_val &= ~(bar_size - 1); 303 + tmp_val |= flags; 304 + 305 + return cpu_to_le64(tmp_val); 306 + } 307 + 308 + /* 309 + * Both the usable (usemem) and the reserved (resmem) device memory region 310 + * are exposed as a 64b fake device BARs in the VM. These fake BARs must 311 + * respond to the accesses on their respective PCI config space offsets. 312 + * 313 + * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 314 + * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 315 + */ 316 + static ssize_t 317 + nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 318 + char __user *buf, size_t count, loff_t *ppos) 319 + { 320 + struct nvgrace_gpu_pci_core_device *nvdev = 321 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 322 + core_device.vdev); 323 + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 324 + struct mem_region *memregion = NULL; 325 + __le64 val64; 326 + size_t register_offset; 327 + loff_t copy_offset; 328 + size_t copy_count; 329 + int ret; 330 + 331 + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 332 + if (ret < 0) 333 + return ret; 334 + 335 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 336 + sizeof(val64), 337 + &copy_offset, &copy_count, 338 + &register_offset)) 339 + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 340 + else if (vfio_pci_core_range_intersect_range(pos, count, 341 + PCI_BASE_ADDRESS_4, 342 + sizeof(val64), 343 + &copy_offset, &copy_count, 344 + &register_offset)) 345 + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 346 + 347 + if (memregion) { 348 + val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 349 + PCI_BASE_ADDRESS_MEM_TYPE_64 | 350 + PCI_BASE_ADDRESS_MEM_PREFETCH, 351 + memregion->bar_val); 352 + if (copy_to_user(buf + copy_offset, 353 + (void *)&val64 + register_offset, copy_count)) { 354 + /* 355 + * The position has been incremented in 356 + * vfio_pci_core_read. Reset the offset back to the 357 + * starting position. 358 + */ 359 + *ppos -= count; 360 + return -EFAULT; 361 + } 362 + } 363 + 364 + return count; 365 + } 366 + 367 + static ssize_t 368 + nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 369 + const char __user *buf, size_t count, loff_t *ppos) 370 + { 371 + struct nvgrace_gpu_pci_core_device *nvdev = 372 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 373 + core_device.vdev); 374 + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 375 + struct mem_region *memregion = NULL; 376 + size_t register_offset; 377 + loff_t copy_offset; 378 + size_t copy_count; 379 + 380 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 381 + sizeof(u64), &copy_offset, 382 + &copy_count, &register_offset)) 383 + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 384 + else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 385 + sizeof(u64), &copy_offset, 386 + &copy_count, &register_offset)) 387 + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 388 + 389 + if (memregion) { 390 + if (copy_from_user((void *)&memregion->bar_val + register_offset, 391 + buf + copy_offset, copy_count)) 392 + return -EFAULT; 393 + *ppos += copy_count; 394 + return copy_count; 395 + } 396 + 397 + return vfio_pci_core_write(core_vdev, buf, count, ppos); 398 + } 399 + 400 + /* 401 + * Ad hoc map the device memory in the module kernel VA space. Primarily needed 402 + * as vfio does not require the userspace driver to only perform accesses through 403 + * mmaps of the vfio-pci BAR regions and such accesses should be supported using 404 + * vfio_device_ops read/write implementations. 405 + * 406 + * The usemem region is cacheable memory and hence is memremaped. 407 + * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 408 + */ 409 + static int 410 + nvgrace_gpu_map_device_mem(int index, 411 + struct nvgrace_gpu_pci_core_device *nvdev) 412 + { 413 + struct mem_region *memregion; 414 + int ret = 0; 415 + 416 + memregion = nvgrace_gpu_memregion(index, nvdev); 417 + if (!memregion) 418 + return -EINVAL; 419 + 420 + mutex_lock(&nvdev->remap_lock); 421 + 422 + if (memregion->memaddr) 423 + goto unlock; 424 + 425 + if (index == USEMEM_REGION_INDEX) 426 + memregion->memaddr = memremap(memregion->memphys, 427 + memregion->memlength, 428 + MEMREMAP_WB); 429 + else 430 + memregion->ioaddr = ioremap_wc(memregion->memphys, 431 + memregion->memlength); 432 + 433 + if (!memregion->memaddr) 434 + ret = -ENOMEM; 435 + 436 + unlock: 437 + mutex_unlock(&nvdev->remap_lock); 438 + 439 + return ret; 440 + } 441 + 442 + /* 443 + * Read the data from the device memory (mapped either through ioremap 444 + * or memremap) into the user buffer. 445 + */ 446 + static int 447 + nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 448 + char __user *buf, size_t mem_count, loff_t *ppos) 449 + { 450 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 451 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 452 + int ret; 453 + 454 + if (!mem_count) 455 + return 0; 456 + 457 + /* 458 + * Handle read on the BAR regions. Map to the target device memory 459 + * physical address and copy to the request read buffer. 460 + */ 461 + ret = nvgrace_gpu_map_device_mem(index, nvdev); 462 + if (ret) 463 + return ret; 464 + 465 + if (index == USEMEM_REGION_INDEX) { 466 + if (copy_to_user(buf, 467 + (u8 *)nvdev->usemem.memaddr + offset, 468 + mem_count)) 469 + ret = -EFAULT; 470 + } else { 471 + /* 472 + * The hardware ensures that the system does not crash when 473 + * the device memory is accessed with the memory enable 474 + * turned off. It synthesizes ~0 on such read. So there is 475 + * no need to check or support the disablement/enablement of 476 + * BAR through PCI_COMMAND config space register. Pass 477 + * test_mem flag as false. 478 + */ 479 + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 480 + nvdev->resmem.ioaddr, 481 + buf, offset, mem_count, 482 + 0, 0, false); 483 + } 484 + 485 + return ret; 486 + } 487 + 488 + /* 489 + * Read count bytes from the device memory at an offset. The actual device 490 + * memory size (available) may not be a power-of-2. So the driver fakes 491 + * the size to a power-of-2 (reported) when exposing to a user space driver. 492 + * 493 + * Reads starting beyond the reported size generate -EINVAL; reads extending 494 + * beyond the actual device size is filled with ~0; reads extending beyond 495 + * the reported size are truncated. 496 + */ 497 + static ssize_t 498 + nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 499 + char __user *buf, size_t count, loff_t *ppos) 500 + { 501 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 502 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 503 + struct mem_region *memregion; 504 + size_t mem_count, i; 505 + u8 val = 0xFF; 506 + int ret; 507 + 508 + /* No need to do NULL check as caller does. */ 509 + memregion = nvgrace_gpu_memregion(index, nvdev); 510 + 511 + if (offset >= memregion->bar_size) 512 + return -EINVAL; 513 + 514 + /* Clip short the read request beyond reported BAR size */ 515 + count = min(count, memregion->bar_size - (size_t)offset); 516 + 517 + /* 518 + * Determine how many bytes to be actually read from the device memory. 519 + * Read request beyond the actual device memory size is filled with ~0, 520 + * while those beyond the actual reported size is skipped. 521 + */ 522 + if (offset >= memregion->memlength) 523 + mem_count = 0; 524 + else 525 + mem_count = min(count, memregion->memlength - (size_t)offset); 526 + 527 + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 528 + if (ret) 529 + return ret; 530 + 531 + /* 532 + * Only the device memory present on the hardware is mapped, which may 533 + * not be power-of-2 aligned. A read to an offset beyond the device memory 534 + * size is filled with ~0. 535 + */ 536 + for (i = mem_count; i < count; i++) { 537 + ret = put_user(val, (unsigned char __user *)(buf + i)); 538 + if (ret) 539 + return ret; 540 + } 541 + 542 + *ppos += count; 543 + return count; 544 + } 545 + 546 + static ssize_t 547 + nvgrace_gpu_read(struct vfio_device *core_vdev, 548 + char __user *buf, size_t count, loff_t *ppos) 549 + { 550 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 551 + struct nvgrace_gpu_pci_core_device *nvdev = 552 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 553 + core_device.vdev); 554 + 555 + if (nvgrace_gpu_memregion(index, nvdev)) 556 + return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 557 + 558 + if (index == VFIO_PCI_CONFIG_REGION_INDEX) 559 + return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 560 + 561 + return vfio_pci_core_read(core_vdev, buf, count, ppos); 562 + } 563 + 564 + /* 565 + * Write the data to the device memory (mapped either through ioremap 566 + * or memremap) from the user buffer. 567 + */ 568 + static int 569 + nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 570 + const char __user *buf, size_t mem_count, 571 + loff_t *ppos) 572 + { 573 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 574 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 575 + int ret; 576 + 577 + if (!mem_count) 578 + return 0; 579 + 580 + ret = nvgrace_gpu_map_device_mem(index, nvdev); 581 + if (ret) 582 + return ret; 583 + 584 + if (index == USEMEM_REGION_INDEX) { 585 + if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 586 + buf, mem_count)) 587 + return -EFAULT; 588 + } else { 589 + /* 590 + * The hardware ensures that the system does not crash when 591 + * the device memory is accessed with the memory enable 592 + * turned off. It drops such writes. So there is no need to 593 + * check or support the disablement/enablement of BAR 594 + * through PCI_COMMAND config space register. Pass test_mem 595 + * flag as false. 596 + */ 597 + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 598 + nvdev->resmem.ioaddr, 599 + (char __user *)buf, pos, mem_count, 600 + 0, 0, true); 601 + } 602 + 603 + return ret; 604 + } 605 + 606 + /* 607 + * Write count bytes to the device memory at a given offset. The actual device 608 + * memory size (available) may not be a power-of-2. So the driver fakes the 609 + * size to a power-of-2 (reported) when exposing to a user space driver. 610 + * 611 + * Writes extending beyond the reported size are truncated; writes starting 612 + * beyond the reported size generate -EINVAL. 613 + */ 614 + static ssize_t 615 + nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 616 + size_t count, loff_t *ppos, const char __user *buf) 617 + { 618 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 619 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 620 + struct mem_region *memregion; 621 + size_t mem_count; 622 + int ret = 0; 623 + 624 + /* No need to do NULL check as caller does. */ 625 + memregion = nvgrace_gpu_memregion(index, nvdev); 626 + 627 + if (offset >= memregion->bar_size) 628 + return -EINVAL; 629 + 630 + /* Clip short the write request beyond reported BAR size */ 631 + count = min(count, memregion->bar_size - (size_t)offset); 632 + 633 + /* 634 + * Determine how many bytes to be actually written to the device memory. 635 + * Do not write to the offset beyond available size. 636 + */ 637 + if (offset >= memregion->memlength) 638 + goto exitfn; 639 + 640 + /* 641 + * Only the device memory present on the hardware is mapped, which may 642 + * not be power-of-2 aligned. Drop access outside the available device 643 + * memory on the hardware. 644 + */ 645 + mem_count = min(count, memregion->memlength - (size_t)offset); 646 + 647 + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 648 + if (ret) 649 + return ret; 650 + 651 + exitfn: 652 + *ppos += count; 653 + return count; 654 + } 655 + 656 + static ssize_t 657 + nvgrace_gpu_write(struct vfio_device *core_vdev, 658 + const char __user *buf, size_t count, loff_t *ppos) 659 + { 660 + struct nvgrace_gpu_pci_core_device *nvdev = 661 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 662 + core_device.vdev); 663 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 664 + 665 + if (nvgrace_gpu_memregion(index, nvdev)) 666 + return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 667 + 668 + if (index == VFIO_PCI_CONFIG_REGION_INDEX) 669 + return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 670 + 671 + return vfio_pci_core_write(core_vdev, buf, count, ppos); 672 + } 673 + 674 + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 675 + .name = "nvgrace-gpu-vfio-pci", 676 + .init = vfio_pci_core_init_dev, 677 + .release = vfio_pci_core_release_dev, 678 + .open_device = nvgrace_gpu_open_device, 679 + .close_device = nvgrace_gpu_close_device, 680 + .ioctl = nvgrace_gpu_ioctl, 681 + .device_feature = vfio_pci_core_ioctl_feature, 682 + .read = nvgrace_gpu_read, 683 + .write = nvgrace_gpu_write, 684 + .mmap = nvgrace_gpu_mmap, 685 + .request = vfio_pci_core_request, 686 + .match = vfio_pci_core_match, 687 + .bind_iommufd = vfio_iommufd_physical_bind, 688 + .unbind_iommufd = vfio_iommufd_physical_unbind, 689 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 690 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 691 + }; 692 + 693 + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 694 + .name = "nvgrace-gpu-vfio-pci-core", 695 + .init = vfio_pci_core_init_dev, 696 + .release = vfio_pci_core_release_dev, 697 + .open_device = nvgrace_gpu_open_device, 698 + .close_device = vfio_pci_core_close_device, 699 + .ioctl = vfio_pci_core_ioctl, 700 + .device_feature = vfio_pci_core_ioctl_feature, 701 + .read = vfio_pci_core_read, 702 + .write = vfio_pci_core_write, 703 + .mmap = vfio_pci_core_mmap, 704 + .request = vfio_pci_core_request, 705 + .match = vfio_pci_core_match, 706 + .bind_iommufd = vfio_iommufd_physical_bind, 707 + .unbind_iommufd = vfio_iommufd_physical_unbind, 708 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 709 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 710 + }; 711 + 712 + static int 713 + nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 714 + u64 *pmemphys, u64 *pmemlength) 715 + { 716 + int ret; 717 + 718 + /* 719 + * The memory information is present in the system ACPI tables as DSD 720 + * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 721 + */ 722 + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 723 + pmemphys); 724 + if (ret) 725 + return ret; 726 + 727 + if (*pmemphys > type_max(phys_addr_t)) 728 + return -EOVERFLOW; 729 + 730 + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 731 + pmemlength); 732 + if (ret) 733 + return ret; 734 + 735 + if (*pmemlength > type_max(size_t)) 736 + return -EOVERFLOW; 737 + 738 + /* 739 + * If the C2C link is not up due to an error, the coherent device 740 + * memory size is returned as 0. Fail in such case. 741 + */ 742 + if (*pmemlength == 0) 743 + return -ENOMEM; 744 + 745 + return ret; 746 + } 747 + 748 + static int 749 + nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 750 + struct nvgrace_gpu_pci_core_device *nvdev, 751 + u64 memphys, u64 memlength) 752 + { 753 + int ret = 0; 754 + 755 + /* 756 + * The VM GPU device driver needs a non-cacheable region to support 757 + * the MIG feature. Since the device memory is mapped as NORMAL cached, 758 + * carve out a region from the end with a different NORMAL_NC 759 + * property (called as reserved memory and represented as resmem). This 760 + * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while 761 + * exposing the rest (termed as usable memory and represented using usemem) 762 + * as cacheable 64b BAR (region 4 and 5). 763 + * 764 + * devmem (memlength) 765 + * |-------------------------------------------------| 766 + * | | 767 + * usemem.memphys resmem.memphys 768 + */ 769 + nvdev->usemem.memphys = memphys; 770 + 771 + /* 772 + * The device memory exposed to the VM is added to the kernel by the 773 + * VM driver module in chunks of memory block size. Only the usable 774 + * memory (usemem) is added to the kernel for usage by the VM 775 + * workloads. Make the usable memory size memblock aligned. 776 + */ 777 + if (check_sub_overflow(memlength, RESMEM_SIZE, 778 + &nvdev->usemem.memlength)) { 779 + ret = -EOVERFLOW; 780 + goto done; 781 + } 782 + 783 + /* 784 + * The USEMEM part of the device memory has to be MEMBLK_SIZE 785 + * aligned. This is a hardwired ABI value between the GPU FW and 786 + * VFIO driver. The VM device driver is also aware of it and make 787 + * use of the value for its calculation to determine USEMEM size. 788 + */ 789 + nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 790 + MEMBLK_SIZE); 791 + if (nvdev->usemem.memlength == 0) { 792 + ret = -EINVAL; 793 + goto done; 794 + } 795 + 796 + if ((check_add_overflow(nvdev->usemem.memphys, 797 + nvdev->usemem.memlength, 798 + &nvdev->resmem.memphys)) || 799 + (check_sub_overflow(memlength, nvdev->usemem.memlength, 800 + &nvdev->resmem.memlength))) { 801 + ret = -EOVERFLOW; 802 + goto done; 803 + } 804 + 805 + /* 806 + * The memory regions are exposed as BARs. Calculate and save 807 + * the BAR size for them. 808 + */ 809 + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 810 + nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 811 + done: 812 + return ret; 813 + } 814 + 815 + static int nvgrace_gpu_probe(struct pci_dev *pdev, 816 + const struct pci_device_id *id) 817 + { 818 + const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 819 + struct nvgrace_gpu_pci_core_device *nvdev; 820 + u64 memphys, memlength; 821 + int ret; 822 + 823 + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 824 + if (!ret) 825 + ops = &nvgrace_gpu_pci_ops; 826 + 827 + nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 828 + &pdev->dev, ops); 829 + if (IS_ERR(nvdev)) 830 + return PTR_ERR(nvdev); 831 + 832 + dev_set_drvdata(&pdev->dev, &nvdev->core_device); 833 + 834 + if (ops == &nvgrace_gpu_pci_ops) { 835 + /* 836 + * Device memory properties are identified in the host ACPI 837 + * table. Set the nvgrace_gpu_pci_core_device structure. 838 + */ 839 + ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 840 + memphys, memlength); 841 + if (ret) 842 + goto out_put_vdev; 843 + } 844 + 845 + ret = vfio_pci_core_register_device(&nvdev->core_device); 846 + if (ret) 847 + goto out_put_vdev; 848 + 849 + return ret; 850 + 851 + out_put_vdev: 852 + vfio_put_device(&nvdev->core_device.vdev); 853 + return ret; 854 + } 855 + 856 + static void nvgrace_gpu_remove(struct pci_dev *pdev) 857 + { 858 + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 859 + 860 + vfio_pci_core_unregister_device(core_device); 861 + vfio_put_device(&core_device->vdev); 862 + } 863 + 864 + static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 865 + /* GH200 120GB */ 866 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 867 + /* GH200 480GB */ 868 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 869 + {} 870 + }; 871 + 872 + MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 873 + 874 + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 875 + .name = KBUILD_MODNAME, 876 + .id_table = nvgrace_gpu_vfio_pci_table, 877 + .probe = nvgrace_gpu_probe, 878 + .remove = nvgrace_gpu_remove, 879 + .err_handler = &vfio_pci_core_err_handlers, 880 + .driver_managed_dma = true, 881 + }; 882 + 883 + module_pci_driver(nvgrace_gpu_vfio_pci_driver); 884 + 885 + MODULE_LICENSE("GPL"); 886 + MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 887 + MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 888 + MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");

+3 -3

drivers/vfio/pci/pds/dirty.c

··· 607 607 608 608 mutex_lock(&pds_vfio->state_mutex); 609 609 err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); 610 - pds_vfio_state_mutex_unlock(pds_vfio); 610 + mutex_unlock(&pds_vfio->state_mutex); 611 611 612 612 return err; 613 613 } ··· 624 624 mutex_lock(&pds_vfio->state_mutex); 625 625 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); 626 626 err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); 627 - pds_vfio_state_mutex_unlock(pds_vfio); 627 + mutex_unlock(&pds_vfio->state_mutex); 628 628 629 629 return err; 630 630 } ··· 637 637 638 638 mutex_lock(&pds_vfio->state_mutex); 639 639 pds_vfio_dirty_disable(pds_vfio, true); 640 - pds_vfio_state_mutex_unlock(pds_vfio); 640 + mutex_unlock(&pds_vfio->state_mutex); 641 641 642 642 return 0; 643 643 }

+13

drivers/vfio/pci/pds/lm.c

··· 92 92 { 93 93 mutex_lock(&lm_file->lock); 94 94 95 + lm_file->disabled = true; 95 96 lm_file->size = 0; 96 97 lm_file->alloc_size = 0; 98 + lm_file->filep->f_pos = 0; 97 99 98 100 /* Free scatter list of file pages */ 99 101 sg_free_table(&lm_file->sg_table); ··· 185 183 pos = &filp->f_pos; 186 184 187 185 mutex_lock(&lm_file->lock); 186 + 187 + if (lm_file->disabled) { 188 + done = -ENODEV; 189 + goto out_unlock; 190 + } 191 + 188 192 if (*pos > lm_file->size) { 189 193 done = -EINVAL; 190 194 goto out_unlock; ··· 290 282 return -EINVAL; 291 283 292 284 mutex_lock(&lm_file->lock); 285 + 286 + if (lm_file->disabled) { 287 + done = -ENODEV; 288 + goto out_unlock; 289 + } 293 290 294 291 while (len) { 295 292 size_t page_offset;

+1

drivers/vfio/pci/pds/lm.h

··· 27 27 struct scatterlist *last_offset_sg; /* Iterator */ 28 28 unsigned int sg_last_entry; 29 29 unsigned long last_offset; 30 + bool disabled; 30 31 }; 31 32 32 33 struct pds_vfio_pci_device;

+5 -22

drivers/vfio/pci/pds/pci_drv.c

··· 21 21 22 22 static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) 23 23 { 24 - bool deferred_reset_needed = false; 25 - 26 24 /* 27 25 * Documentation states that the kernel migration driver must not 28 26 * generate asynchronous device state transitions outside of 29 27 * manipulation by the user or the VFIO_DEVICE_RESET ioctl. 30 28 * 31 29 * Since recovery is an asynchronous event received from the device, 32 - * initiate a deferred reset. Issue a deferred reset in the following 33 - * situations: 30 + * initiate a reset in the following situations: 34 31 * 1. Migration is in progress, which will cause the next step of 35 32 * the migration to fail. 36 33 * 2. If the device is in a state that will be set to ··· 39 42 pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || 40 43 (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && 41 44 pds_vfio_dirty_is_enabled(pds_vfio))) 42 - deferred_reset_needed = true; 45 + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR); 43 46 mutex_unlock(&pds_vfio->state_mutex); 44 - 45 - /* 46 - * On the next user initiated state transition, the device will 47 - * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's 48 - * responsibility to reset the device. 49 - * 50 - * If a VFIO_DEVICE_RESET is requested post recovery and before the next 51 - * state transition, then the deferred reset state will be set to 52 - * VFIO_DEVICE_STATE_RUNNING. 53 - */ 54 - if (deferred_reset_needed) { 55 - mutex_lock(&pds_vfio->reset_mutex); 56 - pds_vfio->deferred_reset = true; 57 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; 58 - mutex_unlock(&pds_vfio->reset_mutex); 59 - } 60 47 } 61 48 62 49 static int pds_vfio_pci_notify_handler(struct notifier_block *nb, ··· 166 185 { 167 186 struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); 168 187 169 - pds_vfio_reset(pds_vfio); 188 + mutex_lock(&pds_vfio->state_mutex); 189 + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_RUNNING); 190 + mutex_unlock(&pds_vfio->state_mutex); 170 191 } 171 192 172 193 static const struct pci_error_handlers pds_vfio_pci_err_handlers = {

+9 -36

drivers/vfio/pci/pds/vfio_dev.c

··· 26 26 vfio_coredev); 27 27 } 28 28 29 - void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) 29 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, 30 + enum vfio_device_mig_state state) 30 31 { 31 - again: 32 - mutex_lock(&pds_vfio->reset_mutex); 33 - if (pds_vfio->deferred_reset) { 34 - pds_vfio->deferred_reset = false; 35 - if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { 36 - pds_vfio_put_restore_file(pds_vfio); 37 - pds_vfio_put_save_file(pds_vfio); 38 - pds_vfio_dirty_disable(pds_vfio, false); 39 - } 40 - pds_vfio->state = pds_vfio->deferred_reset_state; 41 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 42 - mutex_unlock(&pds_vfio->reset_mutex); 43 - goto again; 44 - } 45 - mutex_unlock(&pds_vfio->state_mutex); 46 - mutex_unlock(&pds_vfio->reset_mutex); 47 - } 48 - 49 - void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) 50 - { 51 - mutex_lock(&pds_vfio->reset_mutex); 52 - pds_vfio->deferred_reset = true; 53 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 54 - if (!mutex_trylock(&pds_vfio->state_mutex)) { 55 - mutex_unlock(&pds_vfio->reset_mutex); 56 - return; 57 - } 58 - mutex_unlock(&pds_vfio->reset_mutex); 59 - pds_vfio_state_mutex_unlock(pds_vfio); 32 + pds_vfio_put_restore_file(pds_vfio); 33 + pds_vfio_put_save_file(pds_vfio); 34 + if (state == VFIO_DEVICE_STATE_ERROR) 35 + pds_vfio_dirty_disable(pds_vfio, false); 36 + pds_vfio->state = state; 60 37 } 61 38 62 39 static struct file * ··· 74 97 break; 75 98 } 76 99 } 77 - pds_vfio_state_mutex_unlock(pds_vfio); 78 - /* still waiting on a deferred_reset */ 100 + mutex_unlock(&pds_vfio->state_mutex); 79 101 if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) 80 102 res = ERR_PTR(-EIO); 81 103 ··· 90 114 91 115 mutex_lock(&pds_vfio->state_mutex); 92 116 *current_state = pds_vfio->state; 93 - pds_vfio_state_mutex_unlock(pds_vfio); 117 + mutex_unlock(&pds_vfio->state_mutex); 94 118 return 0; 95 119 } 96 120 ··· 132 156 pds_vfio->vf_id = vf_id; 133 157 134 158 mutex_init(&pds_vfio->state_mutex); 135 - mutex_init(&pds_vfio->reset_mutex); 136 159 137 160 vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 138 161 vdev->mig_ops = &pds_vfio_lm_ops; ··· 153 178 vfio_coredev.vdev); 154 179 155 180 mutex_destroy(&pds_vfio->state_mutex); 156 - mutex_destroy(&pds_vfio->reset_mutex); 157 181 vfio_pci_core_release_dev(vdev); 158 182 } 159 183 ··· 168 194 return err; 169 195 170 196 pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; 171 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 172 197 173 198 vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); 174 199

+2 -6

drivers/vfio/pci/pds/vfio_dev.h

··· 18 18 struct pds_vfio_dirty dirty; 19 19 struct mutex state_mutex; /* protect migration state */ 20 20 enum vfio_device_mig_state state; 21 - struct mutex reset_mutex; /* protect reset_done flow */ 22 - u8 deferred_reset; 23 - enum vfio_device_mig_state deferred_reset_state; 24 21 struct notifier_block nb; 25 22 26 23 int vf_id; 27 24 u16 client_id; 28 25 }; 29 26 30 - void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); 31 - 32 27 const struct vfio_device_ops *pds_vfio_ops_info(void); 33 28 struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); 34 - void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); 29 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, 30 + enum vfio_device_mig_state state); 35 31 36 32 struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); 37 33 struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);

+42

drivers/vfio/pci/vfio_pci_config.c

··· 1966 1966 1967 1967 return done; 1968 1968 } 1969 + 1970 + /** 1971 + * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer 1972 + * and register offset ranges. 1973 + * @buf_start: start offset of the buffer 1974 + * @buf_cnt: number of buffer bytes 1975 + * @reg_start: start register offset 1976 + * @reg_cnt: number of register bytes 1977 + * @buf_offset: start offset of overlap in the buffer 1978 + * @intersect_count: number of overlapping bytes 1979 + * @register_offset: start offset of overlap in register 1980 + * 1981 + * Returns: true if there is overlap, false if not. 1982 + * The overlap start and size is returned through function args. 1983 + */ 1984 + bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, 1985 + loff_t reg_start, size_t reg_cnt, 1986 + loff_t *buf_offset, 1987 + size_t *intersect_count, 1988 + size_t *register_offset) 1989 + { 1990 + if (buf_start <= reg_start && 1991 + buf_start + buf_cnt > reg_start) { 1992 + *buf_offset = reg_start - buf_start; 1993 + *intersect_count = min_t(size_t, reg_cnt, 1994 + buf_start + buf_cnt - reg_start); 1995 + *register_offset = 0; 1996 + return true; 1997 + } 1998 + 1999 + if (buf_start > reg_start && 2000 + buf_start < reg_start + reg_cnt) { 2001 + *buf_offset = 0; 2002 + *intersect_count = min_t(size_t, buf_cnt, 2003 + reg_start + reg_cnt - buf_start); 2004 + *register_offset = buf_start - reg_start; 2005 + return true; 2006 + } 2007 + 2008 + return false; 2009 + } 2010 + EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range);

+1

drivers/vfio/pci/vfio_pci_core.c

··· 2064 2064 pci_name(pdev)); 2065 2065 pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 2066 2066 vdev->vdev.ops->name); 2067 + WARN_ON(!pdev->driver_override); 2067 2068 } else if (action == BUS_NOTIFY_BOUND_DRIVER && 2068 2069 pdev->is_virtfn && physfn == vdev->pdev) { 2069 2070 struct pci_driver *drv = pci_dev_driver(pdev);

+108 -72

drivers/vfio/pci/vfio_pci_intrs.c

··· 90 90 91 91 if (likely(is_intx(vdev) && !vdev->virq_disabled)) { 92 92 struct vfio_pci_irq_ctx *ctx; 93 + struct eventfd_ctx *trigger; 93 94 94 95 ctx = vfio_irq_ctx_get(vdev, 0); 95 96 if (WARN_ON_ONCE(!ctx)) 96 97 return; 97 - eventfd_signal(ctx->trigger); 98 + 99 + trigger = READ_ONCE(ctx->trigger); 100 + if (likely(trigger)) 101 + eventfd_signal(trigger); 98 102 } 99 103 } 100 104 101 105 /* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */ 102 - bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 106 + static bool __vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 103 107 { 104 108 struct pci_dev *pdev = vdev->pdev; 105 109 struct vfio_pci_irq_ctx *ctx; 106 110 unsigned long flags; 107 111 bool masked_changed = false; 112 + 113 + lockdep_assert_held(&vdev->igate); 108 114 109 115 spin_lock_irqsave(&vdev->irqlock, flags); 110 116 ··· 147 141 out_unlock: 148 142 spin_unlock_irqrestore(&vdev->irqlock, flags); 149 143 return masked_changed; 144 + } 145 + 146 + bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 147 + { 148 + bool mask_changed; 149 + 150 + mutex_lock(&vdev->igate); 151 + mask_changed = __vfio_pci_intx_mask(vdev); 152 + mutex_unlock(&vdev->igate); 153 + 154 + return mask_changed; 150 155 } 151 156 152 157 /* ··· 211 194 return ret; 212 195 } 213 196 214 - void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 197 + static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 215 198 { 199 + lockdep_assert_held(&vdev->igate); 200 + 216 201 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) 217 202 vfio_send_intx_eventfd(vdev, NULL); 203 + } 204 + 205 + void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 206 + { 207 + mutex_lock(&vdev->igate); 208 + __vfio_pci_intx_unmask(vdev); 209 + mutex_unlock(&vdev->igate); 218 210 } 219 211 220 212 static irqreturn_t vfio_intx_handler(int irq, void *dev_id) ··· 257 231 return ret; 258 232 } 259 233 260 - static int vfio_intx_enable(struct vfio_pci_core_device *vdev) 234 + static int vfio_intx_enable(struct vfio_pci_core_device *vdev, 235 + struct eventfd_ctx *trigger) 261 236 { 237 + struct pci_dev *pdev = vdev->pdev; 262 238 struct vfio_pci_irq_ctx *ctx; 239 + unsigned long irqflags; 240 + char *name; 241 + int ret; 263 242 264 243 if (!is_irq_none(vdev)) 265 244 return -EINVAL; 266 245 267 - if (!vdev->pdev->irq) 246 + if (!pdev->irq) 268 247 return -ENODEV; 248 + 249 + name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); 250 + if (!name) 251 + return -ENOMEM; 269 252 270 253 ctx = vfio_irq_ctx_alloc(vdev, 0); 271 254 if (!ctx) 272 255 return -ENOMEM; 273 256 257 + ctx->name = name; 258 + ctx->trigger = trigger; 259 + 274 260 /* 275 - * If the virtual interrupt is masked, restore it. Devices 276 - * supporting DisINTx can be masked at the hardware level 277 - * here, non-PCI-2.3 devices will have to wait until the 278 - * interrupt is enabled. 261 + * Fill the initial masked state based on virq_disabled. After 262 + * enable, changing the DisINTx bit in vconfig directly changes INTx 263 + * masking. igate prevents races during setup, once running masked 264 + * is protected via irqlock. 265 + * 266 + * Devices supporting DisINTx also reflect the current mask state in 267 + * the physical DisINTx bit, which is not affected during IRQ setup. 268 + * 269 + * Devices without DisINTx support require an exclusive interrupt. 270 + * IRQ masking is performed at the IRQ chip. Again, igate protects 271 + * against races during setup and IRQ handlers and irqfds are not 272 + * yet active, therefore masked is stable and can be used to 273 + * conditionally auto-enable the IRQ. 274 + * 275 + * irq_type must be stable while the IRQ handler is registered, 276 + * therefore it must be set before request_irq(). 279 277 */ 280 278 ctx->masked = vdev->virq_disabled; 281 - if (vdev->pci_2_3) 282 - pci_intx(vdev->pdev, !ctx->masked); 279 + if (vdev->pci_2_3) { 280 + pci_intx(pdev, !ctx->masked); 281 + irqflags = IRQF_SHARED; 282 + } else { 283 + irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0; 284 + } 283 285 284 286 vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; 287 + 288 + ret = request_irq(pdev->irq, vfio_intx_handler, 289 + irqflags, ctx->name, vdev); 290 + if (ret) { 291 + vdev->irq_type = VFIO_PCI_NUM_IRQS; 292 + kfree(name); 293 + vfio_irq_ctx_free(vdev, ctx, 0); 294 + return ret; 295 + } 285 296 286 297 return 0; 287 298 } 288 299 289 - static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) 300 + static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, 301 + struct eventfd_ctx *trigger) 290 302 { 291 303 struct pci_dev *pdev = vdev->pdev; 292 - unsigned long irqflags = IRQF_SHARED; 293 304 struct vfio_pci_irq_ctx *ctx; 294 - struct eventfd_ctx *trigger; 295 - unsigned long flags; 296 - int ret; 305 + struct eventfd_ctx *old; 297 306 298 307 ctx = vfio_irq_ctx_get(vdev, 0); 299 308 if (WARN_ON_ONCE(!ctx)) 300 309 return -EINVAL; 301 310 302 - if (ctx->trigger) { 303 - free_irq(pdev->irq, vdev); 304 - kfree(ctx->name); 305 - eventfd_ctx_put(ctx->trigger); 306 - ctx->trigger = NULL; 311 + old = ctx->trigger; 312 + 313 + WRITE_ONCE(ctx->trigger, trigger); 314 + 315 + /* Releasing an old ctx requires synchronizing in-flight users */ 316 + if (old) { 317 + synchronize_irq(pdev->irq); 318 + vfio_virqfd_flush_thread(&ctx->unmask); 319 + eventfd_ctx_put(old); 307 320 } 308 - 309 - if (fd < 0) /* Disable only */ 310 - return 0; 311 - 312 - ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", 313 - pci_name(pdev)); 314 - if (!ctx->name) 315 - return -ENOMEM; 316 - 317 - trigger = eventfd_ctx_fdget(fd); 318 - if (IS_ERR(trigger)) { 319 - kfree(ctx->name); 320 - return PTR_ERR(trigger); 321 - } 322 - 323 - ctx->trigger = trigger; 324 - 325 - if (!vdev->pci_2_3) 326 - irqflags = 0; 327 - 328 - ret = request_irq(pdev->irq, vfio_intx_handler, 329 - irqflags, ctx->name, vdev); 330 - if (ret) { 331 - ctx->trigger = NULL; 332 - kfree(ctx->name); 333 - eventfd_ctx_put(trigger); 334 - return ret; 335 - } 336 - 337 - /* 338 - * INTx disable will stick across the new irq setup, 339 - * disable_irq won't. 340 - */ 341 - spin_lock_irqsave(&vdev->irqlock, flags); 342 - if (!vdev->pci_2_3 && ctx->masked) 343 - disable_irq_nosync(pdev->irq); 344 - spin_unlock_irqrestore(&vdev->irqlock, flags); 345 321 346 322 return 0; 347 323 } 348 324 349 325 static void vfio_intx_disable(struct vfio_pci_core_device *vdev) 350 326 { 327 + struct pci_dev *pdev = vdev->pdev; 351 328 struct vfio_pci_irq_ctx *ctx; 352 329 353 330 ctx = vfio_irq_ctx_get(vdev, 0); ··· 358 329 if (ctx) { 359 330 vfio_virqfd_disable(&ctx->unmask); 360 331 vfio_virqfd_disable(&ctx->mask); 332 + free_irq(pdev->irq, vdev); 333 + if (ctx->trigger) 334 + eventfd_ctx_put(ctx->trigger); 335 + kfree(ctx->name); 336 + vfio_irq_ctx_free(vdev, ctx, 0); 361 337 } 362 - vfio_intx_set_signal(vdev, -1); 363 338 vdev->irq_type = VFIO_PCI_NUM_IRQS; 364 - vfio_irq_ctx_free(vdev, ctx, 0); 365 339 } 366 340 367 341 /* ··· 592 560 return -EINVAL; 593 561 594 562 if (flags & VFIO_IRQ_SET_DATA_NONE) { 595 - vfio_pci_intx_unmask(vdev); 563 + __vfio_pci_intx_unmask(vdev); 596 564 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 597 565 uint8_t unmask = *(uint8_t *)data; 598 566 if (unmask) 599 - vfio_pci_intx_unmask(vdev); 567 + __vfio_pci_intx_unmask(vdev); 600 568 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 601 569 struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0); 602 570 int32_t fd = *(int32_t *)data; ··· 623 591 return -EINVAL; 624 592 625 593 if (flags & VFIO_IRQ_SET_DATA_NONE) { 626 - vfio_pci_intx_mask(vdev); 594 + __vfio_pci_intx_mask(vdev); 627 595 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 628 596 uint8_t mask = *(uint8_t *)data; 629 597 if (mask) 630 - vfio_pci_intx_mask(vdev); 598 + __vfio_pci_intx_mask(vdev); 631 599 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 632 600 return -ENOTTY; /* XXX implement me */ 633 601 } ··· 648 616 return -EINVAL; 649 617 650 618 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 619 + struct eventfd_ctx *trigger = NULL; 651 620 int32_t fd = *(int32_t *)data; 652 621 int ret; 653 622 623 + if (fd >= 0) { 624 + trigger = eventfd_ctx_fdget(fd); 625 + if (IS_ERR(trigger)) 626 + return PTR_ERR(trigger); 627 + } 628 + 654 629 if (is_intx(vdev)) 655 - return vfio_intx_set_signal(vdev, fd); 630 + ret = vfio_intx_set_signal(vdev, trigger); 631 + else 632 + ret = vfio_intx_enable(vdev, trigger); 656 633 657 - ret = vfio_intx_enable(vdev); 658 - if (ret) 659 - return ret; 660 - 661 - ret = vfio_intx_set_signal(vdev, fd); 662 - if (ret) 663 - vfio_intx_disable(vdev); 634 + if (ret && trigger) 635 + eventfd_ctx_put(trigger); 664 636 665 637 return ret; 666 638 }

+9 -7

drivers/vfio/pci/vfio_pci_rdwr.c

··· 96 96 * reads with -1. This is intended for handling MSI-X vector tables and 97 97 * leftover space for ROM BARs. 98 98 */ 99 - static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 100 - void __iomem *io, char __user *buf, 101 - loff_t off, size_t count, size_t x_start, 102 - size_t x_end, bool iswrite) 99 + ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 100 + void __iomem *io, char __user *buf, 101 + loff_t off, size_t count, size_t x_start, 102 + size_t x_end, bool iswrite) 103 103 { 104 104 ssize_t done = 0; 105 105 int ret; ··· 201 201 202 202 return done; 203 203 } 204 + EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); 204 205 205 206 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) 206 207 { ··· 280 279 x_end = vdev->msix_offset + vdev->msix_size; 281 280 } 282 281 283 - done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, 284 - count, x_start, x_end, iswrite); 282 + done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, 283 + count, x_start, x_end, iswrite); 285 284 286 285 if (done >= 0) 287 286 *ppos += done; ··· 349 348 * probing, so we don't currently worry about access in relation 350 349 * to the memory enable bit in the command register. 351 350 */ 352 - done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite); 351 + done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count, 352 + 0, 0, iswrite); 353 353 354 354 vga_put(vdev->pdev, rsrc); 355 355

+26 -46

drivers/vfio/pci/virtio/main.c

··· 132 132 return ret ? ret : count; 133 133 } 134 134 135 - static bool range_intersect_range(loff_t range1_start, size_t count1, 136 - loff_t range2_start, size_t count2, 137 - loff_t *start_offset, 138 - size_t *intersect_count, 139 - size_t *register_offset) 140 - { 141 - if (range1_start <= range2_start && 142 - range1_start + count1 > range2_start) { 143 - *start_offset = range2_start - range1_start; 144 - *intersect_count = min_t(size_t, count2, 145 - range1_start + count1 - range2_start); 146 - *register_offset = 0; 147 - return true; 148 - } 149 - 150 - if (range1_start > range2_start && 151 - range1_start < range2_start + count2) { 152 - *start_offset = 0; 153 - *intersect_count = min_t(size_t, count1, 154 - range2_start + count2 - range1_start); 155 - *register_offset = range1_start - range2_start; 156 - return true; 157 - } 158 - 159 - return false; 160 - } 161 - 162 135 static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, 163 136 char __user *buf, size_t count, 164 137 loff_t *ppos) ··· 151 178 if (ret < 0) 152 179 return ret; 153 180 154 - if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16), 155 - &copy_offset, &copy_count, &register_offset)) { 181 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, 182 + sizeof(val16), &copy_offset, 183 + &copy_count, &register_offset)) { 156 184 val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); 157 185 if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) 158 186 return -EFAULT; 159 187 } 160 188 161 189 if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && 162 - range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16), 163 - &copy_offset, &copy_count, &register_offset)) { 190 + vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, 191 + sizeof(val16), &copy_offset, 192 + &copy_count, &register_offset)) { 164 193 if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, 165 194 copy_count)) 166 195 return -EFAULT; ··· 172 197 return -EFAULT; 173 198 } 174 199 175 - if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8), 176 - &copy_offset, &copy_count, &register_offset)) { 200 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, 201 + sizeof(val8), &copy_offset, 202 + &copy_count, &register_offset)) { 177 203 /* Transional needs to have revision 0 */ 178 204 val8 = 0; 179 205 if (copy_to_user(buf + copy_offset, &val8, copy_count)) 180 206 return -EFAULT; 181 207 } 182 208 183 - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32), 184 - &copy_offset, &copy_count, &register_offset)) { 209 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 210 + sizeof(val32), &copy_offset, 211 + &copy_count, &register_offset)) { 185 212 u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); 186 213 u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); 187 214 ··· 192 215 return -EFAULT; 193 216 } 194 217 195 - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16), 196 - &copy_offset, &copy_count, &register_offset)) { 218 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, 219 + sizeof(val16), &copy_offset, 220 + &copy_count, &register_offset)) { 197 221 /* 198 222 * Transitional devices use the PCI subsystem device id as 199 223 * virtio device id, same as legacy driver always did. ··· 205 227 return -EFAULT; 206 228 } 207 229 208 - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16), 209 - &copy_offset, &copy_count, &register_offset)) { 230 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, 231 + sizeof(val16), &copy_offset, 232 + &copy_count, &register_offset)) { 210 233 val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); 211 234 if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, 212 235 copy_count)) ··· 249 270 loff_t copy_offset; 250 271 size_t copy_count; 251 272 252 - if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd), 253 - &copy_offset, &copy_count, 254 - &register_offset)) { 273 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, 274 + sizeof(virtvdev->pci_cmd), 275 + &copy_offset, &copy_count, 276 + &register_offset)) { 255 277 if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, 256 278 buf + copy_offset, 257 279 copy_count)) 258 280 return -EFAULT; 259 281 } 260 282 261 - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 262 - sizeof(virtvdev->pci_base_addr_0), 263 - &copy_offset, &copy_count, 264 - &register_offset)) { 283 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 284 + sizeof(virtvdev->pci_base_addr_0), 285 + &copy_offset, &copy_count, 286 + &register_offset)) { 265 287 if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, 266 288 buf + copy_offset, 267 289 copy_count))

+3 -3

drivers/vfio/platform/vfio_amba.c

··· 122 122 .detach_ioas = vfio_iommufd_physical_detach_ioas, 123 123 }; 124 124 125 - static const struct amba_id pl330_ids[] = { 125 + static const struct amba_id vfio_amba_ids[] = { 126 126 { 0, 0 }, 127 127 }; 128 128 129 - MODULE_DEVICE_TABLE(amba, pl330_ids); 129 + MODULE_DEVICE_TABLE(amba, vfio_amba_ids); 130 130 131 131 static struct amba_driver vfio_amba_driver = { 132 132 .probe = vfio_amba_probe, 133 133 .remove = vfio_amba_remove, 134 - .id_table = pl330_ids, 134 + .id_table = vfio_amba_ids, 135 135 .drv = { 136 136 .name = "vfio-amba", 137 137 .owner = THIS_MODULE,

+2 -3

drivers/vfio/platform/vfio_platform.c

··· 85 85 vfio_platform_release_common(vdev); 86 86 } 87 87 88 - static int vfio_platform_remove(struct platform_device *pdev) 88 + static void vfio_platform_remove(struct platform_device *pdev) 89 89 { 90 90 struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev); 91 91 92 92 vfio_unregister_group_dev(&vdev->vdev); 93 93 pm_runtime_disable(vdev->device); 94 94 vfio_put_device(&vdev->vdev); 95 - return 0; 96 95 } 97 96 98 97 static const struct vfio_device_ops vfio_platform_ops = { ··· 112 113 113 114 static struct platform_driver vfio_platform_driver = { 114 115 .probe = vfio_platform_probe, 115 - .remove = vfio_platform_remove, 116 + .remove_new = vfio_platform_remove, 116 117 .driver = { 117 118 .name = "vfio-platform", 118 119 },

+72 -33

drivers/vfio/platform/vfio_platform_irq.c

··· 136 136 return 0; 137 137 } 138 138 139 + /* 140 + * The trigger eventfd is guaranteed valid in the interrupt path 141 + * and protected by the igate mutex when triggered via ioctl. 142 + */ 143 + static void vfio_send_eventfd(struct vfio_platform_irq *irq_ctx) 144 + { 145 + if (likely(irq_ctx->trigger)) 146 + eventfd_signal(irq_ctx->trigger); 147 + } 148 + 139 149 static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id) 140 150 { 141 151 struct vfio_platform_irq *irq_ctx = dev_id; ··· 165 155 spin_unlock_irqrestore(&irq_ctx->lock, flags); 166 156 167 157 if (ret == IRQ_HANDLED) 168 - eventfd_signal(irq_ctx->trigger); 158 + vfio_send_eventfd(irq_ctx); 169 159 170 160 return ret; 171 161 } ··· 174 164 { 175 165 struct vfio_platform_irq *irq_ctx = dev_id; 176 166 177 - eventfd_signal(irq_ctx->trigger); 167 + vfio_send_eventfd(irq_ctx); 178 168 179 169 return IRQ_HANDLED; 180 170 } 181 171 182 172 static int vfio_set_trigger(struct vfio_platform_device *vdev, int index, 183 - int fd, irq_handler_t handler) 173 + int fd) 184 174 { 185 175 struct vfio_platform_irq *irq = &vdev->irqs[index]; 186 176 struct eventfd_ctx *trigger; 187 - int ret; 188 177 189 178 if (irq->trigger) { 190 - irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN); 191 - free_irq(irq->hwirq, irq); 192 - kfree(irq->name); 179 + disable_irq(irq->hwirq); 193 180 eventfd_ctx_put(irq->trigger); 194 181 irq->trigger = NULL; 195 182 } 196 183 197 184 if (fd < 0) /* Disable only */ 198 185 return 0; 199 - irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", 200 - irq->hwirq, vdev->name); 201 - if (!irq->name) 202 - return -ENOMEM; 203 186 204 187 trigger = eventfd_ctx_fdget(fd); 205 - if (IS_ERR(trigger)) { 206 - kfree(irq->name); 188 + if (IS_ERR(trigger)) 207 189 return PTR_ERR(trigger); 208 - } 209 190 210 191 irq->trigger = trigger; 211 192 212 - irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN); 213 - ret = request_irq(irq->hwirq, handler, 0, irq->name, irq); 214 - if (ret) { 215 - kfree(irq->name); 216 - eventfd_ctx_put(trigger); 217 - irq->trigger = NULL; 218 - return ret; 219 - } 220 - 221 - if (!irq->masked) 222 - enable_irq(irq->hwirq); 193 + /* 194 + * irq->masked effectively provides nested disables within the overall 195 + * enable relative to trigger. Specifically request_irq() is called 196 + * with NO_AUTOEN, therefore the IRQ is initially disabled. The user 197 + * may only further disable the IRQ with a MASK operations because 198 + * irq->masked is initially false. 199 + */ 200 + enable_irq(irq->hwirq); 223 201 224 202 return 0; 225 203 } ··· 226 228 handler = vfio_irq_handler; 227 229 228 230 if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) 229 - return vfio_set_trigger(vdev, index, -1, handler); 231 + return vfio_set_trigger(vdev, index, -1); 230 232 231 233 if (start != 0 || count != 1) 232 234 return -EINVAL; ··· 234 236 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 235 237 int32_t fd = *(int32_t *)data; 236 238 237 - return vfio_set_trigger(vdev, index, fd, handler); 239 + return vfio_set_trigger(vdev, index, fd); 238 240 } 239 241 240 242 if (flags & VFIO_IRQ_SET_DATA_NONE) { ··· 258 260 unsigned start, unsigned count, uint32_t flags, 259 261 void *data) = NULL; 260 262 263 + /* 264 + * For compatibility, errors from request_irq() are local to the 265 + * SET_IRQS path and reflected in the name pointer. This allows, 266 + * for example, polling mode fallback for an exclusive IRQ failure. 267 + */ 268 + if (IS_ERR(vdev->irqs[index].name)) 269 + return PTR_ERR(vdev->irqs[index].name); 270 + 261 271 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 262 272 case VFIO_IRQ_SET_ACTION_MASK: 263 273 func = vfio_platform_set_irq_mask; ··· 286 280 287 281 int vfio_platform_irq_init(struct vfio_platform_device *vdev) 288 282 { 289 - int cnt = 0, i; 283 + int cnt = 0, i, ret = 0; 290 284 291 285 while (vdev->get_irq(vdev, cnt) >= 0) 292 286 cnt++; ··· 298 292 299 293 for (i = 0; i < cnt; i++) { 300 294 int hwirq = vdev->get_irq(vdev, i); 295 + irq_handler_t handler = vfio_irq_handler; 301 296 302 - if (hwirq < 0) 297 + if (hwirq < 0) { 298 + ret = -EINVAL; 303 299 goto err; 300 + } 304 301 305 302 spin_lock_init(&vdev->irqs[i].lock); 306 303 307 304 vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD; 308 305 309 - if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) 306 + if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) { 310 307 vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE 311 308 | VFIO_IRQ_INFO_AUTOMASKED; 309 + handler = vfio_automasked_irq_handler; 310 + } 312 311 313 312 vdev->irqs[i].count = 1; 314 313 vdev->irqs[i].hwirq = hwirq; 315 314 vdev->irqs[i].masked = false; 315 + vdev->irqs[i].name = kasprintf(GFP_KERNEL_ACCOUNT, 316 + "vfio-irq[%d](%s)", hwirq, 317 + vdev->name); 318 + if (!vdev->irqs[i].name) { 319 + ret = -ENOMEM; 320 + goto err; 321 + } 322 + 323 + ret = request_irq(hwirq, handler, IRQF_NO_AUTOEN, 324 + vdev->irqs[i].name, &vdev->irqs[i]); 325 + if (ret) { 326 + kfree(vdev->irqs[i].name); 327 + vdev->irqs[i].name = ERR_PTR(ret); 328 + } 316 329 } 317 330 318 331 vdev->num_irqs = cnt; 319 332 320 333 return 0; 321 334 err: 335 + for (--i; i >= 0; i--) { 336 + if (!IS_ERR(vdev->irqs[i].name)) { 337 + free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]); 338 + kfree(vdev->irqs[i].name); 339 + } 340 + } 322 341 kfree(vdev->irqs); 323 - return -EINVAL; 342 + return ret; 324 343 } 325 344 326 345 void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev) 327 346 { 328 347 int i; 329 348 330 - for (i = 0; i < vdev->num_irqs; i++) 331 - vfio_set_trigger(vdev, i, -1, NULL); 349 + for (i = 0; i < vdev->num_irqs; i++) { 350 + vfio_virqfd_disable(&vdev->irqs[i].mask); 351 + vfio_virqfd_disable(&vdev->irqs[i].unmask); 352 + if (!IS_ERR(vdev->irqs[i].name)) { 353 + free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]); 354 + if (vdev->irqs[i].trigger) 355 + eventfd_ctx_put(vdev->irqs[i].trigger); 356 + kfree(vdev->irqs[i].name); 357 + } 358 + } 332 359 333 360 vdev->num_irqs = 0; 334 361 kfree(vdev->irqs);

-12

drivers/vfio/vfio_iommu_type1.c

··· 567 567 ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, 568 568 pages, NULL); 569 569 if (ret > 0) { 570 - int i; 571 - 572 - /* 573 - * The zero page is always resident, we don't need to pin it 574 - * and it falls into our invalid/reserved test so we don't 575 - * unpin in put_pfn(). Unpin all zero pages in the batch here. 576 - */ 577 - for (i = 0 ; i < ret; i++) { 578 - if (unlikely(is_zero_pfn(page_to_pfn(pages[i])))) 579 - unpin_user_page(pages[i]); 580 - } 581 - 582 570 *pfn = page_to_pfn(pages[0]); 583 571 goto done; 584 572 }

+21

drivers/vfio/virqfd.c

··· 101 101 virqfd->thread(virqfd->opaque, virqfd->data); 102 102 } 103 103 104 + static void virqfd_flush_inject(struct work_struct *work) 105 + { 106 + struct virqfd *virqfd = container_of(work, struct virqfd, flush_inject); 107 + 108 + flush_work(&virqfd->inject); 109 + } 110 + 104 111 int vfio_virqfd_enable(void *opaque, 105 112 int (*handler)(void *, void *), 106 113 void (*thread)(void *, void *), ··· 131 124 132 125 INIT_WORK(&virqfd->shutdown, virqfd_shutdown); 133 126 INIT_WORK(&virqfd->inject, virqfd_inject); 127 + INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject); 134 128 135 129 irqfd = fdget(fd); 136 130 if (!irqfd.file) { ··· 221 213 flush_workqueue(vfio_irqfd_cleanup_wq); 222 214 } 223 215 EXPORT_SYMBOL_GPL(vfio_virqfd_disable); 216 + 217 + void vfio_virqfd_flush_thread(struct virqfd **pvirqfd) 218 + { 219 + unsigned long flags; 220 + 221 + spin_lock_irqsave(&virqfd_lock, flags); 222 + if (*pvirqfd && (*pvirqfd)->thread) 223 + queue_work(vfio_irqfd_cleanup_wq, &(*pvirqfd)->flush_inject); 224 + spin_unlock_irqrestore(&virqfd_lock, flags); 225 + 226 + flush_workqueue(vfio_irqfd_cleanup_wq); 227 + } 228 + EXPORT_SYMBOL_GPL(vfio_virqfd_flush_thread);

+5

include/linux/mlx5/mlx5_ifc.h

··· 12677 12677 struct mlx5_ifc_page_track_bits obj_context; 12678 12678 }; 12679 12679 12680 + struct mlx5_ifc_query_page_track_obj_out_bits { 12681 + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; 12682 + struct mlx5_ifc_page_track_bits obj_context; 12683 + }; 12684 + 12680 12685 struct mlx5_ifc_msecq_reg_bits { 12681 12686 u8 reserved_at_0[0x20]; 12682 12687

+2

include/linux/vfio.h

··· 356 356 wait_queue_entry_t wait; 357 357 poll_table pt; 358 358 struct work_struct shutdown; 359 + struct work_struct flush_inject; 359 360 struct virqfd **pvirqfd; 360 361 }; 361 362 ··· 364 363 void (*thread)(void *, void *), void *data, 365 364 struct virqfd **pvirqfd, int fd); 366 365 void vfio_virqfd_disable(struct virqfd **pvirqfd); 366 + void vfio_virqfd_flush_thread(struct virqfd **pvirqfd); 367 367 368 368 #endif /* VFIO_H */

+9 -1

include/linux/vfio_pci_core.h

··· 130 130 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); 131 131 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, 132 132 pci_channel_state_t state); 133 - 133 + ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 134 + void __iomem *io, char __user *buf, 135 + loff_t off, size_t count, size_t x_start, 136 + size_t x_end, bool iswrite); 137 + bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, 138 + loff_t reg_start, size_t reg_cnt, 139 + loff_t *buf_offset, 140 + size_t *intersect_count, 141 + size_t *register_offset); 134 142 #define VFIO_IOWRITE_DECLATION(size) \ 135 143 int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ 136 144 bool test_mem, u##size val, void __iomem *io);

+8 -10

samples/vfio-mdev/mbochs.c

··· 133 133 }; 134 134 135 135 static dev_t mbochs_devt; 136 - static struct class *mbochs_class; 136 + static const struct class mbochs_class = { 137 + .name = MBOCHS_CLASS_NAME, 138 + }; 137 139 static struct cdev mbochs_cdev; 138 140 static struct device mbochs_dev; 139 141 static struct mdev_parent mbochs_parent; ··· 1424 1422 if (ret) 1425 1423 goto err_cdev; 1426 1424 1427 - mbochs_class = class_create(MBOCHS_CLASS_NAME); 1428 - if (IS_ERR(mbochs_class)) { 1429 - pr_err("Error: failed to register mbochs_dev class\n"); 1430 - ret = PTR_ERR(mbochs_class); 1425 + ret = class_register(&mbochs_class); 1426 + if (ret) 1431 1427 goto err_driver; 1432 - } 1433 - mbochs_dev.class = mbochs_class; 1428 + mbochs_dev.class = &mbochs_class; 1434 1429 mbochs_dev.release = mbochs_device_release; 1435 1430 dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME); 1436 1431 ··· 1447 1448 device_del(&mbochs_dev); 1448 1449 err_put: 1449 1450 put_device(&mbochs_dev); 1450 - class_destroy(mbochs_class); 1451 + class_unregister(&mbochs_class); 1451 1452 err_driver: 1452 1453 mdev_unregister_driver(&mbochs_driver); 1453 1454 err_cdev: ··· 1465 1466 mdev_unregister_driver(&mbochs_driver); 1466 1467 cdev_del(&mbochs_cdev); 1467 1468 unregister_chrdev_region(mbochs_devt, MINORMASK + 1); 1468 - class_destroy(mbochs_class); 1469 - mbochs_class = NULL; 1469 + class_unregister(&mbochs_class); 1470 1470 } 1471 1471 1472 1472 MODULE_IMPORT_NS(DMA_BUF);

+8 -10

samples/vfio-mdev/mdpy.c

··· 84 84 }; 85 85 86 86 static dev_t mdpy_devt; 87 - static struct class *mdpy_class; 87 + static const struct class mdpy_class = { 88 + .name = MDPY_CLASS_NAME, 89 + }; 88 90 static struct cdev mdpy_cdev; 89 91 static struct device mdpy_dev; 90 92 static struct mdev_parent mdpy_parent; ··· 711 709 if (ret) 712 710 goto err_cdev; 713 711 714 - mdpy_class = class_create(MDPY_CLASS_NAME); 715 - if (IS_ERR(mdpy_class)) { 716 - pr_err("Error: failed to register mdpy_dev class\n"); 717 - ret = PTR_ERR(mdpy_class); 712 + ret = class_register(&mdpy_class); 713 + if (ret) 718 714 goto err_driver; 719 - } 720 - mdpy_dev.class = mdpy_class; 715 + mdpy_dev.class = &mdpy_class; 721 716 mdpy_dev.release = mdpy_device_release; 722 717 dev_set_name(&mdpy_dev, "%s", MDPY_NAME); 723 718 ··· 734 735 device_del(&mdpy_dev); 735 736 err_put: 736 737 put_device(&mdpy_dev); 737 - class_destroy(mdpy_class); 738 + class_unregister(&mdpy_class); 738 739 err_driver: 739 740 mdev_unregister_driver(&mdpy_driver); 740 741 err_cdev: ··· 752 753 mdev_unregister_driver(&mdpy_driver); 753 754 cdev_del(&mdpy_cdev); 754 755 unregister_chrdev_region(mdpy_devt, MINORMASK + 1); 755 - class_destroy(mdpy_class); 756 - mdpy_class = NULL; 756 + class_unregister(&mdpy_class); 757 757 } 758 758 759 759 module_param_named(count, mdpy_driver.max_instances, int, 0444);