Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v6.9-rc1' of https://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

- Add warning in unlikely case that device is not captured with
driver_override (Kunwu Chan)

- Error handling improvements in mlx5-vfio-pci to detect firmware
tracking object error states, logging of firmware error syndrom, and
releasing of firmware resources in aborted migration sequence (Yishai
Hadas)

- Correct an un-alphabetized VFIO MAINTAINERS entry (Alex Williamson)

- Make the mdev_bus_type const and also make the class struct const for
a couple of the vfio-mdev sample drivers (Ricardo B. Marliere)

- Addition of a new vfio-pci variant driver for the GPU of NVIDIA's
Grace-Hopper superchip. During initialization of the chip-to-chip
interconnect in this hardware module, the PCI BARs of the device
become unused in favor of a faster, coherent mechanism for exposing
device memory. This driver primarily changes the VFIO representation
of the device to masquerade this coherent aperture to replace the
physical PCI BARs for userspace drivers. This also incorporates use
of a new vma flag allowing KVM to use write combining attributes for
uncached device memory (Ankit Agrawal)

- Reset fixes and cleanups for the pds-vfio-pci driver. Save and
restore files were previously leaked if the device didn't pass
through an error state, this is resolved and later re-fixed to
prevent access to the now freed files. Reset handling is also
refactored to remove the complicated deferred reset mechanism (Brett
Creeley)

- Remove some references to pl330 in the vfio-platform amba driver
(Geert Uytterhoeven)

- Remove twice redundant and ugly code to unpin incidental pins of the
zero-page (Alex Williamson)

- Deferred reset logic is also removed from the hisi-acc-vfio-pci
driver as a simplification (Shameer Kolothum)

- Enforce that mlx5-vfio-pci devices must support PRE_COPY and remove
resulting unnecessary code. There is no device firmware that has been
available publicly without this support (Yishai Hadas)

- Switch over to using the .remove_new callback for vfio-platform in
support of the broader transition for a void remove function (Uwe
Kleine-König)

- Resolve multiple issues in interrupt code for VFIO bus drivers that
allow calling eventfd_signal() on a NULL context. This also remove a
potential race in INTx setup on certain hardware for vfio-pci, races
with various mechanisms to mask INTx, and leaked virqfds in
vfio-platform (Alex Williamson)

* tag 'vfio-v6.9-rc1' of https://github.com/awilliam/linux-vfio: (29 commits)
vfio/fsl-mc: Block calling interrupt handler without trigger
vfio/platform: Create persistent IRQ handlers
vfio/platform: Disable virqfds on cleanup
vfio/pci: Create persistent INTx handler
vfio: Introduce interface to flush virqfd inject workqueue
vfio/pci: Lock external INTx masking ops
vfio/pci: Disable auto-enable of exclusive INTx IRQ
vfio/pds: Refactor/simplify reset logic
vfio/pds: Make sure migration file isn't accessed after reset
vfio/platform: Convert to platform remove callback returning void
vfio/mlx5: Enforce PRE_COPY support
vfio/mbochs: make mbochs_class constant
vfio/mdpy: make mdpy_class constant
hisi_acc_vfio_pci: Remove the deferred_reset logic
Revert "vfio/type1: Unpin zero pages"
vfio/nvgrace-gpu: Convey kvm to map device memory region as noncached
vfio: amba: Rename pl330_ids[] to vfio_amba_ids[]
vfio/pds: Always clear the save/restore FDs on reset
vfio/nvgrace-gpu: Add vfio pci variant module for grace hopper
vfio/pci: rename and export range_intersect_range
...

+1461 -454
+11 -5
MAINTAINERS
··· 23164 23164 S: Maintained 23165 23165 F: drivers/vfio/pci/mlx5/ 23166 23166 23167 - VFIO VIRTIO PCI DRIVER 23168 - M: Yishai Hadas <yishaih@nvidia.com> 23167 + VFIO NVIDIA GRACE GPU DRIVER 23168 + M: Ankit Agrawal <ankita@nvidia.com> 23169 23169 L: kvm@vger.kernel.org 23170 - L: virtualization@lists.linux.dev 23171 - S: Maintained 23172 - F: drivers/vfio/pci/virtio 23170 + S: Supported 23171 + F: drivers/vfio/pci/nvgrace-gpu/ 23173 23172 23174 23173 VFIO PCI DEVICE SPECIFIC DRIVERS 23175 23174 R: Jason Gunthorpe <jgg@nvidia.com> ··· 23192 23193 L: kvm@vger.kernel.org 23193 23194 S: Maintained 23194 23195 F: drivers/vfio/platform/ 23196 + 23197 + VFIO VIRTIO PCI DRIVER 23198 + M: Yishai Hadas <yishaih@nvidia.com> 23199 + L: kvm@vger.kernel.org 23200 + L: virtualization@lists.linux.dev 23201 + S: Maintained 23202 + F: drivers/vfio/pci/virtio 23195 23203 23196 23204 VGA_SWITCHEROO 23197 23205 R: Lukas Wunner <lukas@wunner.de>
+4 -3
drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
··· 141 141 irq = &vdev->mc_irqs[index]; 142 142 143 143 if (flags & VFIO_IRQ_SET_DATA_NONE) { 144 - vfio_fsl_mc_irq_handler(hwirq, irq); 144 + if (irq->trigger) 145 + eventfd_signal(irq->trigger); 145 146 146 147 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 147 148 u8 trigger = *(u8 *)data; 148 149 149 - if (trigger) 150 - vfio_fsl_mc_irq_handler(hwirq, irq); 150 + if (trigger && irq->trigger) 151 + eventfd_signal(irq->trigger); 151 152 } 152 153 153 154 return 0;
+1 -1
drivers/vfio/mdev/mdev_driver.c
··· 40 40 return 0; 41 41 } 42 42 43 - struct bus_type mdev_bus_type = { 43 + const struct bus_type mdev_bus_type = { 44 44 .name = "mdev", 45 45 .probe = mdev_probe, 46 46 .remove = mdev_remove,
+1 -1
drivers/vfio/mdev/mdev_private.h
··· 13 13 int mdev_bus_register(void); 14 14 void mdev_bus_unregister(void); 15 15 16 - extern struct bus_type mdev_bus_type; 16 + extern const struct bus_type mdev_bus_type; 17 17 extern const struct attribute_group *mdev_device_groups[]; 18 18 19 19 #define to_mdev_type_attr(_attr) \
+2
drivers/vfio/pci/Kconfig
··· 67 67 68 68 source "drivers/vfio/pci/virtio/Kconfig" 69 69 70 + source "drivers/vfio/pci/nvgrace-gpu/Kconfig" 71 + 70 72 endmenu
+2
drivers/vfio/pci/Makefile
··· 15 15 obj-$(CONFIG_PDS_VFIO_PCI) += pds/ 16 16 17 17 obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ 18 + 19 + obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
+12 -36
drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
··· 630 630 } 631 631 } 632 632 633 - /* 634 - * This function is called in all state_mutex unlock cases to 635 - * handle a 'deferred_reset' if exists. 636 - */ 637 - static void 638 - hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev) 633 + static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev) 639 634 { 640 - again: 641 - spin_lock(&hisi_acc_vdev->reset_lock); 642 - if (hisi_acc_vdev->deferred_reset) { 643 - hisi_acc_vdev->deferred_reset = false; 644 - spin_unlock(&hisi_acc_vdev->reset_lock); 645 - hisi_acc_vdev->vf_qm_state = QM_NOT_READY; 646 - hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 647 - hisi_acc_vf_disable_fds(hisi_acc_vdev); 648 - goto again; 649 - } 650 - mutex_unlock(&hisi_acc_vdev->state_mutex); 651 - spin_unlock(&hisi_acc_vdev->reset_lock); 635 + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; 636 + hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 637 + hisi_acc_vf_disable_fds(hisi_acc_vdev); 652 638 } 653 639 654 640 static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) ··· 790 804 791 805 info.dirty_bytes = 0; 792 806 info.initial_bytes = migf->total_length - *pos; 807 + mutex_unlock(&migf->lock); 808 + mutex_unlock(&hisi_acc_vdev->state_mutex); 793 809 794 - ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 810 + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 795 811 out: 796 812 mutex_unlock(&migf->lock); 797 813 mutex_unlock(&hisi_acc_vdev->state_mutex); ··· 1059 1071 break; 1060 1072 } 1061 1073 } 1062 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1074 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1063 1075 return res; 1064 1076 } 1065 1077 ··· 1080 1092 1081 1093 mutex_lock(&hisi_acc_vdev->state_mutex); 1082 1094 *curr_state = hisi_acc_vdev->mig_state; 1083 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1095 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1084 1096 return 0; 1085 1097 } 1086 1098 ··· 1092 1104 VFIO_MIGRATION_STOP_COPY) 1093 1105 return; 1094 1106 1095 - /* 1096 - * As the higher VFIO layers are holding locks across reset and using 1097 - * those same locks with the mm_lock we need to prevent ABBA deadlock 1098 - * with the state_mutex and mm_lock. 1099 - * In case the state_mutex was taken already we defer the cleanup work 1100 - * to the unlock flow of the other running context. 1101 - */ 1102 - spin_lock(&hisi_acc_vdev->reset_lock); 1103 - hisi_acc_vdev->deferred_reset = true; 1104 - if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) { 1105 - spin_unlock(&hisi_acc_vdev->reset_lock); 1106 - return; 1107 - } 1108 - spin_unlock(&hisi_acc_vdev->reset_lock); 1109 - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1107 + mutex_lock(&hisi_acc_vdev->state_mutex); 1108 + hisi_acc_vf_reset(hisi_acc_vdev); 1109 + mutex_unlock(&hisi_acc_vdev->state_mutex); 1110 1110 } 1111 1111 1112 1112 static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+2 -4
drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
··· 98 98 99 99 struct hisi_acc_vf_core_device { 100 100 struct vfio_pci_core_device core_device; 101 - u8 match_done:1; 102 - u8 deferred_reset:1; 101 + u8 match_done; 102 + 103 103 /* For migration state */ 104 104 struct mutex state_mutex; 105 105 enum vfio_device_mig_state mig_state; ··· 109 109 struct hisi_qm vf_qm; 110 110 u32 vf_qm_state; 111 111 int vf_id; 112 - /* For reset handler */ 113 - spinlock_t reset_lock; 114 112 struct hisi_acc_vf_migration_file *resuming_migf; 115 113 struct hisi_acc_vf_migration_file *saving_migf; 116 114 };
+132 -25
drivers/vfio/pci/mlx5/cmd.c
··· 108 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 109 if (ret) 110 110 return ret; 111 - if (mvdev->saving_migf->state == 112 - MLX5_MIGF_STATE_PRE_COPY_ERROR) { 111 + /* Upon cleanup, ignore previous pre_copy error state */ 112 + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR && 113 + !(query_flags & MLX5VF_QUERY_CLEANUP)) { 113 114 /* 114 115 * In case we had a PRE_COPY error, only query full 115 116 * image for final image ··· 121 120 return 0; 122 121 } 123 122 query_flags &= ~MLX5VF_QUERY_INC; 123 + } 124 + /* Block incremental query which is state-dependent */ 125 + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) { 126 + complete(&mvdev->saving_migf->save_comp); 127 + return -ENODEV; 124 128 } 125 129 } 126 130 ··· 153 147 remaining_total_size) : *state_size; 154 148 155 149 return 0; 150 + } 151 + 152 + static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev) 153 + { 154 + mvdev->tracker.object_changed = true; 155 + complete(&mvdev->tracker_comp); 156 156 } 157 157 158 158 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) ··· 201 189 /* Must be done outside the lock to let it progress */ 202 190 set_tracker_error(mvdev); 203 191 mutex_lock(&mvdev->state_mutex); 204 - mlx5vf_disable_fds(mvdev); 192 + mlx5vf_disable_fds(mvdev, NULL); 205 193 _mlx5vf_free_page_tracker_resources(mvdev); 206 194 mlx5vf_state_mutex_unlock(mvdev); 207 195 } ··· 233 221 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 234 222 goto end; 235 223 224 + if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 225 + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))) 226 + goto end; 227 + 236 228 mvdev->vf_id = pci_iov_vf_id(pdev); 237 229 if (mvdev->vf_id < 0) 238 230 goto end; ··· 266 250 mvdev->migrate_cap = 1; 267 251 mvdev->core_device.vdev.migration_flags = 268 252 VFIO_MIGRATION_STOP_COPY | 269 - VFIO_MIGRATION_P2P; 253 + VFIO_MIGRATION_P2P | 254 + VFIO_MIGRATION_PRE_COPY; 255 + 270 256 mvdev->core_device.vdev.mig_ops = mig_ops; 271 257 init_completion(&mvdev->tracker_comp); 272 258 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 273 259 mvdev->core_device.vdev.log_ops = log_ops; 274 - 275 - if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 276 - MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 277 - mvdev->core_device.vdev.migration_flags |= 278 - VFIO_MIGRATION_PRE_COPY; 279 260 280 261 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) 281 262 mvdev->chunk_mode = 1; ··· 413 400 __free_page(sg_page_iter_page(&sg_iter)); 414 401 sg_free_append_table(&buf->table); 415 402 kfree(buf); 403 + } 404 + 405 + static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 406 + unsigned int npages) 407 + { 408 + unsigned int to_alloc = npages; 409 + struct page **page_list; 410 + unsigned long filled; 411 + unsigned int to_fill; 412 + int ret; 413 + 414 + to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 415 + page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 416 + if (!page_list) 417 + return -ENOMEM; 418 + 419 + do { 420 + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 421 + page_list); 422 + if (!filled) { 423 + ret = -ENOMEM; 424 + goto err; 425 + } 426 + to_alloc -= filled; 427 + ret = sg_alloc_append_table_from_pages( 428 + &buf->table, page_list, filled, 0, 429 + filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 430 + GFP_KERNEL_ACCOUNT); 431 + 432 + if (ret) 433 + goto err; 434 + buf->allocated_length += filled * PAGE_SIZE; 435 + /* clean input for another bulk allocation */ 436 + memset(page_list, 0, filled * sizeof(*page_list)); 437 + to_fill = min_t(unsigned int, to_alloc, 438 + PAGE_SIZE / sizeof(*page_list)); 439 + } while (to_alloc > 0); 440 + 441 + kvfree(page_list); 442 + return 0; 443 + 444 + err: 445 + kvfree(page_list); 446 + return ret; 416 447 } 417 448 418 449 struct mlx5_vhca_data_buffer * ··· 665 608 666 609 err: 667 610 /* The error flow can't run from an interrupt context */ 668 - if (status == -EREMOTEIO) 611 + if (status == -EREMOTEIO) { 669 612 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 613 + /* Failed in FW, print cmd out failure details */ 614 + mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0, 615 + async_data->out); 616 + } 617 + 670 618 async_data->status = status; 671 619 queue_work(migf->mvdev->cb_wq, &async_data->work); 672 620 } ··· 685 623 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 686 624 struct mlx5_vhca_data_buffer *header_buf = NULL; 687 625 struct mlx5vf_async_data *async_data; 626 + bool pre_copy_cleanup = false; 688 627 int err; 689 628 690 629 lockdep_assert_held(&mvdev->state_mutex); ··· 695 632 err = wait_for_completion_interruptible(&migf->save_comp); 696 633 if (err) 697 634 return err; 635 + 636 + if ((migf->state == MLX5_MIGF_STATE_PRE_COPY || 637 + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc) 638 + pre_copy_cleanup = true; 698 639 699 640 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 700 641 /* ··· 718 651 719 652 async_data = &migf->async_data; 720 653 async_data->buf = buf; 721 - async_data->stop_copy_chunk = !track; 654 + async_data->stop_copy_chunk = (!track && !pre_copy_cleanup); 722 655 async_data->out = kvzalloc(out_size, GFP_KERNEL); 723 656 if (!async_data->out) { 724 657 err = -ENOMEM; 725 658 goto err_out; 726 659 } 727 660 728 - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 729 - if (async_data->stop_copy_chunk) { 730 - u8 header_idx = buf->stop_copy_chunk_num ? 731 - buf->stop_copy_chunk_num - 1 : 0; 661 + if (async_data->stop_copy_chunk) { 662 + u8 header_idx = buf->stop_copy_chunk_num ? 663 + buf->stop_copy_chunk_num - 1 : 0; 732 664 733 - header_buf = migf->buf_header[header_idx]; 734 - migf->buf_header[header_idx] = NULL; 735 - } 665 + header_buf = migf->buf_header[header_idx]; 666 + migf->buf_header[header_idx] = NULL; 667 + } 736 668 737 - if (!header_buf) { 738 - header_buf = mlx5vf_get_data_buffer(migf, 739 - sizeof(struct mlx5_vf_migration_header), DMA_NONE); 740 - if (IS_ERR(header_buf)) { 741 - err = PTR_ERR(header_buf); 742 - goto err_free; 743 - } 669 + if (!header_buf) { 670 + header_buf = mlx5vf_get_data_buffer(migf, 671 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 672 + if (IS_ERR(header_buf)) { 673 + err = PTR_ERR(header_buf); 674 + goto err_free; 744 675 } 745 676 } 746 677 ··· 965 900 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 966 901 } 967 902 903 + static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev, 904 + struct mlx5_vhca_page_tracker *tracker) 905 + { 906 + u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {}; 907 + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 908 + void *obj_context; 909 + void *cmd_hdr; 910 + int err; 911 + 912 + cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 913 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); 914 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 915 + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id); 916 + 917 + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 918 + if (err) 919 + return err; 920 + 921 + obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context); 922 + tracker->status = MLX5_GET(page_track, obj_context, state); 923 + return 0; 924 + } 925 + 968 926 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 969 927 struct mlx5_vhca_cq_buf *buf, int nent, 970 928 int cqe_size) ··· 1045 957 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 1046 958 struct mlx5vf_pci_core_device *mvdev = container_of( 1047 959 tracker, struct mlx5vf_pci_core_device, tracker); 960 + struct mlx5_eqe_obj_change *object; 1048 961 struct mlx5_eqe *eqe = data; 1049 962 u8 event_type = (u8)type; 1050 963 u8 queue_type; 964 + u32 obj_id; 1051 965 int qp_num; 1052 966 1053 967 switch (event_type) { ··· 1064 974 qp_num != tracker->fw_qp->qpn) 1065 975 break; 1066 976 set_tracker_error(mvdev); 977 + break; 978 + case MLX5_EVENT_TYPE_OBJECT_CHANGE: 979 + object = &eqe->data.obj_change; 980 + obj_id = be32_to_cpu(object->obj_id); 981 + if (obj_id == tracker->id) 982 + set_tracker_change_event(mvdev); 1067 983 break; 1068 984 default: 1069 985 break; ··· 1730 1634 goto end; 1731 1635 } 1732 1636 1637 + if (tracker->is_err) { 1638 + err = -EIO; 1639 + goto end; 1640 + } 1641 + 1733 1642 mdev = mvdev->mdev; 1734 1643 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1735 1644 MLX5_PAGE_TRACK_STATE_REPORTING); ··· 1753 1652 dirty, &tracker->status); 1754 1653 if (poll_err == CQ_EMPTY) { 1755 1654 wait_for_completion(&mvdev->tracker_comp); 1655 + if (tracker->object_changed) { 1656 + tracker->object_changed = false; 1657 + err = mlx5vf_cmd_query_tracker(mdev, tracker); 1658 + if (err) 1659 + goto end; 1660 + } 1756 1661 continue; 1757 1662 } 1758 1663 }
+4 -7
drivers/vfio/pci/mlx5/cmd.h
··· 13 13 #include <linux/mlx5/cq.h> 14 14 #include <linux/mlx5/qp.h> 15 15 16 - #define MLX5VF_PRE_COPY_SUPP(mvdev) \ 17 - ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) 18 - 19 16 enum mlx5_vf_migf_state { 20 17 MLX5_MIGF_STATE_ERROR = 1, 21 18 MLX5_MIGF_STATE_PRE_COPY_ERROR, ··· 22 25 }; 23 26 24 27 enum mlx5_vf_load_state { 25 - MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, 26 28 MLX5_VF_LOAD_STATE_READ_HEADER, 27 29 MLX5_VF_LOAD_STATE_PREP_HEADER_DATA, 28 30 MLX5_VF_LOAD_STATE_READ_HEADER_DATA, ··· 158 162 u32 id; 159 163 u32 pdn; 160 164 u8 is_err:1; 165 + u8 object_changed:1; 161 166 struct mlx5_uars_page *uar; 162 167 struct mlx5_vhca_cq cq; 163 168 struct mlx5_vhca_qp *host_qp; ··· 193 196 enum { 194 197 MLX5VF_QUERY_INC = (1UL << 0), 195 198 MLX5VF_QUERY_FINAL = (1UL << 1), 199 + MLX5VF_QUERY_CLEANUP = (1UL << 2), 196 200 }; 197 201 198 202 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); ··· 224 226 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 225 227 size_t length, enum dma_data_direction dma_dir); 226 228 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); 227 - int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 228 - unsigned int npages); 229 229 struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 230 230 unsigned long offset); 231 231 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); 232 - void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); 232 + void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 233 + enum mlx5_vf_migf_state *last_save_state); 233 234 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); 234 235 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, 235 236 u8 chunk_num, size_t next_required_umem_size);
+40 -108
drivers/vfio/pci/mlx5/main.c
··· 65 65 return NULL; 66 66 } 67 67 68 - int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 69 - unsigned int npages) 70 - { 71 - unsigned int to_alloc = npages; 72 - struct page **page_list; 73 - unsigned long filled; 74 - unsigned int to_fill; 75 - int ret; 76 - 77 - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 78 - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); 79 - if (!page_list) 80 - return -ENOMEM; 81 - 82 - do { 83 - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, 84 - page_list); 85 - if (!filled) { 86 - ret = -ENOMEM; 87 - goto err; 88 - } 89 - to_alloc -= filled; 90 - ret = sg_alloc_append_table_from_pages( 91 - &buf->table, page_list, filled, 0, 92 - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 93 - GFP_KERNEL_ACCOUNT); 94 - 95 - if (ret) 96 - goto err; 97 - buf->allocated_length += filled * PAGE_SIZE; 98 - /* clean input for another bulk allocation */ 99 - memset(page_list, 0, filled * sizeof(*page_list)); 100 - to_fill = min_t(unsigned int, to_alloc, 101 - PAGE_SIZE / sizeof(*page_list)); 102 - } while (to_alloc > 0); 103 - 104 - kvfree(page_list); 105 - return 0; 106 - 107 - err: 108 - kvfree(page_list); 109 - return ret; 110 - } 111 - 112 68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 113 69 { 114 70 mutex_lock(&migf->lock); ··· 733 777 return 0; 734 778 } 735 779 736 - static int 737 - mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 738 - loff_t requested_length, 739 - const char __user **buf, size_t *len, 740 - loff_t *pos, ssize_t *done) 741 - { 742 - int ret; 743 - 744 - if (requested_length > MAX_LOAD_SIZE) 745 - return -ENOMEM; 746 - 747 - if (vhca_buf->allocated_length < requested_length) { 748 - ret = mlx5vf_add_migration_pages( 749 - vhca_buf, 750 - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 751 - PAGE_SIZE)); 752 - if (ret) 753 - return ret; 754 - } 755 - 756 - while (*len) { 757 - ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 758 - done); 759 - if (ret) 760 - return ret; 761 - } 762 - 763 - return 0; 764 - } 765 - 766 780 static ssize_t 767 781 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 768 782 struct mlx5_vhca_data_buffer *vhca_buf, ··· 964 1038 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 965 1039 break; 966 1040 } 967 - case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 968 - ret = mlx5vf_resume_read_image_no_header(vhca_buf, 969 - requested_length, 970 - &buf, &len, pos, &done); 971 - if (ret) 972 - goto out_unlock; 973 - break; 974 1041 case MLX5_VF_LOAD_STATE_READ_IMAGE: 975 1042 ret = mlx5vf_resume_read_image(migf, vhca_buf, 976 1043 migf->record_size, ··· 1033 1114 } 1034 1115 1035 1116 migf->buf[0] = buf; 1036 - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 1037 - buf = mlx5vf_alloc_data_buffer(migf, 1038 - sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1039 - if (IS_ERR(buf)) { 1040 - ret = PTR_ERR(buf); 1041 - goto out_buf; 1042 - } 1043 - 1044 - migf->buf_header[0] = buf; 1045 - migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1046 - } else { 1047 - /* Initial state will be to read the image */ 1048 - migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 1117 + buf = mlx5vf_alloc_data_buffer(migf, 1118 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 1119 + if (IS_ERR(buf)) { 1120 + ret = PTR_ERR(buf); 1121 + goto out_buf; 1049 1122 } 1123 + 1124 + migf->buf_header[0] = buf; 1125 + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 1050 1126 1051 1127 stream_open(migf->filp->f_inode, migf->filp); 1052 1128 mutex_init(&migf->lock); ··· 1060 1146 return ERR_PTR(ret); 1061 1147 } 1062 1148 1063 - void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 1149 + void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, 1150 + enum mlx5_vf_migf_state *last_save_state) 1064 1151 { 1065 1152 if (mvdev->resuming_migf) { 1066 1153 mlx5vf_disable_fd(mvdev->resuming_migf); ··· 1072 1157 if (mvdev->saving_migf) { 1073 1158 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 1074 1159 cancel_work_sync(&mvdev->saving_migf->async_data.work); 1160 + if (last_save_state) 1161 + *last_save_state = mvdev->saving_migf->state; 1075 1162 mlx5vf_disable_fd(mvdev->saving_migf); 1076 1163 wake_up_interruptible(&mvdev->saving_migf->poll_wait); 1077 1164 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); ··· 1134 1217 return migf->filp; 1135 1218 } 1136 1219 1137 - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 1138 - (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1220 + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 1221 + mlx5vf_disable_fds(mvdev, NULL); 1222 + return NULL; 1223 + } 1224 + 1225 + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 1139 1226 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 1140 1227 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 1141 - mlx5vf_disable_fds(mvdev); 1142 - return NULL; 1228 + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 1229 + struct mlx5_vhca_data_buffer *buf; 1230 + enum mlx5_vf_migf_state state; 1231 + size_t size; 1232 + 1233 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, 1234 + MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); 1235 + if (ret) 1236 + return ERR_PTR(ret); 1237 + buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); 1238 + if (IS_ERR(buf)) 1239 + return ERR_CAST(buf); 1240 + /* pre_copy cleanup */ 1241 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); 1242 + if (ret) { 1243 + mlx5vf_put_data_buffer(buf); 1244 + return ERR_PTR(ret); 1245 + } 1246 + mlx5vf_disable_fds(mvdev, &state); 1247 + return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); 1143 1248 } 1144 1249 1145 1250 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { ··· 1176 1237 } 1177 1238 1178 1239 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 1179 - if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 1180 - ret = mlx5vf_cmd_load_vhca_state(mvdev, 1181 - mvdev->resuming_migf, 1182 - mvdev->resuming_migf->buf[0]); 1183 - if (ret) 1184 - return ERR_PTR(ret); 1185 - } 1186 - mlx5vf_disable_fds(mvdev); 1240 + mlx5vf_disable_fds(mvdev, NULL); 1187 1241 return NULL; 1188 1242 } 1189 1243 ··· 1221 1289 mvdev->deferred_reset = false; 1222 1290 spin_unlock(&mvdev->reset_lock); 1223 1291 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1224 - mlx5vf_disable_fds(mvdev); 1292 + mlx5vf_disable_fds(mvdev, NULL); 1225 1293 goto again; 1226 1294 } 1227 1295 mutex_unlock(&mvdev->state_mutex);
+10
drivers/vfio/pci/nvgrace-gpu/Kconfig
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + config NVGRACE_GPU_VFIO_PCI 3 + tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip" 4 + depends on ARM64 || (COMPILE_TEST && 64BIT) 5 + select VFIO_PCI_CORE 6 + help 7 + VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is 8 + required to assign the GPU device to userspace using KVM/qemu/etc. 9 + 10 + If you don't know what to do here, say N.
+3
drivers/vfio/pci/nvgrace-gpu/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o 3 + nvgrace-gpu-vfio-pci-y := main.o
+888
drivers/vfio/pci/nvgrace-gpu/main.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + */ 5 + 6 + #include <linux/sizes.h> 7 + #include <linux/vfio_pci_core.h> 8 + 9 + /* 10 + * The device memory usable to the workloads running in the VM is cached 11 + * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) 12 + * to the VM and is represented as usemem. 13 + * Moreover, the VM GPU device driver needs a non-cacheable region to 14 + * support the MIG feature. This region is also exposed as a 64b BAR 15 + * (comprising of BAR2 and BAR3 region) and represented as resmem. 16 + */ 17 + #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 18 + #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 19 + 20 + /* Memory size expected as non cached and reserved by the VM driver */ 21 + #define RESMEM_SIZE SZ_1G 22 + 23 + /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 24 + #define MEMBLK_SIZE SZ_512M 25 + 26 + /* 27 + * The state of the two device memory region - resmem and usemem - is 28 + * saved as struct mem_region. 29 + */ 30 + struct mem_region { 31 + phys_addr_t memphys; /* Base physical address of the region */ 32 + size_t memlength; /* Region size */ 33 + size_t bar_size; /* Reported region BAR size */ 34 + __le64 bar_val; /* Emulated BAR offset registers */ 35 + union { 36 + void *memaddr; 37 + void __iomem *ioaddr; 38 + }; /* Base virtual address of the region */ 39 + }; 40 + 41 + struct nvgrace_gpu_pci_core_device { 42 + struct vfio_pci_core_device core_device; 43 + /* Cached and usable memory for the VM. */ 44 + struct mem_region usemem; 45 + /* Non cached memory carved out from the end of device memory */ 46 + struct mem_region resmem; 47 + /* Lock to control device memory kernel mapping */ 48 + struct mutex remap_lock; 49 + }; 50 + 51 + static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) 52 + { 53 + struct nvgrace_gpu_pci_core_device *nvdev = 54 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 55 + core_device.vdev); 56 + 57 + nvdev->resmem.bar_val = 0; 58 + nvdev->usemem.bar_val = 0; 59 + } 60 + 61 + /* Choose the structure corresponding to the fake BAR with a given index. */ 62 + static struct mem_region * 63 + nvgrace_gpu_memregion(int index, 64 + struct nvgrace_gpu_pci_core_device *nvdev) 65 + { 66 + if (index == USEMEM_REGION_INDEX) 67 + return &nvdev->usemem; 68 + 69 + if (index == RESMEM_REGION_INDEX) 70 + return &nvdev->resmem; 71 + 72 + return NULL; 73 + } 74 + 75 + static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 76 + { 77 + struct vfio_pci_core_device *vdev = 78 + container_of(core_vdev, struct vfio_pci_core_device, vdev); 79 + struct nvgrace_gpu_pci_core_device *nvdev = 80 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 81 + core_device.vdev); 82 + int ret; 83 + 84 + ret = vfio_pci_core_enable(vdev); 85 + if (ret) 86 + return ret; 87 + 88 + if (nvdev->usemem.memlength) { 89 + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 90 + mutex_init(&nvdev->remap_lock); 91 + } 92 + 93 + vfio_pci_core_finish_enable(vdev); 94 + 95 + return 0; 96 + } 97 + 98 + static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) 99 + { 100 + struct nvgrace_gpu_pci_core_device *nvdev = 101 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 102 + core_device.vdev); 103 + 104 + /* Unmap the mapping to the device memory cached region */ 105 + if (nvdev->usemem.memaddr) { 106 + memunmap(nvdev->usemem.memaddr); 107 + nvdev->usemem.memaddr = NULL; 108 + } 109 + 110 + /* Unmap the mapping to the device memory non-cached region */ 111 + if (nvdev->resmem.ioaddr) { 112 + iounmap(nvdev->resmem.ioaddr); 113 + nvdev->resmem.ioaddr = NULL; 114 + } 115 + 116 + mutex_destroy(&nvdev->remap_lock); 117 + 118 + vfio_pci_core_close_device(core_vdev); 119 + } 120 + 121 + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 122 + struct vm_area_struct *vma) 123 + { 124 + struct nvgrace_gpu_pci_core_device *nvdev = 125 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 126 + core_device.vdev); 127 + struct mem_region *memregion; 128 + unsigned long start_pfn; 129 + u64 req_len, pgoff, end; 130 + unsigned int index; 131 + int ret = 0; 132 + 133 + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 134 + 135 + memregion = nvgrace_gpu_memregion(index, nvdev); 136 + if (!memregion) 137 + return vfio_pci_core_mmap(core_vdev, vma); 138 + 139 + /* 140 + * Request to mmap the BAR. Map to the CPU accessible memory on the 141 + * GPU using the memory information gathered from the system ACPI 142 + * tables. 143 + */ 144 + pgoff = vma->vm_pgoff & 145 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 146 + 147 + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || 148 + check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || 149 + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) 150 + return -EOVERFLOW; 151 + 152 + /* 153 + * Check that the mapping request does not go beyond available device 154 + * memory size 155 + */ 156 + if (end > memregion->memlength) 157 + return -EINVAL; 158 + 159 + /* 160 + * The carved out region of the device memory needs the NORMAL_NC 161 + * property. Communicate as such to the hypervisor. 162 + */ 163 + if (index == RESMEM_REGION_INDEX) { 164 + /* 165 + * The nvgrace-gpu module has no issues with uncontained 166 + * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is 167 + * set to communicate to the KVM to S2 map as NORMAL_NC. 168 + * This opens up guest usage of NORMAL_NC for this mapping. 169 + */ 170 + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); 171 + 172 + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); 173 + } 174 + 175 + /* 176 + * Perform a PFN map to the memory and back the device BAR by the 177 + * GPU memory. 178 + * 179 + * The available GPU memory size may not be power-of-2 aligned. The 180 + * remainder is only backed by vfio_device_ops read/write handlers. 181 + * 182 + * During device reset, the GPU is safely disconnected to the CPU 183 + * and access to the BAR will be immediately returned preventing 184 + * machine check. 185 + */ 186 + ret = remap_pfn_range(vma, vma->vm_start, start_pfn, 187 + req_len, vma->vm_page_prot); 188 + if (ret) 189 + return ret; 190 + 191 + vma->vm_pgoff = start_pfn; 192 + 193 + return 0; 194 + } 195 + 196 + static long 197 + nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, 198 + unsigned long arg) 199 + { 200 + struct nvgrace_gpu_pci_core_device *nvdev = 201 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 202 + core_device.vdev); 203 + unsigned long minsz = offsetofend(struct vfio_region_info, offset); 204 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 205 + struct vfio_region_info_cap_sparse_mmap *sparse; 206 + struct vfio_region_info info; 207 + struct mem_region *memregion; 208 + u32 size; 209 + int ret; 210 + 211 + if (copy_from_user(&info, (void __user *)arg, minsz)) 212 + return -EFAULT; 213 + 214 + if (info.argsz < minsz) 215 + return -EINVAL; 216 + 217 + /* 218 + * Request to determine the BAR region information. Send the 219 + * GPU memory information. 220 + */ 221 + memregion = nvgrace_gpu_memregion(info.index, nvdev); 222 + if (!memregion) 223 + return vfio_pci_core_ioctl(core_vdev, 224 + VFIO_DEVICE_GET_REGION_INFO, arg); 225 + 226 + size = struct_size(sparse, areas, 1); 227 + 228 + /* 229 + * Setup for sparse mapping for the device memory. Only the 230 + * available device memory on the hardware is shown as a 231 + * mappable region. 232 + */ 233 + sparse = kzalloc(size, GFP_KERNEL); 234 + if (!sparse) 235 + return -ENOMEM; 236 + 237 + sparse->nr_areas = 1; 238 + sparse->areas[0].offset = 0; 239 + sparse->areas[0].size = memregion->memlength; 240 + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; 241 + sparse->header.version = 1; 242 + 243 + ret = vfio_info_add_capability(&caps, &sparse->header, size); 244 + kfree(sparse); 245 + if (ret) 246 + return ret; 247 + 248 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 249 + /* 250 + * The region memory size may not be power-of-2 aligned. 251 + * Given that the memory as a BAR and may not be 252 + * aligned, roundup to the next power-of-2. 253 + */ 254 + info.size = memregion->bar_size; 255 + info.flags = VFIO_REGION_INFO_FLAG_READ | 256 + VFIO_REGION_INFO_FLAG_WRITE | 257 + VFIO_REGION_INFO_FLAG_MMAP; 258 + 259 + if (caps.size) { 260 + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 261 + if (info.argsz < sizeof(info) + caps.size) { 262 + info.argsz = sizeof(info) + caps.size; 263 + info.cap_offset = 0; 264 + } else { 265 + vfio_info_cap_shift(&caps, sizeof(info)); 266 + if (copy_to_user((void __user *)arg + 267 + sizeof(info), caps.buf, 268 + caps.size)) { 269 + kfree(caps.buf); 270 + return -EFAULT; 271 + } 272 + info.cap_offset = sizeof(info); 273 + } 274 + kfree(caps.buf); 275 + } 276 + return copy_to_user((void __user *)arg, &info, minsz) ? 277 + -EFAULT : 0; 278 + } 279 + 280 + static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, 281 + unsigned int cmd, unsigned long arg) 282 + { 283 + switch (cmd) { 284 + case VFIO_DEVICE_GET_REGION_INFO: 285 + return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); 286 + case VFIO_DEVICE_IOEVENTFD: 287 + return -ENOTTY; 288 + case VFIO_DEVICE_RESET: 289 + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); 290 + fallthrough; 291 + default: 292 + return vfio_pci_core_ioctl(core_vdev, cmd, arg); 293 + } 294 + } 295 + 296 + static __le64 297 + nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) 298 + { 299 + u64 tmp_val; 300 + 301 + tmp_val = le64_to_cpu(val64); 302 + tmp_val &= ~(bar_size - 1); 303 + tmp_val |= flags; 304 + 305 + return cpu_to_le64(tmp_val); 306 + } 307 + 308 + /* 309 + * Both the usable (usemem) and the reserved (resmem) device memory region 310 + * are exposed as a 64b fake device BARs in the VM. These fake BARs must 311 + * respond to the accesses on their respective PCI config space offsets. 312 + * 313 + * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. 314 + * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. 315 + */ 316 + static ssize_t 317 + nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, 318 + char __user *buf, size_t count, loff_t *ppos) 319 + { 320 + struct nvgrace_gpu_pci_core_device *nvdev = 321 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 322 + core_device.vdev); 323 + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 324 + struct mem_region *memregion = NULL; 325 + __le64 val64; 326 + size_t register_offset; 327 + loff_t copy_offset; 328 + size_t copy_count; 329 + int ret; 330 + 331 + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); 332 + if (ret < 0) 333 + return ret; 334 + 335 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 336 + sizeof(val64), 337 + &copy_offset, &copy_count, 338 + &register_offset)) 339 + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 340 + else if (vfio_pci_core_range_intersect_range(pos, count, 341 + PCI_BASE_ADDRESS_4, 342 + sizeof(val64), 343 + &copy_offset, &copy_count, 344 + &register_offset)) 345 + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 346 + 347 + if (memregion) { 348 + val64 = nvgrace_gpu_get_read_value(memregion->bar_size, 349 + PCI_BASE_ADDRESS_MEM_TYPE_64 | 350 + PCI_BASE_ADDRESS_MEM_PREFETCH, 351 + memregion->bar_val); 352 + if (copy_to_user(buf + copy_offset, 353 + (void *)&val64 + register_offset, copy_count)) { 354 + /* 355 + * The position has been incremented in 356 + * vfio_pci_core_read. Reset the offset back to the 357 + * starting position. 358 + */ 359 + *ppos -= count; 360 + return -EFAULT; 361 + } 362 + } 363 + 364 + return count; 365 + } 366 + 367 + static ssize_t 368 + nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, 369 + const char __user *buf, size_t count, loff_t *ppos) 370 + { 371 + struct nvgrace_gpu_pci_core_device *nvdev = 372 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 373 + core_device.vdev); 374 + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; 375 + struct mem_region *memregion = NULL; 376 + size_t register_offset; 377 + loff_t copy_offset; 378 + size_t copy_count; 379 + 380 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, 381 + sizeof(u64), &copy_offset, 382 + &copy_count, &register_offset)) 383 + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); 384 + else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, 385 + sizeof(u64), &copy_offset, 386 + &copy_count, &register_offset)) 387 + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); 388 + 389 + if (memregion) { 390 + if (copy_from_user((void *)&memregion->bar_val + register_offset, 391 + buf + copy_offset, copy_count)) 392 + return -EFAULT; 393 + *ppos += copy_count; 394 + return copy_count; 395 + } 396 + 397 + return vfio_pci_core_write(core_vdev, buf, count, ppos); 398 + } 399 + 400 + /* 401 + * Ad hoc map the device memory in the module kernel VA space. Primarily needed 402 + * as vfio does not require the userspace driver to only perform accesses through 403 + * mmaps of the vfio-pci BAR regions and such accesses should be supported using 404 + * vfio_device_ops read/write implementations. 405 + * 406 + * The usemem region is cacheable memory and hence is memremaped. 407 + * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). 408 + */ 409 + static int 410 + nvgrace_gpu_map_device_mem(int index, 411 + struct nvgrace_gpu_pci_core_device *nvdev) 412 + { 413 + struct mem_region *memregion; 414 + int ret = 0; 415 + 416 + memregion = nvgrace_gpu_memregion(index, nvdev); 417 + if (!memregion) 418 + return -EINVAL; 419 + 420 + mutex_lock(&nvdev->remap_lock); 421 + 422 + if (memregion->memaddr) 423 + goto unlock; 424 + 425 + if (index == USEMEM_REGION_INDEX) 426 + memregion->memaddr = memremap(memregion->memphys, 427 + memregion->memlength, 428 + MEMREMAP_WB); 429 + else 430 + memregion->ioaddr = ioremap_wc(memregion->memphys, 431 + memregion->memlength); 432 + 433 + if (!memregion->memaddr) 434 + ret = -ENOMEM; 435 + 436 + unlock: 437 + mutex_unlock(&nvdev->remap_lock); 438 + 439 + return ret; 440 + } 441 + 442 + /* 443 + * Read the data from the device memory (mapped either through ioremap 444 + * or memremap) into the user buffer. 445 + */ 446 + static int 447 + nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, 448 + char __user *buf, size_t mem_count, loff_t *ppos) 449 + { 450 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 451 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 452 + int ret; 453 + 454 + if (!mem_count) 455 + return 0; 456 + 457 + /* 458 + * Handle read on the BAR regions. Map to the target device memory 459 + * physical address and copy to the request read buffer. 460 + */ 461 + ret = nvgrace_gpu_map_device_mem(index, nvdev); 462 + if (ret) 463 + return ret; 464 + 465 + if (index == USEMEM_REGION_INDEX) { 466 + if (copy_to_user(buf, 467 + (u8 *)nvdev->usemem.memaddr + offset, 468 + mem_count)) 469 + ret = -EFAULT; 470 + } else { 471 + /* 472 + * The hardware ensures that the system does not crash when 473 + * the device memory is accessed with the memory enable 474 + * turned off. It synthesizes ~0 on such read. So there is 475 + * no need to check or support the disablement/enablement of 476 + * BAR through PCI_COMMAND config space register. Pass 477 + * test_mem flag as false. 478 + */ 479 + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 480 + nvdev->resmem.ioaddr, 481 + buf, offset, mem_count, 482 + 0, 0, false); 483 + } 484 + 485 + return ret; 486 + } 487 + 488 + /* 489 + * Read count bytes from the device memory at an offset. The actual device 490 + * memory size (available) may not be a power-of-2. So the driver fakes 491 + * the size to a power-of-2 (reported) when exposing to a user space driver. 492 + * 493 + * Reads starting beyond the reported size generate -EINVAL; reads extending 494 + * beyond the actual device size is filled with ~0; reads extending beyond 495 + * the reported size are truncated. 496 + */ 497 + static ssize_t 498 + nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, 499 + char __user *buf, size_t count, loff_t *ppos) 500 + { 501 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 502 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 503 + struct mem_region *memregion; 504 + size_t mem_count, i; 505 + u8 val = 0xFF; 506 + int ret; 507 + 508 + /* No need to do NULL check as caller does. */ 509 + memregion = nvgrace_gpu_memregion(index, nvdev); 510 + 511 + if (offset >= memregion->bar_size) 512 + return -EINVAL; 513 + 514 + /* Clip short the read request beyond reported BAR size */ 515 + count = min(count, memregion->bar_size - (size_t)offset); 516 + 517 + /* 518 + * Determine how many bytes to be actually read from the device memory. 519 + * Read request beyond the actual device memory size is filled with ~0, 520 + * while those beyond the actual reported size is skipped. 521 + */ 522 + if (offset >= memregion->memlength) 523 + mem_count = 0; 524 + else 525 + mem_count = min(count, memregion->memlength - (size_t)offset); 526 + 527 + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); 528 + if (ret) 529 + return ret; 530 + 531 + /* 532 + * Only the device memory present on the hardware is mapped, which may 533 + * not be power-of-2 aligned. A read to an offset beyond the device memory 534 + * size is filled with ~0. 535 + */ 536 + for (i = mem_count; i < count; i++) { 537 + ret = put_user(val, (unsigned char __user *)(buf + i)); 538 + if (ret) 539 + return ret; 540 + } 541 + 542 + *ppos += count; 543 + return count; 544 + } 545 + 546 + static ssize_t 547 + nvgrace_gpu_read(struct vfio_device *core_vdev, 548 + char __user *buf, size_t count, loff_t *ppos) 549 + { 550 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 551 + struct nvgrace_gpu_pci_core_device *nvdev = 552 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 553 + core_device.vdev); 554 + 555 + if (nvgrace_gpu_memregion(index, nvdev)) 556 + return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); 557 + 558 + if (index == VFIO_PCI_CONFIG_REGION_INDEX) 559 + return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); 560 + 561 + return vfio_pci_core_read(core_vdev, buf, count, ppos); 562 + } 563 + 564 + /* 565 + * Write the data to the device memory (mapped either through ioremap 566 + * or memremap) from the user buffer. 567 + */ 568 + static int 569 + nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, 570 + const char __user *buf, size_t mem_count, 571 + loff_t *ppos) 572 + { 573 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 574 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 575 + int ret; 576 + 577 + if (!mem_count) 578 + return 0; 579 + 580 + ret = nvgrace_gpu_map_device_mem(index, nvdev); 581 + if (ret) 582 + return ret; 583 + 584 + if (index == USEMEM_REGION_INDEX) { 585 + if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, 586 + buf, mem_count)) 587 + return -EFAULT; 588 + } else { 589 + /* 590 + * The hardware ensures that the system does not crash when 591 + * the device memory is accessed with the memory enable 592 + * turned off. It drops such writes. So there is no need to 593 + * check or support the disablement/enablement of BAR 594 + * through PCI_COMMAND config space register. Pass test_mem 595 + * flag as false. 596 + */ 597 + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, 598 + nvdev->resmem.ioaddr, 599 + (char __user *)buf, pos, mem_count, 600 + 0, 0, true); 601 + } 602 + 603 + return ret; 604 + } 605 + 606 + /* 607 + * Write count bytes to the device memory at a given offset. The actual device 608 + * memory size (available) may not be a power-of-2. So the driver fakes the 609 + * size to a power-of-2 (reported) when exposing to a user space driver. 610 + * 611 + * Writes extending beyond the reported size are truncated; writes starting 612 + * beyond the reported size generate -EINVAL. 613 + */ 614 + static ssize_t 615 + nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, 616 + size_t count, loff_t *ppos, const char __user *buf) 617 + { 618 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 619 + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; 620 + struct mem_region *memregion; 621 + size_t mem_count; 622 + int ret = 0; 623 + 624 + /* No need to do NULL check as caller does. */ 625 + memregion = nvgrace_gpu_memregion(index, nvdev); 626 + 627 + if (offset >= memregion->bar_size) 628 + return -EINVAL; 629 + 630 + /* Clip short the write request beyond reported BAR size */ 631 + count = min(count, memregion->bar_size - (size_t)offset); 632 + 633 + /* 634 + * Determine how many bytes to be actually written to the device memory. 635 + * Do not write to the offset beyond available size. 636 + */ 637 + if (offset >= memregion->memlength) 638 + goto exitfn; 639 + 640 + /* 641 + * Only the device memory present on the hardware is mapped, which may 642 + * not be power-of-2 aligned. Drop access outside the available device 643 + * memory on the hardware. 644 + */ 645 + mem_count = min(count, memregion->memlength - (size_t)offset); 646 + 647 + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); 648 + if (ret) 649 + return ret; 650 + 651 + exitfn: 652 + *ppos += count; 653 + return count; 654 + } 655 + 656 + static ssize_t 657 + nvgrace_gpu_write(struct vfio_device *core_vdev, 658 + const char __user *buf, size_t count, loff_t *ppos) 659 + { 660 + struct nvgrace_gpu_pci_core_device *nvdev = 661 + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 662 + core_device.vdev); 663 + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 664 + 665 + if (nvgrace_gpu_memregion(index, nvdev)) 666 + return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); 667 + 668 + if (index == VFIO_PCI_CONFIG_REGION_INDEX) 669 + return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); 670 + 671 + return vfio_pci_core_write(core_vdev, buf, count, ppos); 672 + } 673 + 674 + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { 675 + .name = "nvgrace-gpu-vfio-pci", 676 + .init = vfio_pci_core_init_dev, 677 + .release = vfio_pci_core_release_dev, 678 + .open_device = nvgrace_gpu_open_device, 679 + .close_device = nvgrace_gpu_close_device, 680 + .ioctl = nvgrace_gpu_ioctl, 681 + .device_feature = vfio_pci_core_ioctl_feature, 682 + .read = nvgrace_gpu_read, 683 + .write = nvgrace_gpu_write, 684 + .mmap = nvgrace_gpu_mmap, 685 + .request = vfio_pci_core_request, 686 + .match = vfio_pci_core_match, 687 + .bind_iommufd = vfio_iommufd_physical_bind, 688 + .unbind_iommufd = vfio_iommufd_physical_unbind, 689 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 690 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 691 + }; 692 + 693 + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { 694 + .name = "nvgrace-gpu-vfio-pci-core", 695 + .init = vfio_pci_core_init_dev, 696 + .release = vfio_pci_core_release_dev, 697 + .open_device = nvgrace_gpu_open_device, 698 + .close_device = vfio_pci_core_close_device, 699 + .ioctl = vfio_pci_core_ioctl, 700 + .device_feature = vfio_pci_core_ioctl_feature, 701 + .read = vfio_pci_core_read, 702 + .write = vfio_pci_core_write, 703 + .mmap = vfio_pci_core_mmap, 704 + .request = vfio_pci_core_request, 705 + .match = vfio_pci_core_match, 706 + .bind_iommufd = vfio_iommufd_physical_bind, 707 + .unbind_iommufd = vfio_iommufd_physical_unbind, 708 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 709 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 710 + }; 711 + 712 + static int 713 + nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, 714 + u64 *pmemphys, u64 *pmemlength) 715 + { 716 + int ret; 717 + 718 + /* 719 + * The memory information is present in the system ACPI tables as DSD 720 + * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. 721 + */ 722 + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", 723 + pmemphys); 724 + if (ret) 725 + return ret; 726 + 727 + if (*pmemphys > type_max(phys_addr_t)) 728 + return -EOVERFLOW; 729 + 730 + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", 731 + pmemlength); 732 + if (ret) 733 + return ret; 734 + 735 + if (*pmemlength > type_max(size_t)) 736 + return -EOVERFLOW; 737 + 738 + /* 739 + * If the C2C link is not up due to an error, the coherent device 740 + * memory size is returned as 0. Fail in such case. 741 + */ 742 + if (*pmemlength == 0) 743 + return -ENOMEM; 744 + 745 + return ret; 746 + } 747 + 748 + static int 749 + nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, 750 + struct nvgrace_gpu_pci_core_device *nvdev, 751 + u64 memphys, u64 memlength) 752 + { 753 + int ret = 0; 754 + 755 + /* 756 + * The VM GPU device driver needs a non-cacheable region to support 757 + * the MIG feature. Since the device memory is mapped as NORMAL cached, 758 + * carve out a region from the end with a different NORMAL_NC 759 + * property (called as reserved memory and represented as resmem). This 760 + * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while 761 + * exposing the rest (termed as usable memory and represented using usemem) 762 + * as cacheable 64b BAR (region 4 and 5). 763 + * 764 + * devmem (memlength) 765 + * |-------------------------------------------------| 766 + * | | 767 + * usemem.memphys resmem.memphys 768 + */ 769 + nvdev->usemem.memphys = memphys; 770 + 771 + /* 772 + * The device memory exposed to the VM is added to the kernel by the 773 + * VM driver module in chunks of memory block size. Only the usable 774 + * memory (usemem) is added to the kernel for usage by the VM 775 + * workloads. Make the usable memory size memblock aligned. 776 + */ 777 + if (check_sub_overflow(memlength, RESMEM_SIZE, 778 + &nvdev->usemem.memlength)) { 779 + ret = -EOVERFLOW; 780 + goto done; 781 + } 782 + 783 + /* 784 + * The USEMEM part of the device memory has to be MEMBLK_SIZE 785 + * aligned. This is a hardwired ABI value between the GPU FW and 786 + * VFIO driver. The VM device driver is also aware of it and make 787 + * use of the value for its calculation to determine USEMEM size. 788 + */ 789 + nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 790 + MEMBLK_SIZE); 791 + if (nvdev->usemem.memlength == 0) { 792 + ret = -EINVAL; 793 + goto done; 794 + } 795 + 796 + if ((check_add_overflow(nvdev->usemem.memphys, 797 + nvdev->usemem.memlength, 798 + &nvdev->resmem.memphys)) || 799 + (check_sub_overflow(memlength, nvdev->usemem.memlength, 800 + &nvdev->resmem.memlength))) { 801 + ret = -EOVERFLOW; 802 + goto done; 803 + } 804 + 805 + /* 806 + * The memory regions are exposed as BARs. Calculate and save 807 + * the BAR size for them. 808 + */ 809 + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 810 + nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 811 + done: 812 + return ret; 813 + } 814 + 815 + static int nvgrace_gpu_probe(struct pci_dev *pdev, 816 + const struct pci_device_id *id) 817 + { 818 + const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; 819 + struct nvgrace_gpu_pci_core_device *nvdev; 820 + u64 memphys, memlength; 821 + int ret; 822 + 823 + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 824 + if (!ret) 825 + ops = &nvgrace_gpu_pci_ops; 826 + 827 + nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, 828 + &pdev->dev, ops); 829 + if (IS_ERR(nvdev)) 830 + return PTR_ERR(nvdev); 831 + 832 + dev_set_drvdata(&pdev->dev, &nvdev->core_device); 833 + 834 + if (ops == &nvgrace_gpu_pci_ops) { 835 + /* 836 + * Device memory properties are identified in the host ACPI 837 + * table. Set the nvgrace_gpu_pci_core_device structure. 838 + */ 839 + ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, 840 + memphys, memlength); 841 + if (ret) 842 + goto out_put_vdev; 843 + } 844 + 845 + ret = vfio_pci_core_register_device(&nvdev->core_device); 846 + if (ret) 847 + goto out_put_vdev; 848 + 849 + return ret; 850 + 851 + out_put_vdev: 852 + vfio_put_device(&nvdev->core_device.vdev); 853 + return ret; 854 + } 855 + 856 + static void nvgrace_gpu_remove(struct pci_dev *pdev) 857 + { 858 + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 859 + 860 + vfio_pci_core_unregister_device(core_device); 861 + vfio_put_device(&core_device->vdev); 862 + } 863 + 864 + static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { 865 + /* GH200 120GB */ 866 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, 867 + /* GH200 480GB */ 868 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 869 + {} 870 + }; 871 + 872 + MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); 873 + 874 + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { 875 + .name = KBUILD_MODNAME, 876 + .id_table = nvgrace_gpu_vfio_pci_table, 877 + .probe = nvgrace_gpu_probe, 878 + .remove = nvgrace_gpu_remove, 879 + .err_handler = &vfio_pci_core_err_handlers, 880 + .driver_managed_dma = true, 881 + }; 882 + 883 + module_pci_driver(nvgrace_gpu_vfio_pci_driver); 884 + 885 + MODULE_LICENSE("GPL"); 886 + MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>"); 887 + MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>"); 888 + MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
+3 -3
drivers/vfio/pci/pds/dirty.c
··· 607 607 608 608 mutex_lock(&pds_vfio->state_mutex); 609 609 err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); 610 - pds_vfio_state_mutex_unlock(pds_vfio); 610 + mutex_unlock(&pds_vfio->state_mutex); 611 611 612 612 return err; 613 613 } ··· 624 624 mutex_lock(&pds_vfio->state_mutex); 625 625 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); 626 626 err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); 627 - pds_vfio_state_mutex_unlock(pds_vfio); 627 + mutex_unlock(&pds_vfio->state_mutex); 628 628 629 629 return err; 630 630 } ··· 637 637 638 638 mutex_lock(&pds_vfio->state_mutex); 639 639 pds_vfio_dirty_disable(pds_vfio, true); 640 - pds_vfio_state_mutex_unlock(pds_vfio); 640 + mutex_unlock(&pds_vfio->state_mutex); 641 641 642 642 return 0; 643 643 }
+13
drivers/vfio/pci/pds/lm.c
··· 92 92 { 93 93 mutex_lock(&lm_file->lock); 94 94 95 + lm_file->disabled = true; 95 96 lm_file->size = 0; 96 97 lm_file->alloc_size = 0; 98 + lm_file->filep->f_pos = 0; 97 99 98 100 /* Free scatter list of file pages */ 99 101 sg_free_table(&lm_file->sg_table); ··· 185 183 pos = &filp->f_pos; 186 184 187 185 mutex_lock(&lm_file->lock); 186 + 187 + if (lm_file->disabled) { 188 + done = -ENODEV; 189 + goto out_unlock; 190 + } 191 + 188 192 if (*pos > lm_file->size) { 189 193 done = -EINVAL; 190 194 goto out_unlock; ··· 290 282 return -EINVAL; 291 283 292 284 mutex_lock(&lm_file->lock); 285 + 286 + if (lm_file->disabled) { 287 + done = -ENODEV; 288 + goto out_unlock; 289 + } 293 290 294 291 while (len) { 295 292 size_t page_offset;
+1
drivers/vfio/pci/pds/lm.h
··· 27 27 struct scatterlist *last_offset_sg; /* Iterator */ 28 28 unsigned int sg_last_entry; 29 29 unsigned long last_offset; 30 + bool disabled; 30 31 }; 31 32 32 33 struct pds_vfio_pci_device;
+5 -22
drivers/vfio/pci/pds/pci_drv.c
··· 21 21 22 22 static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) 23 23 { 24 - bool deferred_reset_needed = false; 25 - 26 24 /* 27 25 * Documentation states that the kernel migration driver must not 28 26 * generate asynchronous device state transitions outside of 29 27 * manipulation by the user or the VFIO_DEVICE_RESET ioctl. 30 28 * 31 29 * Since recovery is an asynchronous event received from the device, 32 - * initiate a deferred reset. Issue a deferred reset in the following 33 - * situations: 30 + * initiate a reset in the following situations: 34 31 * 1. Migration is in progress, which will cause the next step of 35 32 * the migration to fail. 36 33 * 2. If the device is in a state that will be set to ··· 39 42 pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || 40 43 (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && 41 44 pds_vfio_dirty_is_enabled(pds_vfio))) 42 - deferred_reset_needed = true; 45 + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR); 43 46 mutex_unlock(&pds_vfio->state_mutex); 44 - 45 - /* 46 - * On the next user initiated state transition, the device will 47 - * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's 48 - * responsibility to reset the device. 49 - * 50 - * If a VFIO_DEVICE_RESET is requested post recovery and before the next 51 - * state transition, then the deferred reset state will be set to 52 - * VFIO_DEVICE_STATE_RUNNING. 53 - */ 54 - if (deferred_reset_needed) { 55 - mutex_lock(&pds_vfio->reset_mutex); 56 - pds_vfio->deferred_reset = true; 57 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; 58 - mutex_unlock(&pds_vfio->reset_mutex); 59 - } 60 47 } 61 48 62 49 static int pds_vfio_pci_notify_handler(struct notifier_block *nb, ··· 166 185 { 167 186 struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); 168 187 169 - pds_vfio_reset(pds_vfio); 188 + mutex_lock(&pds_vfio->state_mutex); 189 + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_RUNNING); 190 + mutex_unlock(&pds_vfio->state_mutex); 170 191 } 171 192 172 193 static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
+9 -36
drivers/vfio/pci/pds/vfio_dev.c
··· 26 26 vfio_coredev); 27 27 } 28 28 29 - void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) 29 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, 30 + enum vfio_device_mig_state state) 30 31 { 31 - again: 32 - mutex_lock(&pds_vfio->reset_mutex); 33 - if (pds_vfio->deferred_reset) { 34 - pds_vfio->deferred_reset = false; 35 - if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { 36 - pds_vfio_put_restore_file(pds_vfio); 37 - pds_vfio_put_save_file(pds_vfio); 38 - pds_vfio_dirty_disable(pds_vfio, false); 39 - } 40 - pds_vfio->state = pds_vfio->deferred_reset_state; 41 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 42 - mutex_unlock(&pds_vfio->reset_mutex); 43 - goto again; 44 - } 45 - mutex_unlock(&pds_vfio->state_mutex); 46 - mutex_unlock(&pds_vfio->reset_mutex); 47 - } 48 - 49 - void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) 50 - { 51 - mutex_lock(&pds_vfio->reset_mutex); 52 - pds_vfio->deferred_reset = true; 53 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 54 - if (!mutex_trylock(&pds_vfio->state_mutex)) { 55 - mutex_unlock(&pds_vfio->reset_mutex); 56 - return; 57 - } 58 - mutex_unlock(&pds_vfio->reset_mutex); 59 - pds_vfio_state_mutex_unlock(pds_vfio); 32 + pds_vfio_put_restore_file(pds_vfio); 33 + pds_vfio_put_save_file(pds_vfio); 34 + if (state == VFIO_DEVICE_STATE_ERROR) 35 + pds_vfio_dirty_disable(pds_vfio, false); 36 + pds_vfio->state = state; 60 37 } 61 38 62 39 static struct file * ··· 74 97 break; 75 98 } 76 99 } 77 - pds_vfio_state_mutex_unlock(pds_vfio); 78 - /* still waiting on a deferred_reset */ 100 + mutex_unlock(&pds_vfio->state_mutex); 79 101 if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) 80 102 res = ERR_PTR(-EIO); 81 103 ··· 90 114 91 115 mutex_lock(&pds_vfio->state_mutex); 92 116 *current_state = pds_vfio->state; 93 - pds_vfio_state_mutex_unlock(pds_vfio); 117 + mutex_unlock(&pds_vfio->state_mutex); 94 118 return 0; 95 119 } 96 120 ··· 132 156 pds_vfio->vf_id = vf_id; 133 157 134 158 mutex_init(&pds_vfio->state_mutex); 135 - mutex_init(&pds_vfio->reset_mutex); 136 159 137 160 vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 138 161 vdev->mig_ops = &pds_vfio_lm_ops; ··· 153 178 vfio_coredev.vdev); 154 179 155 180 mutex_destroy(&pds_vfio->state_mutex); 156 - mutex_destroy(&pds_vfio->reset_mutex); 157 181 vfio_pci_core_release_dev(vdev); 158 182 } 159 183 ··· 168 194 return err; 169 195 170 196 pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; 171 - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; 172 197 173 198 vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); 174 199
+2 -6
drivers/vfio/pci/pds/vfio_dev.h
··· 18 18 struct pds_vfio_dirty dirty; 19 19 struct mutex state_mutex; /* protect migration state */ 20 20 enum vfio_device_mig_state state; 21 - struct mutex reset_mutex; /* protect reset_done flow */ 22 - u8 deferred_reset; 23 - enum vfio_device_mig_state deferred_reset_state; 24 21 struct notifier_block nb; 25 22 26 23 int vf_id; 27 24 u16 client_id; 28 25 }; 29 26 30 - void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); 31 - 32 27 const struct vfio_device_ops *pds_vfio_ops_info(void); 33 28 struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); 34 - void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); 29 + void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, 30 + enum vfio_device_mig_state state); 35 31 36 32 struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); 37 33 struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
+42
drivers/vfio/pci/vfio_pci_config.c
··· 1966 1966 1967 1967 return done; 1968 1968 } 1969 + 1970 + /** 1971 + * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer 1972 + * and register offset ranges. 1973 + * @buf_start: start offset of the buffer 1974 + * @buf_cnt: number of buffer bytes 1975 + * @reg_start: start register offset 1976 + * @reg_cnt: number of register bytes 1977 + * @buf_offset: start offset of overlap in the buffer 1978 + * @intersect_count: number of overlapping bytes 1979 + * @register_offset: start offset of overlap in register 1980 + * 1981 + * Returns: true if there is overlap, false if not. 1982 + * The overlap start and size is returned through function args. 1983 + */ 1984 + bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, 1985 + loff_t reg_start, size_t reg_cnt, 1986 + loff_t *buf_offset, 1987 + size_t *intersect_count, 1988 + size_t *register_offset) 1989 + { 1990 + if (buf_start <= reg_start && 1991 + buf_start + buf_cnt > reg_start) { 1992 + *buf_offset = reg_start - buf_start; 1993 + *intersect_count = min_t(size_t, reg_cnt, 1994 + buf_start + buf_cnt - reg_start); 1995 + *register_offset = 0; 1996 + return true; 1997 + } 1998 + 1999 + if (buf_start > reg_start && 2000 + buf_start < reg_start + reg_cnt) { 2001 + *buf_offset = 0; 2002 + *intersect_count = min_t(size_t, buf_cnt, 2003 + reg_start + reg_cnt - buf_start); 2004 + *register_offset = buf_start - reg_start; 2005 + return true; 2006 + } 2007 + 2008 + return false; 2009 + } 2010 + EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range);
+1
drivers/vfio/pci/vfio_pci_core.c
··· 2064 2064 pci_name(pdev)); 2065 2065 pdev->driver_override = kasprintf(GFP_KERNEL, "%s", 2066 2066 vdev->vdev.ops->name); 2067 + WARN_ON(!pdev->driver_override); 2067 2068 } else if (action == BUS_NOTIFY_BOUND_DRIVER && 2068 2069 pdev->is_virtfn && physfn == vdev->pdev) { 2069 2070 struct pci_driver *drv = pci_dev_driver(pdev);
+108 -72
drivers/vfio/pci/vfio_pci_intrs.c
··· 90 90 91 91 if (likely(is_intx(vdev) && !vdev->virq_disabled)) { 92 92 struct vfio_pci_irq_ctx *ctx; 93 + struct eventfd_ctx *trigger; 93 94 94 95 ctx = vfio_irq_ctx_get(vdev, 0); 95 96 if (WARN_ON_ONCE(!ctx)) 96 97 return; 97 - eventfd_signal(ctx->trigger); 98 + 99 + trigger = READ_ONCE(ctx->trigger); 100 + if (likely(trigger)) 101 + eventfd_signal(trigger); 98 102 } 99 103 } 100 104 101 105 /* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */ 102 - bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 106 + static bool __vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 103 107 { 104 108 struct pci_dev *pdev = vdev->pdev; 105 109 struct vfio_pci_irq_ctx *ctx; 106 110 unsigned long flags; 107 111 bool masked_changed = false; 112 + 113 + lockdep_assert_held(&vdev->igate); 108 114 109 115 spin_lock_irqsave(&vdev->irqlock, flags); 110 116 ··· 147 141 out_unlock: 148 142 spin_unlock_irqrestore(&vdev->irqlock, flags); 149 143 return masked_changed; 144 + } 145 + 146 + bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) 147 + { 148 + bool mask_changed; 149 + 150 + mutex_lock(&vdev->igate); 151 + mask_changed = __vfio_pci_intx_mask(vdev); 152 + mutex_unlock(&vdev->igate); 153 + 154 + return mask_changed; 150 155 } 151 156 152 157 /* ··· 211 194 return ret; 212 195 } 213 196 214 - void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 197 + static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 215 198 { 199 + lockdep_assert_held(&vdev->igate); 200 + 216 201 if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) 217 202 vfio_send_intx_eventfd(vdev, NULL); 203 + } 204 + 205 + void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) 206 + { 207 + mutex_lock(&vdev->igate); 208 + __vfio_pci_intx_unmask(vdev); 209 + mutex_unlock(&vdev->igate); 218 210 } 219 211 220 212 static irqreturn_t vfio_intx_handler(int irq, void *dev_id) ··· 257 231 return ret; 258 232 } 259 233 260 - static int vfio_intx_enable(struct vfio_pci_core_device *vdev) 234 + static int vfio_intx_enable(struct vfio_pci_core_device *vdev, 235 + struct eventfd_ctx *trigger) 261 236 { 237 + struct pci_dev *pdev = vdev->pdev; 262 238 struct vfio_pci_irq_ctx *ctx; 239 + unsigned long irqflags; 240 + char *name; 241 + int ret; 263 242 264 243 if (!is_irq_none(vdev)) 265 244 return -EINVAL; 266 245 267 - if (!vdev->pdev->irq) 246 + if (!pdev->irq) 268 247 return -ENODEV; 248 + 249 + name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); 250 + if (!name) 251 + return -ENOMEM; 269 252 270 253 ctx = vfio_irq_ctx_alloc(vdev, 0); 271 254 if (!ctx) 272 255 return -ENOMEM; 273 256 257 + ctx->name = name; 258 + ctx->trigger = trigger; 259 + 274 260 /* 275 - * If the virtual interrupt is masked, restore it. Devices 276 - * supporting DisINTx can be masked at the hardware level 277 - * here, non-PCI-2.3 devices will have to wait until the 278 - * interrupt is enabled. 261 + * Fill the initial masked state based on virq_disabled. After 262 + * enable, changing the DisINTx bit in vconfig directly changes INTx 263 + * masking. igate prevents races during setup, once running masked 264 + * is protected via irqlock. 265 + * 266 + * Devices supporting DisINTx also reflect the current mask state in 267 + * the physical DisINTx bit, which is not affected during IRQ setup. 268 + * 269 + * Devices without DisINTx support require an exclusive interrupt. 270 + * IRQ masking is performed at the IRQ chip. Again, igate protects 271 + * against races during setup and IRQ handlers and irqfds are not 272 + * yet active, therefore masked is stable and can be used to 273 + * conditionally auto-enable the IRQ. 274 + * 275 + * irq_type must be stable while the IRQ handler is registered, 276 + * therefore it must be set before request_irq(). 279 277 */ 280 278 ctx->masked = vdev->virq_disabled; 281 - if (vdev->pci_2_3) 282 - pci_intx(vdev->pdev, !ctx->masked); 279 + if (vdev->pci_2_3) { 280 + pci_intx(pdev, !ctx->masked); 281 + irqflags = IRQF_SHARED; 282 + } else { 283 + irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0; 284 + } 283 285 284 286 vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; 287 + 288 + ret = request_irq(pdev->irq, vfio_intx_handler, 289 + irqflags, ctx->name, vdev); 290 + if (ret) { 291 + vdev->irq_type = VFIO_PCI_NUM_IRQS; 292 + kfree(name); 293 + vfio_irq_ctx_free(vdev, ctx, 0); 294 + return ret; 295 + } 285 296 286 297 return 0; 287 298 } 288 299 289 - static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) 300 + static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, 301 + struct eventfd_ctx *trigger) 290 302 { 291 303 struct pci_dev *pdev = vdev->pdev; 292 - unsigned long irqflags = IRQF_SHARED; 293 304 struct vfio_pci_irq_ctx *ctx; 294 - struct eventfd_ctx *trigger; 295 - unsigned long flags; 296 - int ret; 305 + struct eventfd_ctx *old; 297 306 298 307 ctx = vfio_irq_ctx_get(vdev, 0); 299 308 if (WARN_ON_ONCE(!ctx)) 300 309 return -EINVAL; 301 310 302 - if (ctx->trigger) { 303 - free_irq(pdev->irq, vdev); 304 - kfree(ctx->name); 305 - eventfd_ctx_put(ctx->trigger); 306 - ctx->trigger = NULL; 311 + old = ctx->trigger; 312 + 313 + WRITE_ONCE(ctx->trigger, trigger); 314 + 315 + /* Releasing an old ctx requires synchronizing in-flight users */ 316 + if (old) { 317 + synchronize_irq(pdev->irq); 318 + vfio_virqfd_flush_thread(&ctx->unmask); 319 + eventfd_ctx_put(old); 307 320 } 308 - 309 - if (fd < 0) /* Disable only */ 310 - return 0; 311 - 312 - ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", 313 - pci_name(pdev)); 314 - if (!ctx->name) 315 - return -ENOMEM; 316 - 317 - trigger = eventfd_ctx_fdget(fd); 318 - if (IS_ERR(trigger)) { 319 - kfree(ctx->name); 320 - return PTR_ERR(trigger); 321 - } 322 - 323 - ctx->trigger = trigger; 324 - 325 - if (!vdev->pci_2_3) 326 - irqflags = 0; 327 - 328 - ret = request_irq(pdev->irq, vfio_intx_handler, 329 - irqflags, ctx->name, vdev); 330 - if (ret) { 331 - ctx->trigger = NULL; 332 - kfree(ctx->name); 333 - eventfd_ctx_put(trigger); 334 - return ret; 335 - } 336 - 337 - /* 338 - * INTx disable will stick across the new irq setup, 339 - * disable_irq won't. 340 - */ 341 - spin_lock_irqsave(&vdev->irqlock, flags); 342 - if (!vdev->pci_2_3 && ctx->masked) 343 - disable_irq_nosync(pdev->irq); 344 - spin_unlock_irqrestore(&vdev->irqlock, flags); 345 321 346 322 return 0; 347 323 } 348 324 349 325 static void vfio_intx_disable(struct vfio_pci_core_device *vdev) 350 326 { 327 + struct pci_dev *pdev = vdev->pdev; 351 328 struct vfio_pci_irq_ctx *ctx; 352 329 353 330 ctx = vfio_irq_ctx_get(vdev, 0); ··· 358 329 if (ctx) { 359 330 vfio_virqfd_disable(&ctx->unmask); 360 331 vfio_virqfd_disable(&ctx->mask); 332 + free_irq(pdev->irq, vdev); 333 + if (ctx->trigger) 334 + eventfd_ctx_put(ctx->trigger); 335 + kfree(ctx->name); 336 + vfio_irq_ctx_free(vdev, ctx, 0); 361 337 } 362 - vfio_intx_set_signal(vdev, -1); 363 338 vdev->irq_type = VFIO_PCI_NUM_IRQS; 364 - vfio_irq_ctx_free(vdev, ctx, 0); 365 339 } 366 340 367 341 /* ··· 592 560 return -EINVAL; 593 561 594 562 if (flags & VFIO_IRQ_SET_DATA_NONE) { 595 - vfio_pci_intx_unmask(vdev); 563 + __vfio_pci_intx_unmask(vdev); 596 564 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 597 565 uint8_t unmask = *(uint8_t *)data; 598 566 if (unmask) 599 - vfio_pci_intx_unmask(vdev); 567 + __vfio_pci_intx_unmask(vdev); 600 568 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 601 569 struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0); 602 570 int32_t fd = *(int32_t *)data; ··· 623 591 return -EINVAL; 624 592 625 593 if (flags & VFIO_IRQ_SET_DATA_NONE) { 626 - vfio_pci_intx_mask(vdev); 594 + __vfio_pci_intx_mask(vdev); 627 595 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 628 596 uint8_t mask = *(uint8_t *)data; 629 597 if (mask) 630 - vfio_pci_intx_mask(vdev); 598 + __vfio_pci_intx_mask(vdev); 631 599 } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 632 600 return -ENOTTY; /* XXX implement me */ 633 601 } ··· 648 616 return -EINVAL; 649 617 650 618 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 619 + struct eventfd_ctx *trigger = NULL; 651 620 int32_t fd = *(int32_t *)data; 652 621 int ret; 653 622 623 + if (fd >= 0) { 624 + trigger = eventfd_ctx_fdget(fd); 625 + if (IS_ERR(trigger)) 626 + return PTR_ERR(trigger); 627 + } 628 + 654 629 if (is_intx(vdev)) 655 - return vfio_intx_set_signal(vdev, fd); 630 + ret = vfio_intx_set_signal(vdev, trigger); 631 + else 632 + ret = vfio_intx_enable(vdev, trigger); 656 633 657 - ret = vfio_intx_enable(vdev); 658 - if (ret) 659 - return ret; 660 - 661 - ret = vfio_intx_set_signal(vdev, fd); 662 - if (ret) 663 - vfio_intx_disable(vdev); 634 + if (ret && trigger) 635 + eventfd_ctx_put(trigger); 664 636 665 637 return ret; 666 638 }
+9 -7
drivers/vfio/pci/vfio_pci_rdwr.c
··· 96 96 * reads with -1. This is intended for handling MSI-X vector tables and 97 97 * leftover space for ROM BARs. 98 98 */ 99 - static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 100 - void __iomem *io, char __user *buf, 101 - loff_t off, size_t count, size_t x_start, 102 - size_t x_end, bool iswrite) 99 + ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 100 + void __iomem *io, char __user *buf, 101 + loff_t off, size_t count, size_t x_start, 102 + size_t x_end, bool iswrite) 103 103 { 104 104 ssize_t done = 0; 105 105 int ret; ··· 201 201 202 202 return done; 203 203 } 204 + EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); 204 205 205 206 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) 206 207 { ··· 280 279 x_end = vdev->msix_offset + vdev->msix_size; 281 280 } 282 281 283 - done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, 284 - count, x_start, x_end, iswrite); 282 + done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, 283 + count, x_start, x_end, iswrite); 285 284 286 285 if (done >= 0) 287 286 *ppos += done; ··· 349 348 * probing, so we don't currently worry about access in relation 350 349 * to the memory enable bit in the command register. 351 350 */ 352 - done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite); 351 + done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count, 352 + 0, 0, iswrite); 353 353 354 354 vga_put(vdev->pdev, rsrc); 355 355
+26 -46
drivers/vfio/pci/virtio/main.c
··· 132 132 return ret ? ret : count; 133 133 } 134 134 135 - static bool range_intersect_range(loff_t range1_start, size_t count1, 136 - loff_t range2_start, size_t count2, 137 - loff_t *start_offset, 138 - size_t *intersect_count, 139 - size_t *register_offset) 140 - { 141 - if (range1_start <= range2_start && 142 - range1_start + count1 > range2_start) { 143 - *start_offset = range2_start - range1_start; 144 - *intersect_count = min_t(size_t, count2, 145 - range1_start + count1 - range2_start); 146 - *register_offset = 0; 147 - return true; 148 - } 149 - 150 - if (range1_start > range2_start && 151 - range1_start < range2_start + count2) { 152 - *start_offset = 0; 153 - *intersect_count = min_t(size_t, count1, 154 - range2_start + count2 - range1_start); 155 - *register_offset = range1_start - range2_start; 156 - return true; 157 - } 158 - 159 - return false; 160 - } 161 - 162 135 static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, 163 136 char __user *buf, size_t count, 164 137 loff_t *ppos) ··· 151 178 if (ret < 0) 152 179 return ret; 153 180 154 - if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16), 155 - &copy_offset, &copy_count, &register_offset)) { 181 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, 182 + sizeof(val16), &copy_offset, 183 + &copy_count, &register_offset)) { 156 184 val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); 157 185 if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) 158 186 return -EFAULT; 159 187 } 160 188 161 189 if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && 162 - range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16), 163 - &copy_offset, &copy_count, &register_offset)) { 190 + vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, 191 + sizeof(val16), &copy_offset, 192 + &copy_count, &register_offset)) { 164 193 if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, 165 194 copy_count)) 166 195 return -EFAULT; ··· 172 197 return -EFAULT; 173 198 } 174 199 175 - if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8), 176 - &copy_offset, &copy_count, &register_offset)) { 200 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, 201 + sizeof(val8), &copy_offset, 202 + &copy_count, &register_offset)) { 177 203 /* Transional needs to have revision 0 */ 178 204 val8 = 0; 179 205 if (copy_to_user(buf + copy_offset, &val8, copy_count)) 180 206 return -EFAULT; 181 207 } 182 208 183 - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32), 184 - &copy_offset, &copy_count, &register_offset)) { 209 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 210 + sizeof(val32), &copy_offset, 211 + &copy_count, &register_offset)) { 185 212 u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); 186 213 u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); 187 214 ··· 192 215 return -EFAULT; 193 216 } 194 217 195 - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16), 196 - &copy_offset, &copy_count, &register_offset)) { 218 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, 219 + sizeof(val16), &copy_offset, 220 + &copy_count, &register_offset)) { 197 221 /* 198 222 * Transitional devices use the PCI subsystem device id as 199 223 * virtio device id, same as legacy driver always did. ··· 205 227 return -EFAULT; 206 228 } 207 229 208 - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16), 209 - &copy_offset, &copy_count, &register_offset)) { 230 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, 231 + sizeof(val16), &copy_offset, 232 + &copy_count, &register_offset)) { 210 233 val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); 211 234 if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, 212 235 copy_count)) ··· 249 270 loff_t copy_offset; 250 271 size_t copy_count; 251 272 252 - if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd), 253 - &copy_offset, &copy_count, 254 - &register_offset)) { 273 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, 274 + sizeof(virtvdev->pci_cmd), 275 + &copy_offset, &copy_count, 276 + &register_offset)) { 255 277 if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, 256 278 buf + copy_offset, 257 279 copy_count)) 258 280 return -EFAULT; 259 281 } 260 282 261 - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 262 - sizeof(virtvdev->pci_base_addr_0), 263 - &copy_offset, &copy_count, 264 - &register_offset)) { 283 + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, 284 + sizeof(virtvdev->pci_base_addr_0), 285 + &copy_offset, &copy_count, 286 + &register_offset)) { 265 287 if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, 266 288 buf + copy_offset, 267 289 copy_count))
+3 -3
drivers/vfio/platform/vfio_amba.c
··· 122 122 .detach_ioas = vfio_iommufd_physical_detach_ioas, 123 123 }; 124 124 125 - static const struct amba_id pl330_ids[] = { 125 + static const struct amba_id vfio_amba_ids[] = { 126 126 { 0, 0 }, 127 127 }; 128 128 129 - MODULE_DEVICE_TABLE(amba, pl330_ids); 129 + MODULE_DEVICE_TABLE(amba, vfio_amba_ids); 130 130 131 131 static struct amba_driver vfio_amba_driver = { 132 132 .probe = vfio_amba_probe, 133 133 .remove = vfio_amba_remove, 134 - .id_table = pl330_ids, 134 + .id_table = vfio_amba_ids, 135 135 .drv = { 136 136 .name = "vfio-amba", 137 137 .owner = THIS_MODULE,
+2 -3
drivers/vfio/platform/vfio_platform.c
··· 85 85 vfio_platform_release_common(vdev); 86 86 } 87 87 88 - static int vfio_platform_remove(struct platform_device *pdev) 88 + static void vfio_platform_remove(struct platform_device *pdev) 89 89 { 90 90 struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev); 91 91 92 92 vfio_unregister_group_dev(&vdev->vdev); 93 93 pm_runtime_disable(vdev->device); 94 94 vfio_put_device(&vdev->vdev); 95 - return 0; 96 95 } 97 96 98 97 static const struct vfio_device_ops vfio_platform_ops = { ··· 112 113 113 114 static struct platform_driver vfio_platform_driver = { 114 115 .probe = vfio_platform_probe, 115 - .remove = vfio_platform_remove, 116 + .remove_new = vfio_platform_remove, 116 117 .driver = { 117 118 .name = "vfio-platform", 118 119 },
+72 -33
drivers/vfio/platform/vfio_platform_irq.c
··· 136 136 return 0; 137 137 } 138 138 139 + /* 140 + * The trigger eventfd is guaranteed valid in the interrupt path 141 + * and protected by the igate mutex when triggered via ioctl. 142 + */ 143 + static void vfio_send_eventfd(struct vfio_platform_irq *irq_ctx) 144 + { 145 + if (likely(irq_ctx->trigger)) 146 + eventfd_signal(irq_ctx->trigger); 147 + } 148 + 139 149 static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id) 140 150 { 141 151 struct vfio_platform_irq *irq_ctx = dev_id; ··· 165 155 spin_unlock_irqrestore(&irq_ctx->lock, flags); 166 156 167 157 if (ret == IRQ_HANDLED) 168 - eventfd_signal(irq_ctx->trigger); 158 + vfio_send_eventfd(irq_ctx); 169 159 170 160 return ret; 171 161 } ··· 174 164 { 175 165 struct vfio_platform_irq *irq_ctx = dev_id; 176 166 177 - eventfd_signal(irq_ctx->trigger); 167 + vfio_send_eventfd(irq_ctx); 178 168 179 169 return IRQ_HANDLED; 180 170 } 181 171 182 172 static int vfio_set_trigger(struct vfio_platform_device *vdev, int index, 183 - int fd, irq_handler_t handler) 173 + int fd) 184 174 { 185 175 struct vfio_platform_irq *irq = &vdev->irqs[index]; 186 176 struct eventfd_ctx *trigger; 187 - int ret; 188 177 189 178 if (irq->trigger) { 190 - irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN); 191 - free_irq(irq->hwirq, irq); 192 - kfree(irq->name); 179 + disable_irq(irq->hwirq); 193 180 eventfd_ctx_put(irq->trigger); 194 181 irq->trigger = NULL; 195 182 } 196 183 197 184 if (fd < 0) /* Disable only */ 198 185 return 0; 199 - irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", 200 - irq->hwirq, vdev->name); 201 - if (!irq->name) 202 - return -ENOMEM; 203 186 204 187 trigger = eventfd_ctx_fdget(fd); 205 - if (IS_ERR(trigger)) { 206 - kfree(irq->name); 188 + if (IS_ERR(trigger)) 207 189 return PTR_ERR(trigger); 208 - } 209 190 210 191 irq->trigger = trigger; 211 192 212 - irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN); 213 - ret = request_irq(irq->hwirq, handler, 0, irq->name, irq); 214 - if (ret) { 215 - kfree(irq->name); 216 - eventfd_ctx_put(trigger); 217 - irq->trigger = NULL; 218 - return ret; 219 - } 220 - 221 - if (!irq->masked) 222 - enable_irq(irq->hwirq); 193 + /* 194 + * irq->masked effectively provides nested disables within the overall 195 + * enable relative to trigger. Specifically request_irq() is called 196 + * with NO_AUTOEN, therefore the IRQ is initially disabled. The user 197 + * may only further disable the IRQ with a MASK operations because 198 + * irq->masked is initially false. 199 + */ 200 + enable_irq(irq->hwirq); 223 201 224 202 return 0; 225 203 } ··· 226 228 handler = vfio_irq_handler; 227 229 228 230 if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) 229 - return vfio_set_trigger(vdev, index, -1, handler); 231 + return vfio_set_trigger(vdev, index, -1); 230 232 231 233 if (start != 0 || count != 1) 232 234 return -EINVAL; ··· 234 236 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { 235 237 int32_t fd = *(int32_t *)data; 236 238 237 - return vfio_set_trigger(vdev, index, fd, handler); 239 + return vfio_set_trigger(vdev, index, fd); 238 240 } 239 241 240 242 if (flags & VFIO_IRQ_SET_DATA_NONE) { ··· 258 260 unsigned start, unsigned count, uint32_t flags, 259 261 void *data) = NULL; 260 262 263 + /* 264 + * For compatibility, errors from request_irq() are local to the 265 + * SET_IRQS path and reflected in the name pointer. This allows, 266 + * for example, polling mode fallback for an exclusive IRQ failure. 267 + */ 268 + if (IS_ERR(vdev->irqs[index].name)) 269 + return PTR_ERR(vdev->irqs[index].name); 270 + 261 271 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { 262 272 case VFIO_IRQ_SET_ACTION_MASK: 263 273 func = vfio_platform_set_irq_mask; ··· 286 280 287 281 int vfio_platform_irq_init(struct vfio_platform_device *vdev) 288 282 { 289 - int cnt = 0, i; 283 + int cnt = 0, i, ret = 0; 290 284 291 285 while (vdev->get_irq(vdev, cnt) >= 0) 292 286 cnt++; ··· 298 292 299 293 for (i = 0; i < cnt; i++) { 300 294 int hwirq = vdev->get_irq(vdev, i); 295 + irq_handler_t handler = vfio_irq_handler; 301 296 302 - if (hwirq < 0) 297 + if (hwirq < 0) { 298 + ret = -EINVAL; 303 299 goto err; 300 + } 304 301 305 302 spin_lock_init(&vdev->irqs[i].lock); 306 303 307 304 vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD; 308 305 309 - if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) 306 + if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) { 310 307 vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE 311 308 | VFIO_IRQ_INFO_AUTOMASKED; 309 + handler = vfio_automasked_irq_handler; 310 + } 312 311 313 312 vdev->irqs[i].count = 1; 314 313 vdev->irqs[i].hwirq = hwirq; 315 314 vdev->irqs[i].masked = false; 315 + vdev->irqs[i].name = kasprintf(GFP_KERNEL_ACCOUNT, 316 + "vfio-irq[%d](%s)", hwirq, 317 + vdev->name); 318 + if (!vdev->irqs[i].name) { 319 + ret = -ENOMEM; 320 + goto err; 321 + } 322 + 323 + ret = request_irq(hwirq, handler, IRQF_NO_AUTOEN, 324 + vdev->irqs[i].name, &vdev->irqs[i]); 325 + if (ret) { 326 + kfree(vdev->irqs[i].name); 327 + vdev->irqs[i].name = ERR_PTR(ret); 328 + } 316 329 } 317 330 318 331 vdev->num_irqs = cnt; 319 332 320 333 return 0; 321 334 err: 335 + for (--i; i >= 0; i--) { 336 + if (!IS_ERR(vdev->irqs[i].name)) { 337 + free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]); 338 + kfree(vdev->irqs[i].name); 339 + } 340 + } 322 341 kfree(vdev->irqs); 323 - return -EINVAL; 342 + return ret; 324 343 } 325 344 326 345 void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev) 327 346 { 328 347 int i; 329 348 330 - for (i = 0; i < vdev->num_irqs; i++) 331 - vfio_set_trigger(vdev, i, -1, NULL); 349 + for (i = 0; i < vdev->num_irqs; i++) { 350 + vfio_virqfd_disable(&vdev->irqs[i].mask); 351 + vfio_virqfd_disable(&vdev->irqs[i].unmask); 352 + if (!IS_ERR(vdev->irqs[i].name)) { 353 + free_irq(vdev->irqs[i].hwirq, &vdev->irqs[i]); 354 + if (vdev->irqs[i].trigger) 355 + eventfd_ctx_put(vdev->irqs[i].trigger); 356 + kfree(vdev->irqs[i].name); 357 + } 358 + } 332 359 333 360 vdev->num_irqs = 0; 334 361 kfree(vdev->irqs);
-12
drivers/vfio/vfio_iommu_type1.c
··· 567 567 ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, 568 568 pages, NULL); 569 569 if (ret > 0) { 570 - int i; 571 - 572 - /* 573 - * The zero page is always resident, we don't need to pin it 574 - * and it falls into our invalid/reserved test so we don't 575 - * unpin in put_pfn(). Unpin all zero pages in the batch here. 576 - */ 577 - for (i = 0 ; i < ret; i++) { 578 - if (unlikely(is_zero_pfn(page_to_pfn(pages[i])))) 579 - unpin_user_page(pages[i]); 580 - } 581 - 582 570 *pfn = page_to_pfn(pages[0]); 583 571 goto done; 584 572 }
+21
drivers/vfio/virqfd.c
··· 101 101 virqfd->thread(virqfd->opaque, virqfd->data); 102 102 } 103 103 104 + static void virqfd_flush_inject(struct work_struct *work) 105 + { 106 + struct virqfd *virqfd = container_of(work, struct virqfd, flush_inject); 107 + 108 + flush_work(&virqfd->inject); 109 + } 110 + 104 111 int vfio_virqfd_enable(void *opaque, 105 112 int (*handler)(void *, void *), 106 113 void (*thread)(void *, void *), ··· 131 124 132 125 INIT_WORK(&virqfd->shutdown, virqfd_shutdown); 133 126 INIT_WORK(&virqfd->inject, virqfd_inject); 127 + INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject); 134 128 135 129 irqfd = fdget(fd); 136 130 if (!irqfd.file) { ··· 221 213 flush_workqueue(vfio_irqfd_cleanup_wq); 222 214 } 223 215 EXPORT_SYMBOL_GPL(vfio_virqfd_disable); 216 + 217 + void vfio_virqfd_flush_thread(struct virqfd **pvirqfd) 218 + { 219 + unsigned long flags; 220 + 221 + spin_lock_irqsave(&virqfd_lock, flags); 222 + if (*pvirqfd && (*pvirqfd)->thread) 223 + queue_work(vfio_irqfd_cleanup_wq, &(*pvirqfd)->flush_inject); 224 + spin_unlock_irqrestore(&virqfd_lock, flags); 225 + 226 + flush_workqueue(vfio_irqfd_cleanup_wq); 227 + } 228 + EXPORT_SYMBOL_GPL(vfio_virqfd_flush_thread);
+5
include/linux/mlx5/mlx5_ifc.h
··· 12677 12677 struct mlx5_ifc_page_track_bits obj_context; 12678 12678 }; 12679 12679 12680 + struct mlx5_ifc_query_page_track_obj_out_bits { 12681 + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; 12682 + struct mlx5_ifc_page_track_bits obj_context; 12683 + }; 12684 + 12680 12685 struct mlx5_ifc_msecq_reg_bits { 12681 12686 u8 reserved_at_0[0x20]; 12682 12687
+2
include/linux/vfio.h
··· 356 356 wait_queue_entry_t wait; 357 357 poll_table pt; 358 358 struct work_struct shutdown; 359 + struct work_struct flush_inject; 359 360 struct virqfd **pvirqfd; 360 361 }; 361 362 ··· 364 363 void (*thread)(void *, void *), void *data, 365 364 struct virqfd **pvirqfd, int fd); 366 365 void vfio_virqfd_disable(struct virqfd **pvirqfd); 366 + void vfio_virqfd_flush_thread(struct virqfd **pvirqfd); 367 367 368 368 #endif /* VFIO_H */
+9 -1
include/linux/vfio_pci_core.h
··· 130 130 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); 131 131 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, 132 132 pci_channel_state_t state); 133 - 133 + ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, 134 + void __iomem *io, char __user *buf, 135 + loff_t off, size_t count, size_t x_start, 136 + size_t x_end, bool iswrite); 137 + bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, 138 + loff_t reg_start, size_t reg_cnt, 139 + loff_t *buf_offset, 140 + size_t *intersect_count, 141 + size_t *register_offset); 134 142 #define VFIO_IOWRITE_DECLATION(size) \ 135 143 int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ 136 144 bool test_mem, u##size val, void __iomem *io);
+8 -10
samples/vfio-mdev/mbochs.c
··· 133 133 }; 134 134 135 135 static dev_t mbochs_devt; 136 - static struct class *mbochs_class; 136 + static const struct class mbochs_class = { 137 + .name = MBOCHS_CLASS_NAME, 138 + }; 137 139 static struct cdev mbochs_cdev; 138 140 static struct device mbochs_dev; 139 141 static struct mdev_parent mbochs_parent; ··· 1424 1422 if (ret) 1425 1423 goto err_cdev; 1426 1424 1427 - mbochs_class = class_create(MBOCHS_CLASS_NAME); 1428 - if (IS_ERR(mbochs_class)) { 1429 - pr_err("Error: failed to register mbochs_dev class\n"); 1430 - ret = PTR_ERR(mbochs_class); 1425 + ret = class_register(&mbochs_class); 1426 + if (ret) 1431 1427 goto err_driver; 1432 - } 1433 - mbochs_dev.class = mbochs_class; 1428 + mbochs_dev.class = &mbochs_class; 1434 1429 mbochs_dev.release = mbochs_device_release; 1435 1430 dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME); 1436 1431 ··· 1447 1448 device_del(&mbochs_dev); 1448 1449 err_put: 1449 1450 put_device(&mbochs_dev); 1450 - class_destroy(mbochs_class); 1451 + class_unregister(&mbochs_class); 1451 1452 err_driver: 1452 1453 mdev_unregister_driver(&mbochs_driver); 1453 1454 err_cdev: ··· 1465 1466 mdev_unregister_driver(&mbochs_driver); 1466 1467 cdev_del(&mbochs_cdev); 1467 1468 unregister_chrdev_region(mbochs_devt, MINORMASK + 1); 1468 - class_destroy(mbochs_class); 1469 - mbochs_class = NULL; 1469 + class_unregister(&mbochs_class); 1470 1470 } 1471 1471 1472 1472 MODULE_IMPORT_NS(DMA_BUF);
+8 -10
samples/vfio-mdev/mdpy.c
··· 84 84 }; 85 85 86 86 static dev_t mdpy_devt; 87 - static struct class *mdpy_class; 87 + static const struct class mdpy_class = { 88 + .name = MDPY_CLASS_NAME, 89 + }; 88 90 static struct cdev mdpy_cdev; 89 91 static struct device mdpy_dev; 90 92 static struct mdev_parent mdpy_parent; ··· 711 709 if (ret) 712 710 goto err_cdev; 713 711 714 - mdpy_class = class_create(MDPY_CLASS_NAME); 715 - if (IS_ERR(mdpy_class)) { 716 - pr_err("Error: failed to register mdpy_dev class\n"); 717 - ret = PTR_ERR(mdpy_class); 712 + ret = class_register(&mdpy_class); 713 + if (ret) 718 714 goto err_driver; 719 - } 720 - mdpy_dev.class = mdpy_class; 715 + mdpy_dev.class = &mdpy_class; 721 716 mdpy_dev.release = mdpy_device_release; 722 717 dev_set_name(&mdpy_dev, "%s", MDPY_NAME); 723 718 ··· 734 735 device_del(&mdpy_dev); 735 736 err_put: 736 737 put_device(&mdpy_dev); 737 - class_destroy(mdpy_class); 738 + class_unregister(&mdpy_class); 738 739 err_driver: 739 740 mdev_unregister_driver(&mdpy_driver); 740 741 err_cdev: ··· 752 753 mdev_unregister_driver(&mdpy_driver); 753 754 cdev_del(&mdpy_cdev); 754 755 unregister_chrdev_region(mdpy_devt, MINORMASK + 1); 755 - class_destroy(mdpy_class); 756 - mdpy_class = NULL; 756 + class_unregister(&mdpy_class); 757 757 } 758 758 759 759 module_param_named(count, mdpy_driver.max_instances, int, 0444);