Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'topic/xe-vfio-2025-12-01' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-next

Cross-subsystem Changes:
- Add device specific vfio_pci driver variant for intel graphics (Michal Winiarski)

Driver Changes:
- Add scope-based cleanup helper for runtime PM (Matt Roper)
- Additional xe driver prerequisites and exports (Michal Winiarski)

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Thomas Hellstrom <thomas.hellstrom@linux.intel.com>
Link: https://patch.msgid.link/aS1bNpqeem6PIHrA@fedora

+926 -7
+7
MAINTAINERS
··· 27022 27022 S: Maintained 27023 27023 F: drivers/vfio/pci/virtio 27024 27024 27025 + VFIO XE PCI DRIVER 27026 + M: Michał Winiarski <michal.winiarski@intel.com> 27027 + L: kvm@vger.kernel.org 27028 + L: intel-xe@lists.freedesktop.org 27029 + S: Supported 27030 + F: drivers/vfio/pci/xe 27031 + 27025 27032 VGA_SWITCHEROO 27026 27033 R: Lukas Wunner <lukas@wunner.de> 27027 27034 S: Maintained
+4
drivers/gpu/drm/xe/Makefile
··· 184 184 xe_sriov_pf_sysfs.o \ 185 185 xe_tile_sriov_pf_debugfs.o 186 186 187 + ifeq ($(CONFIG_PCI_IOV),y) 188 + xe-$(CONFIG_XE_VFIO_PCI) += xe_sriov_vfio.o 189 + endif 190 + 187 191 # include helpers for tests even when XE is built-in 188 192 ifdef CONFIG_DRM_XE_KUNIT_TEST 189 193 xe-y += tests/xe_kunit_helpers.o
+9
drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c
··· 17 17 #include "xe_gt_sriov_pf_helpers.h" 18 18 #include "xe_gt_sriov_pf_migration.h" 19 19 #include "xe_gt_sriov_printk.h" 20 + #include "xe_guc.h" 20 21 #include "xe_guc_buf.h" 21 22 #include "xe_guc_ct.h" 22 23 #include "xe_migrate.h" ··· 1024 1023 ptr_ring_cleanup(r, destroy_pf_packet); 1025 1024 } 1026 1025 1026 + static void pf_gt_migration_check_support(struct xe_gt *gt) 1027 + { 1028 + if (GUC_FIRMWARE_VER(&gt->uc.guc) < MAKE_GUC_VER(70, 54, 0)) 1029 + xe_sriov_pf_migration_disable(gt_to_xe(gt), "requires GuC version >= 70.54.0"); 1030 + } 1031 + 1027 1032 /** 1028 1033 * xe_gt_sriov_pf_migration_init() - Initialize support for VF migration. 1029 1034 * @gt: the &xe_gt ··· 1045 1038 int err; 1046 1039 1047 1040 xe_gt_assert(gt, IS_SRIOV_PF(xe)); 1041 + 1042 + pf_gt_migration_check_support(gt); 1048 1043 1049 1044 if (!pf_migration_supported(gt)) 1050 1045 return 0;
+17
drivers/gpu/drm/xe/xe_pci.c
··· 1223 1223 #endif 1224 1224 }; 1225 1225 1226 + /** 1227 + * xe_pci_to_pf_device() - Get PF &xe_device. 1228 + * @pdev: the VF &pci_dev device 1229 + * 1230 + * Return: pointer to PF &xe_device, NULL otherwise. 1231 + */ 1232 + struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev) 1233 + { 1234 + struct drm_device *drm; 1235 + 1236 + drm = pci_iov_get_pf_drvdata(pdev, &xe_pci_driver); 1237 + if (IS_ERR(drm)) 1238 + return NULL; 1239 + 1240 + return to_xe_device(drm); 1241 + } 1242 + 1226 1243 int xe_register_pci_driver(void) 1227 1244 { 1228 1245 return pci_register_driver(&xe_pci_driver);
+3
drivers/gpu/drm/xe/xe_pci.h
··· 6 6 #ifndef _XE_PCI_H_ 7 7 #define _XE_PCI_H_ 8 8 9 + struct pci_dev; 10 + 9 11 int xe_register_pci_driver(void); 10 12 void xe_unregister_pci_driver(void); 13 + struct xe_device *xe_pci_to_pf_device(struct pci_dev *pdev); 11 14 12 15 #endif
+21
drivers/gpu/drm/xe/xe_pm.c
··· 726 726 /** 727 727 * xe_pm_runtime_get - Get a runtime_pm reference and resume synchronously 728 728 * @xe: xe device instance 729 + * 730 + * When possible, scope-based runtime PM (through guard(xe_pm_runtime)) is 731 + * be preferred over direct usage of this function. Manual get/put handling 732 + * should only be used when the function contains goto-based logic which 733 + * can break scope-based handling, or when the lifetime of the runtime PM 734 + * reference does not match a specific scope (e.g., runtime PM obtained in one 735 + * function and released in a different one). 729 736 */ 730 737 void xe_pm_runtime_get(struct xe_device *xe) 731 738 { ··· 764 757 /** 765 758 * xe_pm_runtime_get_ioctl - Get a runtime_pm reference before ioctl 766 759 * @xe: xe device instance 760 + * 761 + * When possible, scope-based runtime PM (through 762 + * ACQUIRE(xe_pm_runtime_ioctl, ...)) is be preferred over direct usage of this 763 + * function. Manual get/put handling should only be used when the function 764 + * contains goto-based logic which can break scope-based handling, or when the 765 + * lifetime of the runtime PM reference does not match a specific scope (e.g., 766 + * runtime PM obtained in one function and released in a different one). 767 767 * 768 768 * Returns: Any number greater than or equal to 0 for success, negative error 769 769 * code otherwise. ··· 841 827 * It will warn if not protected. 842 828 * The reference should be put back after this function regardless, since it 843 829 * will always bump the usage counter, regardless. 830 + * 831 + * When possible, scope-based runtime PM (through guard(xe_pm_runtime_noresume)) 832 + * is be preferred over direct usage of this function. Manual get/put handling 833 + * should only be used when the function contains goto-based logic which can 834 + * break scope-based handling, or when the lifetime of the runtime PM reference 835 + * does not match a specific scope (e.g., runtime PM obtained in one function 836 + * and released in a different one). 844 837 */ 845 838 void xe_pm_runtime_get_noresume(struct xe_device *xe) 846 839 {
+17
drivers/gpu/drm/xe/xe_pm.h
··· 6 6 #ifndef _XE_PM_H_ 7 7 #define _XE_PM_H_ 8 8 9 + #include <linux/cleanup.h> 9 10 #include <linux/pm_runtime.h> 10 11 11 12 #define DEFAULT_VRAM_THRESHOLD 300 /* in MB */ ··· 37 36 int xe_pm_block_on_suspend(struct xe_device *xe); 38 37 void xe_pm_might_block_on_suspend(void); 39 38 int xe_pm_module_init(void); 39 + 40 + static inline void __xe_pm_runtime_noop(struct xe_device *xe) {} 41 + 42 + DEFINE_GUARD(xe_pm_runtime, struct xe_device *, 43 + xe_pm_runtime_get(_T), xe_pm_runtime_put(_T)) 44 + DEFINE_GUARD(xe_pm_runtime_noresume, struct xe_device *, 45 + xe_pm_runtime_get_noresume(_T), xe_pm_runtime_put(_T)) 46 + DEFINE_GUARD_COND(xe_pm_runtime, _ioctl, xe_pm_runtime_get_ioctl(_T), _RET >= 0) 47 + 48 + /* 49 + * Used when a function needs to release runtime PM in all possible cases 50 + * and error paths, but the wakeref was already acquired by a different 51 + * function (i.e., get() has already happened so only a put() is needed). 52 + */ 53 + DEFINE_GUARD(xe_pm_runtime_release_only, struct xe_device *, 54 + __xe_pm_runtime_noop(_T), xe_pm_runtime_put(_T)); 40 55 41 56 #endif
+30 -5
drivers/gpu/drm/xe/xe_sriov_pf_migration.c
··· 46 46 { 47 47 xe_assert(xe, IS_SRIOV_PF(xe)); 48 48 49 - return xe->sriov.pf.migration.supported; 49 + return IS_ENABLED(CONFIG_DRM_XE_DEBUG) || !xe->sriov.pf.migration.disabled; 50 50 } 51 51 52 - static bool pf_check_migration_support(struct xe_device *xe) 52 + /** 53 + * xe_sriov_pf_migration_disable() - Turn off SR-IOV VF migration support on PF. 54 + * @xe: the &xe_device instance. 55 + * @fmt: format string for the log message, to be combined with following VAs. 56 + */ 57 + void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...) 53 58 { 54 - /* XXX: for now this is for feature enabling only */ 55 - return IS_ENABLED(CONFIG_DRM_XE_DEBUG); 59 + struct va_format vaf; 60 + va_list va_args; 61 + 62 + xe_assert(xe, IS_SRIOV_PF(xe)); 63 + 64 + va_start(va_args, fmt); 65 + vaf.fmt = fmt; 66 + vaf.va = &va_args; 67 + xe_sriov_notice(xe, "migration %s: %pV\n", 68 + IS_ENABLED(CONFIG_DRM_XE_DEBUG) ? 69 + "missing prerequisite" : "disabled", 70 + &vaf); 71 + va_end(va_args); 72 + 73 + xe->sriov.pf.migration.disabled = true; 74 + } 75 + 76 + static void pf_migration_check_support(struct xe_device *xe) 77 + { 78 + if (!xe_device_has_memirq(xe)) 79 + xe_sriov_pf_migration_disable(xe, "requires memory-based IRQ support"); 56 80 } 57 81 58 82 static void pf_migration_cleanup(void *arg) ··· 101 77 102 78 xe_assert(xe, IS_SRIOV_PF(xe)); 103 79 104 - xe->sriov.pf.migration.supported = pf_check_migration_support(xe); 80 + pf_migration_check_support(xe); 81 + 105 82 if (!xe_sriov_pf_migration_supported(xe)) 106 83 return 0; 107 84
+1
drivers/gpu/drm/xe/xe_sriov_pf_migration.h
··· 14 14 15 15 int xe_sriov_pf_migration_init(struct xe_device *xe); 16 16 bool xe_sriov_pf_migration_supported(struct xe_device *xe); 17 + void xe_sriov_pf_migration_disable(struct xe_device *xe, const char *fmt, ...); 17 18 int xe_sriov_pf_migration_restore_produce(struct xe_device *xe, unsigned int vfid, 18 19 struct xe_sriov_packet *data); 19 20 struct xe_sriov_packet *
+2 -2
drivers/gpu/drm/xe/xe_sriov_pf_migration_types.h
··· 14 14 * struct xe_sriov_pf_migration - Xe device level VF migration data 15 15 */ 16 16 struct xe_sriov_pf_migration { 17 - /** @supported: indicates whether VF migration feature is supported */ 18 - bool supported; 17 + /** @disabled: indicates whether VF migration feature is disabled */ 18 + bool disabled; 19 19 }; 20 20 21 21 /**
+80
drivers/gpu/drm/xe/xe_sriov_vfio.c
··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + 6 + #include <drm/intel/xe_sriov_vfio.h> 7 + #include <linux/cleanup.h> 8 + 9 + #include "xe_pci.h" 10 + #include "xe_pm.h" 11 + #include "xe_sriov_pf_control.h" 12 + #include "xe_sriov_pf_helpers.h" 13 + #include "xe_sriov_pf_migration.h" 14 + 15 + struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev) 16 + { 17 + return xe_pci_to_pf_device(pdev); 18 + } 19 + EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci"); 20 + 21 + bool xe_sriov_vfio_migration_supported(struct xe_device *xe) 22 + { 23 + if (!IS_SRIOV_PF(xe)) 24 + return -EPERM; 25 + 26 + return xe_sriov_pf_migration_supported(xe); 27 + } 28 + EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_migration_supported, "xe-vfio-pci"); 29 + 30 + #define DEFINE_XE_SRIOV_VFIO_FUNCTION(_type, _func, _impl) \ 31 + _type xe_sriov_vfio_##_func(struct xe_device *xe, unsigned int vfid) \ 32 + { \ 33 + if (!IS_SRIOV_PF(xe)) \ 34 + return -EPERM; \ 35 + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) \ 36 + return -EINVAL; \ 37 + \ 38 + guard(xe_pm_runtime_noresume)(xe); \ 39 + \ 40 + return xe_sriov_pf_##_impl(xe, vfid); \ 41 + } \ 42 + EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_##_func, "xe-vfio-pci") 43 + 44 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, wait_flr_done, control_wait_flr); 45 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, suspend_device, control_pause_vf); 46 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_device, control_resume_vf); 47 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_enter, control_trigger_save_vf); 48 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, stop_copy_exit, control_finish_save_vf); 49 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_enter, control_trigger_restore_vf); 50 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, resume_data_exit, control_finish_restore_vf); 51 + DEFINE_XE_SRIOV_VFIO_FUNCTION(int, error, control_stop_vf); 52 + DEFINE_XE_SRIOV_VFIO_FUNCTION(ssize_t, stop_copy_size, migration_size); 53 + 54 + ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, 55 + char __user *buf, size_t len) 56 + { 57 + if (!IS_SRIOV_PF(xe)) 58 + return -EPERM; 59 + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) 60 + return -EINVAL; 61 + 62 + guard(xe_pm_runtime_noresume)(xe); 63 + 64 + return xe_sriov_pf_migration_read(xe, vfid, buf, len); 65 + } 66 + EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_read, "xe-vfio-pci"); 67 + 68 + ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, 69 + const char __user *buf, size_t len) 70 + { 71 + if (!IS_SRIOV_PF(xe)) 72 + return -EPERM; 73 + if (vfid == PFID || vfid > xe_sriov_pf_num_vfs(xe)) 74 + return -EINVAL; 75 + 76 + guard(xe_pm_runtime_noresume)(xe); 77 + 78 + return xe_sriov_pf_migration_write(xe, vfid, buf, len); 79 + } 80 + EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_data_write, "xe-vfio-pci");
+2
drivers/vfio/pci/Kconfig
··· 67 67 68 68 source "drivers/vfio/pci/qat/Kconfig" 69 69 70 + source "drivers/vfio/pci/xe/Kconfig" 71 + 70 72 endmenu
+2
drivers/vfio/pci/Makefile
··· 19 19 obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ 20 20 21 21 obj-$(CONFIG_QAT_VFIO_PCI) += qat/ 22 + 23 + obj-$(CONFIG_XE_VFIO_PCI) += xe/
+12
drivers/vfio/pci/xe/Kconfig
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + config XE_VFIO_PCI 3 + tristate "VFIO support for Intel Graphics" 4 + depends on DRM_XE && PCI_IOV 5 + select VFIO_PCI_CORE 6 + help 7 + This option enables device specific VFIO driver variant for Intel Graphics. 8 + In addition to generic VFIO PCI functionality, it implements VFIO 9 + migration uAPI allowing userspace to enable migration for 10 + Intel Graphics SR-IOV Virtual Functions supported by the Xe driver. 11 + 12 + If you don't know what to do here, say N.
+3
drivers/vfio/pci/xe/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + obj-$(CONFIG_XE_VFIO_PCI) += xe-vfio-pci.o 3 + xe-vfio-pci-y := main.o
+573
drivers/vfio/pci/xe/main.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + 6 + #include <linux/anon_inodes.h> 7 + #include <linux/delay.h> 8 + #include <linux/file.h> 9 + #include <linux/module.h> 10 + #include <linux/pci.h> 11 + #include <linux/sizes.h> 12 + #include <linux/types.h> 13 + #include <linux/vfio.h> 14 + #include <linux/vfio_pci_core.h> 15 + 16 + #include <drm/intel/xe_sriov_vfio.h> 17 + #include <drm/intel/pciids.h> 18 + 19 + struct xe_vfio_pci_migration_file { 20 + struct file *filp; 21 + /* serializes accesses to migration data */ 22 + struct mutex lock; 23 + struct xe_vfio_pci_core_device *xe_vdev; 24 + u8 disabled:1; 25 + }; 26 + 27 + struct xe_vfio_pci_core_device { 28 + struct vfio_pci_core_device core_device; 29 + struct xe_device *xe; 30 + /* PF internal control uses vfid index starting from 1 */ 31 + unsigned int vfid; 32 + u8 deferred_reset:1; 33 + /* protects migration state */ 34 + struct mutex state_mutex; 35 + enum vfio_device_mig_state mig_state; 36 + /* protects the reset_done flow */ 37 + spinlock_t reset_lock; 38 + struct xe_vfio_pci_migration_file *migf; 39 + }; 40 + 41 + #define xe_vdev_to_dev(xe_vdev) (&(xe_vdev)->core_device.pdev->dev) 42 + 43 + static void xe_vfio_pci_disable_file(struct xe_vfio_pci_migration_file *migf) 44 + { 45 + mutex_lock(&migf->lock); 46 + migf->disabled = true; 47 + mutex_unlock(&migf->lock); 48 + } 49 + 50 + static void xe_vfio_pci_put_file(struct xe_vfio_pci_core_device *xe_vdev) 51 + { 52 + xe_vfio_pci_disable_file(xe_vdev->migf); 53 + fput(xe_vdev->migf->filp); 54 + xe_vdev->migf = NULL; 55 + } 56 + 57 + static void xe_vfio_pci_reset(struct xe_vfio_pci_core_device *xe_vdev) 58 + { 59 + if (xe_vdev->migf) 60 + xe_vfio_pci_put_file(xe_vdev); 61 + 62 + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 63 + } 64 + 65 + static void xe_vfio_pci_state_mutex_lock(struct xe_vfio_pci_core_device *xe_vdev) 66 + { 67 + mutex_lock(&xe_vdev->state_mutex); 68 + } 69 + 70 + /* 71 + * This function is called in all state_mutex unlock cases to 72 + * handle a 'deferred_reset' if exists. 73 + */ 74 + static void xe_vfio_pci_state_mutex_unlock(struct xe_vfio_pci_core_device *xe_vdev) 75 + { 76 + again: 77 + spin_lock(&xe_vdev->reset_lock); 78 + if (xe_vdev->deferred_reset) { 79 + xe_vdev->deferred_reset = false; 80 + spin_unlock(&xe_vdev->reset_lock); 81 + xe_vfio_pci_reset(xe_vdev); 82 + goto again; 83 + } 84 + mutex_unlock(&xe_vdev->state_mutex); 85 + spin_unlock(&xe_vdev->reset_lock); 86 + } 87 + 88 + static void xe_vfio_pci_reset_done(struct pci_dev *pdev) 89 + { 90 + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 91 + int ret; 92 + 93 + if (!pdev->is_virtfn) 94 + return; 95 + 96 + /* 97 + * VF FLR requires additional processing done by PF driver. 98 + * The processing is done after FLR is already finished from PCIe 99 + * perspective. 100 + * In order to avoid a scenario where VF is used while PF processing 101 + * is still in progress, additional synchronization point is needed. 102 + */ 103 + ret = xe_sriov_vfio_wait_flr_done(xe_vdev->xe, xe_vdev->vfid); 104 + if (ret) 105 + dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret); 106 + 107 + if (!xe_vdev->vfid) 108 + return; 109 + 110 + /* 111 + * As the higher VFIO layers are holding locks across reset and using 112 + * those same locks with the mm_lock we need to prevent ABBA deadlock 113 + * with the state_mutex and mm_lock. 114 + * In case the state_mutex was taken already we defer the cleanup work 115 + * to the unlock flow of the other running context. 116 + */ 117 + spin_lock(&xe_vdev->reset_lock); 118 + xe_vdev->deferred_reset = true; 119 + if (!mutex_trylock(&xe_vdev->state_mutex)) { 120 + spin_unlock(&xe_vdev->reset_lock); 121 + return; 122 + } 123 + spin_unlock(&xe_vdev->reset_lock); 124 + xe_vfio_pci_state_mutex_unlock(xe_vdev); 125 + 126 + xe_vfio_pci_reset(xe_vdev); 127 + } 128 + 129 + static const struct pci_error_handlers xe_vfio_pci_err_handlers = { 130 + .reset_done = xe_vfio_pci_reset_done, 131 + .error_detected = vfio_pci_core_aer_err_detected, 132 + }; 133 + 134 + static int xe_vfio_pci_open_device(struct vfio_device *core_vdev) 135 + { 136 + struct xe_vfio_pci_core_device *xe_vdev = 137 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 138 + struct vfio_pci_core_device *vdev = &xe_vdev->core_device; 139 + int ret; 140 + 141 + ret = vfio_pci_core_enable(vdev); 142 + if (ret) 143 + return ret; 144 + 145 + xe_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 146 + 147 + vfio_pci_core_finish_enable(vdev); 148 + 149 + return 0; 150 + } 151 + 152 + static void xe_vfio_pci_close_device(struct vfio_device *core_vdev) 153 + { 154 + struct xe_vfio_pci_core_device *xe_vdev = 155 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 156 + 157 + xe_vfio_pci_state_mutex_lock(xe_vdev); 158 + xe_vfio_pci_reset(xe_vdev); 159 + xe_vfio_pci_state_mutex_unlock(xe_vdev); 160 + vfio_pci_core_close_device(core_vdev); 161 + } 162 + 163 + static int xe_vfio_pci_release_file(struct inode *inode, struct file *filp) 164 + { 165 + struct xe_vfio_pci_migration_file *migf = filp->private_data; 166 + 167 + mutex_destroy(&migf->lock); 168 + kfree(migf); 169 + 170 + return 0; 171 + } 172 + 173 + static ssize_t xe_vfio_pci_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) 174 + { 175 + struct xe_vfio_pci_migration_file *migf = filp->private_data; 176 + ssize_t ret; 177 + 178 + if (pos) 179 + return -ESPIPE; 180 + 181 + mutex_lock(&migf->lock); 182 + if (migf->disabled) { 183 + mutex_unlock(&migf->lock); 184 + return -ENODEV; 185 + } 186 + 187 + ret = xe_sriov_vfio_data_read(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 188 + mutex_unlock(&migf->lock); 189 + 190 + return ret; 191 + } 192 + 193 + static const struct file_operations xe_vfio_pci_save_fops = { 194 + .owner = THIS_MODULE, 195 + .read = xe_vfio_pci_save_read, 196 + .release = xe_vfio_pci_release_file, 197 + .llseek = noop_llseek, 198 + }; 199 + 200 + static ssize_t xe_vfio_pci_resume_write(struct file *filp, const char __user *buf, 201 + size_t len, loff_t *pos) 202 + { 203 + struct xe_vfio_pci_migration_file *migf = filp->private_data; 204 + ssize_t ret; 205 + 206 + if (pos) 207 + return -ESPIPE; 208 + 209 + mutex_lock(&migf->lock); 210 + if (migf->disabled) { 211 + mutex_unlock(&migf->lock); 212 + return -ENODEV; 213 + } 214 + 215 + ret = xe_sriov_vfio_data_write(migf->xe_vdev->xe, migf->xe_vdev->vfid, buf, len); 216 + mutex_unlock(&migf->lock); 217 + 218 + return ret; 219 + } 220 + 221 + static const struct file_operations xe_vfio_pci_resume_fops = { 222 + .owner = THIS_MODULE, 223 + .write = xe_vfio_pci_resume_write, 224 + .release = xe_vfio_pci_release_file, 225 + .llseek = noop_llseek, 226 + }; 227 + 228 + static const char *vfio_dev_state_str(u32 state) 229 + { 230 + switch (state) { 231 + case VFIO_DEVICE_STATE_RUNNING: return "running"; 232 + case VFIO_DEVICE_STATE_RUNNING_P2P: return "running_p2p"; 233 + case VFIO_DEVICE_STATE_STOP_COPY: return "stopcopy"; 234 + case VFIO_DEVICE_STATE_STOP: return "stop"; 235 + case VFIO_DEVICE_STATE_RESUMING: return "resuming"; 236 + case VFIO_DEVICE_STATE_ERROR: return "error"; 237 + default: return ""; 238 + } 239 + } 240 + 241 + enum xe_vfio_pci_file_type { 242 + XE_VFIO_FILE_SAVE = 0, 243 + XE_VFIO_FILE_RESUME, 244 + }; 245 + 246 + static struct xe_vfio_pci_migration_file * 247 + xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev, 248 + enum xe_vfio_pci_file_type type) 249 + { 250 + struct xe_vfio_pci_migration_file *migf; 251 + const struct file_operations *fops; 252 + int flags; 253 + 254 + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); 255 + if (!migf) 256 + return ERR_PTR(-ENOMEM); 257 + 258 + fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops : &xe_vfio_pci_resume_fops; 259 + flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY; 260 + migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags); 261 + if (IS_ERR(migf->filp)) { 262 + kfree(migf); 263 + return ERR_CAST(migf->filp); 264 + } 265 + 266 + mutex_init(&migf->lock); 267 + migf->xe_vdev = xe_vdev; 268 + xe_vdev->migf = migf; 269 + 270 + stream_open(migf->filp->f_inode, migf->filp); 271 + 272 + return migf; 273 + } 274 + 275 + static struct file * 276 + xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new) 277 + { 278 + u32 cur = xe_vdev->mig_state; 279 + int ret; 280 + 281 + dev_dbg(xe_vdev_to_dev(xe_vdev), 282 + "state: %s->%s\n", vfio_dev_state_str(cur), vfio_dev_state_str(new)); 283 + 284 + /* 285 + * "STOP" handling is reused for "RUNNING_P2P", as the device doesn't 286 + * have the capability to selectively block outgoing p2p DMA transfers. 287 + * While the device is allowing BAR accesses when the VF is stopped, it 288 + * is not processing any new workload requests, effectively stopping 289 + * any outgoing DMA transfers (not just p2p). 290 + * Any VRAM / MMIO accesses occurring during "RUNNING_P2P" are kept and 291 + * will be migrated to target VF during stop-copy. 292 + */ 293 + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 294 + ret = xe_sriov_vfio_suspend_device(xe_vdev->xe, xe_vdev->vfid); 295 + if (ret) 296 + goto err; 297 + 298 + return NULL; 299 + } 300 + 301 + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || 302 + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) 303 + return NULL; 304 + 305 + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 306 + ret = xe_sriov_vfio_resume_device(xe_vdev->xe, xe_vdev->vfid); 307 + if (ret) 308 + goto err; 309 + 310 + return NULL; 311 + } 312 + 313 + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 314 + struct xe_vfio_pci_migration_file *migf; 315 + 316 + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE); 317 + if (IS_ERR(migf)) { 318 + ret = PTR_ERR(migf); 319 + goto err; 320 + } 321 + get_file(migf->filp); 322 + 323 + ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->xe, xe_vdev->vfid); 324 + if (ret) { 325 + fput(migf->filp); 326 + goto err; 327 + } 328 + 329 + return migf->filp; 330 + } 331 + 332 + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { 333 + if (xe_vdev->migf) 334 + xe_vfio_pci_put_file(xe_vdev); 335 + 336 + ret = xe_sriov_vfio_stop_copy_exit(xe_vdev->xe, xe_vdev->vfid); 337 + if (ret) 338 + goto err; 339 + 340 + return NULL; 341 + } 342 + 343 + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 344 + struct xe_vfio_pci_migration_file *migf; 345 + 346 + migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_RESUME); 347 + if (IS_ERR(migf)) { 348 + ret = PTR_ERR(migf); 349 + goto err; 350 + } 351 + get_file(migf->filp); 352 + 353 + ret = xe_sriov_vfio_resume_data_enter(xe_vdev->xe, xe_vdev->vfid); 354 + if (ret) { 355 + fput(migf->filp); 356 + goto err; 357 + } 358 + 359 + return migf->filp; 360 + } 361 + 362 + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 363 + if (xe_vdev->migf) 364 + xe_vfio_pci_put_file(xe_vdev); 365 + 366 + ret = xe_sriov_vfio_resume_data_exit(xe_vdev->xe, xe_vdev->vfid); 367 + if (ret) 368 + goto err; 369 + 370 + return NULL; 371 + } 372 + 373 + WARN(true, "Unknown state transition %d->%d", cur, new); 374 + return ERR_PTR(-EINVAL); 375 + 376 + err: 377 + dev_dbg(xe_vdev_to_dev(xe_vdev), 378 + "Failed to transition state: %s->%s err=%d\n", 379 + vfio_dev_state_str(cur), vfio_dev_state_str(new), ret); 380 + return ERR_PTR(ret); 381 + } 382 + 383 + static struct file * 384 + xe_vfio_pci_set_device_state(struct vfio_device *core_vdev, 385 + enum vfio_device_mig_state new_state) 386 + { 387 + struct xe_vfio_pci_core_device *xe_vdev = 388 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 389 + enum vfio_device_mig_state next_state; 390 + struct file *f = NULL; 391 + int ret; 392 + 393 + xe_vfio_pci_state_mutex_lock(xe_vdev); 394 + while (new_state != xe_vdev->mig_state) { 395 + ret = vfio_mig_get_next_state(core_vdev, xe_vdev->mig_state, 396 + new_state, &next_state); 397 + if (ret) { 398 + xe_sriov_vfio_error(xe_vdev->xe, xe_vdev->vfid); 399 + f = ERR_PTR(ret); 400 + break; 401 + } 402 + f = xe_vfio_set_state(xe_vdev, next_state); 403 + if (IS_ERR(f)) 404 + break; 405 + 406 + xe_vdev->mig_state = next_state; 407 + 408 + /* Multiple state transitions with non-NULL file in the middle */ 409 + if (f && new_state != xe_vdev->mig_state) { 410 + fput(f); 411 + f = ERR_PTR(-EINVAL); 412 + break; 413 + } 414 + } 415 + xe_vfio_pci_state_mutex_unlock(xe_vdev); 416 + 417 + return f; 418 + } 419 + 420 + static int xe_vfio_pci_get_device_state(struct vfio_device *core_vdev, 421 + enum vfio_device_mig_state *curr_state) 422 + { 423 + struct xe_vfio_pci_core_device *xe_vdev = 424 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 425 + 426 + xe_vfio_pci_state_mutex_lock(xe_vdev); 427 + *curr_state = xe_vdev->mig_state; 428 + xe_vfio_pci_state_mutex_unlock(xe_vdev); 429 + 430 + return 0; 431 + } 432 + 433 + static int xe_vfio_pci_get_data_size(struct vfio_device *vdev, 434 + unsigned long *stop_copy_length) 435 + { 436 + struct xe_vfio_pci_core_device *xe_vdev = 437 + container_of(vdev, struct xe_vfio_pci_core_device, core_device.vdev); 438 + 439 + xe_vfio_pci_state_mutex_lock(xe_vdev); 440 + *stop_copy_length = xe_sriov_vfio_stop_copy_size(xe_vdev->xe, xe_vdev->vfid); 441 + xe_vfio_pci_state_mutex_unlock(xe_vdev); 442 + 443 + return 0; 444 + } 445 + 446 + static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { 447 + .migration_set_state = xe_vfio_pci_set_device_state, 448 + .migration_get_state = xe_vfio_pci_get_device_state, 449 + .migration_get_data_size = xe_vfio_pci_get_data_size, 450 + }; 451 + 452 + static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) 453 + { 454 + struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; 455 + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); 456 + struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); 457 + 458 + if (!xe) 459 + return; 460 + if (!xe_sriov_vfio_migration_supported(xe)) 461 + return; 462 + 463 + mutex_init(&xe_vdev->state_mutex); 464 + spin_lock_init(&xe_vdev->reset_lock); 465 + 466 + /* PF internal control uses vfid index starting from 1 */ 467 + xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; 468 + xe_vdev->xe = xe; 469 + 470 + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; 471 + core_vdev->mig_ops = &xe_vfio_pci_migration_ops; 472 + } 473 + 474 + static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) 475 + { 476 + if (!xe_vdev->vfid) 477 + return; 478 + 479 + mutex_destroy(&xe_vdev->state_mutex); 480 + } 481 + 482 + static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) 483 + { 484 + struct xe_vfio_pci_core_device *xe_vdev = 485 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 486 + 487 + xe_vfio_pci_migration_init(xe_vdev); 488 + 489 + return vfio_pci_core_init_dev(core_vdev); 490 + } 491 + 492 + static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) 493 + { 494 + struct xe_vfio_pci_core_device *xe_vdev = 495 + container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); 496 + 497 + xe_vfio_pci_migration_fini(xe_vdev); 498 + } 499 + 500 + static const struct vfio_device_ops xe_vfio_pci_ops = { 501 + .name = "xe-vfio-pci", 502 + .init = xe_vfio_pci_init_dev, 503 + .release = xe_vfio_pci_release_dev, 504 + .open_device = xe_vfio_pci_open_device, 505 + .close_device = xe_vfio_pci_close_device, 506 + .ioctl = vfio_pci_core_ioctl, 507 + .device_feature = vfio_pci_core_ioctl_feature, 508 + .read = vfio_pci_core_read, 509 + .write = vfio_pci_core_write, 510 + .mmap = vfio_pci_core_mmap, 511 + .request = vfio_pci_core_request, 512 + .match = vfio_pci_core_match, 513 + .match_token_uuid = vfio_pci_core_match_token_uuid, 514 + .bind_iommufd = vfio_iommufd_physical_bind, 515 + .unbind_iommufd = vfio_iommufd_physical_unbind, 516 + .attach_ioas = vfio_iommufd_physical_attach_ioas, 517 + .detach_ioas = vfio_iommufd_physical_detach_ioas, 518 + }; 519 + 520 + static int xe_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 521 + { 522 + struct xe_vfio_pci_core_device *xe_vdev; 523 + int ret; 524 + 525 + xe_vdev = vfio_alloc_device(xe_vfio_pci_core_device, core_device.vdev, &pdev->dev, 526 + &xe_vfio_pci_ops); 527 + if (IS_ERR(xe_vdev)) 528 + return PTR_ERR(xe_vdev); 529 + 530 + dev_set_drvdata(&pdev->dev, &xe_vdev->core_device); 531 + 532 + ret = vfio_pci_core_register_device(&xe_vdev->core_device); 533 + if (ret) { 534 + vfio_put_device(&xe_vdev->core_device.vdev); 535 + return ret; 536 + } 537 + 538 + return 0; 539 + } 540 + 541 + static void xe_vfio_pci_remove(struct pci_dev *pdev) 542 + { 543 + struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev); 544 + 545 + vfio_pci_core_unregister_device(&xe_vdev->core_device); 546 + vfio_put_device(&xe_vdev->core_device.vdev); 547 + } 548 + 549 + #define INTEL_PCI_VFIO_DEVICE(_id) { \ 550 + PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, (_id)) \ 551 + } 552 + 553 + static const struct pci_device_id xe_vfio_pci_table[] = { 554 + INTEL_PTL_IDS(INTEL_PCI_VFIO_DEVICE), 555 + INTEL_WCL_IDS(INTEL_PCI_VFIO_DEVICE), 556 + INTEL_BMG_IDS(INTEL_PCI_VFIO_DEVICE), 557 + {} 558 + }; 559 + MODULE_DEVICE_TABLE(pci, xe_vfio_pci_table); 560 + 561 + static struct pci_driver xe_vfio_pci_driver = { 562 + .name = "xe-vfio-pci", 563 + .id_table = xe_vfio_pci_table, 564 + .probe = xe_vfio_pci_probe, 565 + .remove = xe_vfio_pci_remove, 566 + .err_handler = &xe_vfio_pci_err_handlers, 567 + .driver_managed_dma = true, 568 + }; 569 + module_pci_driver(xe_vfio_pci_driver); 570 + 571 + MODULE_LICENSE("GPL"); 572 + MODULE_AUTHOR("Michał Winiarski <michal.winiarski@intel.com>"); 573 + MODULE_DESCRIPTION("VFIO PCI driver with migration support for Intel Graphics");
+143
include/drm/intel/xe_sriov_vfio.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + 6 + #ifndef _XE_SRIOV_VFIO_H_ 7 + #define _XE_SRIOV_VFIO_H_ 8 + 9 + #include <linux/types.h> 10 + 11 + struct pci_dev; 12 + struct xe_device; 13 + 14 + /** 15 + * xe_sriov_vfio_get_pf() - Get PF &xe_device. 16 + * @pdev: the VF &pci_dev device 17 + * 18 + * Return: pointer to PF &xe_device, NULL otherwise. 19 + */ 20 + struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev); 21 + 22 + /** 23 + * xe_sriov_vfio_migration_supported() - Check if migration is supported. 24 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 25 + * 26 + * Return: true if migration is supported, false otherwise. 27 + */ 28 + bool xe_sriov_vfio_migration_supported(struct xe_device *xe); 29 + 30 + /** 31 + * xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion. 32 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 33 + * @vfid: the VF identifier (can't be 0) 34 + * 35 + * This function will wait until VF FLR is processed by PF on all tiles (or 36 + * until timeout occurs). 37 + * 38 + * Return: 0 on success or a negative error code on failure. 39 + */ 40 + int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid); 41 + 42 + /** 43 + * xe_sriov_vfio_suspend_device() - Suspend VF. 44 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 45 + * @vfid: the VF identifier (can't be 0) 46 + * 47 + * This function will pause VF on all tiles/GTs. 48 + * 49 + * Return: 0 on success or a negative error code on failure. 50 + */ 51 + int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid); 52 + 53 + /** 54 + * xe_sriov_vfio_resume_device() - Resume VF. 55 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 56 + * @vfid: the VF identifier (can't be 0) 57 + * 58 + * This function will resume VF on all tiles. 59 + * 60 + * Return: 0 on success or a negative error code on failure. 61 + */ 62 + int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid); 63 + 64 + /** 65 + * xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save. 66 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 67 + * @vfid: the VF identifier (can't be 0) 68 + * 69 + * Return: 0 on success or a negative error code on failure. 70 + */ 71 + int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid); 72 + 73 + /** 74 + * xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save. 75 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 76 + * @vfid: the VF identifier (can't be 0) 77 + * 78 + * Return: 0 on success or a negative error code on failure. 79 + */ 80 + int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid); 81 + 82 + /** 83 + * xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore. 84 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 85 + * @vfid: the VF identifier (can't be 0) 86 + * 87 + * Return: 0 on success or a negative error code on failure. 88 + */ 89 + int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid); 90 + 91 + /** 92 + * xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore. 93 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 94 + * @vfid: the VF identifier (can't be 0) 95 + * 96 + * Return: 0 on success or a negative error code on failure. 97 + */ 98 + int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid); 99 + 100 + /** 101 + * xe_sriov_vfio_error() - Move VF device to error state. 102 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 103 + * @vfid: the VF identifier (can't be 0) 104 + * 105 + * Reset is needed to move it out of error state. 106 + * 107 + * Return: 0 on success or a negative error code on failure. 108 + */ 109 + int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid); 110 + 111 + /** 112 + * xe_sriov_vfio_data_read() - Read migration data from the VF device. 113 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 114 + * @vfid: the VF identifier (can't be 0) 115 + * @buf: start address of userspace buffer 116 + * @len: requested read size from userspace 117 + * 118 + * Return: number of bytes that has been successfully read, 119 + * 0 if no more migration data is available, -errno on failure. 120 + */ 121 + ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, 122 + char __user *buf, size_t len); 123 + /** 124 + * xe_sriov_vfio_data_write() - Write migration data to the VF device. 125 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 126 + * @vfid: the VF identifier (can't be 0) 127 + * @buf: start address of userspace buffer 128 + * @len: requested write size from userspace 129 + * 130 + * Return: number of bytes that has been successfully written, -errno on failure. 131 + */ 132 + ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, 133 + const char __user *buf, size_t len); 134 + /** 135 + * xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data. 136 + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() 137 + * @vfid: the VF identifier (can't be 0) 138 + * 139 + * Return: migration data size in bytes or a negative error code on failure. 140 + */ 141 + ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid); 142 + 143 + #endif