Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

- Replace deprecated git://github.com link in MAINTAINERS (Palmer
Dabbelt)

- Simplify vfio/mlx5 with module_pci_driver() helper (Shang XiaoJing)

- Drop unnecessary buffer from ACPI call (Rafael Mendonca)

- Correct latent missing include issue in iova-bitmap and fix support
for unaligned bitmaps. Follow-up with better fix through refactor
(Joao Martins)

- Rework ccw mdev driver to split private data from parent structure,
better aligning with the mdev lifecycle and allowing us to remove a
temporary workaround (Eric Farman)

- Add an interface to get an estimated migration data size for a
device, allowing userspace to make informed decisions, ex. more
accurately predicting VM downtime (Yishai Hadas)

- Fix minor typo in vfio/mlx5 array declaration (Yishai Hadas)

- Simplify module and Kconfig through consolidating SPAPR/EEH code and
config options and folding virqfd module into main vfio module (Jason
Gunthorpe)

- Fix error path from device_register() across all vfio mdev and sample
drivers (Alex Williamson)

- Define migration pre-copy interface and implement for vfio/mlx5
devices, allowing portions of the device state to be saved while the
device continues operation, towards reducing the stop-copy state size
(Jason Gunthorpe, Yishai Hadas, Shay Drory)

- Implement pre-copy for hisi_acc devices (Shameer Kolothum)

- Fixes to mdpy mdev driver remove path and error path on probe (Shang
XiaoJing)

- vfio/mlx5 fixes for incorrect return after copy_to_user() fault and
incorrect buffer freeing (Dan Carpenter)

* tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio: (42 commits)
vfio/mlx5: error pointer dereference in error handling
vfio/mlx5: fix error code in mlx5vf_precopy_ioctl()
samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe()
hisi_acc_vfio_pci: Enable PRE_COPY flag
hisi_acc_vfio_pci: Move the dev compatibility tests for early check
hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions
hisi_acc_vfio_pci: Add support for precopy IOCTL
vfio/mlx5: Enable MIGRATION_PRE_COPY flag
vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error
vfio/mlx5: Introduce multiple loads
vfio/mlx5: Consider temporary end of stream as part of PRE_COPY
vfio/mlx5: Introduce vfio precopy ioctl implementation
vfio/mlx5: Introduce SW headers for migration states
vfio/mlx5: Introduce device transitions of PRE_COPY
vfio/mlx5: Refactor to use queue based data chunks
vfio/mlx5: Refactor migration file state
vfio/mlx5: Refactor MKEY usage
vfio/mlx5: Refactor PD usage
vfio/mlx5: Enforce a single SAVE command at a time
vfio: Extend the device migration protocol with PRE_COPY
...

+1797 -651
+1 -1
MAINTAINERS
··· 21781 21781 R: Cornelia Huck <cohuck@redhat.com> 21782 21782 L: kvm@vger.kernel.org 21783 21783 S: Maintained 21784 - T: git git://github.com/awilliam/linux-vfio.git 21784 + T: git https://github.com/awilliam/linux-vfio.git 21785 21785 F: Documentation/ABI/testing/sysfs-devices-vfio-dev 21786 21786 F: Documentation/driver-api/vfio.rst 21787 21787 F: drivers/vfio/
-1
drivers/gpu/drm/i915/gvt/kvmgt.c
··· 1465 1465 struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); 1466 1466 1467 1467 intel_gvt_destroy_vgpu(vgpu); 1468 - vfio_free_device(vfio_dev); 1469 1468 } 1470 1469 1471 1470 static const struct vfio_device_ops intel_vgpu_dev_ops = {
+3 -2
drivers/s390/cio/vfio_ccw_chp.c
··· 16 16 char __user *buf, size_t count, 17 17 loff_t *ppos) 18 18 { 19 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 19 20 unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; 20 21 loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; 21 22 struct ccw_schib_region *region; ··· 28 27 mutex_lock(&private->io_mutex); 29 28 region = private->region[i].data; 30 29 31 - if (cio_update_schib(private->sch)) { 30 + if (cio_update_schib(sch)) { 32 31 ret = -ENODEV; 33 32 goto out; 34 33 } 35 34 36 - memcpy(region, &private->sch->schib, sizeof(*region)); 35 + memcpy(region, &sch->schib, sizeof(*region)); 37 36 38 37 if (copy_to_user(buf, (void *)region + pos, count)) { 39 38 ret = -EFAULT;
+75 -101
drivers/s390/cio/vfio_ccw_drv.c
··· 23 23 #include "vfio_ccw_private.h" 24 24 25 25 struct workqueue_struct *vfio_ccw_work_q; 26 - static struct kmem_cache *vfio_ccw_io_region; 27 - static struct kmem_cache *vfio_ccw_cmd_region; 28 - static struct kmem_cache *vfio_ccw_schib_region; 29 - static struct kmem_cache *vfio_ccw_crw_region; 26 + struct kmem_cache *vfio_ccw_io_region; 27 + struct kmem_cache *vfio_ccw_cmd_region; 28 + struct kmem_cache *vfio_ccw_schib_region; 29 + struct kmem_cache *vfio_ccw_crw_region; 30 30 31 31 debug_info_t *vfio_ccw_debug_msg_id; 32 32 debug_info_t *vfio_ccw_debug_trace_id; ··· 36 36 */ 37 37 int vfio_ccw_sch_quiesce(struct subchannel *sch) 38 38 { 39 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 39 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 40 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 40 41 DECLARE_COMPLETION_ONSTACK(completion); 41 42 int iretry, ret = 0; 43 + 44 + /* 45 + * Probably an impossible situation, after being called through 46 + * FSM callbacks. But in the event it did, register a warning 47 + * and return as if things were fine. 48 + */ 49 + if (WARN_ON(!private)) 50 + return 0; 42 51 43 52 iretry = 255; 44 53 do { ··· 79 70 return ret; 80 71 } 81 72 82 - static void vfio_ccw_sch_io_todo(struct work_struct *work) 73 + void vfio_ccw_sch_io_todo(struct work_struct *work) 83 74 { 84 75 struct vfio_ccw_private *private; 85 76 struct irb *irb; ··· 115 106 eventfd_signal(private->io_trigger, 1); 116 107 } 117 108 118 - static void vfio_ccw_crw_todo(struct work_struct *work) 109 + void vfio_ccw_crw_todo(struct work_struct *work) 119 110 { 120 111 struct vfio_ccw_private *private; 121 112 ··· 130 121 */ 131 122 static void vfio_ccw_sch_irq(struct subchannel *sch) 132 123 { 133 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 124 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 125 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 126 + 127 + /* 128 + * The subchannel should still be disabled at this point, 129 + * so an interrupt would be quite surprising. As with an 130 + * interrupt while the FSM is closed, let's attempt to 131 + * disable the subchannel again. 132 + */ 133 + if (!private) { 134 + VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: unexpected interrupt\n", 135 + sch->schid.cssid, sch->schid.ssid, 136 + sch->schid.sch_no); 137 + 138 + cio_disable_subchannel(sch); 139 + return; 140 + } 134 141 135 142 inc_irq_stat(IRQIO_CIO); 136 143 vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); 137 144 } 138 145 139 - static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) 146 + static void vfio_ccw_free_parent(struct device *dev) 140 147 { 141 - struct vfio_ccw_private *private; 148 + struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev); 142 149 143 - private = kzalloc(sizeof(*private), GFP_KERNEL); 144 - if (!private) 145 - return ERR_PTR(-ENOMEM); 146 - 147 - private->sch = sch; 148 - mutex_init(&private->io_mutex); 149 - private->state = VFIO_CCW_STATE_STANDBY; 150 - INIT_LIST_HEAD(&private->crw); 151 - INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); 152 - INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); 153 - 154 - private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), 155 - GFP_KERNEL); 156 - if (!private->cp.guest_cp) 157 - goto out_free_private; 158 - 159 - private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, 160 - GFP_KERNEL | GFP_DMA); 161 - if (!private->io_region) 162 - goto out_free_cp; 163 - 164 - private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, 165 - GFP_KERNEL | GFP_DMA); 166 - if (!private->cmd_region) 167 - goto out_free_io; 168 - 169 - private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, 170 - GFP_KERNEL | GFP_DMA); 171 - 172 - if (!private->schib_region) 173 - goto out_free_cmd; 174 - 175 - private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, 176 - GFP_KERNEL | GFP_DMA); 177 - 178 - if (!private->crw_region) 179 - goto out_free_schib; 180 - return private; 181 - 182 - out_free_schib: 183 - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); 184 - out_free_cmd: 185 - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); 186 - out_free_io: 187 - kmem_cache_free(vfio_ccw_io_region, private->io_region); 188 - out_free_cp: 189 - kfree(private->cp.guest_cp); 190 - out_free_private: 191 - mutex_destroy(&private->io_mutex); 192 - kfree(private); 193 - return ERR_PTR(-ENOMEM); 150 + kfree(parent); 194 151 } 195 152 196 - static void vfio_ccw_free_private(struct vfio_ccw_private *private) 197 - { 198 - struct vfio_ccw_crw *crw, *temp; 199 - 200 - list_for_each_entry_safe(crw, temp, &private->crw, next) { 201 - list_del(&crw->next); 202 - kfree(crw); 203 - } 204 - 205 - kmem_cache_free(vfio_ccw_crw_region, private->crw_region); 206 - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); 207 - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); 208 - kmem_cache_free(vfio_ccw_io_region, private->io_region); 209 - kfree(private->cp.guest_cp); 210 - mutex_destroy(&private->io_mutex); 211 - kfree(private); 212 - } 213 153 static int vfio_ccw_sch_probe(struct subchannel *sch) 214 154 { 215 155 struct pmcw *pmcw = &sch->schib.pmcw; 216 - struct vfio_ccw_private *private; 156 + struct vfio_ccw_parent *parent; 217 157 int ret = -ENOMEM; 218 158 219 159 if (pmcw->qf) { ··· 171 213 return -ENODEV; 172 214 } 173 215 174 - private = vfio_ccw_alloc_private(sch); 175 - if (IS_ERR(private)) 176 - return PTR_ERR(private); 216 + parent = kzalloc(sizeof(*parent), GFP_KERNEL); 217 + if (!parent) 218 + return -ENOMEM; 177 219 178 - dev_set_drvdata(&sch->dev, private); 179 - 180 - private->mdev_type.sysfs_name = "io"; 181 - private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; 182 - private->mdev_types[0] = &private->mdev_type; 183 - ret = mdev_register_parent(&private->parent, &sch->dev, 184 - &vfio_ccw_mdev_driver, 185 - private->mdev_types, 1); 220 + dev_set_name(&parent->dev, "parent"); 221 + parent->dev.parent = &sch->dev; 222 + parent->dev.release = &vfio_ccw_free_parent; 223 + ret = device_register(&parent->dev); 186 224 if (ret) 187 225 goto out_free; 226 + 227 + dev_set_drvdata(&sch->dev, parent); 228 + 229 + parent->mdev_type.sysfs_name = "io"; 230 + parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; 231 + parent->mdev_types[0] = &parent->mdev_type; 232 + ret = mdev_register_parent(&parent->parent, &sch->dev, 233 + &vfio_ccw_mdev_driver, 234 + parent->mdev_types, 1); 235 + if (ret) 236 + goto out_unreg; 188 237 189 238 VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n", 190 239 sch->schid.cssid, sch->schid.ssid, 191 240 sch->schid.sch_no); 192 241 return 0; 193 242 243 + out_unreg: 244 + device_del(&parent->dev); 194 245 out_free: 246 + put_device(&parent->dev); 195 247 dev_set_drvdata(&sch->dev, NULL); 196 - vfio_ccw_free_private(private); 197 248 return ret; 198 249 } 199 250 200 251 static void vfio_ccw_sch_remove(struct subchannel *sch) 201 252 { 202 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 253 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 203 254 204 - mdev_unregister_parent(&private->parent); 255 + mdev_unregister_parent(&parent->parent); 205 256 257 + device_unregister(&parent->dev); 206 258 dev_set_drvdata(&sch->dev, NULL); 207 - 208 - vfio_ccw_free_private(private); 209 259 210 260 VFIO_CCW_MSG_EVENT(4, "unbound from subchannel %x.%x.%04x\n", 211 261 sch->schid.cssid, sch->schid.ssid, ··· 222 256 223 257 static void vfio_ccw_sch_shutdown(struct subchannel *sch) 224 258 { 225 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 259 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 260 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 261 + 262 + if (WARN_ON(!private)) 263 + return; 226 264 227 265 vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE); 228 266 vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); ··· 244 274 */ 245 275 static int vfio_ccw_sch_event(struct subchannel *sch, int process) 246 276 { 247 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 277 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 278 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 248 279 unsigned long flags; 249 280 int rc = -EAGAIN; 250 281 ··· 258 287 259 288 rc = 0; 260 289 261 - if (cio_update_schib(sch)) 262 - vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); 290 + if (cio_update_schib(sch)) { 291 + if (private) 292 + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); 293 + } 263 294 264 295 out_unlock: 265 296 spin_unlock_irqrestore(sch->lock, flags); ··· 299 326 static int vfio_ccw_chp_event(struct subchannel *sch, 300 327 struct chp_link *link, int event) 301 328 { 302 - struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); 329 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 330 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 303 331 int mask = chp_ssd_get_mask(&sch->ssd_info, link); 304 332 int retry = 255; 305 333 306 334 if (!private || !mask) 307 335 return 0; 308 336 309 - trace_vfio_ccw_chp_event(private->sch->schid, mask, event); 337 + trace_vfio_ccw_chp_event(sch->schid, mask, event); 310 338 VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n", 311 339 sch->schid.cssid, 312 340 sch->schid.ssid, sch->schid.sch_no,
+12 -15
drivers/s390/cio/vfio_ccw_fsm.c
··· 18 18 19 19 static int fsm_io_helper(struct vfio_ccw_private *private) 20 20 { 21 - struct subchannel *sch; 21 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 22 22 union orb *orb; 23 23 int ccode; 24 24 __u8 lpm; 25 25 unsigned long flags; 26 26 int ret; 27 - 28 - sch = private->sch; 29 27 30 28 spin_lock_irqsave(sch->lock, flags); 31 29 ··· 78 80 79 81 static int fsm_do_halt(struct vfio_ccw_private *private) 80 82 { 81 - struct subchannel *sch; 83 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 82 84 unsigned long flags; 83 85 int ccode; 84 86 int ret; 85 - 86 - sch = private->sch; 87 87 88 88 spin_lock_irqsave(sch->lock, flags); 89 89 ··· 117 121 118 122 static int fsm_do_clear(struct vfio_ccw_private *private) 119 123 { 120 - struct subchannel *sch; 124 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 121 125 unsigned long flags; 122 126 int ccode; 123 127 int ret; 124 - 125 - sch = private->sch; 126 128 127 129 spin_lock_irqsave(sch->lock, flags); 128 130 ··· 154 160 static void fsm_notoper(struct vfio_ccw_private *private, 155 161 enum vfio_ccw_event event) 156 162 { 157 - struct subchannel *sch = private->sch; 163 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 158 164 159 165 VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: notoper event %x state %x\n", 160 166 sch->schid.cssid, ··· 222 228 static void fsm_disabled_irq(struct vfio_ccw_private *private, 223 229 enum vfio_ccw_event event) 224 230 { 225 - struct subchannel *sch = private->sch; 231 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 226 232 227 233 /* 228 234 * An interrupt in a disabled state means a previous disable was not ··· 232 238 } 233 239 inline struct subchannel_id get_schid(struct vfio_ccw_private *p) 234 240 { 235 - return p->sch->schid; 241 + struct subchannel *sch = to_subchannel(p->vdev.dev->parent); 242 + 243 + return sch->schid; 236 244 } 237 245 238 246 /* ··· 356 360 static void fsm_irq(struct vfio_ccw_private *private, 357 361 enum vfio_ccw_event event) 358 362 { 363 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 359 364 struct irb *irb = this_cpu_ptr(&cio_irb); 360 365 361 366 VFIO_CCW_TRACE_EVENT(6, "IRQ"); 362 - VFIO_CCW_TRACE_EVENT(6, dev_name(&private->sch->dev)); 367 + VFIO_CCW_TRACE_EVENT(6, dev_name(&sch->dev)); 363 368 364 369 memcpy(&private->irb, irb, sizeof(*irb)); 365 370 ··· 373 376 static void fsm_open(struct vfio_ccw_private *private, 374 377 enum vfio_ccw_event event) 375 378 { 376 - struct subchannel *sch = private->sch; 379 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 377 380 int ret; 378 381 379 382 spin_lock_irq(sch->lock); ··· 394 397 static void fsm_close(struct vfio_ccw_private *private, 395 398 enum vfio_ccw_event event) 396 399 { 397 - struct subchannel *sch = private->sch; 400 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 398 401 int ret; 399 402 400 403 spin_lock_irq(sch->lock);
+74 -33
drivers/s390/cio/vfio_ccw_ops.c
··· 49 49 struct vfio_ccw_private *private = 50 50 container_of(vdev, struct vfio_ccw_private, vdev); 51 51 52 - init_completion(&private->release_comp); 52 + mutex_init(&private->io_mutex); 53 + private->state = VFIO_CCW_STATE_STANDBY; 54 + INIT_LIST_HEAD(&private->crw); 55 + INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); 56 + INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); 57 + 58 + private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), 59 + GFP_KERNEL); 60 + if (!private->cp.guest_cp) 61 + goto out_free_private; 62 + 63 + private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, 64 + GFP_KERNEL | GFP_DMA); 65 + if (!private->io_region) 66 + goto out_free_cp; 67 + 68 + private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, 69 + GFP_KERNEL | GFP_DMA); 70 + if (!private->cmd_region) 71 + goto out_free_io; 72 + 73 + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, 74 + GFP_KERNEL | GFP_DMA); 75 + if (!private->schib_region) 76 + goto out_free_cmd; 77 + 78 + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, 79 + GFP_KERNEL | GFP_DMA); 80 + if (!private->crw_region) 81 + goto out_free_schib; 82 + 53 83 return 0; 84 + 85 + out_free_schib: 86 + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); 87 + out_free_cmd: 88 + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); 89 + out_free_io: 90 + kmem_cache_free(vfio_ccw_io_region, private->io_region); 91 + out_free_cp: 92 + kfree(private->cp.guest_cp); 93 + out_free_private: 94 + mutex_destroy(&private->io_mutex); 95 + return -ENOMEM; 54 96 } 55 97 56 98 static int vfio_ccw_mdev_probe(struct mdev_device *mdev) 57 99 { 58 - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); 100 + struct subchannel *sch = to_subchannel(mdev->dev.parent); 101 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 102 + struct vfio_ccw_private *private; 59 103 int ret; 60 104 61 - if (private->state == VFIO_CCW_STATE_NOT_OPER) 62 - return -ENODEV; 105 + private = vfio_alloc_device(vfio_ccw_private, vdev, &mdev->dev, 106 + &vfio_ccw_dev_ops); 107 + if (IS_ERR(private)) 108 + return PTR_ERR(private); 63 109 64 - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); 65 - if (ret) 66 - return ret; 110 + dev_set_drvdata(&parent->dev, private); 67 111 68 112 VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", 69 - private->sch->schid.cssid, 70 - private->sch->schid.ssid, 71 - private->sch->schid.sch_no); 113 + sch->schid.cssid, 114 + sch->schid.ssid, 115 + sch->schid.sch_no); 72 116 73 117 ret = vfio_register_emulated_iommu_dev(&private->vdev); 74 118 if (ret) ··· 121 77 return 0; 122 78 123 79 err_put_vdev: 80 + dev_set_drvdata(&parent->dev, NULL); 124 81 vfio_put_device(&private->vdev); 125 82 return ret; 126 83 } ··· 130 85 { 131 86 struct vfio_ccw_private *private = 132 87 container_of(vdev, struct vfio_ccw_private, vdev); 88 + struct vfio_ccw_crw *crw, *temp; 133 89 134 - /* 135 - * We cannot free vfio_ccw_private here because it includes 136 - * parent info which must be free'ed by css driver. 137 - * 138 - * Use a workaround by memset'ing the core device part and 139 - * then notifying the remove path that all active references 140 - * to this device have been released. 141 - */ 142 - memset(vdev, 0, sizeof(*vdev)); 143 - complete(&private->release_comp); 90 + list_for_each_entry_safe(crw, temp, &private->crw, next) { 91 + list_del(&crw->next); 92 + kfree(crw); 93 + } 94 + 95 + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); 96 + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); 97 + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); 98 + kmem_cache_free(vfio_ccw_io_region, private->io_region); 99 + kfree(private->cp.guest_cp); 100 + mutex_destroy(&private->io_mutex); 144 101 } 145 102 146 103 static void vfio_ccw_mdev_remove(struct mdev_device *mdev) 147 104 { 148 - struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); 105 + struct subchannel *sch = to_subchannel(mdev->dev.parent); 106 + struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev); 107 + struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev); 149 108 150 109 VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n", 151 - private->sch->schid.cssid, 152 - private->sch->schid.ssid, 153 - private->sch->schid.sch_no); 110 + sch->schid.cssid, 111 + sch->schid.ssid, 112 + sch->schid.sch_no); 154 113 155 114 vfio_unregister_group_dev(&private->vdev); 156 115 116 + dev_set_drvdata(&parent->dev, NULL); 157 117 vfio_put_device(&private->vdev); 158 - /* 159 - * Wait for all active references on mdev are released so it 160 - * is safe to defer kfree() to a later point. 161 - * 162 - * TODO: the clean fix is to split parent/mdev info from ccw 163 - * private structure so each can be managed in its own life 164 - * cycle. 165 - */ 166 - wait_for_completion(&private->release_comp); 167 118 } 168 119 169 120 static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
+25 -12
drivers/s390/cio/vfio_ccw_private.h
··· 68 68 }; 69 69 70 70 /** 71 + * struct vfio_ccw_parent 72 + * 73 + * @dev: embedded device struct 74 + * @parent: parent data structures for mdevs created 75 + * @mdev_type(s): identifying information for mdevs created 76 + */ 77 + struct vfio_ccw_parent { 78 + struct device dev; 79 + 80 + struct mdev_parent parent; 81 + struct mdev_type mdev_type; 82 + struct mdev_type *mdev_types[1]; 83 + }; 84 + 85 + /** 71 86 * struct vfio_ccw_private 72 87 * @vdev: Embedded VFIO device 73 - * @sch: pointer to the subchannel 74 88 * @state: internal state of the device 75 89 * @completion: synchronization helper of the I/O completion 76 90 * @io_region: MMIO region to input/output I/O arguments/results ··· 102 88 * @req_trigger: eventfd ctx for signaling userspace to return device 103 89 * @io_work: work for deferral process of I/O handling 104 90 * @crw_work: work for deferral process of CRW handling 105 - * @release_comp: synchronization helper for vfio device release 106 - * @parent: parent data structures for mdevs created 107 91 */ 108 92 struct vfio_ccw_private { 109 93 struct vfio_device vdev; 110 - struct subchannel *sch; 111 94 int state; 112 95 struct completion *completion; 113 96 struct ccw_io_region *io_region; ··· 125 114 struct eventfd_ctx *req_trigger; 126 115 struct work_struct io_work; 127 116 struct work_struct crw_work; 128 - 129 - struct completion release_comp; 130 - 131 - struct mdev_parent parent; 132 - struct mdev_type mdev_type; 133 - struct mdev_type *mdev_types[1]; 134 117 } __aligned(8); 135 118 136 119 int vfio_ccw_sch_quiesce(struct subchannel *sch); 120 + void vfio_ccw_sch_io_todo(struct work_struct *work); 121 + void vfio_ccw_crw_todo(struct work_struct *work); 137 122 138 123 extern struct mdev_driver vfio_ccw_mdev_driver; 139 124 ··· 169 162 static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private, 170 163 enum vfio_ccw_event event) 171 164 { 172 - trace_vfio_ccw_fsm_event(private->sch->schid, private->state, event); 165 + struct subchannel *sch = to_subchannel(private->vdev.dev->parent); 166 + 167 + if (sch) 168 + trace_vfio_ccw_fsm_event(sch->schid, private->state, event); 173 169 vfio_ccw_jumptable[private->state][event](private, event); 174 170 } 175 171 176 172 extern struct workqueue_struct *vfio_ccw_work_q; 177 - 173 + extern struct kmem_cache *vfio_ccw_io_region; 174 + extern struct kmem_cache *vfio_ccw_cmd_region; 175 + extern struct kmem_cache *vfio_ccw_schib_region; 176 + extern struct kmem_cache *vfio_ccw_crw_region; 178 177 179 178 /* s390 debug feature, similar to base cio */ 180 179 extern debug_info_t *vfio_ccw_debug_msg_id;
+1 -1
drivers/s390/crypto/vfio_ap_drv.c
··· 122 122 return 0; 123 123 124 124 matrix_drv_err: 125 - device_unregister(&matrix_dev->device); 125 + device_del(&matrix_dev->device); 126 126 matrix_reg_err: 127 127 put_device(&matrix_dev->device); 128 128 matrix_alloc_err:
-6
drivers/s390/crypto/vfio_ap_ops.c
··· 765 765 } 766 766 } 767 767 768 - static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) 769 - { 770 - vfio_free_device(vdev); 771 - } 772 - 773 768 static void vfio_ap_mdev_remove(struct mdev_device *mdev) 774 769 { 775 770 struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); ··· 1795 1800 1796 1801 static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { 1797 1802 .init = vfio_ap_mdev_init_dev, 1798 - .release = vfio_ap_mdev_release_dev, 1799 1803 .open_device = vfio_ap_mdev_open_device, 1800 1804 .close_device = vfio_ap_mdev_close_device, 1801 1805 .ioctl = vfio_ap_mdev_ioctl,
+1 -6
drivers/vfio/Kconfig
··· 48 48 If you don't know what to do here, say N. 49 49 endif 50 50 51 - config VFIO_SPAPR_EEH 52 - tristate 53 - depends on EEH && VFIO_IOMMU_SPAPR_TCE 54 - default VFIO 55 - 56 51 config VFIO_VIRQFD 57 - tristate 52 + bool 58 53 select EVENTFD 59 54 default n 60 55
+1 -4
drivers/vfio/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 - vfio_virqfd-y := virqfd.o 3 - 4 2 obj-$(CONFIG_VFIO) += vfio.o 5 3 6 4 vfio-y += vfio_main.o \ ··· 6 8 iova_bitmap.o 7 9 vfio-$(CONFIG_IOMMUFD) += iommufd.o 8 10 vfio-$(CONFIG_VFIO_CONTAINER) += container.o 11 + vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o 9 12 10 - obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o 11 13 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o 12 14 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o 13 - obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o 14 15 obj-$(CONFIG_VFIO_PCI) += pci/ 15 16 obj-$(CONFIG_VFIO_PLATFORM) += platform/ 16 17 obj-$(CONFIG_VFIO_MDEV) += mdev/
-1
drivers/vfio/fsl-mc/vfio_fsl_mc.c
··· 568 568 569 569 vfio_fsl_uninit_device(vdev); 570 570 mutex_destroy(&vdev->igate); 571 - vfio_free_device(core_vdev); 572 571 } 573 572 574 573 static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev)
+17 -16
drivers/vfio/iova_bitmap.c
··· 5 5 */ 6 6 #include <linux/iova_bitmap.h> 7 7 #include <linux/mm.h> 8 + #include <linux/slab.h> 8 9 #include <linux/highmem.h> 9 10 10 11 #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) ··· 296 295 */ 297 296 static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) 298 297 { 299 - unsigned long remaining; 298 + unsigned long remaining, bytes; 299 + 300 + bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; 300 301 301 302 remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; 302 303 remaining = min_t(unsigned long, remaining, 303 - (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); 304 + bytes / sizeof(*bitmap->bitmap)); 304 305 305 306 return remaining; 306 307 } ··· 397 394 * Set the bits corresponding to the range [iova .. iova+length-1] in 398 395 * the user bitmap. 399 396 * 400 - * Return: The number of bits set. 401 397 */ 402 398 void iova_bitmap_set(struct iova_bitmap *bitmap, 403 399 unsigned long iova, size_t length) 404 400 { 405 401 struct iova_bitmap_map *mapped = &bitmap->mapped; 406 - unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; 407 - unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); 408 - unsigned long page_idx = offset / BITS_PER_PAGE; 409 - unsigned long page_offset = mapped->pgoff; 410 - void *kaddr; 411 - 412 - offset = offset % BITS_PER_PAGE; 402 + unsigned long cur_bit = ((iova - mapped->iova) >> 403 + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; 404 + unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> 405 + mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; 413 406 414 407 do { 415 - unsigned long size = min(BITS_PER_PAGE - offset, nbits); 408 + unsigned int page_idx = cur_bit / BITS_PER_PAGE; 409 + unsigned int offset = cur_bit % BITS_PER_PAGE; 410 + unsigned int nbits = min(BITS_PER_PAGE - offset, 411 + last_bit - cur_bit + 1); 412 + void *kaddr; 416 413 417 414 kaddr = kmap_local_page(mapped->pages[page_idx]); 418 - bitmap_set(kaddr + page_offset, offset, size); 415 + bitmap_set(kaddr, offset, nbits); 419 416 kunmap_local(kaddr); 420 - page_offset = offset = 0; 421 - nbits -= size; 422 - page_idx++; 423 - } while (nbits > 0); 417 + cur_bit += nbits; 418 + } while (cur_bit <= last_bit); 424 419 } 425 420 EXPORT_SYMBOL_GPL(iova_bitmap_set);
+140 -16
drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
··· 360 360 u32 que_iso_state; 361 361 int ret; 362 362 363 - if (migf->total_length < QM_MATCH_SIZE) 364 - return -EINVAL; 363 + if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) 364 + return 0; 365 365 366 366 if (vf_data->acc_magic != ACC_DEV_MAGIC) { 367 367 dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); ··· 406 406 } 407 407 408 408 hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; 409 + hisi_acc_vdev->match_done = true; 409 410 return 0; 410 411 } 411 412 ··· 493 492 struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; 494 493 struct device *dev = &vf_qm->pdev->dev; 495 494 int ret; 496 - 497 - ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data); 498 - if (ret) 499 - return ret; 500 495 501 496 if (unlikely(qm_wait_dev_not_ready(vf_qm))) { 502 497 /* Update state and return with match data */ ··· 670 673 struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; 671 674 int ret; 672 675 673 - /* Check dev compatibility */ 674 - ret = vf_qm_check_match(hisi_acc_vdev, migf); 675 - if (ret) { 676 - dev_err(dev, "failed to match the VF!\n"); 677 - return ret; 678 - } 679 676 /* Recover data to VF */ 680 677 ret = vf_qm_load_data(hisi_acc_vdev, migf); 681 678 if (ret) { ··· 723 732 *pos += len; 724 733 done = len; 725 734 migf->total_length += len; 735 + 736 + ret = vf_qm_check_match(migf->hisi_acc_vdev, migf); 737 + if (ret) 738 + done = -EFAULT; 726 739 out_unlock: 727 740 mutex_unlock(&migf->lock); 728 741 return done; ··· 759 764 760 765 stream_open(migf->filp->f_inode, migf->filp); 761 766 mutex_init(&migf->lock); 767 + migf->hisi_acc_vdev = hisi_acc_vdev; 762 768 return migf; 769 + } 770 + 771 + static long hisi_acc_vf_precopy_ioctl(struct file *filp, 772 + unsigned int cmd, unsigned long arg) 773 + { 774 + struct hisi_acc_vf_migration_file *migf = filp->private_data; 775 + struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; 776 + loff_t *pos = &filp->f_pos; 777 + struct vfio_precopy_info info; 778 + unsigned long minsz; 779 + int ret; 780 + 781 + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 782 + return -ENOTTY; 783 + 784 + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 785 + 786 + if (copy_from_user(&info, (void __user *)arg, minsz)) 787 + return -EFAULT; 788 + if (info.argsz < minsz) 789 + return -EINVAL; 790 + 791 + mutex_lock(&hisi_acc_vdev->state_mutex); 792 + if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { 793 + mutex_unlock(&hisi_acc_vdev->state_mutex); 794 + return -EINVAL; 795 + } 796 + 797 + mutex_lock(&migf->lock); 798 + 799 + if (migf->disabled) { 800 + ret = -ENODEV; 801 + goto out; 802 + } 803 + 804 + if (*pos > migf->total_length) { 805 + ret = -EINVAL; 806 + goto out; 807 + } 808 + 809 + info.dirty_bytes = 0; 810 + info.initial_bytes = migf->total_length - *pos; 811 + 812 + ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; 813 + out: 814 + mutex_unlock(&migf->lock); 815 + mutex_unlock(&hisi_acc_vdev->state_mutex); 816 + return ret; 763 817 } 764 818 765 819 static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len, ··· 851 807 static const struct file_operations hisi_acc_vf_save_fops = { 852 808 .owner = THIS_MODULE, 853 809 .read = hisi_acc_vf_save_read, 810 + .unlocked_ioctl = hisi_acc_vf_precopy_ioctl, 811 + .compat_ioctl = compat_ptr_ioctl, 854 812 .release = hisi_acc_vf_release_file, 855 813 .llseek = no_llseek, 856 814 }; 857 815 858 816 static struct hisi_acc_vf_migration_file * 859 - hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) 817 + hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) 860 818 { 861 819 struct hisi_acc_vf_migration_file *migf; 862 820 int ret; ··· 878 832 879 833 stream_open(migf->filp->f_inode, migf->filp); 880 834 mutex_init(&migf->lock); 835 + migf->hisi_acc_vdev = hisi_acc_vdev; 881 836 882 - ret = vf_qm_state_save(hisi_acc_vdev, migf); 837 + ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data); 883 838 if (ret) { 884 839 fput(migf->filp); 885 840 return ERR_PTR(ret); 886 841 } 887 842 888 843 return migf; 844 + } 845 + 846 + static struct hisi_acc_vf_migration_file * 847 + hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev) 848 + { 849 + struct hisi_acc_vf_migration_file *migf; 850 + 851 + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); 852 + if (IS_ERR(migf)) 853 + return migf; 854 + 855 + migf->total_length = QM_MATCH_SIZE; 856 + return migf; 857 + } 858 + 859 + static struct hisi_acc_vf_migration_file * 860 + hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open) 861 + { 862 + int ret; 863 + struct hisi_acc_vf_migration_file *migf = NULL; 864 + 865 + if (open) { 866 + /* 867 + * Userspace didn't use PRECOPY support. Hence saving_migf 868 + * is not opened yet. 869 + */ 870 + migf = hisi_acc_open_saving_migf(hisi_acc_vdev); 871 + if (IS_ERR(migf)) 872 + return migf; 873 + } else { 874 + migf = hisi_acc_vdev->saving_migf; 875 + } 876 + 877 + ret = vf_qm_state_save(hisi_acc_vdev, migf); 878 + if (ret) 879 + return ERR_PTR(ret); 880 + 881 + return open ? migf : NULL; 889 882 } 890 883 891 884 static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) ··· 954 869 u32 cur = hisi_acc_vdev->mig_state; 955 870 int ret; 956 871 872 + if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) { 873 + struct hisi_acc_vf_migration_file *migf; 874 + 875 + migf = hisi_acc_vf_pre_copy(hisi_acc_vdev); 876 + if (IS_ERR(migf)) 877 + return ERR_CAST(migf); 878 + get_file(migf->filp); 879 + hisi_acc_vdev->saving_migf = migf; 880 + return migf->filp; 881 + } 882 + 883 + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) { 884 + struct hisi_acc_vf_migration_file *migf; 885 + 886 + ret = hisi_acc_vf_stop_device(hisi_acc_vdev); 887 + if (ret) 888 + return ERR_PTR(ret); 889 + 890 + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false); 891 + if (IS_ERR(migf)) 892 + return ERR_CAST(migf); 893 + 894 + return NULL; 895 + } 896 + 957 897 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) { 958 898 ret = hisi_acc_vf_stop_device(hisi_acc_vdev); 959 899 if (ret) ··· 989 879 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 990 880 struct hisi_acc_vf_migration_file *migf; 991 881 992 - migf = hisi_acc_vf_stop_copy(hisi_acc_vdev); 882 + migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true); 993 883 if (IS_ERR(migf)) 994 884 return ERR_CAST(migf); 995 885 get_file(migf->filp); ··· 1017 907 ret = hisi_acc_vf_load_state(hisi_acc_vdev); 1018 908 if (ret) 1019 909 return ERR_PTR(ret); 910 + hisi_acc_vf_disable_fds(hisi_acc_vdev); 911 + return NULL; 912 + } 913 + 914 + if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) { 1020 915 hisi_acc_vf_disable_fds(hisi_acc_vdev); 1021 916 return NULL; 1022 917 } ··· 1070 955 } 1071 956 hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); 1072 957 return res; 958 + } 959 + 960 + static int 961 + hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev, 962 + unsigned long *stop_copy_length) 963 + { 964 + *stop_copy_length = sizeof(struct acc_vf_data); 965 + return 0; 1073 966 } 1074 967 1075 968 static int ··· 1336 1213 static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { 1337 1214 .migration_set_state = hisi_acc_vfio_pci_set_device_state, 1338 1215 .migration_get_state = hisi_acc_vfio_pci_get_device_state, 1216 + .migration_get_data_size = hisi_acc_vfio_pci_get_data_size, 1339 1217 }; 1340 1218 1341 1219 static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) ··· 1351 1227 hisi_acc_vdev->vf_dev = pdev; 1352 1228 mutex_init(&hisi_acc_vdev->state_mutex); 1353 1229 1354 - core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; 1230 + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; 1355 1231 core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; 1356 1232 1357 1233 return vfio_pci_core_init_dev(core_vdev);
+2
drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
··· 91 91 struct mutex lock; 92 92 bool disabled; 93 93 94 + struct hisi_acc_vf_core_device *hisi_acc_vdev; 94 95 struct acc_vf_data vf_data; 95 96 size_t total_length; 96 97 }; 97 98 98 99 struct hisi_acc_vf_core_device { 99 100 struct vfio_pci_core_device core_device; 101 + u8 match_done:1; 100 102 u8 deferred_reset:1; 101 103 /* For migration state */ 102 104 struct mutex state_mutex;
+340 -73
drivers/vfio/pci/mlx5/cmd.c
··· 14 14 15 15 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 16 16 { 17 + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 17 18 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 18 19 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 20 + int err; 19 21 20 22 lockdep_assert_held(&mvdev->state_mutex); 21 23 if (mvdev->mdev_detach) 22 24 return -ENOTCONN; 23 25 26 + /* 27 + * In case PRE_COPY is used, saving_migf is exposed while the device is 28 + * running. Make sure to run only once there is no active save command. 29 + * Running both in parallel, might end-up with a failure in the save 30 + * command once it will try to turn on 'tracking' on a suspended device. 31 + */ 32 + if (migf) { 33 + err = wait_for_completion_interruptible(&migf->save_comp); 34 + if (err) 35 + return err; 36 + } 37 + 24 38 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 25 39 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 26 40 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 27 41 28 - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 42 + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 43 + if (migf) 44 + complete(&migf->save_comp); 45 + 46 + return err; 29 47 } 30 48 31 49 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) ··· 63 45 } 64 46 65 47 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 66 - size_t *state_size) 48 + size_t *state_size, u8 query_flags) 67 49 { 68 50 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 69 51 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 52 + bool inc = query_flags & MLX5VF_QUERY_INC; 70 53 int ret; 71 54 72 55 lockdep_assert_held(&mvdev->state_mutex); 73 56 if (mvdev->mdev_detach) 74 57 return -ENOTCONN; 75 58 59 + /* 60 + * In case PRE_COPY is used, saving_migf is exposed while device is 61 + * running. Make sure to run only once there is no active save command. 62 + * Running both in parallel, might end-up with a failure in the 63 + * incremental query command on un-tracked vhca. 64 + */ 65 + if (inc) { 66 + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 67 + if (ret) 68 + return ret; 69 + if (mvdev->saving_migf->state == 70 + MLX5_MIGF_STATE_PRE_COPY_ERROR) { 71 + /* 72 + * In case we had a PRE_COPY error, only query full 73 + * image for final image 74 + */ 75 + if (!(query_flags & MLX5VF_QUERY_FINAL)) { 76 + *state_size = 0; 77 + complete(&mvdev->saving_migf->save_comp); 78 + return 0; 79 + } 80 + query_flags &= ~MLX5VF_QUERY_INC; 81 + } 82 + } 83 + 76 84 MLX5_SET(query_vhca_migration_state_in, in, opcode, 77 85 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 78 86 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 79 87 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 88 + MLX5_SET(query_vhca_migration_state_in, in, incremental, 89 + query_flags & MLX5VF_QUERY_INC); 80 90 81 91 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 82 92 out); 93 + if (inc) 94 + complete(&mvdev->saving_migf->save_comp); 95 + 83 96 if (ret) 84 97 return ret; 85 98 ··· 222 173 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 223 174 mvdev->core_device.vdev.log_ops = log_ops; 224 175 176 + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 177 + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 178 + mvdev->core_device.vdev.migration_flags |= 179 + VFIO_MIGRATION_PRE_COPY; 180 + 225 181 end: 226 182 mlx5_vf_put_core_dev(mvdev->mdev); 227 183 } ··· 264 210 } 265 211 266 212 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 267 - struct mlx5_vf_migration_file *migf, 213 + struct mlx5_vhca_data_buffer *buf, 268 214 struct mlx5_vhca_recv_buf *recv_buf, 269 215 u32 *mkey) 270 216 { 271 - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : 217 + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 272 218 recv_buf->npages; 273 219 int err = 0, inlen; 274 220 __be64 *mtt; ··· 286 232 DIV_ROUND_UP(npages, 2)); 287 233 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 288 234 289 - if (migf) { 235 + if (buf) { 290 236 struct sg_dma_page_iter dma_iter; 291 237 292 - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) 238 + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 293 239 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 294 240 } else { 295 241 int i; ··· 309 255 MLX5_SET(mkc, mkc, qpn, 0xffffff); 310 256 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 311 257 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 312 - MLX5_SET64(mkc, mkc, len, 313 - migf ? migf->total_length : (npages * PAGE_SIZE)); 258 + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 314 259 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 315 260 kvfree(in); 316 261 return err; 262 + } 263 + 264 + static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 265 + { 266 + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 267 + struct mlx5_core_dev *mdev = mvdev->mdev; 268 + int ret; 269 + 270 + lockdep_assert_held(&mvdev->state_mutex); 271 + if (mvdev->mdev_detach) 272 + return -ENOTCONN; 273 + 274 + if (buf->dmaed || !buf->allocated_length) 275 + return -EINVAL; 276 + 277 + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 278 + if (ret) 279 + return ret; 280 + 281 + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 282 + if (ret) 283 + goto err; 284 + 285 + buf->dmaed = true; 286 + 287 + return 0; 288 + err: 289 + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 290 + return ret; 291 + } 292 + 293 + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 294 + { 295 + struct mlx5_vf_migration_file *migf = buf->migf; 296 + struct sg_page_iter sg_iter; 297 + 298 + lockdep_assert_held(&migf->mvdev->state_mutex); 299 + WARN_ON(migf->mvdev->mdev_detach); 300 + 301 + if (buf->dmaed) { 302 + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 303 + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 304 + buf->dma_dir, 0); 305 + } 306 + 307 + /* Undo alloc_pages_bulk_array() */ 308 + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 309 + __free_page(sg_page_iter_page(&sg_iter)); 310 + sg_free_append_table(&buf->table); 311 + kfree(buf); 312 + } 313 + 314 + struct mlx5_vhca_data_buffer * 315 + mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 316 + size_t length, 317 + enum dma_data_direction dma_dir) 318 + { 319 + struct mlx5_vhca_data_buffer *buf; 320 + int ret; 321 + 322 + buf = kzalloc(sizeof(*buf), GFP_KERNEL); 323 + if (!buf) 324 + return ERR_PTR(-ENOMEM); 325 + 326 + buf->dma_dir = dma_dir; 327 + buf->migf = migf; 328 + if (length) { 329 + ret = mlx5vf_add_migration_pages(buf, 330 + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 331 + if (ret) 332 + goto end; 333 + 334 + if (dma_dir != DMA_NONE) { 335 + ret = mlx5vf_dma_data_buffer(buf); 336 + if (ret) 337 + goto end; 338 + } 339 + } 340 + 341 + return buf; 342 + end: 343 + mlx5vf_free_data_buffer(buf); 344 + return ERR_PTR(ret); 345 + } 346 + 347 + void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 348 + { 349 + spin_lock_irq(&buf->migf->list_lock); 350 + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 351 + spin_unlock_irq(&buf->migf->list_lock); 352 + } 353 + 354 + struct mlx5_vhca_data_buffer * 355 + mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 356 + size_t length, enum dma_data_direction dma_dir) 357 + { 358 + struct mlx5_vhca_data_buffer *buf, *temp_buf; 359 + struct list_head free_list; 360 + 361 + lockdep_assert_held(&migf->mvdev->state_mutex); 362 + if (migf->mvdev->mdev_detach) 363 + return ERR_PTR(-ENOTCONN); 364 + 365 + INIT_LIST_HEAD(&free_list); 366 + 367 + spin_lock_irq(&migf->list_lock); 368 + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 369 + if (buf->dma_dir == dma_dir) { 370 + list_del_init(&buf->buf_elm); 371 + if (buf->allocated_length >= length) { 372 + spin_unlock_irq(&migf->list_lock); 373 + goto found; 374 + } 375 + /* 376 + * Prevent holding redundant buffers. Put in a free 377 + * list and call at the end not under the spin lock 378 + * (&migf->list_lock) to mlx5vf_free_data_buffer which 379 + * might sleep. 380 + */ 381 + list_add(&buf->buf_elm, &free_list); 382 + } 383 + } 384 + spin_unlock_irq(&migf->list_lock); 385 + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 386 + 387 + found: 388 + while ((temp_buf = list_first_entry_or_null(&free_list, 389 + struct mlx5_vhca_data_buffer, buf_elm))) { 390 + list_del(&temp_buf->buf_elm); 391 + mlx5vf_free_data_buffer(temp_buf); 392 + } 393 + 394 + return buf; 317 395 } 318 396 319 397 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) ··· 454 268 struct mlx5vf_async_data, work); 455 269 struct mlx5_vf_migration_file *migf = container_of(async_data, 456 270 struct mlx5_vf_migration_file, async_data); 457 - struct mlx5_core_dev *mdev = migf->mvdev->mdev; 458 271 459 272 mutex_lock(&migf->lock); 460 273 if (async_data->status) { 461 - migf->is_err = true; 274 + mlx5vf_put_data_buffer(async_data->buf); 275 + if (async_data->header_buf) 276 + mlx5vf_put_data_buffer(async_data->header_buf); 277 + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 278 + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 279 + else 280 + migf->state = MLX5_MIGF_STATE_ERROR; 462 281 wake_up_interruptible(&migf->poll_wait); 463 282 } 464 283 mutex_unlock(&migf->lock); 465 - 466 - mlx5_core_destroy_mkey(mdev, async_data->mkey); 467 - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); 468 - mlx5_core_dealloc_pd(mdev, async_data->pdn); 469 284 kvfree(async_data->out); 285 + complete(&migf->save_comp); 470 286 fput(migf->filp); 287 + } 288 + 289 + static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 290 + size_t image_size) 291 + { 292 + struct mlx5_vf_migration_file *migf = header_buf->migf; 293 + struct mlx5_vf_migration_header header = {}; 294 + unsigned long flags; 295 + struct page *page; 296 + u8 *to_buff; 297 + 298 + header.image_size = cpu_to_le64(image_size); 299 + page = mlx5vf_get_migration_page(header_buf, 0); 300 + if (!page) 301 + return -EINVAL; 302 + to_buff = kmap_local_page(page); 303 + memcpy(to_buff, &header, sizeof(header)); 304 + kunmap_local(to_buff); 305 + header_buf->length = sizeof(header); 306 + header_buf->header_image_size = image_size; 307 + header_buf->start_pos = header_buf->migf->max_pos; 308 + migf->max_pos += header_buf->length; 309 + spin_lock_irqsave(&migf->list_lock, flags); 310 + list_add_tail(&header_buf->buf_elm, &migf->buf_list); 311 + spin_unlock_irqrestore(&migf->list_lock, flags); 312 + return 0; 471 313 } 472 314 473 315 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) ··· 506 292 struct mlx5_vf_migration_file, async_data); 507 293 508 294 if (!status) { 509 - WRITE_ONCE(migf->total_length, 510 - MLX5_GET(save_vhca_state_out, async_data->out, 511 - actual_image_size)); 295 + size_t image_size; 296 + unsigned long flags; 297 + 298 + image_size = MLX5_GET(save_vhca_state_out, async_data->out, 299 + actual_image_size); 300 + if (async_data->header_buf) { 301 + status = add_buf_header(async_data->header_buf, image_size); 302 + if (status) 303 + goto err; 304 + } 305 + async_data->buf->length = image_size; 306 + async_data->buf->start_pos = migf->max_pos; 307 + migf->max_pos += async_data->buf->length; 308 + spin_lock_irqsave(&migf->list_lock, flags); 309 + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 310 + spin_unlock_irqrestore(&migf->list_lock, flags); 311 + migf->state = async_data->last_chunk ? 312 + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; 512 313 wake_up_interruptible(&migf->poll_wait); 513 314 } 514 315 316 + err: 515 317 /* 516 318 * The error and the cleanup flows can't run from an 517 319 * interrupt context 518 320 */ 321 + if (status == -EREMOTEIO) 322 + status = MLX5_GET(save_vhca_state_out, async_data->out, status); 519 323 async_data->status = status; 520 324 queue_work(migf->mvdev->cb_wq, &async_data->work); 521 325 } 522 326 523 327 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 524 - struct mlx5_vf_migration_file *migf) 328 + struct mlx5_vf_migration_file *migf, 329 + struct mlx5_vhca_data_buffer *buf, bool inc, 330 + bool track) 525 331 { 526 332 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 527 333 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 334 + struct mlx5_vhca_data_buffer *header_buf = NULL; 528 335 struct mlx5vf_async_data *async_data; 529 - struct mlx5_core_dev *mdev; 530 - u32 pdn, mkey; 531 336 int err; 532 337 533 338 lockdep_assert_held(&mvdev->state_mutex); 534 339 if (mvdev->mdev_detach) 535 340 return -ENOTCONN; 536 341 537 - mdev = mvdev->mdev; 538 - err = mlx5_core_alloc_pd(mdev, &pdn); 342 + err = wait_for_completion_interruptible(&migf->save_comp); 539 343 if (err) 540 344 return err; 541 345 542 - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 543 - 0); 544 - if (err) 545 - goto err_dma_map; 546 - 547 - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); 548 - if (err) 549 - goto err_create_mkey; 346 + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 347 + /* 348 + * In case we had a PRE_COPY error, SAVE is triggered only for 349 + * the final image, read device full image. 350 + */ 351 + inc = false; 550 352 551 353 MLX5_SET(save_vhca_state_in, in, opcode, 552 354 MLX5_CMD_OP_SAVE_VHCA_STATE); 553 355 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 554 356 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 555 - MLX5_SET(save_vhca_state_in, in, mkey, mkey); 556 - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); 357 + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 358 + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 359 + MLX5_SET(save_vhca_state_in, in, incremental, inc); 360 + MLX5_SET(save_vhca_state_in, in, set_track, track); 557 361 558 362 async_data = &migf->async_data; 363 + async_data->buf = buf; 364 + async_data->last_chunk = !track; 559 365 async_data->out = kvzalloc(out_size, GFP_KERNEL); 560 366 if (!async_data->out) { 561 367 err = -ENOMEM; 562 368 goto err_out; 563 369 } 564 370 565 - /* no data exists till the callback comes back */ 566 - migf->total_length = 0; 371 + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 372 + header_buf = mlx5vf_get_data_buffer(migf, 373 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 374 + if (IS_ERR(header_buf)) { 375 + err = PTR_ERR(header_buf); 376 + goto err_free; 377 + } 378 + } 379 + 380 + if (async_data->last_chunk) 381 + migf->state = MLX5_MIGF_STATE_SAVE_LAST; 382 + 383 + async_data->header_buf = header_buf; 567 384 get_file(migf->filp); 568 - async_data->mkey = mkey; 569 - async_data->pdn = pdn; 570 385 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 571 386 async_data->out, 572 387 out_size, mlx5vf_save_callback, ··· 606 363 return 0; 607 364 608 365 err_exec: 366 + if (header_buf) 367 + mlx5vf_put_data_buffer(header_buf); 609 368 fput(migf->filp); 369 + err_free: 610 370 kvfree(async_data->out); 611 371 err_out: 612 - mlx5_core_destroy_mkey(mdev, mkey); 613 - err_create_mkey: 614 - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); 615 - err_dma_map: 616 - mlx5_core_dealloc_pd(mdev, pdn); 372 + complete(&migf->save_comp); 617 373 return err; 618 374 } 619 375 620 376 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 621 - struct mlx5_vf_migration_file *migf) 377 + struct mlx5_vf_migration_file *migf, 378 + struct mlx5_vhca_data_buffer *buf) 622 379 { 623 - struct mlx5_core_dev *mdev; 624 - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; 625 - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 626 - u32 pdn, mkey; 380 + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 381 + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 627 382 int err; 628 383 629 384 lockdep_assert_held(&mvdev->state_mutex); 630 385 if (mvdev->mdev_detach) 631 386 return -ENOTCONN; 632 387 633 - mutex_lock(&migf->lock); 634 - if (!migf->total_length) { 635 - err = -EINVAL; 636 - goto end; 388 + if (!buf->dmaed) { 389 + err = mlx5vf_dma_data_buffer(buf); 390 + if (err) 391 + return err; 637 392 } 638 - 639 - mdev = mvdev->mdev; 640 - err = mlx5_core_alloc_pd(mdev, &pdn); 641 - if (err) 642 - goto end; 643 - 644 - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); 645 - if (err) 646 - goto err_reg; 647 - 648 - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); 649 - if (err) 650 - goto err_mkey; 651 393 652 394 MLX5_SET(load_vhca_state_in, in, opcode, 653 395 MLX5_CMD_OP_LOAD_VHCA_STATE); 654 396 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 655 397 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 656 - MLX5_SET(load_vhca_state_in, in, mkey, mkey); 657 - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); 398 + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 399 + MLX5_SET(load_vhca_state_in, in, size, buf->length); 400 + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 401 + } 658 402 659 - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); 403 + int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 404 + { 405 + int err; 660 406 661 - mlx5_core_destroy_mkey(mdev, mkey); 662 - err_mkey: 663 - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); 664 - err_reg: 665 - mlx5_core_dealloc_pd(mdev, pdn); 666 - end: 667 - mutex_unlock(&migf->lock); 407 + lockdep_assert_held(&migf->mvdev->state_mutex); 408 + if (migf->mvdev->mdev_detach) 409 + return -ENOTCONN; 410 + 411 + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 668 412 return err; 413 + } 414 + 415 + void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 416 + { 417 + lockdep_assert_held(&migf->mvdev->state_mutex); 418 + if (migf->mvdev->mdev_detach) 419 + return; 420 + 421 + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 422 + } 423 + 424 + void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 425 + { 426 + struct mlx5_vhca_data_buffer *entry; 427 + 428 + lockdep_assert_held(&migf->mvdev->state_mutex); 429 + WARN_ON(migf->mvdev->mdev_detach); 430 + 431 + if (migf->buf) { 432 + mlx5vf_free_data_buffer(migf->buf); 433 + migf->buf = NULL; 434 + } 435 + 436 + if (migf->buf_header) { 437 + mlx5vf_free_data_buffer(migf->buf_header); 438 + migf->buf_header = NULL; 439 + } 440 + 441 + list_splice(&migf->avail_list, &migf->buf_list); 442 + 443 + while ((entry = list_first_entry_or_null(&migf->buf_list, 444 + struct mlx5_vhca_data_buffer, buf_elm))) { 445 + list_del(&entry->buf_elm); 446 + mlx5vf_free_data_buffer(entry); 447 + } 448 + 449 + mlx5vf_cmd_dealloc_pd(migf); 669 450 } 670 451 671 452 static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
+81 -15
drivers/vfio/pci/mlx5/cmd.h
··· 12 12 #include <linux/mlx5/cq.h> 13 13 #include <linux/mlx5/qp.h> 14 14 15 + #define MLX5VF_PRE_COPY_SUPP(mvdev) \ 16 + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) 17 + 18 + enum mlx5_vf_migf_state { 19 + MLX5_MIGF_STATE_ERROR = 1, 20 + MLX5_MIGF_STATE_PRE_COPY_ERROR, 21 + MLX5_MIGF_STATE_PRE_COPY, 22 + MLX5_MIGF_STATE_SAVE_LAST, 23 + MLX5_MIGF_STATE_COMPLETE, 24 + }; 25 + 26 + enum mlx5_vf_load_state { 27 + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, 28 + MLX5_VF_LOAD_STATE_READ_HEADER, 29 + MLX5_VF_LOAD_STATE_PREP_IMAGE, 30 + MLX5_VF_LOAD_STATE_READ_IMAGE, 31 + MLX5_VF_LOAD_STATE_LOAD_IMAGE, 32 + }; 33 + 34 + struct mlx5_vf_migration_header { 35 + __le64 image_size; 36 + /* For future use in case we may need to change the kernel protocol */ 37 + __le64 flags; 38 + }; 39 + 40 + struct mlx5_vhca_data_buffer { 41 + struct sg_append_table table; 42 + loff_t start_pos; 43 + u64 length; 44 + u64 allocated_length; 45 + u64 header_image_size; 46 + u32 mkey; 47 + enum dma_data_direction dma_dir; 48 + u8 dmaed:1; 49 + struct list_head buf_elm; 50 + struct mlx5_vf_migration_file *migf; 51 + /* Optimize mlx5vf_get_migration_page() for sequential access */ 52 + struct scatterlist *last_offset_sg; 53 + unsigned int sg_last_entry; 54 + unsigned long last_offset; 55 + }; 56 + 15 57 struct mlx5vf_async_data { 16 58 struct mlx5_async_work cb_work; 17 59 struct work_struct work; 60 + struct mlx5_vhca_data_buffer *buf; 61 + struct mlx5_vhca_data_buffer *header_buf; 18 62 int status; 19 - u32 pdn; 20 - u32 mkey; 63 + u8 last_chunk:1; 21 64 void *out; 22 65 }; 23 66 24 67 struct mlx5_vf_migration_file { 25 68 struct file *filp; 26 69 struct mutex lock; 27 - u8 disabled:1; 28 - u8 is_err:1; 70 + enum mlx5_vf_migf_state state; 29 71 30 - struct sg_append_table table; 31 - size_t total_length; 32 - size_t allocated_length; 33 - 34 - /* Optimize mlx5vf_get_migration_page() for sequential access */ 35 - struct scatterlist *last_offset_sg; 36 - unsigned int sg_last_entry; 37 - unsigned long last_offset; 72 + enum mlx5_vf_load_state load_state; 73 + u32 pdn; 74 + loff_t max_pos; 75 + struct mlx5_vhca_data_buffer *buf; 76 + struct mlx5_vhca_data_buffer *buf_header; 77 + spinlock_t list_lock; 78 + struct list_head buf_list; 79 + struct list_head avail_list; 38 80 struct mlx5vf_pci_core_device *mvdev; 39 81 wait_queue_head_t poll_wait; 82 + struct completion save_comp; 40 83 struct mlx5_async_ctx async_ctx; 41 84 struct mlx5vf_async_data async_data; 42 85 }; ··· 156 113 struct mlx5_core_dev *mdev; 157 114 }; 158 115 116 + enum { 117 + MLX5VF_QUERY_INC = (1UL << 0), 118 + MLX5VF_QUERY_FINAL = (1UL << 1), 119 + }; 120 + 159 121 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); 160 122 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); 161 123 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 162 - size_t *state_size); 124 + size_t *state_size, u8 query_flags); 163 125 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 164 126 const struct vfio_migration_ops *mig_ops, 165 127 const struct vfio_log_ops *log_ops); 166 128 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); 167 129 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); 168 130 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 169 - struct mlx5_vf_migration_file *migf); 131 + struct mlx5_vf_migration_file *migf, 132 + struct mlx5_vhca_data_buffer *buf, bool inc, 133 + bool track); 170 134 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 171 - struct mlx5_vf_migration_file *migf); 135 + struct mlx5_vf_migration_file *migf, 136 + struct mlx5_vhca_data_buffer *buf); 137 + int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); 138 + void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); 139 + void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); 140 + struct mlx5_vhca_data_buffer * 141 + mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 142 + size_t length, enum dma_data_direction dma_dir); 143 + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); 144 + struct mlx5_vhca_data_buffer * 145 + mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 146 + size_t length, enum dma_data_direction dma_dir); 147 + void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); 148 + int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 149 + unsigned int npages); 150 + struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 151 + unsigned long offset); 172 152 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); 173 153 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); 174 154 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+651 -139
drivers/vfio/pci/mlx5/main.c
··· 32 32 core_device); 33 33 } 34 34 35 - static struct page * 36 - mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, 35 + struct page * 36 + mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 37 unsigned long offset) 38 38 { 39 39 unsigned long cur_offset = 0; ··· 41 41 unsigned int i; 42 42 43 43 /* All accesses are sequential */ 44 - if (offset < migf->last_offset || !migf->last_offset_sg) { 45 - migf->last_offset = 0; 46 - migf->last_offset_sg = migf->table.sgt.sgl; 47 - migf->sg_last_entry = 0; 44 + if (offset < buf->last_offset || !buf->last_offset_sg) { 45 + buf->last_offset = 0; 46 + buf->last_offset_sg = buf->table.sgt.sgl; 47 + buf->sg_last_entry = 0; 48 48 } 49 49 50 - cur_offset = migf->last_offset; 50 + cur_offset = buf->last_offset; 51 51 52 - for_each_sg(migf->last_offset_sg, sg, 53 - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { 52 + for_each_sg(buf->last_offset_sg, sg, 53 + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 54 if (offset < sg->length + cur_offset) { 55 - migf->last_offset_sg = sg; 56 - migf->sg_last_entry += i; 57 - migf->last_offset = cur_offset; 55 + buf->last_offset_sg = sg; 56 + buf->sg_last_entry += i; 57 + buf->last_offset = cur_offset; 58 58 return nth_page(sg_page(sg), 59 59 (offset - cur_offset) / PAGE_SIZE); 60 60 } ··· 63 63 return NULL; 64 64 } 65 65 66 - static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, 67 - unsigned int npages) 66 + int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 + unsigned int npages) 68 68 { 69 69 unsigned int to_alloc = npages; 70 70 struct page **page_list; ··· 85 85 } 86 86 to_alloc -= filled; 87 87 ret = sg_alloc_append_table_from_pages( 88 - &migf->table, page_list, filled, 0, 88 + &buf->table, page_list, filled, 0, 89 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 90 GFP_KERNEL); 91 91 92 92 if (ret) 93 93 goto err; 94 - migf->allocated_length += filled * PAGE_SIZE; 94 + buf->allocated_length += filled * PAGE_SIZE; 95 95 /* clean input for another bulk allocation */ 96 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 97 to_fill = min_t(unsigned int, to_alloc, ··· 108 108 109 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 110 { 111 - struct sg_page_iter sg_iter; 112 - 113 111 mutex_lock(&migf->lock); 114 - /* Undo alloc_pages_bulk_array() */ 115 - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) 116 - __free_page(sg_page_iter_page(&sg_iter)); 117 - sg_free_append_table(&migf->table); 118 - migf->disabled = true; 119 - migf->total_length = 0; 120 - migf->allocated_length = 0; 112 + migf->state = MLX5_MIGF_STATE_ERROR; 121 113 migf->filp->f_pos = 0; 122 114 mutex_unlock(&migf->lock); 123 115 } ··· 124 132 return 0; 125 133 } 126 134 135 + static struct mlx5_vhca_data_buffer * 136 + mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 137 + bool *end_of_data) 138 + { 139 + struct mlx5_vhca_data_buffer *buf; 140 + bool found = false; 141 + 142 + *end_of_data = false; 143 + spin_lock_irq(&migf->list_lock); 144 + if (list_empty(&migf->buf_list)) { 145 + *end_of_data = true; 146 + goto end; 147 + } 148 + 149 + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 150 + buf_elm); 151 + if (pos >= buf->start_pos && 152 + pos < buf->start_pos + buf->length) { 153 + found = true; 154 + goto end; 155 + } 156 + 157 + /* 158 + * As we use a stream based FD we may expect having the data always 159 + * on first chunk 160 + */ 161 + migf->state = MLX5_MIGF_STATE_ERROR; 162 + 163 + end: 164 + spin_unlock_irq(&migf->list_lock); 165 + return found ? buf : NULL; 166 + } 167 + 168 + static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 169 + char __user **buf, size_t *len, loff_t *pos) 170 + { 171 + unsigned long offset; 172 + ssize_t done = 0; 173 + size_t copy_len; 174 + 175 + copy_len = min_t(size_t, 176 + vhca_buf->start_pos + vhca_buf->length - *pos, *len); 177 + while (copy_len) { 178 + size_t page_offset; 179 + struct page *page; 180 + size_t page_len; 181 + u8 *from_buff; 182 + int ret; 183 + 184 + offset = *pos - vhca_buf->start_pos; 185 + page_offset = offset % PAGE_SIZE; 186 + offset -= page_offset; 187 + page = mlx5vf_get_migration_page(vhca_buf, offset); 188 + if (!page) 189 + return -EINVAL; 190 + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 191 + from_buff = kmap_local_page(page); 192 + ret = copy_to_user(*buf, from_buff + page_offset, page_len); 193 + kunmap_local(from_buff); 194 + if (ret) 195 + return -EFAULT; 196 + *pos += page_len; 197 + *len -= page_len; 198 + *buf += page_len; 199 + done += page_len; 200 + copy_len -= page_len; 201 + } 202 + 203 + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 204 + spin_lock_irq(&vhca_buf->migf->list_lock); 205 + list_del_init(&vhca_buf->buf_elm); 206 + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 207 + spin_unlock_irq(&vhca_buf->migf->list_lock); 208 + } 209 + 210 + return done; 211 + } 212 + 127 213 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 128 214 loff_t *pos) 129 215 { 130 216 struct mlx5_vf_migration_file *migf = filp->private_data; 217 + struct mlx5_vhca_data_buffer *vhca_buf; 218 + bool first_loop_call = true; 219 + bool end_of_data; 131 220 ssize_t done = 0; 132 221 133 222 if (pos) ··· 217 144 218 145 if (!(filp->f_flags & O_NONBLOCK)) { 219 146 if (wait_event_interruptible(migf->poll_wait, 220 - READ_ONCE(migf->total_length) || migf->is_err)) 147 + !list_empty(&migf->buf_list) || 148 + migf->state == MLX5_MIGF_STATE_ERROR || 149 + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 150 + migf->state == MLX5_MIGF_STATE_PRE_COPY || 151 + migf->state == MLX5_MIGF_STATE_COMPLETE)) 221 152 return -ERESTARTSYS; 222 153 } 223 154 224 155 mutex_lock(&migf->lock); 225 - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { 226 - done = -EAGAIN; 227 - goto out_unlock; 228 - } 229 - if (*pos > migf->total_length) { 230 - done = -EINVAL; 231 - goto out_unlock; 232 - } 233 - if (migf->disabled || migf->is_err) { 156 + if (migf->state == MLX5_MIGF_STATE_ERROR) { 234 157 done = -ENODEV; 235 158 goto out_unlock; 236 159 } 237 160 238 - len = min_t(size_t, migf->total_length - *pos, len); 239 161 while (len) { 240 - size_t page_offset; 241 - struct page *page; 242 - size_t page_len; 243 - u8 *from_buff; 244 - int ret; 162 + ssize_t count; 245 163 246 - page_offset = (*pos) % PAGE_SIZE; 247 - page = mlx5vf_get_migration_page(migf, *pos - page_offset); 248 - if (!page) { 249 - if (done == 0) 250 - done = -EINVAL; 164 + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 165 + &end_of_data); 166 + if (first_loop_call) { 167 + first_loop_call = false; 168 + /* Temporary end of file as part of PRE_COPY */ 169 + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 170 + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 171 + done = -ENOMSG; 172 + goto out_unlock; 173 + } 174 + 175 + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 176 + if (filp->f_flags & O_NONBLOCK) { 177 + done = -EAGAIN; 178 + goto out_unlock; 179 + } 180 + } 181 + } 182 + 183 + if (end_of_data) 184 + goto out_unlock; 185 + 186 + if (!vhca_buf) { 187 + done = -EINVAL; 251 188 goto out_unlock; 252 189 } 253 190 254 - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 255 - from_buff = kmap_local_page(page); 256 - ret = copy_to_user(buf, from_buff + page_offset, page_len); 257 - kunmap_local(from_buff); 258 - if (ret) { 259 - done = -EFAULT; 191 + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 192 + if (count < 0) { 193 + done = count; 260 194 goto out_unlock; 261 195 } 262 - *pos += page_len; 263 - len -= page_len; 264 - done += page_len; 265 - buf += page_len; 196 + done += count; 266 197 } 267 198 268 199 out_unlock: ··· 283 206 poll_wait(filp, &migf->poll_wait, wait); 284 207 285 208 mutex_lock(&migf->lock); 286 - if (migf->disabled || migf->is_err) 209 + if (migf->state == MLX5_MIGF_STATE_ERROR) 287 210 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 288 - else if (READ_ONCE(migf->total_length)) 211 + else if (!list_empty(&migf->buf_list) || 212 + migf->state == MLX5_MIGF_STATE_COMPLETE) 289 213 pollflags = EPOLLIN | EPOLLRDNORM; 290 214 mutex_unlock(&migf->lock); 291 215 292 216 return pollflags; 293 217 } 294 218 219 + /* 220 + * FD is exposed and user can use it after receiving an error. 221 + * Mark migf in error, and wake the user. 222 + */ 223 + static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 224 + { 225 + migf->state = MLX5_MIGF_STATE_ERROR; 226 + wake_up_interruptible(&migf->poll_wait); 227 + } 228 + 229 + static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 230 + unsigned long arg) 231 + { 232 + struct mlx5_vf_migration_file *migf = filp->private_data; 233 + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 234 + struct mlx5_vhca_data_buffer *buf; 235 + struct vfio_precopy_info info = {}; 236 + loff_t *pos = &filp->f_pos; 237 + unsigned long minsz; 238 + size_t inc_length = 0; 239 + bool end_of_data; 240 + int ret; 241 + 242 + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 243 + return -ENOTTY; 244 + 245 + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 246 + 247 + if (copy_from_user(&info, (void __user *)arg, minsz)) 248 + return -EFAULT; 249 + 250 + if (info.argsz < minsz) 251 + return -EINVAL; 252 + 253 + mutex_lock(&mvdev->state_mutex); 254 + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 255 + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 256 + ret = -EINVAL; 257 + goto err_state_unlock; 258 + } 259 + 260 + /* 261 + * We can't issue a SAVE command when the device is suspended, so as 262 + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 263 + * bytes that can't be read. 264 + */ 265 + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 266 + /* 267 + * Once the query returns it's guaranteed that there is no 268 + * active SAVE command. 269 + * As so, the other code below is safe with the proper locks. 270 + */ 271 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 272 + MLX5VF_QUERY_INC); 273 + if (ret) 274 + goto err_state_unlock; 275 + } 276 + 277 + mutex_lock(&migf->lock); 278 + if (migf->state == MLX5_MIGF_STATE_ERROR) { 279 + ret = -ENODEV; 280 + goto err_migf_unlock; 281 + } 282 + 283 + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); 284 + if (buf) { 285 + if (buf->start_pos == 0) { 286 + info.initial_bytes = buf->header_image_size - *pos; 287 + } else if (buf->start_pos == 288 + sizeof(struct mlx5_vf_migration_header)) { 289 + /* First data buffer following the header */ 290 + info.initial_bytes = buf->start_pos + 291 + buf->length - *pos; 292 + } else { 293 + info.dirty_bytes = buf->start_pos + buf->length - *pos; 294 + } 295 + } else { 296 + if (!end_of_data) { 297 + ret = -EINVAL; 298 + goto err_migf_unlock; 299 + } 300 + 301 + info.dirty_bytes = inc_length; 302 + } 303 + 304 + if (!end_of_data || !inc_length) { 305 + mutex_unlock(&migf->lock); 306 + goto done; 307 + } 308 + 309 + mutex_unlock(&migf->lock); 310 + /* 311 + * We finished transferring the current state and the device has a 312 + * dirty state, save a new state to be ready for. 313 + */ 314 + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 315 + if (IS_ERR(buf)) { 316 + ret = PTR_ERR(buf); 317 + mlx5vf_mark_err(migf); 318 + goto err_state_unlock; 319 + } 320 + 321 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 322 + if (ret) { 323 + mlx5vf_mark_err(migf); 324 + mlx5vf_put_data_buffer(buf); 325 + goto err_state_unlock; 326 + } 327 + 328 + done: 329 + mlx5vf_state_mutex_unlock(mvdev); 330 + if (copy_to_user((void __user *)arg, &info, minsz)) 331 + return -EFAULT; 332 + return 0; 333 + 334 + err_migf_unlock: 335 + mutex_unlock(&migf->lock); 336 + err_state_unlock: 337 + mlx5vf_state_mutex_unlock(mvdev); 338 + return ret; 339 + } 340 + 295 341 static const struct file_operations mlx5vf_save_fops = { 296 342 .owner = THIS_MODULE, 297 343 .read = mlx5vf_save_read, 298 344 .poll = mlx5vf_save_poll, 345 + .unlocked_ioctl = mlx5vf_precopy_ioctl, 346 + .compat_ioctl = compat_ptr_ioctl, 299 347 .release = mlx5vf_release_file, 300 348 .llseek = no_llseek, 301 349 }; 302 350 351 + static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 352 + { 353 + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 354 + struct mlx5_vhca_data_buffer *buf; 355 + size_t length; 356 + int ret; 357 + 358 + if (migf->state == MLX5_MIGF_STATE_ERROR) 359 + return -ENODEV; 360 + 361 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 362 + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 363 + if (ret) 364 + goto err; 365 + 366 + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 367 + if (IS_ERR(buf)) { 368 + ret = PTR_ERR(buf); 369 + goto err; 370 + } 371 + 372 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 373 + if (ret) 374 + goto err_save; 375 + 376 + return 0; 377 + 378 + err_save: 379 + mlx5vf_put_data_buffer(buf); 380 + err: 381 + mlx5vf_mark_err(migf); 382 + return ret; 383 + } 384 + 303 385 static struct mlx5_vf_migration_file * 304 - mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) 386 + mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 305 387 { 306 388 struct mlx5_vf_migration_file *migf; 389 + struct mlx5_vhca_data_buffer *buf; 390 + size_t length; 307 391 int ret; 308 392 309 393 migf = kzalloc(sizeof(*migf), GFP_KERNEL); ··· 474 236 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 475 237 O_RDONLY); 476 238 if (IS_ERR(migf->filp)) { 477 - int err = PTR_ERR(migf->filp); 478 - 479 - kfree(migf); 480 - return ERR_PTR(err); 239 + ret = PTR_ERR(migf->filp); 240 + goto end; 481 241 } 242 + 243 + migf->mvdev = mvdev; 244 + ret = mlx5vf_cmd_alloc_pd(migf); 245 + if (ret) 246 + goto out_free; 482 247 483 248 stream_open(migf->filp->f_inode, migf->filp); 484 249 mutex_init(&migf->lock); 485 250 init_waitqueue_head(&migf->poll_wait); 251 + init_completion(&migf->save_comp); 252 + /* 253 + * save_comp is being used as a binary semaphore built from 254 + * a completion. A normal mutex cannot be used because the lock is 255 + * passed between kernel threads and lockdep can't model this. 256 + */ 257 + complete(&migf->save_comp); 486 258 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 487 259 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 488 - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 489 - &migf->total_length); 260 + INIT_LIST_HEAD(&migf->buf_list); 261 + INIT_LIST_HEAD(&migf->avail_list); 262 + spin_lock_init(&migf->list_lock); 263 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 490 264 if (ret) 491 - goto out_free; 265 + goto out_pd; 492 266 493 - ret = mlx5vf_add_migration_pages( 494 - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); 495 - if (ret) 496 - goto out_free; 267 + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 268 + if (IS_ERR(buf)) { 269 + ret = PTR_ERR(buf); 270 + goto out_pd; 271 + } 497 272 498 - migf->mvdev = mvdev; 499 - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); 273 + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 500 274 if (ret) 501 - goto out_free; 275 + goto out_save; 502 276 return migf; 277 + out_save: 278 + mlx5vf_free_data_buffer(buf); 279 + out_pd: 280 + mlx5vf_cmd_dealloc_pd(migf); 503 281 out_free: 504 282 fput(migf->filp); 283 + end: 284 + kfree(migf); 505 285 return ERR_PTR(ret); 286 + } 287 + 288 + static int 289 + mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 290 + const char __user **buf, size_t *len, 291 + loff_t *pos, ssize_t *done) 292 + { 293 + unsigned long offset; 294 + size_t page_offset; 295 + struct page *page; 296 + size_t page_len; 297 + u8 *to_buff; 298 + int ret; 299 + 300 + offset = *pos - vhca_buf->start_pos; 301 + page_offset = offset % PAGE_SIZE; 302 + 303 + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 304 + if (!page) 305 + return -EINVAL; 306 + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 307 + to_buff = kmap_local_page(page); 308 + ret = copy_from_user(to_buff + page_offset, *buf, page_len); 309 + kunmap_local(to_buff); 310 + if (ret) 311 + return -EFAULT; 312 + 313 + *pos += page_len; 314 + *done += page_len; 315 + *buf += page_len; 316 + *len -= page_len; 317 + vhca_buf->length += page_len; 318 + return 0; 319 + } 320 + 321 + static int 322 + mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 323 + loff_t requested_length, 324 + const char __user **buf, size_t *len, 325 + loff_t *pos, ssize_t *done) 326 + { 327 + int ret; 328 + 329 + if (requested_length > MAX_MIGRATION_SIZE) 330 + return -ENOMEM; 331 + 332 + if (vhca_buf->allocated_length < requested_length) { 333 + ret = mlx5vf_add_migration_pages( 334 + vhca_buf, 335 + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 336 + PAGE_SIZE)); 337 + if (ret) 338 + return ret; 339 + } 340 + 341 + while (*len) { 342 + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 343 + done); 344 + if (ret) 345 + return ret; 346 + } 347 + 348 + return 0; 349 + } 350 + 351 + static ssize_t 352 + mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 353 + struct mlx5_vhca_data_buffer *vhca_buf, 354 + size_t image_size, const char __user **buf, 355 + size_t *len, loff_t *pos, ssize_t *done, 356 + bool *has_work) 357 + { 358 + size_t copy_len, to_copy; 359 + int ret; 360 + 361 + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 362 + copy_len = to_copy; 363 + while (to_copy) { 364 + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 365 + done); 366 + if (ret) 367 + return ret; 368 + } 369 + 370 + *len -= copy_len; 371 + if (vhca_buf->length == image_size) { 372 + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 373 + migf->max_pos += image_size; 374 + *has_work = true; 375 + } 376 + 377 + return 0; 378 + } 379 + 380 + static int 381 + mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 382 + struct mlx5_vhca_data_buffer *vhca_buf, 383 + const char __user **buf, 384 + size_t *len, loff_t *pos, 385 + ssize_t *done, bool *has_work) 386 + { 387 + struct page *page; 388 + size_t copy_len; 389 + u8 *to_buff; 390 + int ret; 391 + 392 + copy_len = min_t(size_t, *len, 393 + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 394 + page = mlx5vf_get_migration_page(vhca_buf, 0); 395 + if (!page) 396 + return -EINVAL; 397 + to_buff = kmap_local_page(page); 398 + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 399 + if (ret) { 400 + ret = -EFAULT; 401 + goto end; 402 + } 403 + 404 + *buf += copy_len; 405 + *pos += copy_len; 406 + *done += copy_len; 407 + *len -= copy_len; 408 + vhca_buf->length += copy_len; 409 + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 410 + u64 flags; 411 + 412 + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); 413 + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { 414 + ret = -ENOMEM; 415 + goto end; 416 + } 417 + 418 + flags = le64_to_cpup((__le64 *)(to_buff + 419 + offsetof(struct mlx5_vf_migration_header, flags))); 420 + if (flags) { 421 + ret = -EOPNOTSUPP; 422 + goto end; 423 + } 424 + 425 + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 426 + migf->max_pos += vhca_buf->length; 427 + *has_work = true; 428 + } 429 + end: 430 + kunmap_local(to_buff); 431 + return ret; 506 432 } 507 433 508 434 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 509 435 size_t len, loff_t *pos) 510 436 { 511 437 struct mlx5_vf_migration_file *migf = filp->private_data; 438 + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 439 + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; 512 440 loff_t requested_length; 441 + bool has_work = false; 513 442 ssize_t done = 0; 443 + int ret = 0; 514 444 515 445 if (pos) 516 446 return -ESPIPE; ··· 688 282 check_add_overflow((loff_t)len, *pos, &requested_length)) 689 283 return -EINVAL; 690 284 691 - if (requested_length > MAX_MIGRATION_SIZE) 692 - return -ENOMEM; 693 - 285 + mutex_lock(&migf->mvdev->state_mutex); 694 286 mutex_lock(&migf->lock); 695 - if (migf->disabled) { 696 - done = -ENODEV; 287 + if (migf->state == MLX5_MIGF_STATE_ERROR) { 288 + ret = -ENODEV; 697 289 goto out_unlock; 698 290 } 699 291 700 - if (migf->allocated_length < requested_length) { 701 - done = mlx5vf_add_migration_pages( 702 - migf, 703 - DIV_ROUND_UP(requested_length - migf->allocated_length, 704 - PAGE_SIZE)); 705 - if (done) 706 - goto out_unlock; 292 + while (len || has_work) { 293 + has_work = false; 294 + switch (migf->load_state) { 295 + case MLX5_VF_LOAD_STATE_READ_HEADER: 296 + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 297 + &buf, &len, pos, 298 + &done, &has_work); 299 + if (ret) 300 + goto out_unlock; 301 + break; 302 + case MLX5_VF_LOAD_STATE_PREP_IMAGE: 303 + { 304 + u64 size = vhca_buf_header->header_image_size; 305 + 306 + if (vhca_buf->allocated_length < size) { 307 + mlx5vf_free_data_buffer(vhca_buf); 308 + 309 + migf->buf = mlx5vf_alloc_data_buffer(migf, 310 + size, DMA_TO_DEVICE); 311 + if (IS_ERR(migf->buf)) { 312 + ret = PTR_ERR(migf->buf); 313 + migf->buf = NULL; 314 + goto out_unlock; 315 + } 316 + 317 + vhca_buf = migf->buf; 318 + } 319 + 320 + vhca_buf->start_pos = migf->max_pos; 321 + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 322 + break; 323 + } 324 + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 325 + ret = mlx5vf_resume_read_image_no_header(vhca_buf, 326 + requested_length, 327 + &buf, &len, pos, &done); 328 + if (ret) 329 + goto out_unlock; 330 + break; 331 + case MLX5_VF_LOAD_STATE_READ_IMAGE: 332 + ret = mlx5vf_resume_read_image(migf, vhca_buf, 333 + vhca_buf_header->header_image_size, 334 + &buf, &len, pos, &done, &has_work); 335 + if (ret) 336 + goto out_unlock; 337 + break; 338 + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 339 + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 340 + if (ret) 341 + goto out_unlock; 342 + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 343 + 344 + /* prep header buf for next image */ 345 + vhca_buf_header->length = 0; 346 + vhca_buf_header->header_image_size = 0; 347 + /* prep data buf for next image */ 348 + vhca_buf->length = 0; 349 + 350 + break; 351 + default: 352 + break; 353 + } 707 354 } 708 355 709 - while (len) { 710 - size_t page_offset; 711 - struct page *page; 712 - size_t page_len; 713 - u8 *to_buff; 714 - int ret; 715 - 716 - page_offset = (*pos) % PAGE_SIZE; 717 - page = mlx5vf_get_migration_page(migf, *pos - page_offset); 718 - if (!page) { 719 - if (done == 0) 720 - done = -EINVAL; 721 - goto out_unlock; 722 - } 723 - 724 - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 725 - to_buff = kmap_local_page(page); 726 - ret = copy_from_user(to_buff + page_offset, buf, page_len); 727 - kunmap_local(to_buff); 728 - if (ret) { 729 - done = -EFAULT; 730 - goto out_unlock; 731 - } 732 - *pos += page_len; 733 - len -= page_len; 734 - done += page_len; 735 - buf += page_len; 736 - migf->total_length += page_len; 737 - } 738 356 out_unlock: 357 + if (ret) 358 + migf->state = MLX5_MIGF_STATE_ERROR; 739 359 mutex_unlock(&migf->lock); 740 - return done; 360 + mlx5vf_state_mutex_unlock(migf->mvdev); 361 + return ret ? ret : done; 741 362 } 742 363 743 364 static const struct file_operations mlx5vf_resume_fops = { ··· 778 345 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 779 346 { 780 347 struct mlx5_vf_migration_file *migf; 348 + struct mlx5_vhca_data_buffer *buf; 349 + int ret; 781 350 782 351 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 783 352 if (!migf) ··· 788 353 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 789 354 O_WRONLY); 790 355 if (IS_ERR(migf->filp)) { 791 - int err = PTR_ERR(migf->filp); 792 - 793 - kfree(migf); 794 - return ERR_PTR(err); 356 + ret = PTR_ERR(migf->filp); 357 + goto end; 795 358 } 359 + 360 + migf->mvdev = mvdev; 361 + ret = mlx5vf_cmd_alloc_pd(migf); 362 + if (ret) 363 + goto out_free; 364 + 365 + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 366 + if (IS_ERR(buf)) { 367 + ret = PTR_ERR(buf); 368 + goto out_pd; 369 + } 370 + 371 + migf->buf = buf; 372 + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 373 + buf = mlx5vf_alloc_data_buffer(migf, 374 + sizeof(struct mlx5_vf_migration_header), DMA_NONE); 375 + if (IS_ERR(buf)) { 376 + ret = PTR_ERR(buf); 377 + goto out_buf; 378 + } 379 + 380 + migf->buf_header = buf; 381 + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 382 + } else { 383 + /* Initial state will be to read the image */ 384 + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 385 + } 386 + 796 387 stream_open(migf->filp->f_inode, migf->filp); 797 388 mutex_init(&migf->lock); 389 + INIT_LIST_HEAD(&migf->buf_list); 390 + INIT_LIST_HEAD(&migf->avail_list); 391 + spin_lock_init(&migf->list_lock); 798 392 return migf; 393 + out_buf: 394 + mlx5vf_free_data_buffer(migf->buf); 395 + out_pd: 396 + mlx5vf_cmd_dealloc_pd(migf); 397 + out_free: 398 + fput(migf->filp); 399 + end: 400 + kfree(migf); 401 + return ERR_PTR(ret); 799 402 } 800 403 801 404 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 802 405 { 803 406 if (mvdev->resuming_migf) { 804 407 mlx5vf_disable_fd(mvdev->resuming_migf); 408 + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 805 409 fput(mvdev->resuming_migf->filp); 806 410 mvdev->resuming_migf = NULL; 807 411 } ··· 848 374 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 849 375 cancel_work_sync(&mvdev->saving_migf->async_data.work); 850 376 mlx5vf_disable_fd(mvdev->saving_migf); 377 + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 851 378 fput(mvdev->saving_migf->filp); 852 379 mvdev->saving_migf = NULL; 853 380 } ··· 877 402 return NULL; 878 403 } 879 404 880 - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 405 + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 406 + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 881 407 ret = mlx5vf_cmd_suspend_vhca(mvdev, 882 408 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 883 409 if (ret) ··· 886 410 return NULL; 887 411 } 888 412 889 - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 413 + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 414 + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 890 415 ret = mlx5vf_cmd_resume_vhca(mvdev, 891 416 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 892 417 if (ret) ··· 898 421 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 899 422 struct mlx5_vf_migration_file *migf; 900 423 901 - migf = mlx5vf_pci_save_device_data(mvdev); 424 + migf = mlx5vf_pci_save_device_data(mvdev, false); 902 425 if (IS_ERR(migf)) 903 426 return ERR_CAST(migf); 904 427 get_file(migf->filp); ··· 906 429 return migf->filp; 907 430 } 908 431 909 - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { 432 + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 433 + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 434 + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 435 + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 910 436 mlx5vf_disable_fds(mvdev); 911 437 return NULL; 912 438 } ··· 926 446 } 927 447 928 448 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 929 - ret = mlx5vf_cmd_load_vhca_state(mvdev, 930 - mvdev->resuming_migf); 931 - if (ret) 932 - return ERR_PTR(ret); 449 + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 450 + ret = mlx5vf_cmd_load_vhca_state(mvdev, 451 + mvdev->resuming_migf, 452 + mvdev->resuming_migf->buf); 453 + if (ret) 454 + return ERR_PTR(ret); 455 + } 933 456 mlx5vf_disable_fds(mvdev); 934 457 return NULL; 458 + } 459 + 460 + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 461 + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 462 + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 463 + struct mlx5_vf_migration_file *migf; 464 + 465 + migf = mlx5vf_pci_save_device_data(mvdev, true); 466 + if (IS_ERR(migf)) 467 + return ERR_CAST(migf); 468 + get_file(migf->filp); 469 + mvdev->saving_migf = migf; 470 + return migf->filp; 471 + } 472 + 473 + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 474 + ret = mlx5vf_cmd_suspend_vhca(mvdev, 475 + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 476 + if (ret) 477 + return ERR_PTR(ret); 478 + ret = mlx5vf_pci_save_device_inc_data(mvdev); 479 + return ret ? ERR_PTR(ret) : NULL; 935 480 } 936 481 937 482 /* ··· 1015 510 } 1016 511 mlx5vf_state_mutex_unlock(mvdev); 1017 512 return res; 513 + } 514 + 515 + static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 516 + unsigned long *stop_copy_length) 517 + { 518 + struct mlx5vf_pci_core_device *mvdev = container_of( 519 + vdev, struct mlx5vf_pci_core_device, core_device.vdev); 520 + size_t state_size; 521 + int ret; 522 + 523 + mutex_lock(&mvdev->state_mutex); 524 + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 525 + &state_size, 0); 526 + if (!ret) 527 + *stop_copy_length = state_size; 528 + mlx5vf_state_mutex_unlock(mvdev); 529 + return ret; 1018 530 } 1019 531 1020 532 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, ··· 1099 577 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1100 578 .migration_set_state = mlx5vf_pci_set_device_state, 1101 579 .migration_get_state = mlx5vf_pci_get_device_state, 580 + .migration_get_data_size = mlx5vf_pci_get_data_size, 1102 581 }; 1103 582 1104 583 static const struct vfio_log_ops mlx5vf_pci_log_ops = { ··· 1202 679 .driver_managed_dma = true, 1203 680 }; 1204 681 1205 - static void __exit mlx5vf_pci_cleanup(void) 1206 - { 1207 - pci_unregister_driver(&mlx5vf_pci_driver); 1208 - } 1209 - 1210 - static int __init mlx5vf_pci_init(void) 1211 - { 1212 - return pci_register_driver(&mlx5vf_pci_driver); 1213 - } 1214 - 1215 - module_init(mlx5vf_pci_init); 1216 - module_exit(mlx5vf_pci_cleanup); 682 + module_pci_driver(mlx5vf_pci_driver); 1217 683 1218 684 MODULE_LICENSE("GPL"); 1219 685 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
+11 -4
drivers/vfio/pci/vfio_pci_core.c
··· 27 27 #include <linux/vgaarb.h> 28 28 #include <linux/nospec.h> 29 29 #include <linux/sched/mm.h> 30 + #if IS_ENABLED(CONFIG_EEH) 31 + #include <asm/eeh.h> 32 + #endif 30 33 31 34 #include "vfio_pci_priv.h" 32 35 ··· 689 686 vdev->sriov_pf_core_dev->vf_token->users--; 690 687 mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); 691 688 } 692 - vfio_spapr_pci_eeh_release(vdev->pdev); 689 + #if IS_ENABLED(CONFIG_EEH) 690 + eeh_dev_release(vdev->pdev); 691 + #endif 693 692 vfio_pci_core_disable(vdev); 694 693 695 694 mutex_lock(&vdev->igate); ··· 710 705 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) 711 706 { 712 707 vfio_pci_probe_mmaps(vdev); 713 - vfio_spapr_pci_eeh_open(vdev->pdev); 708 + #if IS_ENABLED(CONFIG_EEH) 709 + eeh_dev_open(vdev->pdev); 710 + #endif 714 711 715 712 if (vdev->sriov_pf_core_dev) { 716 713 mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); ··· 2116 2109 mutex_destroy(&vdev->vma_lock); 2117 2110 kfree(vdev->region); 2118 2111 kfree(vdev->pm_save); 2119 - vfio_free_device(core_vdev); 2120 2112 } 2121 2113 EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); 2122 2114 ··· 2134 2128 2135 2129 if (vdev->vdev.mig_ops) { 2136 2130 if (!(vdev->vdev.mig_ops->migration_get_state && 2137 - vdev->vdev.mig_ops->migration_set_state) || 2131 + vdev->vdev.mig_ops->migration_set_state && 2132 + vdev->vdev.mig_ops->migration_get_data_size) || 2138 2133 !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) 2139 2134 return -EINVAL; 2140 2135 }
-1
drivers/vfio/platform/vfio_amba.c
··· 95 95 96 96 vfio_platform_release_common(vdev); 97 97 kfree(vdev->name); 98 - vfio_free_device(core_vdev); 99 98 } 100 99 101 100 static void vfio_amba_remove(struct amba_device *adev)
-1
drivers/vfio/platform/vfio_platform.c
··· 83 83 container_of(core_vdev, struct vfio_platform_device, vdev); 84 84 85 85 vfio_platform_release_common(vdev); 86 - vfio_free_device(core_vdev); 87 86 } 88 87 89 88 static int vfio_platform_remove(struct platform_device *pdev)
+1 -2
drivers/vfio/platform/vfio_platform_common.c
··· 72 72 const char **extra_dbg) 73 73 { 74 74 #ifdef CONFIG_ACPI 75 - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 76 75 struct device *dev = vdev->device; 77 76 acpi_handle handle = ACPI_HANDLE(dev); 78 77 acpi_status acpi_ret; 79 78 80 - acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer); 79 + acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL); 81 80 if (ACPI_FAILURE(acpi_ret)) { 82 81 if (extra_dbg) 83 82 *extra_dbg = acpi_format_exception(acpi_ret);
+13
drivers/vfio/vfio.h
··· 232 232 } 233 233 #endif 234 234 235 + #if IS_ENABLED(CONFIG_VFIO_VIRQFD) 236 + int __init vfio_virqfd_init(void); 237 + void vfio_virqfd_exit(void); 238 + #else 239 + static inline int __init vfio_virqfd_init(void) 240 + { 241 + return 0; 242 + } 243 + static inline void vfio_virqfd_exit(void) 244 + { 245 + } 246 + #endif 247 + 235 248 #ifdef CONFIG_VFIO_NOIOMMU 236 249 extern bool vfio_noiommu __read_mostly; 237 250 #else
+57 -8
drivers/vfio/vfio_iommu_spapr_tce.c
··· 4 4 * 5 5 * Copyright (C) 2013 IBM Corp. All rights reserved. 6 6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru> 7 + * Copyright Gavin Shan, IBM Corporation 2014. 7 8 * 8 9 * Derived from original vfio_iommu_type1.c: 9 10 * Copyright (C) 2012 Red Hat, Inc. All rights reserved. ··· 774 773 return ret; 775 774 } 776 775 776 + static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group, 777 + unsigned long arg) 778 + { 779 + struct eeh_pe *pe; 780 + struct vfio_eeh_pe_op op; 781 + unsigned long minsz; 782 + 783 + pe = eeh_iommu_group_to_pe(group); 784 + if (!pe) 785 + return -ENODEV; 786 + 787 + minsz = offsetofend(struct vfio_eeh_pe_op, op); 788 + if (copy_from_user(&op, (void __user *)arg, minsz)) 789 + return -EFAULT; 790 + if (op.argsz < minsz || op.flags) 791 + return -EINVAL; 792 + 793 + switch (op.op) { 794 + case VFIO_EEH_PE_DISABLE: 795 + return eeh_pe_set_option(pe, EEH_OPT_DISABLE); 796 + case VFIO_EEH_PE_ENABLE: 797 + return eeh_pe_set_option(pe, EEH_OPT_ENABLE); 798 + case VFIO_EEH_PE_UNFREEZE_IO: 799 + return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); 800 + case VFIO_EEH_PE_UNFREEZE_DMA: 801 + return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); 802 + case VFIO_EEH_PE_GET_STATE: 803 + return eeh_pe_get_state(pe); 804 + break; 805 + case VFIO_EEH_PE_RESET_DEACTIVATE: 806 + return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); 807 + case VFIO_EEH_PE_RESET_HOT: 808 + return eeh_pe_reset(pe, EEH_RESET_HOT, true); 809 + case VFIO_EEH_PE_RESET_FUNDAMENTAL: 810 + return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); 811 + case VFIO_EEH_PE_CONFIGURE: 812 + return eeh_pe_configure(pe); 813 + case VFIO_EEH_PE_INJECT_ERR: 814 + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); 815 + if (op.argsz < minsz) 816 + return -EINVAL; 817 + if (copy_from_user(&op, (void __user *)arg, minsz)) 818 + return -EFAULT; 819 + 820 + return eeh_pe_inject_err(pe, op.err.type, op.err.func, 821 + op.err.addr, op.err.mask); 822 + default: 823 + return -EINVAL; 824 + } 825 + } 826 + 777 827 static long tce_iommu_ioctl(void *iommu_data, 778 828 unsigned int cmd, unsigned long arg) 779 829 { ··· 837 785 switch (arg) { 838 786 case VFIO_SPAPR_TCE_IOMMU: 839 787 case VFIO_SPAPR_TCE_v2_IOMMU: 840 - ret = 1; 841 - break; 788 + return 1; 789 + case VFIO_EEH: 790 + return eeh_enabled(); 842 791 default: 843 - ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); 844 - break; 792 + return 0; 845 793 } 846 - 847 - return (ret < 0) ? 0 : ret; 848 794 } 849 795 850 796 /* ··· 1096 1046 1097 1047 ret = 0; 1098 1048 list_for_each_entry(tcegrp, &container->group_list, next) { 1099 - ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, 1100 - cmd, arg); 1049 + ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg); 1101 1050 if (ret) 1102 1051 return ret; 1103 1052 }
+120 -25
drivers/vfio/vfio_main.c
··· 158 158 vfio_release_device_set(device); 159 159 ida_free(&vfio.device_ida, device->index); 160 160 161 - /* 162 - * kvfree() cannot be done here due to a life cycle mess in 163 - * vfio-ccw. Before the ccw part is fixed all drivers are 164 - * required to support @release and call vfio_free_device() 165 - * from there. 166 - */ 167 - device->ops->release(device); 161 + if (device->ops->release) 162 + device->ops->release(device); 163 + 164 + kvfree(device); 168 165 } 166 + 167 + static int vfio_init_device(struct vfio_device *device, struct device *dev, 168 + const struct vfio_device_ops *ops); 169 169 170 170 /* 171 171 * Allocate and initialize vfio_device so it can be registered to vfio ··· 205 205 206 206 /* 207 207 * Initialize a vfio_device so it can be registered to vfio core. 208 - * 209 - * Only vfio-ccw driver should call this interface. 210 208 */ 211 - int vfio_init_device(struct vfio_device *device, struct device *dev, 212 - const struct vfio_device_ops *ops) 209 + static int vfio_init_device(struct vfio_device *device, struct device *dev, 210 + const struct vfio_device_ops *ops) 213 211 { 214 212 int ret; 215 213 ··· 239 241 ida_free(&vfio.device_ida, device->index); 240 242 return ret; 241 243 } 242 - EXPORT_SYMBOL_GPL(vfio_init_device); 243 - 244 - /* 245 - * The helper called by driver @release callback to free the device 246 - * structure. Drivers which don't have private data to clean can 247 - * simply use this helper as its @release. 248 - */ 249 - void vfio_free_device(struct vfio_device *device) 250 - { 251 - kvfree(device); 252 - } 253 - EXPORT_SYMBOL_GPL(vfio_free_device); 254 244 255 245 static int __vfio_register_dev(struct vfio_device *device, 256 246 enum vfio_group_type type) ··· 490 504 enum vfio_device_mig_state new_fsm, 491 505 enum vfio_device_mig_state *next_fsm) 492 506 { 493 - enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 }; 507 + enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 }; 494 508 /* 495 509 * The coding in this table requires the driver to implement the 496 510 * following FSM arcs: ··· 505 519 * RUNNING_P2P -> RUNNING 506 520 * RUNNING_P2P -> STOP 507 521 * STOP -> RUNNING_P2P 508 - * Without P2P the driver must implement: 522 + * 523 + * If precopy is supported then the driver must support these additional 524 + * FSM arcs: 525 + * RUNNING -> PRE_COPY 526 + * PRE_COPY -> RUNNING 527 + * PRE_COPY -> STOP_COPY 528 + * However, if precopy and P2P are supported together then the driver 529 + * must support these additional arcs beyond the P2P arcs above: 530 + * PRE_COPY -> RUNNING 531 + * PRE_COPY -> PRE_COPY_P2P 532 + * PRE_COPY_P2P -> PRE_COPY 533 + * PRE_COPY_P2P -> RUNNING_P2P 534 + * PRE_COPY_P2P -> STOP_COPY 535 + * RUNNING -> PRE_COPY 536 + * RUNNING_P2P -> PRE_COPY_P2P 537 + * 538 + * Without P2P and precopy the driver must implement: 509 539 * RUNNING -> STOP 510 540 * STOP -> RUNNING 511 541 * 512 542 * The coding will step through multiple states for some combination 513 543 * transitions; if all optional features are supported, this means the 514 544 * following ones: 545 + * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY 546 + * PRE_COPY -> RUNNING -> RUNNING_P2P 547 + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP 548 + * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING 549 + * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING 550 + * PRE_COPY_P2P -> RUNNING_P2P -> STOP 551 + * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING 515 552 * RESUMING -> STOP -> RUNNING_P2P 553 + * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P 516 554 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING 555 + * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 517 556 * RESUMING -> STOP -> STOP_COPY 557 + * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P 518 558 * RUNNING -> RUNNING_P2P -> STOP 519 559 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING 520 560 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY 561 + * RUNNING_P2P -> RUNNING -> PRE_COPY 521 562 * RUNNING_P2P -> STOP -> RESUMING 522 563 * RUNNING_P2P -> STOP -> STOP_COPY 564 + * STOP -> RUNNING_P2P -> PRE_COPY_P2P 523 565 * STOP -> RUNNING_P2P -> RUNNING 566 + * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY 524 567 * STOP_COPY -> STOP -> RESUMING 525 568 * STOP_COPY -> STOP -> RUNNING_P2P 526 569 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING 570 + * 571 + * The following transitions are blocked: 572 + * STOP_COPY -> PRE_COPY 573 + * STOP_COPY -> PRE_COPY_P2P 527 574 */ 528 575 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = { 529 576 [VFIO_DEVICE_STATE_STOP] = { 530 577 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 531 578 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 579 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 580 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 532 581 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 533 582 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 534 583 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, ··· 572 551 [VFIO_DEVICE_STATE_RUNNING] = { 573 552 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 574 553 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 554 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 555 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 575 556 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P, 557 + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 558 + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 559 + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 560 + }, 561 + [VFIO_DEVICE_STATE_PRE_COPY] = { 562 + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING, 563 + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 564 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 565 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 566 + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 567 + [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING, 568 + [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING, 569 + [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, 570 + }, 571 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = { 572 + [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P, 573 + [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P, 574 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY, 575 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 576 + [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 576 577 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P, 577 578 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, 578 579 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR, ··· 602 559 [VFIO_DEVICE_STATE_STOP_COPY] = { 603 560 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 604 561 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 562 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 563 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 605 564 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY, 606 565 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 607 566 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, ··· 612 567 [VFIO_DEVICE_STATE_RESUMING] = { 613 568 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 614 569 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP, 570 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP, 571 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP, 615 572 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 616 573 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING, 617 574 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP, ··· 622 575 [VFIO_DEVICE_STATE_RUNNING_P2P] = { 623 576 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP, 624 577 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING, 578 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING, 579 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P, 625 580 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP, 626 581 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP, 627 582 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P, ··· 632 583 [VFIO_DEVICE_STATE_ERROR] = { 633 584 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR, 634 585 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR, 586 + [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR, 587 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR, 635 588 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR, 636 589 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR, 637 590 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR, ··· 644 593 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = { 645 594 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY, 646 595 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY, 596 + [VFIO_DEVICE_STATE_PRE_COPY] = 597 + VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY, 598 + [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY | 599 + VFIO_MIGRATION_P2P | 600 + VFIO_MIGRATION_PRE_COPY, 647 601 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY, 648 602 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY, 649 603 [VFIO_DEVICE_STATE_RUNNING_P2P] = ··· 757 701 return -EFAULT; 758 702 if (IS_ERR(filp)) 759 703 return PTR_ERR(filp); 704 + return 0; 705 + } 706 + 707 + static int 708 + vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, 709 + u32 flags, void __user *arg, 710 + size_t argsz) 711 + { 712 + struct vfio_device_feature_mig_data_size data_size = {}; 713 + unsigned long stop_copy_length; 714 + int ret; 715 + 716 + if (!device->mig_ops) 717 + return -ENOTTY; 718 + 719 + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, 720 + sizeof(data_size)); 721 + if (ret != 1) 722 + return ret; 723 + 724 + ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length); 725 + if (ret) 726 + return ret; 727 + 728 + data_size.stop_copy_length = stop_copy_length; 729 + if (copy_to_user(arg, &data_size, sizeof(data_size))) 730 + return -EFAULT; 731 + 760 732 return 0; 761 733 } 762 734 ··· 1013 929 feature.argsz - minsz); 1014 930 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: 1015 931 return vfio_ioctl_device_feature_logging_report( 932 + device, feature.flags, arg->data, 933 + feature.argsz - minsz); 934 + case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE: 935 + return vfio_ioctl_device_feature_migration_data_size( 1016 936 device, feature.flags, arg->data, 1017 937 feature.argsz - minsz); 1018 938 default: ··· 1348 1260 if (ret) 1349 1261 return ret; 1350 1262 1263 + ret = vfio_virqfd_init(); 1264 + if (ret) 1265 + goto err_virqfd; 1266 + 1351 1267 /* /sys/class/vfio-dev/vfioX */ 1352 1268 vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); 1353 1269 if (IS_ERR(vfio.device_class)) { ··· 1363 1271 return 0; 1364 1272 1365 1273 err_dev_class: 1274 + vfio_virqfd_exit(); 1275 + err_virqfd: 1366 1276 vfio_group_cleanup(); 1367 1277 return ret; 1368 1278 } ··· 1374 1280 ida_destroy(&vfio.device_ida); 1375 1281 class_destroy(vfio.device_class); 1376 1282 vfio.device_class = NULL; 1283 + vfio_virqfd_exit(); 1377 1284 vfio_group_cleanup(); 1378 1285 xa_destroy(&vfio_device_set_xa); 1379 1286 }
-107
drivers/vfio/vfio_spapr_eeh.c
··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * EEH functionality support for VFIO devices. The feature is only 4 - * available on sPAPR compatible platforms. 5 - * 6 - * Copyright Gavin Shan, IBM Corporation 2014. 7 - */ 8 - 9 - #include <linux/module.h> 10 - #include <linux/uaccess.h> 11 - #include <linux/vfio.h> 12 - #include <asm/eeh.h> 13 - 14 - #define DRIVER_VERSION "0.1" 15 - #define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" 16 - #define DRIVER_DESC "VFIO IOMMU SPAPR EEH" 17 - 18 - /* We might build address mapping here for "fast" path later */ 19 - void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) 20 - { 21 - eeh_dev_open(pdev); 22 - } 23 - EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); 24 - 25 - void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) 26 - { 27 - eeh_dev_release(pdev); 28 - } 29 - EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); 30 - 31 - long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, 32 - unsigned int cmd, unsigned long arg) 33 - { 34 - struct eeh_pe *pe; 35 - struct vfio_eeh_pe_op op; 36 - unsigned long minsz; 37 - long ret = -EINVAL; 38 - 39 - switch (cmd) { 40 - case VFIO_CHECK_EXTENSION: 41 - if (arg == VFIO_EEH) 42 - ret = eeh_enabled() ? 1 : 0; 43 - else 44 - ret = 0; 45 - break; 46 - case VFIO_EEH_PE_OP: 47 - pe = eeh_iommu_group_to_pe(group); 48 - if (!pe) 49 - return -ENODEV; 50 - 51 - minsz = offsetofend(struct vfio_eeh_pe_op, op); 52 - if (copy_from_user(&op, (void __user *)arg, minsz)) 53 - return -EFAULT; 54 - if (op.argsz < minsz || op.flags) 55 - return -EINVAL; 56 - 57 - switch (op.op) { 58 - case VFIO_EEH_PE_DISABLE: 59 - ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); 60 - break; 61 - case VFIO_EEH_PE_ENABLE: 62 - ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); 63 - break; 64 - case VFIO_EEH_PE_UNFREEZE_IO: 65 - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); 66 - break; 67 - case VFIO_EEH_PE_UNFREEZE_DMA: 68 - ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); 69 - break; 70 - case VFIO_EEH_PE_GET_STATE: 71 - ret = eeh_pe_get_state(pe); 72 - break; 73 - case VFIO_EEH_PE_RESET_DEACTIVATE: 74 - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true); 75 - break; 76 - case VFIO_EEH_PE_RESET_HOT: 77 - ret = eeh_pe_reset(pe, EEH_RESET_HOT, true); 78 - break; 79 - case VFIO_EEH_PE_RESET_FUNDAMENTAL: 80 - ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true); 81 - break; 82 - case VFIO_EEH_PE_CONFIGURE: 83 - ret = eeh_pe_configure(pe); 84 - break; 85 - case VFIO_EEH_PE_INJECT_ERR: 86 - minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); 87 - if (op.argsz < minsz) 88 - return -EINVAL; 89 - if (copy_from_user(&op, (void __user *)arg, minsz)) 90 - return -EFAULT; 91 - 92 - ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, 93 - op.err.addr, op.err.mask); 94 - break; 95 - default: 96 - ret = -EINVAL; 97 - } 98 - } 99 - 100 - return ret; 101 - } 102 - EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); 103 - 104 - MODULE_VERSION(DRIVER_VERSION); 105 - MODULE_LICENSE("GPL v2"); 106 - MODULE_AUTHOR(DRIVER_AUTHOR); 107 - MODULE_DESCRIPTION(DRIVER_DESC);
+3 -14
drivers/vfio/virqfd.c
··· 12 12 #include <linux/file.h> 13 13 #include <linux/module.h> 14 14 #include <linux/slab.h> 15 - 16 - #define DRIVER_VERSION "0.1" 17 - #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" 18 - #define DRIVER_DESC "IRQFD support for VFIO bus drivers" 15 + #include "vfio.h" 19 16 20 17 static struct workqueue_struct *vfio_irqfd_cleanup_wq; 21 18 static DEFINE_SPINLOCK(virqfd_lock); 22 19 23 - static int __init vfio_virqfd_init(void) 20 + int __init vfio_virqfd_init(void) 24 21 { 25 22 vfio_irqfd_cleanup_wq = 26 23 create_singlethread_workqueue("vfio-irqfd-cleanup"); ··· 27 30 return 0; 28 31 } 29 32 30 - static void __exit vfio_virqfd_exit(void) 33 + void vfio_virqfd_exit(void) 31 34 { 32 35 destroy_workqueue(vfio_irqfd_cleanup_wq); 33 36 } ··· 213 216 flush_workqueue(vfio_irqfd_cleanup_wq); 214 217 } 215 218 EXPORT_SYMBOL_GPL(vfio_virqfd_disable); 216 - 217 - module_init(vfio_virqfd_init); 218 - module_exit(vfio_virqfd_exit); 219 - 220 - MODULE_VERSION(DRIVER_VERSION); 221 - MODULE_LICENSE("GPL v2"); 222 - MODULE_AUTHOR(DRIVER_AUTHOR); 223 - MODULE_DESCRIPTION(DRIVER_DESC);
+11 -3
include/linux/mlx5/mlx5_ifc.h
··· 1891 1891 u8 max_reformat_remove_size[0x8]; 1892 1892 u8 max_reformat_remove_offset[0x8]; 1893 1893 1894 - u8 reserved_at_c0[0xe0]; 1894 + u8 reserved_at_c0[0x8]; 1895 + u8 migration_multi_load[0x1]; 1896 + u8 migration_tracking_state[0x1]; 1897 + u8 reserved_at_ca[0x16]; 1898 + 1899 + u8 reserved_at_e0[0xc0]; 1895 1900 1896 1901 u8 reserved_at_1a0[0xb]; 1897 1902 u8 log_min_mkey_entity_size[0x5]; ··· 12038 12033 u8 reserved_at_20[0x10]; 12039 12034 u8 op_mod[0x10]; 12040 12035 12041 - u8 reserved_at_40[0x10]; 12036 + u8 incremental[0x1]; 12037 + u8 reserved_at_41[0xf]; 12042 12038 u8 vhca_id[0x10]; 12043 12039 12044 12040 u8 reserved_at_60[0x20]; ··· 12065 12059 u8 reserved_at_20[0x10]; 12066 12060 u8 op_mod[0x10]; 12067 12061 12068 - u8 reserved_at_40[0x10]; 12062 + u8 incremental[0x1]; 12063 + u8 set_track[0x1]; 12064 + u8 reserved_at_42[0xe]; 12069 12065 u8 vhca_id[0x10]; 12070 12066 12071 12067 u8 reserved_at_60[0x20];
+5 -26
include/linux/vfio.h
··· 146 146 * @migration_get_state: Optional callback to get the migration state for 147 147 * devices that support migration. It's mandatory for 148 148 * VFIO_DEVICE_FEATURE_MIGRATION migration support. 149 + * @migration_get_data_size: Optional callback to get the estimated data 150 + * length that will be required to complete stop copy. It's mandatory for 151 + * VFIO_DEVICE_FEATURE_MIGRATION migration support. 149 152 */ 150 153 struct vfio_migration_ops { 151 154 struct file *(*migration_set_state)( ··· 156 153 enum vfio_device_mig_state new_state); 157 154 int (*migration_get_state)(struct vfio_device *device, 158 155 enum vfio_device_mig_state *curr_state); 156 + int (*migration_get_data_size)(struct vfio_device *device, 157 + unsigned long *stop_copy_length); 159 158 }; 160 159 161 160 /** ··· 220 215 dev, ops), \ 221 216 struct dev_struct, member) 222 217 223 - int vfio_init_device(struct vfio_device *device, struct device *dev, 224 - const struct vfio_device_ops *ops); 225 - void vfio_free_device(struct vfio_device *device); 226 218 static inline void vfio_put_device(struct vfio_device *device) 227 219 { 228 220 put_device(&device->device); ··· 272 270 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, 273 271 int num_irqs, int max_irq_type, 274 272 size_t *data_size); 275 - 276 - struct pci_dev; 277 - #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH) 278 - void vfio_spapr_pci_eeh_open(struct pci_dev *pdev); 279 - void vfio_spapr_pci_eeh_release(struct pci_dev *pdev); 280 - long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, 281 - unsigned long arg); 282 - #else 283 - static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) 284 - { 285 - } 286 - 287 - static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) 288 - { 289 - } 290 - 291 - static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, 292 - unsigned int cmd, 293 - unsigned long arg) 294 - { 295 - return -ENOTTY; 296 - } 297 - #endif /* CONFIG_VFIO_SPAPR_EEH */ 298 273 299 274 /* 300 275 * IRQfd - generic
+133 -5
include/uapi/linux/vfio.h
··· 819 819 * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P 820 820 * is supported in addition to the STOP_COPY states. 821 821 * 822 + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that 823 + * PRE_COPY is supported in addition to the STOP_COPY states. 824 + * 825 + * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY 826 + * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported 827 + * in addition to the STOP_COPY states. 828 + * 822 829 * Other combinations of flags have behavior to be defined in the future. 823 830 */ 824 831 struct vfio_device_feature_migration { 825 832 __aligned_u64 flags; 826 833 #define VFIO_MIGRATION_STOP_COPY (1 << 0) 827 834 #define VFIO_MIGRATION_P2P (1 << 1) 835 + #define VFIO_MIGRATION_PRE_COPY (1 << 2) 828 836 }; 829 837 #define VFIO_DEVICE_FEATURE_MIGRATION 1 830 838 ··· 883 875 * RESUMING - The device is stopped and is loading a new internal state 884 876 * ERROR - The device has failed and must be reset 885 877 * 886 - * And 1 optional state to support VFIO_MIGRATION_P2P: 878 + * And optional states to support VFIO_MIGRATION_P2P: 887 879 * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA 880 + * And VFIO_MIGRATION_PRE_COPY: 881 + * PRE_COPY - The device is running normally but tracking internal state 882 + * changes 883 + * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY: 884 + * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA 888 885 * 889 886 * The FSM takes actions on the arcs between FSM states. The driver implements 890 887 * the following behavior for the FSM arcs: ··· 921 908 * 922 909 * To abort a RESUMING session the device must be reset. 923 910 * 911 + * PRE_COPY -> RUNNING 924 912 * RUNNING_P2P -> RUNNING 925 913 * While in RUNNING the device is fully operational, the device may generate 926 914 * interrupts, DMA, respond to MMIO, all vfio device regions are functional, 927 915 * and the device may advance its internal state. 928 916 * 917 + * The PRE_COPY arc will terminate a data transfer session. 918 + * 919 + * PRE_COPY_P2P -> RUNNING_P2P 929 920 * RUNNING -> RUNNING_P2P 930 921 * STOP -> RUNNING_P2P 931 922 * While in RUNNING_P2P the device is partially running in the P2P quiescent 932 923 * state defined below. 933 924 * 934 - * STOP -> STOP_COPY 935 - * This arc begin the process of saving the device state and will return a 936 - * new data_fd. 925 + * The PRE_COPY_P2P arc will terminate a data transfer session. 937 926 * 927 + * RUNNING -> PRE_COPY 928 + * RUNNING_P2P -> PRE_COPY_P2P 929 + * STOP -> STOP_COPY 930 + * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states 931 + * which share a data transfer session. Moving between these states alters 932 + * what is streamed in session, but does not terminate or otherwise affect 933 + * the associated fd. 934 + * 935 + * These arcs begin the process of saving the device state and will return a 936 + * new data_fd. The migration driver may perform actions such as enabling 937 + * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P. 938 + * 939 + * Each arc does not change the device operation, the device remains 940 + * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below 941 + * in PRE_COPY_P2P -> STOP_COPY. 942 + * 943 + * PRE_COPY -> PRE_COPY_P2P 944 + * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above. 945 + * However, while in the PRE_COPY_P2P state, the device is partially running 946 + * in the P2P quiescent state defined below, like RUNNING_P2P. 947 + * 948 + * PRE_COPY_P2P -> PRE_COPY 949 + * This arc allows returning the device to a full RUNNING behavior while 950 + * continuing all the behaviors of PRE_COPY. 951 + * 952 + * PRE_COPY_P2P -> STOP_COPY 938 953 * While in the STOP_COPY state the device has the same behavior as STOP 939 954 * with the addition that the data transfers session continues to stream the 940 955 * migration state. End of stream on the FD indicates the entire device ··· 980 939 * device state for this arc if required to prepare the device to receive the 981 940 * migration data. 982 941 * 942 + * STOP_COPY -> PRE_COPY 943 + * STOP_COPY -> PRE_COPY_P2P 944 + * These arcs are not permitted and return error if requested. Future 945 + * revisions of this API may define behaviors for these arcs, in this case 946 + * support will be discoverable by a new flag in 947 + * VFIO_DEVICE_FEATURE_MIGRATION. 948 + * 983 949 * any -> ERROR 984 950 * ERROR cannot be specified as a device state, however any transition request 985 951 * can be failed with an errno return and may then move the device_state into ··· 998 950 * The optional peer to peer (P2P) quiescent state is intended to be a quiescent 999 951 * state for the device for the purposes of managing multiple devices within a 1000 952 * user context where peer-to-peer DMA between devices may be active. The 1001 - * RUNNING_P2P states must prevent the device from initiating 953 + * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating 1002 954 * any new P2P DMA transactions. If the device can identify P2P transactions 1003 955 * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration 1004 956 * driver must complete any such outstanding operations prior to completing the ··· 1011 963 * above FSM arcs. As there are multiple paths through the FSM arcs the path 1012 964 * should be selected based on the following rules: 1013 965 * - Select the shortest path. 966 + * - The path cannot have saving group states as interior arcs, only 967 + * starting/end states. 1014 968 * Refer to vfio_mig_get_next_state() for the result of the algorithm. 1015 969 * 1016 970 * The automatic transit through the FSM arcs that make up the combination ··· 1026 976 * support them. The user can discover if these states are supported by using 1027 977 * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can 1028 978 * avoid knowing about these optional states if the kernel driver supports them. 979 + * 980 + * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY 981 + * is not present. 1029 982 */ 1030 983 enum vfio_device_mig_state { 1031 984 VFIO_DEVICE_STATE_ERROR = 0, ··· 1037 984 VFIO_DEVICE_STATE_STOP_COPY = 3, 1038 985 VFIO_DEVICE_STATE_RESUMING = 4, 1039 986 VFIO_DEVICE_STATE_RUNNING_P2P = 5, 987 + VFIO_DEVICE_STATE_PRE_COPY = 6, 988 + VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, 1040 989 }; 990 + 991 + /** 992 + * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21) 993 + * 994 + * This ioctl is used on the migration data FD in the precopy phase of the 995 + * migration data transfer. It returns an estimate of the current data sizes 996 + * remaining to be transferred. It allows the user to judge when it is 997 + * appropriate to leave PRE_COPY for STOP_COPY. 998 + * 999 + * This ioctl is valid only in PRE_COPY states and kernel driver should 1000 + * return -EINVAL from any other migration state. 1001 + * 1002 + * The vfio_precopy_info data structure returned by this ioctl provides 1003 + * estimates of data available from the device during the PRE_COPY states. 1004 + * This estimate is split into two categories, initial_bytes and 1005 + * dirty_bytes. 1006 + * 1007 + * The initial_bytes field indicates the amount of initial precopy 1008 + * data available from the device. This field should have a non-zero initial 1009 + * value and decrease as migration data is read from the device. 1010 + * It is recommended to leave PRE_COPY for STOP_COPY only after this field 1011 + * reaches zero. Leaving PRE_COPY earlier might make things slower. 1012 + * 1013 + * The dirty_bytes field tracks device state changes relative to data 1014 + * previously retrieved. This field starts at zero and may increase as 1015 + * the internal device state is modified or decrease as that modified 1016 + * state is read from the device. 1017 + * 1018 + * Userspace may use the combination of these fields to estimate the 1019 + * potential data size available during the PRE_COPY phases, as well as 1020 + * trends relative to the rate the device is dirtying its internal 1021 + * state, but these fields are not required to have any bearing relative 1022 + * to the data size available during the STOP_COPY phase. 1023 + * 1024 + * Drivers have a lot of flexibility in when and what they transfer during the 1025 + * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO. 1026 + * 1027 + * During pre-copy the migration data FD has a temporary "end of stream" that is 1028 + * reached when both initial_bytes and dirty_byte are zero. For instance, this 1029 + * may indicate that the device is idle and not currently dirtying any internal 1030 + * state. When read() is done on this temporary end of stream the kernel driver 1031 + * should return ENOMSG from read(). Userspace can wait for more data (which may 1032 + * never come) by using poll. 1033 + * 1034 + * Once in STOP_COPY the migration data FD has a permanent end of stream 1035 + * signaled in the usual way by read() always returning 0 and poll always 1036 + * returning readable. ENOMSG may not be returned in STOP_COPY. 1037 + * Support for this ioctl is mandatory if a driver claims to support 1038 + * VFIO_MIGRATION_PRE_COPY. 1039 + * 1040 + * Return: 0 on success, -1 and errno set on failure. 1041 + */ 1042 + struct vfio_precopy_info { 1043 + __u32 argsz; 1044 + __u32 flags; 1045 + __aligned_u64 initial_bytes; 1046 + __aligned_u64 dirty_bytes; 1047 + }; 1048 + 1049 + #define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) 1041 1050 1042 1051 /* 1043 1052 * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power ··· 1242 1127 }; 1243 1128 1244 1129 #define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 1130 + 1131 + /* 1132 + * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will 1133 + * be required to complete stop copy. 1134 + * 1135 + * Note: Can be called on each device state. 1136 + */ 1137 + 1138 + struct vfio_device_feature_mig_data_size { 1139 + __aligned_u64 stop_copy_length; 1140 + }; 1141 + 1142 + #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 1245 1143 1246 1144 /* -------- API for Type1 VFIO IOMMU -------- */ 1247 1145
+4 -4
samples/vfio-mdev/mbochs.c
··· 594 594 atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); 595 595 kfree(mdev_state->pages); 596 596 kfree(mdev_state->vconfig); 597 - vfio_free_device(vdev); 598 597 } 599 598 600 599 static void mbochs_remove(struct mdev_device *mdev) ··· 1430 1431 1431 1432 ret = device_register(&mbochs_dev); 1432 1433 if (ret) 1433 - goto err_class; 1434 + goto err_put; 1434 1435 1435 1436 ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, 1436 1437 mbochs_mdev_types, ··· 1441 1442 return 0; 1442 1443 1443 1444 err_device: 1444 - device_unregister(&mbochs_dev); 1445 - err_class: 1445 + device_del(&mbochs_dev); 1446 + err_put: 1447 + put_device(&mbochs_dev); 1446 1448 class_destroy(mbochs_class); 1447 1449 err_driver: 1448 1450 mdev_unregister_driver(&mbochs_driver);
+7 -1
samples/vfio-mdev/mdpy-fb.c
··· 109 109 110 110 ret = pci_request_regions(pdev, "mdpy-fb"); 111 111 if (ret < 0) 112 - return ret; 112 + goto err_disable_dev; 113 113 114 114 pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); 115 115 pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); ··· 191 191 err_release_regions: 192 192 pci_release_regions(pdev); 193 193 194 + err_disable_dev: 195 + pci_disable_device(pdev); 196 + 194 197 return ret; 195 198 } 196 199 ··· 202 199 struct fb_info *info = pci_get_drvdata(pdev); 203 200 204 201 unregister_framebuffer(info); 202 + iounmap(info->screen_base); 205 203 framebuffer_release(info); 204 + pci_release_regions(pdev); 205 + pci_disable_device(pdev); 206 206 } 207 207 208 208 static struct pci_device_id mdpy_fb_pci_table[] = {
+4 -4
samples/vfio-mdev/mdpy.c
··· 283 283 284 284 vfree(mdev_state->memblk); 285 285 kfree(mdev_state->vconfig); 286 - vfio_free_device(vdev); 287 286 } 288 287 289 288 static void mdpy_remove(struct mdev_device *mdev) ··· 717 718 718 719 ret = device_register(&mdpy_dev); 719 720 if (ret) 720 - goto err_class; 721 + goto err_put; 721 722 722 723 ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, 723 724 mdpy_mdev_types, ··· 728 729 return 0; 729 730 730 731 err_device: 731 - device_unregister(&mdpy_dev); 732 - err_class: 732 + device_del(&mdpy_dev); 733 + err_put: 734 + put_device(&mdpy_dev); 733 735 class_destroy(mdpy_class); 734 736 err_driver: 735 737 mdev_unregister_driver(&mdpy_driver);
+4 -4
samples/vfio-mdev/mtty.c
··· 784 784 785 785 atomic_add(mdev_state->nr_ports, &mdev_avail_ports); 786 786 kfree(mdev_state->vconfig); 787 - vfio_free_device(vdev); 788 787 } 789 788 790 789 static void mtty_remove(struct mdev_device *mdev) ··· 1330 1331 1331 1332 ret = device_register(&mtty_dev.dev); 1332 1333 if (ret) 1333 - goto err_class; 1334 + goto err_put; 1334 1335 1335 1336 ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, 1336 1337 &mtty_driver, mtty_mdev_types, ··· 1340 1341 return 0; 1341 1342 1342 1343 err_device: 1343 - device_unregister(&mtty_dev.dev); 1344 - err_class: 1344 + device_del(&mtty_dev.dev); 1345 + err_put: 1346 + put_device(&mtty_dev.dev); 1345 1347 class_destroy(mtty_dev.vd_class); 1346 1348 err_driver: 1347 1349 mdev_unregister_driver(&mtty_driver);