drm/xe/vf: Wakeup in GuC backend on VF post migration recovery

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

If VF post-migration recovery is in progress, the recovery flow will
rebuild all GuC submission state. In this case, exit all waiters to
ensure that submission queue scheduling can also be paused. Avoid taking
any adverse actions after aborting the wait.

As part of waking up the GuC backend, suspend_wait can now return
-EAGAIN indicating the waiter should be retried. If the caller is
running on work item, that work item need to be requeued to avoid a
deadlock for the work item blocking the VF migration recovery work item.

v3:
- Don't block in preempt fence work queue as this can interfere with VF
post-migration work queue scheduling leading to deadlock (Testing)
- Use xe_gt_recovery_inprogress (Michal)
v5:
- Use static function for vf_recovery (Michal)
- Add helper to wake CT waiters (Michal)
- Move some code to following patch (Michal)
- Adjust commit message to explain suspend_wait returning -EAGAIN (Michal)
- Add kernel doc to suspend_wait around returning -EAGAIN
v7:
- Add comment on why a shared wait queue is need on VFs (Michal)
- Guard again suspend_wait signaling early on resfix donw (Tomasz)
v8:
- Fix kernel doc (CI)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-18-matthew.brost@intel.com

Matthew Brost 6 months ago a4dae94a f1029b9d

+99 -21

5 changed files

expand all

drivers

gpu

drm

xe_exec_queue_types.h

xe_gt_sriov_vf.c

xe_guc_ct.h

xe_guc_submit.c

xe_preempt_fence.c

drivers/gpu/drm/xe/xe_exec_queue_types.h

··· 207 207 * call after suspend. In dma-fencing path thus must return within a 208 208 * reasonable amount of time. -ETIME return shall indicate an error 209 209 * waiting for suspend resulting in associated VM getting killed. 210 + * -EAGAIN return indicates the wait should be tried again, if the wait 211 + * is within a work item, the work item should be requeued as deadlock 212 + * avoidance mechanism. 210 213 */ 211 214 int (*suspend_wait)(struct xe_exec_queue *q); 212 215 /**

drivers/gpu/drm/xe/xe_gt_sriov_vf.c

··· 23 23 #include "xe_gt_sriov_vf.h" 24 24 #include "xe_gt_sriov_vf_types.h" 25 25 #include "xe_guc.h" 26 + #include "xe_guc_ct.h" 26 27 #include "xe_guc_hxg_helpers.h" 27 28 #include "xe_guc_relay.h" 28 29 #include "xe_guc_submit.h" ··· 730 729 !gt->sriov.vf.migration.recovery_teardown) { 731 730 gt->sriov.vf.migration.recovery_queued = true; 732 731 WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true); 732 + smp_wmb(); /* Ensure above write visable before wake */ 733 + 734 + xe_guc_ct_wake_waiters(&gt->uc.guc.ct); 733 735 734 736 started = queue_work(gt->ordered_wq, &gt->sriov.vf.migration.worker); 735 737 xe_gt_sriov_info(gt, "VF migration recovery %s\n", started ?

drivers/gpu/drm/xe/xe_guc_ct.h

··· 72 72 73 73 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct); 74 74 75 + /** 76 + * xe_guc_ct_wake_waiters() - GuC CT wake up waiters 77 + * @ct: GuC CT object 78 + */ 79 + static inline void xe_guc_ct_wake_waiters(struct xe_guc_ct *ct) 80 + { 81 + wake_up_all(&ct->wq); 82 + } 83 + 75 84 #endif

+72 -21

drivers/gpu/drm/xe/xe_guc_submit.c

··· 27 27 #include "xe_gt.h" 28 28 #include "xe_gt_clock.h" 29 29 #include "xe_gt_printk.h" 30 - #include "xe_gt_sriov_vf.h" 31 30 #include "xe_guc.h" 32 31 #include "xe_guc_capture.h" 33 32 #include "xe_guc_ct.h" ··· 701 702 return (WQ_SIZE - q->guc->wqi_tail); 702 703 } 703 704 705 + static bool vf_recovery(struct xe_guc *guc) 706 + { 707 + return xe_gt_recovery_pending(guc_to_gt(guc)); 708 + } 709 + 704 710 static int wq_wait_for_space(struct xe_exec_queue *q, u32 wqi_size) 705 711 { 706 712 struct xe_guc *guc = exec_queue_to_guc(q); ··· 715 711 716 712 #define AVAILABLE_SPACE \ 717 713 CIRC_SPACE(q->guc->wqi_tail, q->guc->wqi_head, WQ_SIZE) 718 - if (wqi_size > AVAILABLE_SPACE) { 714 + if (wqi_size > AVAILABLE_SPACE && !vf_recovery(guc)) { 719 715 try_again: 720 716 q->guc->wqi_head = parallel_read(xe, map, wq_desc.head); 721 717 if (wqi_size > AVAILABLE_SPACE) { ··· 914 910 ret = wait_event_timeout(guc->ct.wq, 915 911 (!exec_queue_pending_enable(q) && 916 912 !exec_queue_pending_disable(q)) || 917 - xe_guc_read_stopped(guc), 913 + xe_guc_read_stopped(guc) || 914 + vf_recovery(guc), 918 915 HZ * 5); 919 - if (!ret) { 916 + if (!ret && !vf_recovery(guc)) { 920 917 struct xe_gpu_scheduler *sched = &q->guc->sched; 921 918 922 919 xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n"); ··· 1020 1015 bool wedged = false; 1021 1016 1022 1017 xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q)); 1018 + 1019 + if (vf_recovery(guc)) 1020 + return; 1021 + 1023 1022 trace_xe_exec_queue_lr_cleanup(q); 1024 1023 1025 1024 if (!exec_queue_killed(q)) ··· 1056 1047 */ 1057 1048 ret = wait_event_timeout(guc->ct.wq, 1058 1049 !exec_queue_pending_disable(q) || 1059 - xe_guc_read_stopped(guc), HZ * 5); 1050 + xe_guc_read_stopped(guc) || 1051 + vf_recovery(guc), HZ * 5); 1052 + if (vf_recovery(guc)) 1053 + return; 1054 + 1060 1055 if (!ret) { 1061 1056 xe_gt_warn(q->gt, "Schedule disable failed to respond, guc_id=%d\n", 1062 1057 q->guc->id); ··· 1150 1137 1151 1138 ret = wait_event_timeout(guc->ct.wq, 1152 1139 !exec_queue_pending_enable(q) || 1153 - xe_guc_read_stopped(guc), HZ * 5); 1154 - if (!ret || xe_guc_read_stopped(guc)) { 1140 + xe_guc_read_stopped(guc) || 1141 + vf_recovery(guc), HZ * 5); 1142 + if ((!ret && !vf_recovery(guc)) || xe_guc_read_stopped(guc)) { 1155 1143 xe_gt_warn(guc_to_gt(guc), "Schedule enable failed to respond"); 1156 1144 set_exec_queue_banned(q); 1157 1145 xe_gt_reset_async(q->gt); ··· 1223 1209 * list so job can be freed and kick scheduler ensuring free job is not 1224 1210 * lost. 1225 1211 */ 1226 - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) 1212 + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags) || 1213 + vf_recovery(guc)) 1227 1214 return DRM_GPU_SCHED_STAT_NO_HANG; 1228 1215 1229 1216 /* Kill the run_job entry point */ ··· 1276 1261 ret = wait_event_timeout(guc->ct.wq, 1277 1262 (!exec_queue_pending_enable(q) && 1278 1263 !exec_queue_pending_disable(q)) || 1279 - xe_guc_read_stopped(guc), HZ * 5); 1264 + xe_guc_read_stopped(guc) || 1265 + vf_recovery(guc), HZ * 5); 1266 + if (vf_recovery(guc)) 1267 + goto handle_vf_resume; 1280 1268 if (!ret || xe_guc_read_stopped(guc)) 1281 1269 goto trigger_reset; 1282 1270 ··· 1304 1286 smp_rmb(); 1305 1287 ret = wait_event_timeout(guc->ct.wq, 1306 1288 !exec_queue_pending_disable(q) || 1307 - xe_guc_read_stopped(guc), HZ * 5); 1289 + xe_guc_read_stopped(guc) || 1290 + vf_recovery(guc), HZ * 5); 1291 + if (vf_recovery(guc)) 1292 + goto handle_vf_resume; 1308 1293 if (!ret || xe_guc_read_stopped(guc)) { 1309 1294 trigger_reset: 1310 1295 if (!ret) ··· 1412 1391 * some thought, do this in a follow up. 1413 1392 */ 1414 1393 xe_sched_submission_start(sched); 1394 + handle_vf_resume: 1415 1395 return DRM_GPU_SCHED_STAT_NO_HANG; 1416 1396 } 1417 1397 ··· 1509 1487 1510 1488 static void __suspend_fence_signal(struct xe_exec_queue *q) 1511 1489 { 1490 + struct xe_guc *guc = exec_queue_to_guc(q); 1491 + struct xe_device *xe = guc_to_xe(guc); 1492 + 1512 1493 if (!q->guc->suspend_pending) 1513 1494 return; 1514 1495 1515 1496 WRITE_ONCE(q->guc->suspend_pending, false); 1516 - wake_up(&q->guc->suspend_wait); 1497 + 1498 + /* 1499 + * We use a GuC shared wait queue for VFs because the VF resfix start 1500 + * interrupt must be able to wake all instances of suspend_wait. This 1501 + * prevents the VF migration worker from being starved during 1502 + * scheduling. 1503 + */ 1504 + if (IS_SRIOV_VF(xe)) 1505 + wake_up_all(&guc->ct.wq); 1506 + else 1507 + wake_up(&q->guc->suspend_wait); 1517 1508 } 1518 1509 1519 1510 static void suspend_fence_signal(struct xe_exec_queue *q) ··· 1547 1512 1548 1513 if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) && 1549 1514 exec_queue_enabled(q)) { 1550 - wait_event(guc->ct.wq, (q->guc->resume_time != RESUME_PENDING || 1551 - xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q)); 1515 + wait_event(guc->ct.wq, vf_recovery(guc) || 1516 + ((q->guc->resume_time != RESUME_PENDING || 1517 + xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q))); 1552 1518 1553 1519 if (!xe_guc_read_stopped(guc)) { 1554 1520 s64 since_resume_ms = ··· 1676 1640 1677 1641 q->entity = &ge->entity; 1678 1642 1679 - if (xe_guc_read_stopped(guc)) 1643 + if (xe_guc_read_stopped(guc) || vf_recovery(guc)) 1680 1644 xe_sched_stop(sched); 1681 1645 1682 1646 mutex_unlock(&guc->submission_state.lock); ··· 1822 1786 static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q) 1823 1787 { 1824 1788 struct xe_guc *guc = exec_queue_to_guc(q); 1789 + struct xe_device *xe = guc_to_xe(guc); 1825 1790 int ret; 1826 1791 1827 1792 /* ··· 1830 1793 * suspend_pending upon kill but to be paranoid but races in which 1831 1794 * suspend_pending is set after kill also check kill here. 1832 1795 */ 1833 - ret = wait_event_interruptible_timeout(q->guc->suspend_wait, 1834 - !READ_ONCE(q->guc->suspend_pending) || 1835 - exec_queue_killed(q) || 1836 - xe_guc_read_stopped(guc), 1837 - HZ * 5); 1796 + #define WAIT_COND \ 1797 + (!READ_ONCE(q->guc->suspend_pending) || exec_queue_killed(q) || \ 1798 + xe_guc_read_stopped(guc)) 1799 + 1800 + retry: 1801 + if (IS_SRIOV_VF(xe)) 1802 + ret = wait_event_interruptible_timeout(guc->ct.wq, WAIT_COND || 1803 + vf_recovery(guc), 1804 + HZ * 5); 1805 + else 1806 + ret = wait_event_interruptible_timeout(q->guc->suspend_wait, 1807 + WAIT_COND, HZ * 5); 1808 + 1809 + if (vf_recovery(guc) && !xe_device_wedged((guc_to_xe(guc)))) 1810 + return -EAGAIN; 1838 1811 1839 1812 if (!ret) { 1840 1813 xe_gt_warn(guc_to_gt(guc), ··· 1852 1805 q->guc->id); 1853 1806 /* XXX: Trigger GT reset? */ 1854 1807 return -ETIME; 1808 + } else if (IS_SRIOV_VF(xe) && !WAIT_COND) { 1809 + /* Corner case on RESFIX DONE where vf_recovery() changes */ 1810 + goto retry; 1855 1811 } 1812 + 1813 + #undef WAIT_COND 1856 1814 1857 1815 return ret < 0 ? ret : 0; 1858 1816 } ··· 1957 1905 { 1958 1906 int ret; 1959 1907 1960 - if (xe_gt_WARN_ON(guc_to_gt(guc), 1961 - xe_gt_sriov_vf_recovery_pending(guc_to_gt(guc)))) 1908 + if (xe_gt_WARN_ON(guc_to_gt(guc), vf_recovery(guc))) 1962 1909 return 0; 1963 1910 1964 1911 if (!guc->submission_state.initialized)

+11

drivers/gpu/drm/xe/xe_preempt_fence.c

··· 8 8 #include <linux/slab.h> 9 9 10 10 #include "xe_exec_queue.h" 11 + #include "xe_gt_printk.h" 12 + #include "xe_guc_exec_queue_types.h" 11 13 #include "xe_vm.h" 12 14 13 15 static void preempt_fence_work_func(struct work_struct *w) ··· 23 21 dma_fence_set_error(&pfence->base, pfence->error); 24 22 } else if (!q->ops->reset_status(q)) { 25 23 int err = q->ops->suspend_wait(q); 24 + 25 + if (err == -EAGAIN) { 26 + xe_gt_dbg(q->gt, "PREEMPT FENCE RETRY guc_id=%d", 27 + q->guc->id); 28 + queue_work(q->vm->xe->preempt_fence_wq, 29 + &pfence->preempt_work); 30 + dma_fence_end_signalling(cookie); 31 + return; 32 + } 26 33 27 34 if (err) 28 35 dma_fence_set_error(&pfence->base, err);