drm/xe: Fix error handling if PXP fails to start

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Since the PXP start comes after __xe_exec_queue_init() has completed,
we need to cleanup what was done in that function in case of a PXP
start error.
__xe_exec_queue_init calls the submission backend init() function,
so we need to introduce an opposite for that. Unfortunately, while
we already have a fini() function pointer, it performs other
operations in addition to cleaning up what was done by the init().
Therefore, for clarity, the existing fini() has been renamed to
destroy(), while a new fini() has been added to only clean up what was
done by the init(), with the latter being called by the former (via
xe_exec_queue_fini).

Fixes: 72d479601d67 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues")
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com
(cherry picked from commit 626667321deb4c7a294725406faa3dd71c3d445d)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Daniele Ceraolo Spurio and committed by

Rodrigo Vivi 6 months ago ae5fbbda ff89a4d2

+73 -42

6 changed files

expand all

drivers

gpu

drm

xe_exec_queue.c

xe_exec_queue_types.h

xe_execlist.c

xe_execlist_types.h

xe_guc_exec_queue_types.h

xe_guc_submit.c

+15 -7

drivers/gpu/drm/xe/xe_exec_queue.c

··· 151 151 return err; 152 152 } 153 153 154 + static void __xe_exec_queue_fini(struct xe_exec_queue *q) 155 + { 156 + int i; 157 + 158 + q->ops->fini(q); 159 + 160 + for (i = 0; i < q->width; ++i) 161 + xe_lrc_put(q->lrc[i]); 162 + } 163 + 154 164 struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, 155 165 u32 logical_mask, u16 width, 156 166 struct xe_hw_engine *hwe, u32 flags, ··· 191 181 if (xe_exec_queue_uses_pxp(q)) { 192 182 err = xe_pxp_exec_queue_add(xe->pxp, q); 193 183 if (err) 194 - goto err_post_alloc; 184 + goto err_post_init; 195 185 } 196 186 197 187 return q; 198 188 189 + err_post_init: 190 + __xe_exec_queue_fini(q); 199 191 err_post_alloc: 200 192 __xe_exec_queue_free(q); 201 193 return ERR_PTR(err); ··· 295 283 xe_exec_queue_put(eq); 296 284 } 297 285 298 - q->ops->fini(q); 286 + q->ops->destroy(q); 299 287 } 300 288 301 289 void xe_exec_queue_fini(struct xe_exec_queue *q) 302 290 { 303 - int i; 304 - 305 291 /* 306 292 * Before releasing our ref to lrc and xef, accumulate our run ticks 307 293 * and wakeup any waiters. ··· 308 298 if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal)) 309 299 wake_up_var(&q->xef->exec_queue.pending_removal); 310 300 311 - for (i = 0; i < q->width; ++i) 312 - xe_lrc_put(q->lrc[i]); 313 - 301 + __xe_exec_queue_fini(q); 314 302 __xe_exec_queue_free(q); 315 303 } 316 304

+7 -1

drivers/gpu/drm/xe/xe_exec_queue_types.h

··· 166 166 int (*init)(struct xe_exec_queue *q); 167 167 /** @kill: Kill inflight submissions for backend */ 168 168 void (*kill)(struct xe_exec_queue *q); 169 - /** @fini: Fini exec queue for submission backend */ 169 + /** @fini: Undoes the init() for submission backend */ 170 170 void (*fini)(struct xe_exec_queue *q); 171 + /** 172 + * @destroy: Destroy exec queue for submission backend. The backend 173 + * function must call xe_exec_queue_fini() (which will in turn call the 174 + * fini() backend function) to ensure the queue is properly cleaned up. 175 + */ 176 + void (*destroy)(struct xe_exec_queue *q); 171 177 /** @set_priority: Set priority for exec queue */ 172 178 int (*set_priority)(struct xe_exec_queue *q, 173 179 enum xe_exec_queue_priority priority);

+16 -9

drivers/gpu/drm/xe/xe_execlist.c

··· 385 385 return err; 386 386 } 387 387 388 - static void execlist_exec_queue_fini_async(struct work_struct *w) 388 + static void execlist_exec_queue_fini(struct xe_exec_queue *q) 389 + { 390 + struct xe_execlist_exec_queue *exl = q->execlist; 391 + 392 + drm_sched_entity_fini(&exl->entity); 393 + drm_sched_fini(&exl->sched); 394 + 395 + kfree(exl); 396 + } 397 + 398 + static void execlist_exec_queue_destroy_async(struct work_struct *w) 389 399 { 390 400 struct xe_execlist_exec_queue *ee = 391 - container_of(w, struct xe_execlist_exec_queue, fini_async); 401 + container_of(w, struct xe_execlist_exec_queue, destroy_async); 392 402 struct xe_exec_queue *q = ee->q; 393 403 struct xe_execlist_exec_queue *exl = q->execlist; 394 404 struct xe_device *xe = gt_to_xe(q->gt); ··· 411 401 list_del(&exl->active_link); 412 402 spin_unlock_irqrestore(&exl->port->lock, flags); 413 403 414 - drm_sched_entity_fini(&exl->entity); 415 - drm_sched_fini(&exl->sched); 416 - kfree(exl); 417 - 418 404 xe_exec_queue_fini(q); 419 405 } 420 406 ··· 419 413 /* NIY */ 420 414 } 421 415 422 - static void execlist_exec_queue_fini(struct xe_exec_queue *q) 416 + static void execlist_exec_queue_destroy(struct xe_exec_queue *q) 423 417 { 424 - INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); 425 - queue_work(system_unbound_wq, &q->execlist->fini_async); 418 + INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async); 419 + queue_work(system_unbound_wq, &q->execlist->destroy_async); 426 420 } 427 421 428 422 static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, ··· 473 467 .init = execlist_exec_queue_init, 474 468 .kill = execlist_exec_queue_kill, 475 469 .fini = execlist_exec_queue_fini, 470 + .destroy = execlist_exec_queue_destroy, 476 471 .set_priority = execlist_exec_queue_set_priority, 477 472 .set_timeslice = execlist_exec_queue_set_timeslice, 478 473 .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,

+1 -1

drivers/gpu/drm/xe/xe_execlist_types.h

··· 42 42 43 43 bool has_run; 44 44 45 - struct work_struct fini_async; 45 + struct work_struct destroy_async; 46 46 47 47 enum xe_exec_queue_priority active_priority; 48 48 struct list_head active_link;

+2 -2

drivers/gpu/drm/xe/xe_guc_exec_queue_types.h

··· 35 35 struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; 36 36 /** @lr_tdr: long running TDR worker */ 37 37 struct work_struct lr_tdr; 38 - /** @fini_async: do final fini async from this worker */ 39 - struct work_struct fini_async; 38 + /** @destroy_async: do final destroy async from this worker */ 39 + struct work_struct destroy_async; 40 40 /** @resume_time: time of last resume */ 41 41 u64 resume_time; 42 42 /** @state: GuC specific state for this xe_exec_queue */

+32 -22

drivers/gpu/drm/xe/xe_guc_submit.c

··· 1277 1277 return DRM_GPU_SCHED_STAT_NO_HANG; 1278 1278 } 1279 1279 1280 - static void __guc_exec_queue_fini_async(struct work_struct *w) 1280 + static void guc_exec_queue_fini(struct xe_exec_queue *q) 1281 1281 { 1282 - struct xe_guc_exec_queue *ge = 1283 - container_of(w, struct xe_guc_exec_queue, fini_async); 1284 - struct xe_exec_queue *q = ge->q; 1282 + struct xe_guc_exec_queue *ge = q->guc; 1285 1283 struct xe_guc *guc = exec_queue_to_guc(q); 1286 1284 1287 - xe_pm_runtime_get(guc_to_xe(guc)); 1288 - trace_xe_exec_queue_destroy(q); 1289 - 1290 1285 release_guc_id(guc, q); 1291 - if (xe_exec_queue_is_lr(q)) 1292 - cancel_work_sync(&ge->lr_tdr); 1293 - /* Confirm no work left behind accessing device structures */ 1294 - cancel_delayed_work_sync(&ge->sched.base.work_tdr); 1295 1286 xe_sched_entity_fini(&ge->entity); 1296 1287 xe_sched_fini(&ge->sched); 1297 1288 ··· 1291 1300 * (timeline name). 1292 1301 */ 1293 1302 kfree_rcu(ge, rcu); 1303 + } 1304 + 1305 + static void __guc_exec_queue_destroy_async(struct work_struct *w) 1306 + { 1307 + struct xe_guc_exec_queue *ge = 1308 + container_of(w, struct xe_guc_exec_queue, destroy_async); 1309 + struct xe_exec_queue *q = ge->q; 1310 + struct xe_guc *guc = exec_queue_to_guc(q); 1311 + 1312 + xe_pm_runtime_get(guc_to_xe(guc)); 1313 + trace_xe_exec_queue_destroy(q); 1314 + 1315 + if (xe_exec_queue_is_lr(q)) 1316 + cancel_work_sync(&ge->lr_tdr); 1317 + /* Confirm no work left behind accessing device structures */ 1318 + cancel_delayed_work_sync(&ge->sched.base.work_tdr); 1319 + 1294 1320 xe_exec_queue_fini(q); 1321 + 1295 1322 xe_pm_runtime_put(guc_to_xe(guc)); 1296 1323 } 1297 1324 1298 - static void guc_exec_queue_fini_async(struct xe_exec_queue *q) 1325 + static void guc_exec_queue_destroy_async(struct xe_exec_queue *q) 1299 1326 { 1300 1327 struct xe_guc *guc = exec_queue_to_guc(q); 1301 1328 struct xe_device *xe = guc_to_xe(guc); 1302 1329 1303 - INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); 1330 + INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async); 1304 1331 1305 1332 /* We must block on kernel engines so slabs are empty on driver unload */ 1306 1333 if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) 1307 - __guc_exec_queue_fini_async(&q->guc->fini_async); 1334 + __guc_exec_queue_destroy_async(&q->guc->destroy_async); 1308 1335 else 1309 - queue_work(xe->destroy_wq, &q->guc->fini_async); 1336 + queue_work(xe->destroy_wq, &q->guc->destroy_async); 1310 1337 } 1311 1338 1312 - static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) 1339 + static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q) 1313 1340 { 1314 1341 /* 1315 1342 * Might be done from within the GPU scheduler, need to do async as we ··· 1336 1327 * this we and don't really care when everything is fini'd, just that it 1337 1328 * is. 1338 1329 */ 1339 - guc_exec_queue_fini_async(q); 1330 + guc_exec_queue_destroy_async(q); 1340 1331 } 1341 1332 1342 1333 static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) ··· 1350 1341 if (exec_queue_registered(q)) 1351 1342 disable_scheduling_deregister(guc, q); 1352 1343 else 1353 - __guc_exec_queue_fini(guc, q); 1344 + __guc_exec_queue_destroy(guc, q); 1354 1345 } 1355 1346 1356 1347 static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) ··· 1583 1574 #define STATIC_MSG_CLEANUP 0 1584 1575 #define STATIC_MSG_SUSPEND 1 1585 1576 #define STATIC_MSG_RESUME 2 1586 - static void guc_exec_queue_fini(struct xe_exec_queue *q) 1577 + static void guc_exec_queue_destroy(struct xe_exec_queue *q) 1587 1578 { 1588 1579 struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; 1589 1580 1590 1581 if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) 1591 1582 guc_exec_queue_add_msg(q, msg, CLEANUP); 1592 1583 else 1593 - __guc_exec_queue_fini(exec_queue_to_guc(q), q); 1584 + __guc_exec_queue_destroy(exec_queue_to_guc(q), q); 1594 1585 } 1595 1586 1596 1587 static int guc_exec_queue_set_priority(struct xe_exec_queue *q, ··· 1720 1711 .init = guc_exec_queue_init, 1721 1712 .kill = guc_exec_queue_kill, 1722 1713 .fini = guc_exec_queue_fini, 1714 + .destroy = guc_exec_queue_destroy, 1723 1715 .set_priority = guc_exec_queue_set_priority, 1724 1716 .set_timeslice = guc_exec_queue_set_timeslice, 1725 1717 .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, ··· 1742 1732 if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) 1743 1733 xe_exec_queue_put(q); 1744 1734 else if (exec_queue_destroyed(q)) 1745 - __guc_exec_queue_fini(guc, q); 1735 + __guc_exec_queue_destroy(guc, q); 1746 1736 } 1747 1737 if (q->guc->suspend_pending) { 1748 1738 set_exec_queue_suspended(q); ··· 1999 1989 if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) 2000 1990 xe_exec_queue_put(q); 2001 1991 else 2002 - __guc_exec_queue_fini(guc, q); 1992 + __guc_exec_queue_destroy(guc, q); 2003 1993 } 2004 1994 2005 1995 int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)