drm/xe: Fix error handling if PXP fails to start

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Since the PXP start comes after __xe_exec_queue_init() has completed,
we need to cleanup what was done in that function in case of a PXP
start error.
__xe_exec_queue_init calls the submission backend init() function,
so we need to introduce an opposite for that. Unfortunately, while
we already have a fini() function pointer, it performs other
operations in addition to cleaning up what was done by the init().
Therefore, for clarity, the existing fini() has been renamed to
destroy(), while a new fini() has been added to only clean up what was
done by the init(), with the latter being called by the former (via
xe_exec_queue_fini).

Fixes: 72d479601d67 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues")
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com

authored by

Daniele Ceraolo Spurio and committed by

John Harrison 7 months ago 62666732 9e6eb49e

+73 -42

6 changed files

expand all

drivers

gpu

drm

xe_exec_queue.c

xe_exec_queue_types.h

xe_execlist.c

xe_execlist_types.h

xe_guc_exec_queue_types.h

xe_guc_submit.c

+15 -7

drivers/gpu/drm/xe/xe_exec_queue.c

··· 199 199 return err; 200 200 } 201 201 202 + static void __xe_exec_queue_fini(struct xe_exec_queue *q) 203 + { 204 + int i; 205 + 206 + q->ops->fini(q); 207 + 208 + for (i = 0; i < q->width; ++i) 209 + xe_lrc_put(q->lrc[i]); 210 + } 211 + 202 212 struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, 203 213 u32 logical_mask, u16 width, 204 214 struct xe_hw_engine *hwe, u32 flags, ··· 239 229 if (xe_exec_queue_uses_pxp(q)) { 240 230 err = xe_pxp_exec_queue_add(xe->pxp, q); 241 231 if (err) 242 - goto err_post_alloc; 232 + goto err_post_init; 243 233 } 244 234 245 235 return q; 246 236 237 + err_post_init: 238 + __xe_exec_queue_fini(q); 247 239 err_post_alloc: 248 240 __xe_exec_queue_free(q); 249 241 return ERR_PTR(err); ··· 343 331 xe_exec_queue_put(eq); 344 332 } 345 333 346 - q->ops->fini(q); 334 + q->ops->destroy(q); 347 335 } 348 336 349 337 void xe_exec_queue_fini(struct xe_exec_queue *q) 350 338 { 351 - int i; 352 - 353 339 /* 354 340 * Before releasing our ref to lrc and xef, accumulate our run ticks 355 341 * and wakeup any waiters. ··· 356 346 if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal)) 357 347 wake_up_var(&q->xef->exec_queue.pending_removal); 358 348 359 - for (i = 0; i < q->width; ++i) 360 - xe_lrc_put(q->lrc[i]); 361 - 349 + __xe_exec_queue_fini(q); 362 350 __xe_exec_queue_free(q); 363 351 } 364 352

+7 -1

drivers/gpu/drm/xe/xe_exec_queue_types.h

··· 181 181 int (*init)(struct xe_exec_queue *q); 182 182 /** @kill: Kill inflight submissions for backend */ 183 183 void (*kill)(struct xe_exec_queue *q); 184 - /** @fini: Fini exec queue for submission backend */ 184 + /** @fini: Undoes the init() for submission backend */ 185 185 void (*fini)(struct xe_exec_queue *q); 186 + /** 187 + * @destroy: Destroy exec queue for submission backend. The backend 188 + * function must call xe_exec_queue_fini() (which will in turn call the 189 + * fini() backend function) to ensure the queue is properly cleaned up. 190 + */ 191 + void (*destroy)(struct xe_exec_queue *q); 186 192 /** @set_priority: Set priority for exec queue */ 187 193 int (*set_priority)(struct xe_exec_queue *q, 188 194 enum xe_exec_queue_priority priority);

+16 -9

drivers/gpu/drm/xe/xe_execlist.c

··· 385 385 return err; 386 386 } 387 387 388 - static void execlist_exec_queue_fini_async(struct work_struct *w) 388 + static void execlist_exec_queue_fini(struct xe_exec_queue *q) 389 + { 390 + struct xe_execlist_exec_queue *exl = q->execlist; 391 + 392 + drm_sched_entity_fini(&exl->entity); 393 + drm_sched_fini(&exl->sched); 394 + 395 + kfree(exl); 396 + } 397 + 398 + static void execlist_exec_queue_destroy_async(struct work_struct *w) 389 399 { 390 400 struct xe_execlist_exec_queue *ee = 391 - container_of(w, struct xe_execlist_exec_queue, fini_async); 401 + container_of(w, struct xe_execlist_exec_queue, destroy_async); 392 402 struct xe_exec_queue *q = ee->q; 393 403 struct xe_execlist_exec_queue *exl = q->execlist; 394 404 struct xe_device *xe = gt_to_xe(q->gt); ··· 411 401 list_del(&exl->active_link); 412 402 spin_unlock_irqrestore(&exl->port->lock, flags); 413 403 414 - drm_sched_entity_fini(&exl->entity); 415 - drm_sched_fini(&exl->sched); 416 - kfree(exl); 417 - 418 404 xe_exec_queue_fini(q); 419 405 } 420 406 ··· 419 413 /* NIY */ 420 414 } 421 415 422 - static void execlist_exec_queue_fini(struct xe_exec_queue *q) 416 + static void execlist_exec_queue_destroy(struct xe_exec_queue *q) 423 417 { 424 - INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); 425 - queue_work(system_unbound_wq, &q->execlist->fini_async); 418 + INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async); 419 + queue_work(system_unbound_wq, &q->execlist->destroy_async); 426 420 } 427 421 428 422 static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, ··· 473 467 .init = execlist_exec_queue_init, 474 468 .kill = execlist_exec_queue_kill, 475 469 .fini = execlist_exec_queue_fini, 470 + .destroy = execlist_exec_queue_destroy, 476 471 .set_priority = execlist_exec_queue_set_priority, 477 472 .set_timeslice = execlist_exec_queue_set_timeslice, 478 473 .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,

+1 -1

drivers/gpu/drm/xe/xe_execlist_types.h

··· 42 42 43 43 bool has_run; 44 44 45 - struct work_struct fini_async; 45 + struct work_struct destroy_async; 46 46 47 47 enum xe_exec_queue_priority active_priority; 48 48 struct list_head active_link;

+2 -2

drivers/gpu/drm/xe/xe_guc_exec_queue_types.h

··· 35 35 struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; 36 36 /** @lr_tdr: long running TDR worker */ 37 37 struct work_struct lr_tdr; 38 - /** @fini_async: do final fini async from this worker */ 39 - struct work_struct fini_async; 38 + /** @destroy_async: do final destroy async from this worker */ 39 + struct work_struct destroy_async; 40 40 /** @resume_time: time of last resume */ 41 41 u64 resume_time; 42 42 /** @state: GuC specific state for this xe_exec_queue */

+32 -22

drivers/gpu/drm/xe/xe_guc_submit.c

··· 1419 1419 return DRM_GPU_SCHED_STAT_NO_HANG; 1420 1420 } 1421 1421 1422 - static void __guc_exec_queue_fini_async(struct work_struct *w) 1422 + static void guc_exec_queue_fini(struct xe_exec_queue *q) 1423 1423 { 1424 - struct xe_guc_exec_queue *ge = 1425 - container_of(w, struct xe_guc_exec_queue, fini_async); 1426 - struct xe_exec_queue *q = ge->q; 1424 + struct xe_guc_exec_queue *ge = q->guc; 1427 1425 struct xe_guc *guc = exec_queue_to_guc(q); 1428 1426 1429 - xe_pm_runtime_get(guc_to_xe(guc)); 1430 - trace_xe_exec_queue_destroy(q); 1431 - 1432 1427 release_guc_id(guc, q); 1433 - if (xe_exec_queue_is_lr(q)) 1434 - cancel_work_sync(&ge->lr_tdr); 1435 - /* Confirm no work left behind accessing device structures */ 1436 - cancel_delayed_work_sync(&ge->sched.base.work_tdr); 1437 1428 xe_sched_entity_fini(&ge->entity); 1438 1429 xe_sched_fini(&ge->sched); 1439 1430 ··· 1433 1442 * (timeline name). 1434 1443 */ 1435 1444 kfree_rcu(ge, rcu); 1445 + } 1446 + 1447 + static void __guc_exec_queue_destroy_async(struct work_struct *w) 1448 + { 1449 + struct xe_guc_exec_queue *ge = 1450 + container_of(w, struct xe_guc_exec_queue, destroy_async); 1451 + struct xe_exec_queue *q = ge->q; 1452 + struct xe_guc *guc = exec_queue_to_guc(q); 1453 + 1454 + xe_pm_runtime_get(guc_to_xe(guc)); 1455 + trace_xe_exec_queue_destroy(q); 1456 + 1457 + if (xe_exec_queue_is_lr(q)) 1458 + cancel_work_sync(&ge->lr_tdr); 1459 + /* Confirm no work left behind accessing device structures */ 1460 + cancel_delayed_work_sync(&ge->sched.base.work_tdr); 1461 + 1436 1462 xe_exec_queue_fini(q); 1463 + 1437 1464 xe_pm_runtime_put(guc_to_xe(guc)); 1438 1465 } 1439 1466 1440 - static void guc_exec_queue_fini_async(struct xe_exec_queue *q) 1467 + static void guc_exec_queue_destroy_async(struct xe_exec_queue *q) 1441 1468 { 1442 1469 struct xe_guc *guc = exec_queue_to_guc(q); 1443 1470 struct xe_device *xe = guc_to_xe(guc); 1444 1471 1445 - INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); 1472 + INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async); 1446 1473 1447 1474 /* We must block on kernel engines so slabs are empty on driver unload */ 1448 1475 if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) 1449 - __guc_exec_queue_fini_async(&q->guc->fini_async); 1476 + __guc_exec_queue_destroy_async(&q->guc->destroy_async); 1450 1477 else 1451 - queue_work(xe->destroy_wq, &q->guc->fini_async); 1478 + queue_work(xe->destroy_wq, &q->guc->destroy_async); 1452 1479 } 1453 1480 1454 - static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) 1481 + static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q) 1455 1482 { 1456 1483 /* 1457 1484 * Might be done from within the GPU scheduler, need to do async as we ··· 1478 1469 * this we and don't really care when everything is fini'd, just that it 1479 1470 * is. 1480 1471 */ 1481 - guc_exec_queue_fini_async(q); 1472 + guc_exec_queue_destroy_async(q); 1482 1473 } 1483 1474 1484 1475 static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) ··· 1492 1483 if (exec_queue_registered(q)) 1493 1484 disable_scheduling_deregister(guc, q); 1494 1485 else 1495 - __guc_exec_queue_fini(guc, q); 1486 + __guc_exec_queue_destroy(guc, q); 1496 1487 } 1497 1488 1498 1489 static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) ··· 1725 1716 #define STATIC_MSG_CLEANUP 0 1726 1717 #define STATIC_MSG_SUSPEND 1 1727 1718 #define STATIC_MSG_RESUME 2 1728 - static void guc_exec_queue_fini(struct xe_exec_queue *q) 1719 + static void guc_exec_queue_destroy(struct xe_exec_queue *q) 1729 1720 { 1730 1721 struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; 1731 1722 1732 1723 if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) 1733 1724 guc_exec_queue_add_msg(q, msg, CLEANUP); 1734 1725 else 1735 - __guc_exec_queue_fini(exec_queue_to_guc(q), q); 1726 + __guc_exec_queue_destroy(exec_queue_to_guc(q), q); 1736 1727 } 1737 1728 1738 1729 static int guc_exec_queue_set_priority(struct xe_exec_queue *q, ··· 1862 1853 .init = guc_exec_queue_init, 1863 1854 .kill = guc_exec_queue_kill, 1864 1855 .fini = guc_exec_queue_fini, 1856 + .destroy = guc_exec_queue_destroy, 1865 1857 .set_priority = guc_exec_queue_set_priority, 1866 1858 .set_timeslice = guc_exec_queue_set_timeslice, 1867 1859 .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, ··· 1884 1874 if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) 1885 1875 xe_exec_queue_put(q); 1886 1876 else if (exec_queue_destroyed(q)) 1887 - __guc_exec_queue_fini(guc, q); 1877 + __guc_exec_queue_destroy(guc, q); 1888 1878 } 1889 1879 if (q->guc->suspend_pending) { 1890 1880 set_exec_queue_suspended(q); ··· 2213 2203 if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) 2214 2204 xe_exec_queue_put(q); 2215 2205 else 2216 - __guc_exec_queue_fini(guc, q); 2206 + __guc_exec_queue_destroy(guc, q); 2217 2207 } 2218 2208 2219 2209 int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)