Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Decouple xe_exec_queue and xe_lrc

Decouple xe_lrc from xe_exec_queue and reference count xe_lrc.
Removing hard coupling between xe_exec_queue and xe_lrc allows
flexible design where the user interface xe_exec_queue can be
destroyed independent of the hardware/firmware interface xe_lrc.

v2: Fix lrc indexing in wq_item_append()

Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240530032211.29299-1-niranjana.vishwanathapura@intel.com

authored by

Niranjana Vishwanathapura and committed by
Matthew Brost
264eecdb 0568a408

+109 -59
+14 -12
drivers/gpu/drm/xe/xe_exec_queue.c
··· 86 86 87 87 if (extensions) { 88 88 /* 89 - * may set q->usm, must come before xe_lrc_init(), 89 + * may set q->usm, must come before xe_lrc_create(), 90 90 * may overwrite q->sched_props, must come before q->ops->init() 91 91 */ 92 92 err = exec_queue_user_extensions(xe, q, extensions, 0); ··· 104 104 int i, err; 105 105 106 106 for (i = 0; i < q->width; ++i) { 107 - err = xe_lrc_init(q->lrc + i, q->hwe, q, q->vm, SZ_16K); 108 - if (err) 107 + q->lrc[i] = xe_lrc_create(q->hwe, q->vm, SZ_16K); 108 + if (IS_ERR(q->lrc[i])) { 109 + err = PTR_ERR(q->lrc[i]); 109 110 goto err_lrc; 111 + } 110 112 } 111 113 112 114 err = q->ops->init(q); ··· 119 117 120 118 err_lrc: 121 119 for (i = i - 1; i >= 0; --i) 122 - xe_lrc_finish(q->lrc + i); 120 + xe_lrc_put(q->lrc[i]); 123 121 return err; 124 122 } 125 123 ··· 200 198 int i; 201 199 202 200 for (i = 0; i < q->width; ++i) 203 - xe_lrc_finish(q->lrc + i); 201 + xe_lrc_put(q->lrc[i]); 204 202 __xe_exec_queue_free(q); 205 203 } 206 204 ··· 703 701 704 702 static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q) 705 703 { 706 - return q->lrc->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc) - 1; 704 + return q->lrc[0]->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc[0]) - 1; 707 705 } 708 706 709 707 /** ··· 714 712 */ 715 713 bool xe_exec_queue_ring_full(struct xe_exec_queue *q) 716 714 { 717 - struct xe_lrc *lrc = q->lrc; 715 + struct xe_lrc *lrc = q->lrc[0]; 718 716 s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES; 719 717 720 718 return xe_exec_queue_num_job_inflight(q) >= max_job; ··· 740 738 int i; 741 739 742 740 for (i = 0; i < q->width; ++i) { 743 - if (xe_lrc_seqno(&q->lrc[i]) != 744 - q->lrc[i].fence_ctx.next_seqno - 1) 741 + if (xe_lrc_seqno(q->lrc[i]) != 742 + q->lrc[i]->fence_ctx.next_seqno - 1) 745 743 return false; 746 744 } 747 745 748 746 return true; 749 747 } 750 748 751 - return xe_lrc_seqno(&q->lrc[0]) == 752 - q->lrc[0].fence_ctx.next_seqno - 1; 749 + return xe_lrc_seqno(q->lrc[0]) == 750 + q->lrc[0]->fence_ctx.next_seqno - 1; 753 751 } 754 752 755 753 /** ··· 781 779 * the LRCs and reading them in different time could also introduce 782 780 * errors. 783 781 */ 784 - lrc = &q->lrc[0]; 782 + lrc = q->lrc[0]; 785 783 new_ts = xe_lrc_update_timestamp(lrc, &old_ts); 786 784 q->run_ticks += (new_ts - old_ts) * q->width; 787 785 }
+1 -1
drivers/gpu/drm/xe/xe_exec_queue_types.h
··· 146 146 /** @run_ticks: hw engine class run time in ticks for this exec queue */ 147 147 u64 run_ticks; 148 148 /** @lrc: logical ring context for this exec queue */ 149 - struct xe_lrc lrc[]; 149 + struct xe_lrc *lrc[]; 150 150 }; 151 151 152 152 /**
+5 -5
drivers/gpu/drm/xe/xe_execlist.c
··· 109 109 port->last_ctx_id = 1; 110 110 } 111 111 112 - __start_lrc(port->hwe, exl->q->lrc, port->last_ctx_id); 112 + __start_lrc(port->hwe, exl->q->lrc[0], port->last_ctx_id); 113 113 port->running_exl = exl; 114 114 exl->has_run = true; 115 115 } ··· 123 123 if (!port->running_exl) 124 124 return; 125 125 126 - xe_lrc_write_ring(&port->hwe->kernel_lrc, noop, sizeof(noop)); 127 - __start_lrc(port->hwe, &port->hwe->kernel_lrc, 0); 126 + xe_lrc_write_ring(port->hwe->kernel_lrc, noop, sizeof(noop)); 127 + __start_lrc(port->hwe, port->hwe->kernel_lrc, 0); 128 128 port->running_exl = NULL; 129 129 } 130 130 131 131 static bool xe_execlist_is_idle(struct xe_execlist_exec_queue *exl) 132 132 { 133 - struct xe_lrc *lrc = exl->q->lrc; 133 + struct xe_lrc *lrc = exl->q->lrc[0]; 134 134 135 135 return lrc->ring.tail == lrc->ring.old_tail; 136 136 } ··· 333 333 exl->q = q; 334 334 335 335 err = drm_sched_init(&exl->sched, &drm_sched_ops, NULL, 1, 336 - q->lrc[0].ring.size / MAX_JOB_SIZE_BYTES, 336 + q->lrc[0]->ring.size / MAX_JOB_SIZE_BYTES, 337 337 XE_SCHED_HANG_LIMIT, XE_SCHED_JOB_TIMEOUT, 338 338 NULL, NULL, q->hwe->name, 339 339 gt_to_xe(q->gt)->drm.dev);
+2 -2
drivers/gpu/drm/xe/xe_gt.c
··· 297 297 } 298 298 299 299 xe_map_memcpy_from(xe, default_lrc, 300 - &q->lrc[0].bo->vmap, 301 - xe_lrc_pphwsp_offset(&q->lrc[0]), 300 + &q->lrc[0]->bo->vmap, 301 + xe_lrc_pphwsp_offset(q->lrc[0]), 302 302 xe_gt_lrc_size(gt, hwe->class)); 303 303 304 304 gt->default_lrc[hwe->class] = default_lrc;
+15 -15
drivers/gpu/drm/xe/xe_guc_submit.c
··· 490 490 action[len++] = info->hwlrca_hi; 491 491 492 492 for (i = 1; i < q->width; ++i) { 493 - struct xe_lrc *lrc = q->lrc + i; 493 + struct xe_lrc *lrc = q->lrc[i]; 494 494 495 495 action[len++] = lower_32_bits(xe_lrc_descriptor(lrc)); 496 496 action[len++] = upper_32_bits(xe_lrc_descriptor(lrc)); ··· 527 527 { 528 528 struct xe_guc *guc = exec_queue_to_guc(q); 529 529 struct xe_device *xe = guc_to_xe(guc); 530 - struct xe_lrc *lrc = q->lrc; 530 + struct xe_lrc *lrc = q->lrc[0]; 531 531 struct guc_ctxt_registration_info info; 532 532 533 533 xe_assert(xe, !exec_queue_registered(q)); ··· 586 586 { 587 587 struct xe_guc *guc = exec_queue_to_guc(q); 588 588 struct xe_device *xe = guc_to_xe(guc); 589 - struct iosys_map map = xe_lrc_parallel_map(q->lrc); 589 + struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); 590 590 unsigned int sleep_period_ms = 1; 591 591 592 592 #define AVAILABLE_SPACE \ ··· 614 614 { 615 615 struct xe_guc *guc = exec_queue_to_guc(q); 616 616 struct xe_device *xe = guc_to_xe(guc); 617 - struct iosys_map map = xe_lrc_parallel_map(q->lrc); 617 + struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); 618 618 u32 len_dw = wq_space_until_wrap(q) / sizeof(u32) - 1; 619 619 620 620 if (wq_wait_for_space(q, wq_space_until_wrap(q))) ··· 634 634 { 635 635 struct xe_guc *guc = exec_queue_to_guc(q); 636 636 struct xe_device *xe = guc_to_xe(guc); 637 - struct iosys_map map = xe_lrc_parallel_map(q->lrc); 637 + struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); 638 638 #define WQ_HEADER_SIZE 4 /* Includes 1 LRC address too */ 639 639 u32 wqi[XE_HW_ENGINE_MAX_INSTANCE + (WQ_HEADER_SIZE - 1)]; 640 640 u32 wqi_size = (q->width + (WQ_HEADER_SIZE - 1)) * sizeof(u32); ··· 650 650 651 651 wqi[i++] = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) | 652 652 FIELD_PREP(WQ_LEN_MASK, len_dw); 653 - wqi[i++] = xe_lrc_descriptor(q->lrc); 653 + wqi[i++] = xe_lrc_descriptor(q->lrc[0]); 654 654 wqi[i++] = FIELD_PREP(WQ_GUC_ID_MASK, q->guc->id) | 655 - FIELD_PREP(WQ_RING_TAIL_MASK, q->lrc->ring.tail / sizeof(u64)); 655 + FIELD_PREP(WQ_RING_TAIL_MASK, q->lrc[0]->ring.tail / sizeof(u64)); 656 656 wqi[i++] = 0; 657 657 for (j = 1; j < q->width; ++j) { 658 - struct xe_lrc *lrc = q->lrc + j; 658 + struct xe_lrc *lrc = q->lrc[j]; 659 659 660 660 wqi[i++] = lrc->ring.tail / sizeof(u64); 661 661 } ··· 670 670 671 671 xe_device_wmb(xe); 672 672 673 - map = xe_lrc_parallel_map(q->lrc); 673 + map = xe_lrc_parallel_map(q->lrc[0]); 674 674 parallel_write(xe, map, wq_desc.tail, q->guc->wqi_tail); 675 675 } 676 676 ··· 679 679 { 680 680 struct xe_guc *guc = exec_queue_to_guc(q); 681 681 struct xe_device *xe = guc_to_xe(guc); 682 - struct xe_lrc *lrc = q->lrc; 682 + struct xe_lrc *lrc = q->lrc[0]; 683 683 u32 action[3]; 684 684 u32 g2h_len = 0; 685 685 u32 num_g2h = 0; ··· 1236 1236 msecs_to_jiffies(q->sched_props.job_timeout_ms); 1237 1237 err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops, 1238 1238 get_submit_wq(guc), 1239 - q->lrc[0].ring.size / MAX_JOB_SIZE_BYTES, 64, 1239 + q->lrc[0]->ring.size / MAX_JOB_SIZE_BYTES, 64, 1240 1240 timeout, guc_to_gt(guc)->ordered_wq, NULL, 1241 1241 q->name, gt_to_xe(q->gt)->drm.dev); 1242 1242 if (err) ··· 1464 1464 ban = true; 1465 1465 } 1466 1466 } else if (xe_exec_queue_is_lr(q) && 1467 - (xe_lrc_ring_head(q->lrc) != xe_lrc_ring_tail(q->lrc))) { 1467 + (xe_lrc_ring_head(q->lrc[0]) != xe_lrc_ring_tail(q->lrc[0]))) { 1468 1468 ban = true; 1469 1469 } 1470 1470 ··· 1529 1529 1530 1530 trace_xe_exec_queue_resubmit(q); 1531 1531 for (i = 0; i < q->width; ++i) 1532 - xe_lrc_set_ring_head(q->lrc + i, q->lrc[i].ring.tail); 1532 + xe_lrc_set_ring_head(q->lrc[i], q->lrc[i]->ring.tail); 1533 1533 xe_sched_resubmit_jobs(sched); 1534 1534 } 1535 1535 ··· 1775 1775 { 1776 1776 struct xe_guc *guc = exec_queue_to_guc(q); 1777 1777 struct xe_device *xe = guc_to_xe(guc); 1778 - struct iosys_map map = xe_lrc_parallel_map(q->lrc); 1778 + struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); 1779 1779 int i; 1780 1780 1781 1781 snapshot->guc.wqi_head = q->guc->wqi_head; ··· 1855 1855 1856 1856 if (snapshot->lrc) { 1857 1857 for (i = 0; i < q->width; ++i) { 1858 - struct xe_lrc *lrc = q->lrc + i; 1858 + struct xe_lrc *lrc = q->lrc[i]; 1859 1859 1860 1860 snapshot->lrc[i] = xe_lrc_snapshot_capture(lrc); 1861 1861 }
+6 -4
drivers/gpu/drm/xe/xe_hw_engine.c
··· 268 268 269 269 if (hwe->exl_port) 270 270 xe_execlist_port_destroy(hwe->exl_port); 271 - xe_lrc_finish(&hwe->kernel_lrc); 271 + xe_lrc_put(hwe->kernel_lrc); 272 272 273 273 hwe->gt = NULL; 274 274 } ··· 527 527 goto err_name; 528 528 } 529 529 530 - err = xe_lrc_init(&hwe->kernel_lrc, hwe, NULL, NULL, SZ_16K); 531 - if (err) 530 + hwe->kernel_lrc = xe_lrc_create(hwe, NULL, SZ_16K); 531 + if (IS_ERR(hwe->kernel_lrc)) { 532 + err = PTR_ERR(hwe->kernel_lrc); 532 533 goto err_hwsp; 534 + } 533 535 534 536 if (!xe_device_uc_enabled(xe)) { 535 537 hwe->exl_port = xe_execlist_port_create(xe, hwe); ··· 556 554 return drmm_add_action_or_reset(&xe->drm, hw_engine_fini, hwe); 557 555 558 556 err_kernel_lrc: 559 - xe_lrc_finish(&hwe->kernel_lrc); 557 + xe_lrc_put(hwe->kernel_lrc); 560 558 err_hwsp: 561 559 xe_bo_unpin_map_no_vm(hwe->hwsp); 562 560 err_name:
+1 -1
drivers/gpu/drm/xe/xe_hw_engine_types.h
··· 137 137 /** @hwsp: hardware status page buffer object */ 138 138 struct xe_bo *hwsp; 139 139 /** @kernel_lrc: Kernel LRC (should be replaced /w an xe_engine) */ 140 - struct xe_lrc kernel_lrc; 140 + struct xe_lrc *kernel_lrc; 141 141 /** @exl_port: execlists port */ 142 142 struct xe_execlist_port *exl_port; 143 143 /** @fence_irq: fence IRQ to run when a hw engine IRQ is received */
+36 -8
drivers/gpu/drm/xe/xe_lrc.c
··· 808 808 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 809 809 } 810 810 811 + static void xe_lrc_finish(struct xe_lrc *lrc) 812 + { 813 + xe_hw_fence_ctx_finish(&lrc->fence_ctx); 814 + xe_bo_lock(lrc->bo, false); 815 + xe_bo_unpin(lrc->bo); 816 + xe_bo_unlock(lrc->bo); 817 + xe_bo_put(lrc->bo); 818 + } 819 + 811 820 #define PVC_CTX_ASID (0x2e + 1) 812 821 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) 813 822 814 - int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 815 - struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size) 823 + static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 824 + struct xe_vm *vm, u32 ring_size) 816 825 { 817 826 struct xe_gt *gt = hwe->gt; 818 827 struct xe_tile *tile = gt_to_tile(gt); ··· 832 823 u32 lrc_size; 833 824 int err; 834 825 826 + kref_init(&lrc->refcount); 835 827 lrc->flags = 0; 836 828 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class); 837 829 if (xe_gt_has_indirect_ring_state(gt)) ··· 945 935 return err; 946 936 } 947 937 948 - void xe_lrc_finish(struct xe_lrc *lrc) 938 + struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 939 + u32 ring_size) 949 940 { 950 - xe_hw_fence_ctx_finish(&lrc->fence_ctx); 951 - xe_bo_lock(lrc->bo, false); 952 - xe_bo_unpin(lrc->bo); 953 - xe_bo_unlock(lrc->bo); 954 - xe_bo_put(lrc->bo); 941 + struct xe_lrc *lrc; 942 + int err; 943 + 944 + lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 945 + if (!lrc) 946 + return ERR_PTR(-ENOMEM); 947 + 948 + err = xe_lrc_init(lrc, hwe, vm, ring_size); 949 + if (err) { 950 + kfree(lrc); 951 + return ERR_PTR(err); 952 + } 953 + 954 + return lrc; 955 + } 956 + 957 + void xe_lrc_destroy(struct kref *ref) 958 + { 959 + struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 960 + 961 + xe_lrc_finish(lrc); 962 + kfree(lrc); 955 963 } 956 964 957 965 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
+16 -3
drivers/gpu/drm/xe/xe_lrc.h
··· 7 7 8 8 #include <linux/types.h> 9 9 10 + #include "xe_lrc_types.h" 11 + 10 12 struct drm_printer; 11 13 struct xe_bb; 12 14 struct xe_device; ··· 22 20 23 21 #define LRC_PPHWSP_SCRATCH_ADDR (0x34 * 4) 24 22 25 - int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 26 - struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size); 27 - void xe_lrc_finish(struct xe_lrc *lrc); 23 + struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 24 + u32 ring_size); 25 + void xe_lrc_destroy(struct kref *ref); 26 + 27 + static inline struct xe_lrc *xe_lrc_get(struct xe_lrc *lrc) 28 + { 29 + kref_get(&lrc->refcount); 30 + return lrc; 31 + } 32 + 33 + static inline void xe_lrc_put(struct xe_lrc *lrc) 34 + { 35 + kref_put(&lrc->refcount, xe_lrc_destroy); 36 + } 28 37 29 38 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class); 30 39 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc);
+5
drivers/gpu/drm/xe/xe_lrc_types.h
··· 6 6 #ifndef _XE_LRC_TYPES_H_ 7 7 #define _XE_LRC_TYPES_H_ 8 8 9 + #include <linux/kref.h> 10 + 9 11 #include "xe_hw_fence_types.h" 10 12 11 13 struct xe_bo; ··· 31 29 /** @flags: LRC flags */ 32 30 #define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1 33 31 u32 flags; 32 + 33 + /** @refcount: ref count of this lrc */ 34 + struct kref refcount; 34 35 35 36 /** @ring: submission ring state */ 36 37 struct {
+5 -5
drivers/gpu/drm/xe/xe_ring_ops.c
··· 396 396 397 397 xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */ 398 398 399 - __emit_job_gen12_simple(job, job->q->lrc, 399 + __emit_job_gen12_simple(job, job->q->lrc[0], 400 400 job->ptrs[0].batch_addr, 401 401 xe_sched_job_lrc_seqno(job)); 402 402 } ··· 406 406 int i; 407 407 408 408 if (xe_sched_job_is_migration(job->q)) { 409 - emit_migration_job_gen12(job, job->q->lrc, 409 + emit_migration_job_gen12(job, job->q->lrc[0], 410 410 xe_sched_job_lrc_seqno(job)); 411 411 return; 412 412 } 413 413 414 414 for (i = 0; i < job->q->width; ++i) 415 - __emit_job_gen12_simple(job, job->q->lrc + i, 415 + __emit_job_gen12_simple(job, job->q->lrc[i], 416 416 job->ptrs[i].batch_addr, 417 417 xe_sched_job_lrc_seqno(job)); 418 418 } ··· 423 423 424 424 /* FIXME: Not doing parallel handshake for now */ 425 425 for (i = 0; i < job->q->width; ++i) 426 - __emit_job_gen12_video(job, job->q->lrc + i, 426 + __emit_job_gen12_video(job, job->q->lrc[i], 427 427 job->ptrs[i].batch_addr, 428 428 xe_sched_job_lrc_seqno(job)); 429 429 } ··· 433 433 int i; 434 434 435 435 for (i = 0; i < job->q->width; ++i) 436 - __emit_job_gen12_render_compute(job, job->q->lrc + i, 436 + __emit_job_gen12_render_compute(job, job->q->lrc[i], 437 437 job->ptrs[i].batch_addr, 438 438 xe_sched_job_lrc_seqno(job)); 439 439 }
+3 -3
drivers/gpu/drm/xe/xe_sched_job.c
··· 216 216 217 217 bool xe_sched_job_started(struct xe_sched_job *job) 218 218 { 219 - struct xe_lrc *lrc = job->q->lrc; 219 + struct xe_lrc *lrc = job->q->lrc[0]; 220 220 221 221 return !__dma_fence_is_later(xe_sched_job_lrc_seqno(job), 222 222 xe_lrc_start_seqno(lrc), ··· 225 225 226 226 bool xe_sched_job_completed(struct xe_sched_job *job) 227 227 { 228 - struct xe_lrc *lrc = job->q->lrc; 228 + struct xe_lrc *lrc = job->q->lrc[0]; 229 229 230 230 /* 231 231 * Can safely check just LRC[0] seqno as that is last seqno written when ··· 265 265 struct dma_fence_chain *chain; 266 266 267 267 fence = job->ptrs[i].lrc_fence; 268 - xe_lrc_init_seqno_fence(&q->lrc[i], fence); 268 + xe_lrc_init_seqno_fence(q->lrc[i], fence); 269 269 job->ptrs[i].lrc_fence = NULL; 270 270 if (!i) { 271 271 job->lrc_seqno = fence->seqno;