Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/guc: Configure TLB timeout based on CT buffer size

GuC TLB invalidation depends on GuC to process the request from the CT
queue and then the real time to invalidate TLB. Add a function to return
overestimated possible time a TLB inval H2G might take which can be used
as timeout value for TLB invalidation wait time.

v4: Make sure CTB is in 4K blocks(Michal) and other doc fixes
v3: Pass CT to xe_guc_ct_queue_proc_time_jiffies() (Michal)
Add tlb_timeout_jiffies() that replaces TLB_TIMEOUT(Michal)
v2: Address reviews from Michal.

Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1622
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Suggested-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Acked-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240628085845.2369-1-nirmoy.das@intel.com
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>

+41 -8
+22 -8
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
··· 17 17 #include "xe_trace.h" 18 18 #include "regs/xe_guc_regs.h" 19 19 20 - #define TLB_TIMEOUT (HZ / 4) 20 + /* 21 + * TLB inval depends on pending commands in the CT queue and then the real 22 + * invalidation time. Double up the time to process full CT queue 23 + * just to be on the safe side. 24 + */ 25 + static long tlb_timeout_jiffies(struct xe_gt *gt) 26 + { 27 + /* this reflects what HW/GuC needs to process TLB inv request */ 28 + const long hw_tlb_timeout = HZ / 4; 29 + 30 + /* this estimates actual delay caused by the CTB transport */ 31 + long delay = xe_guc_ct_queue_proc_time_jiffies(&gt->uc.guc.ct); 32 + 33 + return hw_tlb_timeout + 2 * delay; 34 + } 35 + 21 36 22 37 static void xe_gt_tlb_fence_timeout(struct work_struct *work) 23 38 { ··· 47 32 s64 since_inval_ms = ktime_ms_delta(ktime_get(), 48 33 fence->invalidation_time); 49 34 50 - if (msecs_to_jiffies(since_inval_ms) < TLB_TIMEOUT) 35 + if (msecs_to_jiffies(since_inval_ms) < tlb_timeout_jiffies(gt)) 51 36 break; 52 37 53 38 trace_xe_gt_tlb_invalidation_fence_timeout(xe, fence); ··· 62 47 if (!list_empty(&gt->tlb_invalidation.pending_fences)) 63 48 queue_delayed_work(system_wq, 64 49 &gt->tlb_invalidation.fence_tdr, 65 - TLB_TIMEOUT); 50 + tlb_timeout_jiffies(gt)); 66 51 spin_unlock_irq(&gt->tlb_invalidation.pending_lock); 67 52 } 68 53 ··· 198 183 if (list_is_singular(&gt->tlb_invalidation.pending_fences)) 199 184 queue_delayed_work(system_wq, 200 185 &gt->tlb_invalidation.fence_tdr, 201 - TLB_TIMEOUT); 186 + tlb_timeout_jiffies(gt)); 202 187 } 203 188 spin_unlock_irq(&gt->tlb_invalidation.pending_lock); 204 189 } else if (ret < 0 && fence) { ··· 405 390 * @gt: graphics tile 406 391 * @seqno: seqno to wait which was returned from xe_gt_tlb_invalidation 407 392 * 408 - * Wait for 200ms for a TLB invalidation to complete, in practice we always 409 - * should receive the TLB invalidation within 200ms. 393 + * Wait for tlb_timeout_jiffies() for a TLB invalidation to complete. 410 394 * 411 395 * Return: 0 on success, -ETIME on TLB invalidation timeout 412 396 */ ··· 424 410 */ 425 411 ret = wait_event_timeout(guc->ct.wq, 426 412 tlb_invalidation_seqno_past(gt, seqno), 427 - TLB_TIMEOUT); 413 + tlb_timeout_jiffies(gt)); 428 414 if (!ret) { 429 415 struct drm_printer p = xe_gt_err_printer(gt); 430 416 ··· 500 486 if (!list_empty(&gt->tlb_invalidation.pending_fences)) 501 487 mod_delayed_work(system_wq, 502 488 &gt->tlb_invalidation.fence_tdr, 503 - TLB_TIMEOUT); 489 + tlb_timeout_jiffies(gt)); 504 490 else 505 491 cancel_delayed_work(&gt->tlb_invalidation.fence_tdr); 506 492
+17
drivers/gpu/drm/xe/xe_guc_ct.c
··· 112 112 #define CTB_G2H_BUFFER_SIZE (4 * CTB_H2G_BUFFER_SIZE) 113 113 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 4) 114 114 115 + /** 116 + * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full 117 + * CT command queue 118 + * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future. 119 + * 120 + * Observation is that a 4KiB buffer full of commands takes a little over a 121 + * second to process. Use that to calculate maximum time to process a full CT 122 + * command queue. 123 + * 124 + * Return: Maximum time to process a full CT queue in jiffies. 125 + */ 126 + long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct) 127 + { 128 + BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4)); 129 + return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ; 130 + } 131 + 115 132 static size_t guc_ct_size(void) 116 133 { 117 134 return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE +
+2
drivers/gpu/drm/xe/xe_guc_ct.h
··· 64 64 return xe_guc_ct_send_recv_no_fail(ct, action, len, NULL); 65 65 } 66 66 67 + long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct); 68 + 67 69 #endif