drm/xe: Nuke simple error capture · tjh.dev/kernel@83ee002

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

drm/xe: Nuke simple error capture

This error capture prints into dmesg HW state when a gpu hang happens.
It was useful when we did not had devcoredump, now it is a incompleted
version of devcoredump that has potential to flood dmesg.

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240522203431.191594-1-jose.souza@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

José Roberto de Souza and committed by

Rodrigo Vivi 2 years ago 83ee002d b10d0c5e

+1 -113

4 changed files

expand all

drivers

gpu

drm

Kconfig.debug

xe_guc_submit.c

xe_vm.c

xe_vm.h

-10

drivers/gpu/drm/xe/Kconfig.debug

··· 61 61 62 62 If in doubt, say "N". 63 63 64 - config DRM_XE_SIMPLE_ERROR_CAPTURE 65 - bool "Enable simple error capture to dmesg on job timeout" 66 - default n 67 - help 68 - Choose this option when debugging an unexpected job timeout 69 - 70 - Recommended for driver developers only. 71 - 72 - If in doubt, say "N". 73 - 74 64 config DRM_XE_KUNIT_TEST 75 65 tristate "KUnit tests for the drm xe driver" if !KUNIT_ALL_TESTS 76 66 depends on DRM_XE && KUNIT && DEBUG_FS

+1 -52

drivers/gpu/drm/xe/xe_guc_submit.c

··· 816 816 G2H_LEN_DW_DEREGISTER_CONTEXT, 2); 817 817 } 818 818 819 - static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p); 820 - 821 - #if IS_ENABLED(CONFIG_DRM_XE_SIMPLE_ERROR_CAPTURE) 822 - static void simple_error_capture(struct xe_exec_queue *q) 823 - { 824 - struct xe_guc *guc = exec_queue_to_guc(q); 825 - struct xe_device *xe = guc_to_xe(guc); 826 - struct drm_printer p = drm_err_printer(&xe->drm, NULL); 827 - struct xe_hw_engine *hwe; 828 - enum xe_hw_engine_id id; 829 - u32 adj_logical_mask = q->logical_mask; 830 - u32 width_mask = (0x1 << q->width) - 1; 831 - int i; 832 - bool cookie; 833 - 834 - if (q->vm && !q->vm->error_capture.capture_once) { 835 - q->vm->error_capture.capture_once = true; 836 - cookie = dma_fence_begin_signalling(); 837 - for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) { 838 - if (adj_logical_mask & BIT(i)) { 839 - adj_logical_mask |= width_mask << i; 840 - i += q->width; 841 - } else { 842 - ++i; 843 - } 844 - } 845 - 846 - if (xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL)) 847 - xe_gt_info(guc_to_gt(guc), 848 - "failed to get forcewake for error capture"); 849 - xe_guc_ct_print(&guc->ct, &p, true); 850 - guc_exec_queue_print(q, &p); 851 - for_each_hw_engine(hwe, guc_to_gt(guc), id) { 852 - if (hwe->class != q->hwe->class || 853 - !(BIT(hwe->logical_instance) & adj_logical_mask)) 854 - continue; 855 - xe_hw_engine_print(hwe, &p); 856 - } 857 - xe_analyze_vm(&p, q->vm, q->gt->info.id); 858 - xe_force_wake_put(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL); 859 - dma_fence_end_signalling(cookie); 860 - } 861 - } 862 - #else 863 - static void simple_error_capture(struct xe_exec_queue *q) 864 - { 865 - } 866 - #endif 867 - 868 819 static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q) 869 820 { 870 821 struct xe_guc *guc = exec_queue_to_guc(q); ··· 947 996 xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q), 948 997 "VM job timed out on non-killed execqueue\n"); 949 998 950 - if (!exec_queue_killed(q)) { 951 - simple_error_capture(q); 999 + if (!exec_queue_killed(q)) 952 1000 xe_devcoredump(job); 953 - } 954 1001 955 1002 trace_xe_sched_job_timedout(job); 956 1003

-49

drivers/gpu/drm/xe/xe_vm.c

··· 3395 3395 return 0; 3396 3396 } 3397 3397 3398 - int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id) 3399 - { 3400 - struct drm_gpuva *gpuva; 3401 - bool is_vram; 3402 - uint64_t addr; 3403 - 3404 - if (!down_read_trylock(&vm->lock)) { 3405 - drm_printf(p, " Failed to acquire VM lock to dump capture"); 3406 - return 0; 3407 - } 3408 - if (vm->pt_root[gt_id]) { 3409 - addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE); 3410 - is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo); 3411 - drm_printf(p, " VM root: A:0x%llx %s\n", addr, 3412 - is_vram ? "VRAM" : "SYS"); 3413 - } 3414 - 3415 - drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) { 3416 - struct xe_vma *vma = gpuva_to_vma(gpuva); 3417 - bool is_userptr = xe_vma_is_userptr(vma); 3418 - bool is_null = xe_vma_is_null(vma); 3419 - 3420 - if (is_null) { 3421 - addr = 0; 3422 - } else if (is_userptr) { 3423 - struct sg_table *sg = to_userptr_vma(vma)->userptr.sg; 3424 - struct xe_res_cursor cur; 3425 - 3426 - if (sg) { 3427 - xe_res_first_sg(sg, 0, XE_PAGE_SIZE, &cur); 3428 - addr = xe_res_dma(&cur); 3429 - } else { 3430 - addr = 0; 3431 - } 3432 - } else { 3433 - addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE); 3434 - is_vram = xe_bo_is_vram(xe_vma_bo(vma)); 3435 - } 3436 - drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n", 3437 - xe_vma_start(vma), xe_vma_end(vma) - 1, 3438 - xe_vma_size(vma), 3439 - addr, is_null ? "NULL" : is_userptr ? "USR" : 3440 - is_vram ? "VRAM" : "SYS"); 3441 - } 3442 - up_read(&vm->lock); 3443 - 3444 - return 0; 3445 - } 3446 - 3447 3398 struct xe_vm_snapshot { 3448 3399 unsigned long num_snaps; 3449 3400 struct {

-2

drivers/gpu/drm/xe/xe_vm.h

··· 243 243 244 244 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); 245 245 246 - int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id); 247 - 248 246 int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma); 249 247 250 248 int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,