Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Show warning message if IH ring overflow

If IH primary ring and KFD ih fifo overflows, we may miss CP, SDMA
interrupts and cause application soft hang. Show warning message with
ring name if overflow happens.

Add function to get ih ring name to avoid duplicating it. To keep
warning message consistent between GPU generations, change all
*_ih.c except ASICs older than Vega which has only one ih ring.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Philip Yang and committed by
Alex Deucher
e37ccf44 de844846

+15 -11
+6
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
··· 298 298 dw2 = le32_to_cpu(ih->ring[ring_index + 2]); 299 299 return dw1 | ((u64)(dw2 & 0xffff) << 32); 300 300 } 301 + 302 + const char *amdgpu_ih_ring_name(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) 303 + { 304 + return ih == &adev->irq.ih ? "ih" : ih == &adev->irq.ih_soft ? "sw ih" : 305 + ih == &adev->irq.ih1 ? "ih1" : ih == &adev->irq.ih2 ? "ih2" : "unknown"; 306 + }
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
··· 110 110 struct amdgpu_iv_entry *entry); 111 111 uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr, 112 112 signed int offset); 113 + const char *amdgpu_ih_ring_name(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); 113 114 #endif
+2 -3
drivers/gpu/drm/amd/amdgpu/navi10_ih.c
··· 434 434 * this should allow us to catch up. 435 435 */ 436 436 tmp = (wptr + 32) & ih->ptr_mask; 437 - dev_warn(adev->dev, "IH ring buffer overflow " 438 - "(0x%08X, 0x%08X, 0x%08X)\n", 439 - wptr, ih->rptr, tmp); 437 + dev_warn(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n", 438 + amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp); 440 439 ih->rptr = tmp; 441 440 442 441 tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
+2 -3
drivers/gpu/drm/amd/amdgpu/vega10_ih.c
··· 364 364 * this should allow us to catchup. 365 365 */ 366 366 tmp = (wptr + 32) & ih->ptr_mask; 367 - dev_warn(adev->dev, "IH ring buffer overflow " 368 - "(0x%08X, 0x%08X, 0x%08X)\n", 369 - wptr, ih->rptr, tmp); 367 + dev_warn_ratelimited(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n", 368 + amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp); 370 369 ih->rptr = tmp; 371 370 372 371 tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
+2 -3
drivers/gpu/drm/amd/amdgpu/vega20_ih.c
··· 444 444 * this should allow us to catchup. 445 445 */ 446 446 tmp = (wptr + 32) & ih->ptr_mask; 447 - dev_warn(adev->dev, "IH ring buffer overflow " 448 - "(0x%08X, 0x%08X, 0x%08X)\n", 449 - wptr, ih->rptr, tmp); 447 + dev_warn_ratelimited(adev->dev, "%s ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n", 448 + amdgpu_ih_ring_name(adev, ih), wptr, ih->rptr, tmp); 450 449 ih->rptr = tmp; 451 450 452 451 tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl);
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
··· 108 108 bool enqueue_ih_ring_entry(struct kfd_node *node, const void *ih_ring_entry) 109 109 { 110 110 if (kfifo_is_full(&node->ih_fifo)) { 111 - dev_dbg_ratelimited(node->adev->dev, 112 - "Interrupt ring overflow, dropping interrupt\n"); 111 + dev_warn_ratelimited(node->adev->dev, "KFD node %d ih_fifo overflow\n", 112 + node->node_id); 113 113 return false; 114 114 } 115 115