Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.

When the radeon driver resets a gpu, it attempts to test whether all the
rings can successfully handle an IB. If these rings fail to respond, the
process will wait forever. Another gpu reset can't happen at this point,
as the current reset holds a lock required to do so. Instead, make all
the IB tests run with a timeout, so the system can attempt to recover
in this case.

While this doesn't fix the underlying issue with card resets failing, it
gives the system a higher chance of recovering. These timeouts have been
confirmed to help both a Tathi and Hawaii card recover after a gpu reset.

This also adds a new function, radeon_fence_wait_timeout, that behaves like
fence_wait_timeout. It is used instead of fence_wait_timeout as it continues
to work during a reset. radeon_fence_wait is changed to be implemented
using this function.

V2:
- Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
too long. A timeout of 100ms was tested and found to be too short.
- Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Matthew Dawson and committed by
Alex Deucher
04db4caf 6e9821b2

+101 -35
+9 -2
drivers/gpu/drm/radeon/cik.c
··· 4219 4219 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); 4220 4220 return r; 4221 4221 } 4222 - r = radeon_fence_wait(ib.fence, false); 4223 - if (r) { 4222 + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( 4223 + RADEON_USEC_IB_TEST_TIMEOUT)); 4224 + if (r < 0) { 4224 4225 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 4225 4226 radeon_scratch_free(rdev, scratch); 4226 4227 radeon_ib_free(rdev, &ib); 4227 4228 return r; 4229 + } else if (r == 0) { 4230 + DRM_ERROR("radeon: fence wait timed out.\n"); 4231 + radeon_scratch_free(rdev, scratch); 4232 + radeon_ib_free(rdev, &ib); 4233 + return -ETIMEDOUT; 4228 4234 } 4235 + r = 0; 4229 4236 for (i = 0; i < rdev->usec_timeout; i++) { 4230 4237 tmp = RREG32(scratch); 4231 4238 if (tmp == 0xDEADBEEF)
+7 -2
drivers/gpu/drm/radeon/cik_sdma.c
··· 737 737 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); 738 738 return r; 739 739 } 740 - r = radeon_fence_wait(ib.fence, false); 741 - if (r) { 740 + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( 741 + RADEON_USEC_IB_TEST_TIMEOUT)); 742 + if (r < 0) { 742 743 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 743 744 return r; 745 + } else if (r == 0) { 746 + DRM_ERROR("radeon: fence wait timed out.\n"); 747 + return -ETIMEDOUT; 744 748 } 749 + r = 0; 745 750 for (i = 0; i < rdev->usec_timeout; i++) { 746 751 tmp = le32_to_cpu(rdev->wb.wb[index/4]); 747 752 if (tmp == 0xDEADBEEF)
+8 -2
drivers/gpu/drm/radeon/r100.c
··· 3732 3732 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); 3733 3733 goto free_ib; 3734 3734 } 3735 - r = radeon_fence_wait(ib.fence, false); 3736 - if (r) { 3735 + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( 3736 + RADEON_USEC_IB_TEST_TIMEOUT)); 3737 + if (r < 0) { 3737 3738 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 3738 3739 goto free_ib; 3740 + } else if (r == 0) { 3741 + DRM_ERROR("radeon: fence wait timed out.\n"); 3742 + r = -ETIMEDOUT; 3743 + goto free_ib; 3739 3744 } 3745 + r = 0; 3740 3746 for (i = 0; i < rdev->usec_timeout; i++) { 3741 3747 tmp = RREG32(scratch); 3742 3748 if (tmp == 0xDEADBEEF) {
+8 -2
drivers/gpu/drm/radeon/r600.c
··· 3381 3381 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); 3382 3382 goto free_ib; 3383 3383 } 3384 - r = radeon_fence_wait(ib.fence, false); 3385 - if (r) { 3384 + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( 3385 + RADEON_USEC_IB_TEST_TIMEOUT)); 3386 + if (r < 0) { 3386 3387 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 3387 3388 goto free_ib; 3389 + } else if (r == 0) { 3390 + DRM_ERROR("radeon: fence wait timed out.\n"); 3391 + r = -ETIMEDOUT; 3392 + goto free_ib; 3388 3393 } 3394 + r = 0; 3389 3395 for (i = 0; i < rdev->usec_timeout; i++) { 3390 3396 tmp = RREG32(scratch); 3391 3397 if (tmp == 0xDEADBEEF)
+7 -2
drivers/gpu/drm/radeon/r600_dma.c
··· 368 368 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); 369 369 return r; 370 370 } 371 - r = radeon_fence_wait(ib.fence, false); 372 - if (r) { 371 + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( 372 + RADEON_USEC_IB_TEST_TIMEOUT)); 373 + if (r < 0) { 373 374 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 374 375 return r; 376 + } else if (r == 0) { 377 + DRM_ERROR("radeon: fence wait timed out.\n"); 378 + return -ETIMEDOUT; 375 379 } 380 + r = 0; 376 381 for (i = 0; i < rdev->usec_timeout; i++) { 377 382 tmp = le32_to_cpu(rdev->wb.wb[index/4]); 378 383 if (tmp == 0xDEADBEEF)
+2
drivers/gpu/drm/radeon/radeon.h
··· 120 120 */ 121 121 #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ 122 122 #define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2) 123 + #define RADEON_USEC_IB_TEST_TIMEOUT 1000000 /* 1s */ 123 124 /* RADEON_IB_POOL_SIZE must be a power of 2 */ 124 125 #define RADEON_IB_POOL_SIZE 16 125 126 #define RADEON_DEBUGFS_MAX_COMPONENTS 32 ··· 383 382 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring); 384 383 void radeon_fence_process(struct radeon_device *rdev, int ring); 385 384 bool radeon_fence_signaled(struct radeon_fence *fence); 385 + long radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout); 386 386 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); 387 387 int radeon_fence_wait_next(struct radeon_device *rdev, int ring); 388 388 int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
+44 -20
drivers/gpu/drm/radeon/radeon_fence.c
··· 527 527 } 528 528 529 529 /** 530 + * radeon_fence_wait_timeout - wait for a fence to signal with timeout 531 + * 532 + * @fence: radeon fence object 533 + * @intr: use interruptible sleep 534 + * 535 + * Wait for the requested fence to signal (all asics). 536 + * @intr selects whether to use interruptable (true) or non-interruptable 537 + * (false) sleep when waiting for the fence. 538 + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait 539 + * Returns remaining time if the sequence number has passed, 0 when 540 + * the wait timeout, or an error for all other cases. 541 + */ 542 + long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout) 543 + { 544 + uint64_t seq[RADEON_NUM_RINGS] = {}; 545 + long r; 546 + int r_sig; 547 + 548 + /* 549 + * This function should not be called on !radeon fences. 550 + * If this is the case, it would mean this function can 551 + * also be called on radeon fences belonging to another card. 552 + * exclusive_lock is not held in that case. 553 + */ 554 + if (WARN_ON_ONCE(!to_radeon_fence(&fence->base))) 555 + return fence_wait(&fence->base, intr); 556 + 557 + seq[fence->ring] = fence->seq; 558 + r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout); 559 + if (r <= 0) { 560 + return r; 561 + } 562 + 563 + r_sig = fence_signal(&fence->base); 564 + if (!r_sig) 565 + FENCE_TRACE(&fence->base, "signaled from fence_wait\n"); 566 + return r; 567 + } 568 + 569 + /** 530 570 * radeon_fence_wait - wait for a fence to signal 531 571 * 532 572 * @fence: radeon fence object ··· 579 539 */ 580 540 int radeon_fence_wait(struct radeon_fence *fence, bool intr) 581 541 { 582 - uint64_t seq[RADEON_NUM_RINGS] = {}; 583 - long r; 584 - 585 - /* 586 - * This function should not be called on !radeon fences. 587 - * If this is the case, it would mean this function can 588 - * also be called on radeon fences belonging to another card. 589 - * exclusive_lock is not held in that case. 590 - */ 591 - if (WARN_ON_ONCE(!to_radeon_fence(&fence->base))) 592 - return fence_wait(&fence->base, intr); 593 - 594 - seq[fence->ring] = fence->seq; 595 - r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT); 596 - if (r < 0) { 542 + long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT); 543 + if (r > 0) { 544 + return 0; 545 + } else { 597 546 return r; 598 547 } 599 - 600 - r = fence_signal(&fence->base); 601 - if (!r) 602 - FENCE_TRACE(&fence->base, "signaled from fence_wait\n"); 603 - return 0; 604 548 } 605 549 606 550 /**
+8 -3
drivers/gpu/drm/radeon/radeon_vce.c
··· 810 810 goto error; 811 811 } 812 812 813 - r = radeon_fence_wait(fence, false); 814 - if (r) { 813 + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( 814 + RADEON_USEC_IB_TEST_TIMEOUT)); 815 + if (r < 0) { 815 816 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 817 + } else if (r == 0) { 818 + DRM_ERROR("radeon: fence wait timed out.\n"); 819 + r = -ETIMEDOUT; 816 820 } else { 817 - DRM_INFO("ib test on ring %d succeeded\n", ring->idx); 821 + DRM_INFO("ib test on ring %d succeeded\n", ring->idx); 822 + r = 0; 818 823 } 819 824 error: 820 825 radeon_fence_unref(&fence);
+8 -2
drivers/gpu/drm/radeon/uvd_v1_0.c
··· 522 522 goto error; 523 523 } 524 524 525 - r = radeon_fence_wait(fence, false); 526 - if (r) { 525 + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( 526 + RADEON_USEC_IB_TEST_TIMEOUT)); 527 + if (r < 0) { 527 528 DRM_ERROR("radeon: fence wait failed (%d).\n", r); 528 529 goto error; 530 + } else if (r == 0) { 531 + DRM_ERROR("radeon: fence wait timed out.\n"); 532 + r = -ETIMEDOUT; 533 + goto error; 529 534 } 535 + r = 0; 530 536 DRM_INFO("ib test on ring %d succeeded\n", ring->idx); 531 537 error: 532 538 radeon_fence_unref(&fence);