Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu/gfx9: manually control gfxoff for CS on RV

When mesa started using compute queues more often
we started seeing additional hangs with compute queues.
Disabling gfxoff seems to mitigate that. Manually
control gfxoff and gfx pg with command submissions to avoid
any issues related to gfxoff. KFD already does the same
thing for these chips.

v2: limit to compute
v3: limit to APUs
v4: limit to Raven/PCO
v5: only update the compute ring_funcs
v6: Disable GFX PG
v7: adjust order

Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Suggested-by: Błażej Szczygieł <mumei6102@gmail.com>
Suggested-by: Sergey Kovalenko <seryoga.engineering@gmail.com>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3861
Link: https://lists.freedesktop.org/archives/amd-gfx/2025-January/119116.html
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org # 6.12.x

+34 -2
+34 -2
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
··· 7437 7437 amdgpu_ring_write(ring, 0); /* RESERVED field, programmed to zero */ 7438 7438 } 7439 7439 7440 + static void gfx_v9_0_ring_begin_use_compute(struct amdgpu_ring *ring) 7441 + { 7442 + struct amdgpu_device *adev = ring->adev; 7443 + struct amdgpu_ip_block *gfx_block = 7444 + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 7445 + 7446 + amdgpu_gfx_enforce_isolation_ring_begin_use(ring); 7447 + 7448 + /* Raven and PCO APUs seem to have stability issues 7449 + * with compute and gfxoff and gfx pg. Disable gfx pg during 7450 + * submission and allow again afterwards. 7451 + */ 7452 + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) 7453 + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_UNGATE); 7454 + } 7455 + 7456 + static void gfx_v9_0_ring_end_use_compute(struct amdgpu_ring *ring) 7457 + { 7458 + struct amdgpu_device *adev = ring->adev; 7459 + struct amdgpu_ip_block *gfx_block = 7460 + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 7461 + 7462 + /* Raven and PCO APUs seem to have stability issues 7463 + * with compute and gfxoff and gfx pg. Disable gfx pg during 7464 + * submission and allow again afterwards. 7465 + */ 7466 + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) 7467 + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_GATE); 7468 + 7469 + amdgpu_gfx_enforce_isolation_ring_end_use(ring); 7470 + } 7471 + 7440 7472 static const struct amd_ip_funcs gfx_v9_0_ip_funcs = { 7441 7473 .name = "gfx_v9_0", 7442 7474 .early_init = gfx_v9_0_early_init, ··· 7645 7613 .emit_wave_limit = gfx_v9_0_emit_wave_limit, 7646 7614 .reset = gfx_v9_0_reset_kcq, 7647 7615 .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader, 7648 - .begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use, 7649 - .end_use = amdgpu_gfx_enforce_isolation_ring_end_use, 7616 + .begin_use = gfx_v9_0_ring_begin_use_compute, 7617 + .end_use = gfx_v9_0_ring_end_use_compute, 7650 7618 }; 7651 7619 7652 7620 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {