Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

v2: update emit_ib_size

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Marek Olšák and committed by
Alex Deucher
41cca166 67dd1a36

+84 -6
+2 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
··· 72 72 * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. 73 73 * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. 74 74 * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 75 + * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID 75 76 */ 76 77 #define KMS_DRIVER_MAJOR 3 77 - #define KMS_DRIVER_MINOR 28 78 + #define KMS_DRIVER_MINOR 29 78 79 #define KMS_DRIVER_PATCHLEVEL 0 79 80 80 81 int amdgpu_vram_limit = 0;
+2
drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
··· 37 37 struct amdgpu_gds_asic_info mem; 38 38 struct amdgpu_gds_asic_info gws; 39 39 struct amdgpu_gds_asic_info oa; 40 + uint32_t gds_compute_max_wave_id; 41 + 40 42 /* At present, GDS, GWS and OA resources for gfx (graphics) 41 43 * is always pre-allocated and available for graphics operation. 42 44 * Such resource is shared between all gfx clients.
+18 -1
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
··· 2264 2264 unsigned vmid = AMDGPU_JOB_GET_VMID(job); 2265 2265 u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); 2266 2266 2267 + /* Currently, there is a high possibility to get wave ID mismatch 2268 + * between ME and GDS, leading to a hw deadlock, because ME generates 2269 + * different wave IDs than the GDS expects. This situation happens 2270 + * randomly when at least 5 compute pipes use GDS ordered append. 2271 + * The wave IDs generated by ME are also wrong after suspend/resume. 2272 + * Those are probably bugs somewhere else in the kernel driver. 2273 + * 2274 + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and 2275 + * GDS to 0 for this ring (me/pipe). 2276 + */ 2277 + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { 2278 + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); 2279 + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); 2280 + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); 2281 + } 2282 + 2267 2283 amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); 2268 2284 amdgpu_ring_write(ring, 2269 2285 #ifdef __BIG_ENDIAN ··· 5016 5000 7 + /* gfx_v7_0_ring_emit_pipeline_sync */ 5017 5001 CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */ 5018 5002 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */ 5019 - .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */ 5003 + .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */ 5020 5004 .emit_ib = gfx_v7_0_ring_emit_ib_compute, 5021 5005 .emit_fence = gfx_v7_0_ring_emit_fence_compute, 5022 5006 .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync, ··· 5073 5057 adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); 5074 5058 adev->gds.gws.total_size = 64; 5075 5059 adev->gds.oa.total_size = 16; 5060 + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); 5076 5061 5077 5062 if (adev->gds.mem.total_size == 64 * 1024) { 5078 5063 adev->gds.mem.gfx_partition_size = 4096;
+19 -2
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
··· 6084 6084 unsigned vmid = AMDGPU_JOB_GET_VMID(job); 6085 6085 u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); 6086 6086 6087 + /* Currently, there is a high possibility to get wave ID mismatch 6088 + * between ME and GDS, leading to a hw deadlock, because ME generates 6089 + * different wave IDs than the GDS expects. This situation happens 6090 + * randomly when at least 5 compute pipes use GDS ordered append. 6091 + * The wave IDs generated by ME are also wrong after suspend/resume. 6092 + * Those are probably bugs somewhere else in the kernel driver. 6093 + * 6094 + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and 6095 + * GDS to 0 for this ring (me/pipe). 6096 + */ 6097 + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { 6098 + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); 6099 + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START); 6100 + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); 6101 + } 6102 + 6087 6103 amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); 6088 6104 amdgpu_ring_write(ring, 6089 6105 #ifdef __BIG_ENDIAN ··· 6906 6890 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 6907 6891 VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */ 6908 6892 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ 6909 - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ 6893 + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ 6910 6894 .emit_ib = gfx_v8_0_ring_emit_ib_compute, 6911 6895 .emit_fence = gfx_v8_0_ring_emit_fence_compute, 6912 6896 .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, ··· 6936 6920 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 6937 6921 17 + /* gfx_v8_0_ring_emit_vm_flush */ 6938 6922 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */ 6939 - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ 6923 + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */ 6940 6924 .emit_fence = gfx_v8_0_ring_emit_fence_kiq, 6941 6925 .test_ring = gfx_v8_0_ring_test_ring, 6942 6926 .insert_nop = amdgpu_ring_insert_nop, ··· 7012 6996 adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE); 7013 6997 adev->gds.gws.total_size = 64; 7014 6998 adev->gds.oa.total_size = 16; 6999 + adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID); 7015 7000 7016 7001 if (adev->gds.mem.total_size == 64 * 1024) { 7017 7002 adev->gds.mem.gfx_partition_size = 4096;
+38 -2
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
··· 4010 4010 unsigned vmid = AMDGPU_JOB_GET_VMID(job); 4011 4011 u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24); 4012 4012 4013 + /* Currently, there is a high possibility to get wave ID mismatch 4014 + * between ME and GDS, leading to a hw deadlock, because ME generates 4015 + * different wave IDs than the GDS expects. This situation happens 4016 + * randomly when at least 5 compute pipes use GDS ordered append. 4017 + * The wave IDs generated by ME are also wrong after suspend/resume. 4018 + * Those are probably bugs somewhere else in the kernel driver. 4019 + * 4020 + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and 4021 + * GDS to 0 for this ring (me/pipe). 4022 + */ 4023 + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) { 4024 + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); 4025 + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID); 4026 + amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id); 4027 + } 4028 + 4013 4029 amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2)); 4014 4030 BUG_ON(ib->gpu_addr & 0x3); /* Dword align */ 4015 4031 amdgpu_ring_write(ring, ··· 4745 4729 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 4746 4730 2 + /* gfx_v9_0_ring_emit_vm_flush */ 4747 4731 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ 4748 - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ 4732 + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ 4749 4733 .emit_ib = gfx_v9_0_ring_emit_ib_compute, 4750 4734 .emit_fence = gfx_v9_0_ring_emit_fence, 4751 4735 .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync, ··· 4780 4764 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + 4781 4765 2 + /* gfx_v9_0_ring_emit_vm_flush */ 4782 4766 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ 4783 - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */ 4767 + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */ 4784 4768 .emit_fence = gfx_v9_0_ring_emit_fence_kiq, 4785 4769 .test_ring = gfx_v9_0_ring_test_ring, 4786 4770 .insert_nop = amdgpu_ring_insert_nop, ··· 4859 4843 break; 4860 4844 default: 4861 4845 adev->gds.mem.total_size = 0x10000; 4846 + break; 4847 + } 4848 + 4849 + switch (adev->asic_type) { 4850 + case CHIP_VEGA10: 4851 + case CHIP_VEGA20: 4852 + adev->gds.gds_compute_max_wave_id = 0x7ff; 4853 + break; 4854 + case CHIP_VEGA12: 4855 + adev->gds.gds_compute_max_wave_id = 0x27f; 4856 + break; 4857 + case CHIP_RAVEN: 4858 + if (adev->rev_id >= 0x8) 4859 + adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */ 4860 + else 4861 + adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */ 4862 + break; 4863 + default: 4864 + /* this really depends on the chip */ 4865 + adev->gds.gds_compute_max_wave_id = 0x7ff; 4862 4866 break; 4863 4867 } 4864 4868
+5
include/uapi/drm/amdgpu_drm.h
··· 566 566 * caches (L2/vL1/sL1/I$). */ 567 567 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) 568 568 569 + /* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. 570 + * This will reset wave ID counters for the IB. 571 + */ 572 + #define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) 573 + 569 574 struct drm_amdgpu_cs_chunk_ib { 570 575 __u32 _pad; 571 576 /** AMDGPU_IB_FLAG_* */