drm/amdgpu/gfx10: Add cleaner shader for GFX10.3.0

+15

drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

··· 45 45 #include "clearstate_gfx10.h" 46 46 #include "v10_structs.h" 47 47 #include "gfx_v10_0.h" 48 + #include "gfx_v10_0_cleaner_shader.h" 48 49 #include "nbio_v2_3.h" 49 50 50 51 /* ··· 4739 4738 break; 4740 4739 } 4741 4740 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { 4741 + case IP_VERSION(10, 3, 0): 4742 + adev->gfx.cleaner_shader_ptr = gfx_10_3_0_cleaner_shader_hex; 4743 + adev->gfx.cleaner_shader_size = sizeof(gfx_10_3_0_cleaner_shader_hex); 4744 + if (adev->gfx.me_fw_version >= 64 && 4745 + adev->gfx.pfp_fw_version >= 100 && 4746 + adev->gfx.mec_fw_version >= 122) { 4747 + adev->gfx.enable_cleaner_shader = true; 4748 + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); 4749 + if (r) { 4750 + adev->gfx.enable_cleaner_shader = false; 4751 + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); 4752 + } 4753 + } 4754 + break; 4742 4755 default: 4743 4756 adev->gfx.enable_cleaner_shader = false; 4744 4757 break;

+56

drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h

··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright 2025 Advanced Micro Devices, Inc. 4 + * 5 + * Permission is hereby granted, free of charge, to any person obtaining a 6 + * copy of this software and associated documentation files (the "Software"), 7 + * to deal in the Software without restriction, including without limitation 8 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 + * and/or sell copies of the Software, and to permit persons to whom the 10 + * Software is furnished to do so, subject to the following conditions: 11 + * 12 + * The above copyright notice and this permission notice shall be included in 13 + * all copies or substantial portions of the Software. 14 + * 15 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 + * OTHER DEALINGS IN THE SOFTWARE. 22 + */ 23 + 24 + /* Define the cleaner shader gfx_10_3_0 */ 25 + static const u32 gfx_10_3_0_cleaner_shader_hex[] = { 26 + 0xb0804004, 0xbf8a0000, 27 + 0xbe8203b8, 0xbefc0380, 28 + 0x7e008480, 0x7e028480, 29 + 0x7e048480, 0x7e068480, 30 + 0x7e088480, 0x7e0a8480, 31 + 0x7e0c8480, 0x7e0e8480, 32 + 0xbefc0302, 0x80828802, 33 + 0xbf84fff5, 0xbe8203ff, 34 + 0x80000000, 0x87020002, 35 + 0xbf840012, 0xbefe03c1, 36 + 0xbeff03c1, 0xd7650001, 37 + 0x0001007f, 0xd7660001, 38 + 0x0002027e, 0x16020288, 39 + 0xbe8203bf, 0xbefc03c1, 40 + 0xd9382000, 0x00020201, 41 + 0xd9386040, 0x00040401, 42 + 0xd70f6a01, 0x000202ff, 43 + 0x00000400, 0x80828102, 44 + 0xbf84fff7, 0xbefc03ff, 45 + 0x00000068, 0xbe803080, 46 + 0xbe813080, 0xbe823080, 47 + 0xbe833080, 0x80fc847c, 48 + 0xbf84fffa, 0xbeea0480, 49 + 0xbeec0480, 0xbeee0480, 50 + 0xbef00480, 0xbef20480, 51 + 0xbef40480, 0xbef60480, 52 + 0xbef80480, 0xbefa0480, 53 + 0xbf810000, 0xbf9f0000, 54 + 0xbf9f0000, 0xbf9f0000, 55 + 0xbf9f0000, 0xbf9f0000, 56 + };

+124

drivers/gpu/drm/amd/amdgpu/gfx_v10_3_0_cleaner_shader.asm

··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright 2025 Advanced Micro Devices, Inc. 4 + * 5 + * Permission is hereby granted, free of charge, to any person obtaining a 6 + * copy of this software and associated documentation files (the "Software"), 7 + * to deal in the Software without restriction, including without limitation 8 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 + * and/or sell copies of the Software, and to permit persons to whom the 10 + * Software is furnished to do so, subject to the following conditions: 11 + * 12 + * The above copyright notice and this permission notice shall be included in 13 + * all copies or substantial portions of the Software. 14 + * 15 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 + * OTHER DEALINGS IN THE SOFTWARE. 22 + */ 23 + 24 + // This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 192 Dwords cleaner shader. 25 + //To turn this shader program on for complitaion change this to main and lower shader main to main_1 26 + 27 + // GFX10.3 : Clear SGPRs, VGPRs and LDS 28 + // Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot 29 + // Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD 30 + // Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS) 31 + // It takes 2 workgroups to use all of LDS: one on each CU of the WGP 32 + // Each wave clears SGPRs 0 - 107 33 + // Each wave clears VGPRs 0 - 63 34 + // The first wave of the workgroup clears its 64KB of LDS 35 + // The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup 36 + // before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared. 37 + 38 + 39 + shader main 40 + asic(GFX10) 41 + type(CS) 42 + wave_size(32) 43 + // Note: original source code from SQ team 44 + 45 + // 46 + // Create 32 waves in a threadgroup (CS waves) 47 + // Each allocates 64 VGPRs 48 + // The workgroup allocates all of LDS (64kbytes) 49 + // 50 + // Takes about 2500 clocks to run. 51 + // (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks) 52 + // 53 + S_BARRIER 54 + s_mov_b32 s2, 0x00000038 // Loop 64/8=8 times (loop unrolled for performance) 55 + s_mov_b32 m0, 0 56 + // 57 + // CLEAR VGPRs 58 + // 59 + label_0005: 60 + v_movreld_b32 v0, 0 61 + v_movreld_b32 v1, 0 62 + v_movreld_b32 v2, 0 63 + v_movreld_b32 v3, 0 64 + v_movreld_b32 v4, 0 65 + v_movreld_b32 v5, 0 66 + v_movreld_b32 v6, 0 67 + v_movreld_b32 v7, 0 68 + s_mov_b32 m0, s2 69 + s_sub_u32 s2, s2, 8 70 + s_cbranch_scc0 label_0005 71 + // 72 + s_mov_b32 s2, 0x80000000 // Bit31 is first_wave 73 + s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set 74 + s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup 75 + // CLEAR LDS 76 + // 77 + s_mov_b32 exec_lo, 0xffffffff 78 + s_mov_b32 exec_hi, 0xffffffff 79 + v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63) 80 + v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63) 81 + v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte) 82 + s_mov_b32 s2, 0x00000003f // 64 loop iterations 83 + s_mov_b32 m0, 0xffffffff 84 + // Clear all of LDS space 85 + // Each FirstWave of WorkGroup clears 64kbyte block 86 + 87 + label_001F: 88 + ds_write2_b64 v1, v[2:3], v[2:3] offset1:32 89 + ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96 90 + v_add_co_u32 v1, vcc, 0x00000400, v1 91 + s_sub_u32 s2, s2, 1 92 + s_cbranch_scc0 label_001F 93 + 94 + // 95 + // CLEAR SGPRs 96 + // 97 + label_0023: 98 + s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance) 99 + label_sgpr_loop: 100 + s_movreld_b32 s0, 0 101 + s_movreld_b32 s1, 0 102 + s_movreld_b32 s2, 0 103 + s_movreld_b32 s3, 0 104 + s_sub_u32 m0, m0, 4 105 + s_cbranch_scc0 label_sgpr_loop 106 + 107 + //clear vcc 108 + s_mov_b32 flat_scratch_lo, 0 //clear flat scratch lo SGPR 109 + s_mov_b32 flat_scratch_hi, 0 //clear flat scratch hi SGPR 110 + s_mov_b64 vcc, 0 //clear vcc 111 + s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1 112 + s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3 113 + s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5 114 + s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7 115 + s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9 116 + s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11 117 + s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13 118 + s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15 119 + 120 + s_endpgm 121 + 122 + end 123 + 124 +