Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: decouple EccErrCnt query and clear operation

Due to hardware bug that when RSMU UMC index is disabled,
clear EccErrCnt at the first UMC instance will clean up all other
EccErrCnt registes from other instances at the same time. This
will break the correctable error count log in EccErrCnt register
once querying it. So decouple both to make error count query workable.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Guchun Chen and committed by
Alex Deucher
fd90456c 40e73314

+79 -4
+79 -4
drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
··· 104 104 return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst; 105 105 } 106 106 107 + static void umc_v6_1_clear_error_count_per_channel(struct amdgpu_device *adev, 108 + uint32_t umc_reg_offset) 109 + { 110 + uint32_t ecc_err_cnt_addr; 111 + uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 112 + 113 + if (adev->asic_type == CHIP_ARCTURUS) { 114 + /* UMC 6_1_2 registers */ 115 + ecc_err_cnt_sel_addr = 116 + SOC15_REG_OFFSET(UMC, 0, 117 + mmUMCCH0_0_EccErrCntSel_ARCT); 118 + ecc_err_cnt_addr = 119 + SOC15_REG_OFFSET(UMC, 0, 120 + mmUMCCH0_0_EccErrCnt_ARCT); 121 + } else { 122 + /* UMC 6_1_1 registers */ 123 + ecc_err_cnt_sel_addr = 124 + SOC15_REG_OFFSET(UMC, 0, 125 + mmUMCCH0_0_EccErrCntSel); 126 + ecc_err_cnt_addr = 127 + SOC15_REG_OFFSET(UMC, 0, 128 + mmUMCCH0_0_EccErrCnt); 129 + } 130 + 131 + /* select the lower chip */ 132 + ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 133 + umc_reg_offset) * 4); 134 + ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 135 + UMCCH0_0_EccErrCntSel, 136 + EccErrCntCsSel, 0); 137 + WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 138 + ecc_err_cnt_sel); 139 + 140 + /* clear lower chip error count */ 141 + WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 142 + UMC_V6_1_CE_CNT_INIT); 143 + 144 + /* select the higher chip */ 145 + ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + 146 + umc_reg_offset) * 4); 147 + ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, 148 + UMCCH0_0_EccErrCntSel, 149 + EccErrCntCsSel, 1); 150 + WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, 151 + ecc_err_cnt_sel); 152 + 153 + /* clear higher chip error count */ 154 + WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, 155 + UMC_V6_1_CE_CNT_INIT); 156 + } 157 + 158 + static void umc_v6_1_clear_error_count(struct amdgpu_device *adev) 159 + { 160 + uint32_t umc_inst = 0; 161 + uint32_t ch_inst = 0; 162 + uint32_t umc_reg_offset = 0; 163 + uint32_t rsmu_umc_index_state = 164 + umc_v6_1_get_umc_index_mode_state(adev); 165 + 166 + if (rsmu_umc_index_state) 167 + umc_v6_1_disable_umc_index_mode(adev); 168 + 169 + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 170 + umc_reg_offset = get_umc_6_reg_offset(adev, 171 + umc_inst, 172 + ch_inst); 173 + 174 + umc_v6_1_clear_error_count_per_channel(adev, 175 + umc_reg_offset); 176 + } 177 + 178 + if (rsmu_umc_index_state) 179 + umc_v6_1_enable_umc_index_mode(adev); 180 + } 181 + 107 182 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, 108 183 uint32_t umc_reg_offset, 109 184 unsigned long *error_count) ··· 211 136 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 212 137 EccErrCntCsSel, 0); 213 138 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 139 + 214 140 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 215 141 *error_count += 216 142 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 217 143 UMC_V6_1_CE_CNT_INIT); 218 - /* clear the lower chip err count */ 219 - WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 220 144 221 145 /* select the higher chip and check the err counter */ 222 146 ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel, 223 147 EccErrCntCsSel, 1); 224 148 WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); 149 + 225 150 ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4); 226 151 *error_count += 227 152 (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) - 228 153 UMC_V6_1_CE_CNT_INIT); 229 - /* clear the higher chip err count */ 230 - WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT); 231 154 232 155 /* check for SRAM correctable error 233 156 MCUMC_STATUS is a 64 bit register */ ··· 301 228 302 229 if (rsmu_umc_index_state) 303 230 umc_v6_1_enable_umc_index_mode(adev); 231 + 232 + umc_v6_1_clear_error_count(adev); 304 233 } 305 234 306 235 static void umc_v6_1_query_error_address(struct amdgpu_device *adev,