Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: resolve bug in UMC 6 error counter query

iterate over all error counter registers in SMN space

removed support error counter access via MMIO

Reviewed-by: Guchun Chen <guchun.chen@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

John Clements and committed by
Alex Deucher
bd68fb94 a210d698

+64 -90
-35
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
··· 21 21 #ifndef __AMDGPU_UMC_H__ 22 22 #define __AMDGPU_UMC_H__ 23 23 24 - /* implement 64 bits REG operations via 32 bits interface */ 25 - #define RREG64_UMC(reg) (RREG32(reg) | \ 26 - ((uint64_t)RREG32((reg) + 1) << 32)) 27 - #define WREG64_UMC(reg, v) \ 28 - do { \ 29 - WREG32((reg), lower_32_bits(v)); \ 30 - WREG32((reg) + 1, upper_32_bits(v)); \ 31 - } while (0) 32 - 33 - /* 34 - * void (*func)(struct amdgpu_device *adev, struct ras_err_data *err_data, 35 - * uint32_t umc_reg_offset, uint32_t channel_index) 36 - */ 37 - #define amdgpu_umc_for_each_channel(func) \ 38 - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; \ 39 - uint32_t umc_inst, channel_inst, umc_reg_offset, channel_index; \ 40 - for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) { \ 41 - /* enable the index mode to query eror count per channel */ \ 42 - adev->umc.funcs->enable_umc_index_mode(adev, umc_inst); \ 43 - for (channel_inst = 0; \ 44 - channel_inst < adev->umc.channel_inst_num; \ 45 - channel_inst++) { \ 46 - /* calc the register offset according to channel instance */ \ 47 - umc_reg_offset = adev->umc.channel_offs * channel_inst; \ 48 - /* get channel index of interleaved memory */ \ 49 - channel_index = adev->umc.channel_idx_tbl[ \ 50 - umc_inst * adev->umc.channel_inst_num + channel_inst]; \ 51 - (func)(adev, err_data, umc_reg_offset, channel_index); \ 52 - } \ 53 - } \ 54 - adev->umc.funcs->disable_umc_index_mode(adev); 55 - 56 24 struct amdgpu_umc_funcs { 57 25 void (*err_cnt_init)(struct amdgpu_device *adev); 58 26 int (*ras_late_init)(struct amdgpu_device *adev); ··· 28 60 void *ras_error_status); 29 61 void (*query_ras_error_address)(struct amdgpu_device *adev, 30 62 void *ras_error_status); 31 - void (*enable_umc_index_mode)(struct amdgpu_device *adev, 32 - uint32_t umc_instance); 33 - void (*disable_umc_index_mode)(struct amdgpu_device *adev); 34 63 void (*init_registers)(struct amdgpu_device *adev); 35 64 }; 36 65
+64 -55
drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
··· 32 32 33 33 #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 34 34 35 + #define UMC_6_INST_DIST 0x40000 36 + 35 37 /* 36 38 * (addr / 256) * 8192, the higher 26 bits in ErrorAddr 37 39 * is the index of 8KB block 38 40 */ 39 - #define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) 41 + #define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) 40 42 /* channel index is the index of 256B block */ 41 43 #define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) 42 44 /* offset in 256B block */ ··· 52 50 {9, 25, 0, 16}, {15, 31, 6, 22} 53 51 }; 54 52 55 - static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev, 56 - uint32_t umc_instance) 53 + static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev, 54 + uint32_t umc_inst, 55 + uint32_t ch_inst) 57 56 { 58 - uint32_t rsmu_umc_index; 59 - 60 - rsmu_umc_index = RREG32_SOC15(RSMU, 0, 61 - mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 62 - rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 63 - RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 64 - RSMU_UMC_INDEX_MODE_EN, 1); 65 - rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 66 - RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 67 - RSMU_UMC_INDEX_INSTANCE, umc_instance); 68 - rsmu_umc_index = REG_SET_FIELD(rsmu_umc_index, 69 - RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 70 - RSMU_UMC_INDEX_WREN, 1 << umc_instance); 71 - WREG32_SOC15(RSMU, 0, mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 72 - rsmu_umc_index); 73 - } 74 - 75 - static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) 76 - { 77 - WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 78 - RSMU_UMC_INDEX_MODE_EN, 0); 79 - } 80 - 81 - static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev) 82 - { 83 - uint32_t rsmu_umc_index; 84 - 85 - rsmu_umc_index = RREG32_SOC15(RSMU, 0, 86 - mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); 87 - return REG_GET_FIELD(rsmu_umc_index, 88 - RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, 89 - RSMU_UMC_INDEX_INSTANCE); 57 + return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst; 90 58 } 91 59 92 60 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, ··· 146 174 *error_count += 1; 147 175 } 148 176 149 - static void umc_v6_1_query_error_count(struct amdgpu_device *adev, 150 - struct ras_err_data *err_data, uint32_t umc_reg_offset, 151 - uint32_t channel_index) 152 - { 153 - umc_v6_1_query_correctable_error_count(adev, umc_reg_offset, 154 - &(err_data->ce_count)); 155 - umc_v6_1_querry_uncorrectable_error_count(adev, umc_reg_offset, 156 - &(err_data->ue_count)); 157 - } 158 - 159 177 static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev, 160 178 void *ras_error_status) 161 179 { 162 - amdgpu_umc_for_each_channel(umc_v6_1_query_error_count); 180 + struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status; 181 + 182 + uint32_t umc_inst = 0; 183 + uint32_t ch_inst = 0; 184 + uint32_t umc_reg_offset = 0; 185 + 186 + for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) { 187 + for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) { 188 + umc_reg_offset = get_umc_6_reg_offset(adev, 189 + umc_inst, 190 + ch_inst); 191 + 192 + umc_v6_1_query_correctable_error_count(adev, 193 + umc_reg_offset, 194 + &(err_data->ce_count)); 195 + umc_v6_1_querry_uncorrectable_error_count(adev, 196 + umc_reg_offset, 197 + &(err_data->ue_count)); 198 + } 199 + } 163 200 } 164 201 165 202 static void umc_v6_1_query_error_address(struct amdgpu_device *adev, 166 203 struct ras_err_data *err_data, 167 - uint32_t umc_reg_offset, uint32_t channel_index) 204 + uint32_t umc_reg_offset, 205 + uint32_t channel_index, 206 + uint32_t umc_inst) 168 207 { 169 208 uint32_t lsb, mc_umc_status_addr; 170 209 uint64_t mc_umc_status, err_addr, retired_page; ··· 227 244 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 228 245 err_rec->cu = 0; 229 246 err_rec->mem_channel = channel_index; 230 - err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); 247 + err_rec->mcumc_id = umc_inst; 231 248 232 249 err_data->err_addr_cnt++; 233 250 } ··· 240 257 static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, 241 258 void *ras_error_status) 242 259 { 243 - amdgpu_umc_for_each_channel(umc_v6_1_query_error_address); 260 + struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status; 261 + 262 + uint32_t umc_inst = 0; 263 + uint32_t ch_inst = 0; 264 + uint32_t umc_reg_offset = 0; 265 + 266 + for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) { 267 + for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) { 268 + umc_reg_offset = get_umc_6_reg_offset(adev, 269 + umc_inst, 270 + ch_inst); 271 + 272 + umc_v6_1_query_error_address(adev, 273 + err_data, 274 + umc_reg_offset, 275 + ch_inst, 276 + umc_inst); 277 + } 278 + } 279 + 244 280 } 245 281 246 282 static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, 247 - struct ras_err_data *err_data, 248 - uint32_t umc_reg_offset, uint32_t channel_index) 283 + uint32_t umc_reg_offset) 249 284 { 250 285 uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; 251 286 uint32_t ecc_err_cnt_addr; ··· 302 301 303 302 static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev) 304 303 { 305 - void *ras_error_status = NULL; 304 + uint32_t umc_inst = 0; 305 + uint32_t ch_inst = 0; 306 + uint32_t umc_reg_offset = 0; 306 307 307 - amdgpu_umc_for_each_channel(umc_v6_1_err_cnt_init_per_channel); 308 + for (umc_inst = 0; umc_inst < adev->umc.umc_inst_num; umc_inst++) { 309 + for (ch_inst = 0; ch_inst < adev->umc.channel_inst_num; ch_inst++) { 310 + umc_reg_offset = get_umc_6_reg_offset(adev, 311 + umc_inst, 312 + ch_inst); 313 + 314 + umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset); 315 + } 316 + } 308 317 } 309 318 310 319 const struct amdgpu_umc_funcs umc_v6_1_funcs = { ··· 322 311 .ras_late_init = amdgpu_umc_ras_late_init, 323 312 .query_ras_error_count = umc_v6_1_query_ras_error_count, 324 313 .query_ras_error_address = umc_v6_1_query_ras_error_address, 325 - .enable_umc_index_mode = umc_v6_1_enable_umc_index_mode, 326 - .disable_umc_index_mode = umc_v6_1_disable_umc_index_mode, 327 314 };