Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: implement query_ras_error_address callback

query_ras_error_address will be invoked to query bad
page address when there is poison data in HBM consumed
by GPU engines.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: John Clements <John.Clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Hawking Zhang and committed by
Alex Deucher
87da0cc1 878b9e94

+90
+90
drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
··· 183 183 umc_v6_7_reset_error_count(adev); 184 184 } 185 185 186 + static void umc_v6_7_query_error_address(struct amdgpu_device *adev, 187 + struct ras_err_data *err_data, 188 + uint32_t umc_reg_offset, 189 + uint32_t ch_inst, 190 + uint32_t umc_inst) 191 + { 192 + uint32_t mc_umc_status_addr; 193 + uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0; 194 + struct eeprom_table_record *err_rec; 195 + uint32_t channel_index; 196 + 197 + mc_umc_status_addr = 198 + SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); 199 + mc_umc_addrt0 = 200 + SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0); 201 + 202 + mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); 203 + 204 + if (mc_umc_status == 0) 205 + return; 206 + 207 + if (!err_data->err_addr) { 208 + /* clear umc status */ 209 + WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 210 + return; 211 + } 212 + 213 + err_rec = &err_data->err_addr[err_data->err_addr_cnt]; 214 + 215 + channel_index = 216 + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst]; 217 + 218 + /* calculate error address if ue/ce error is detected */ 219 + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && 220 + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || 221 + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) { 222 + 223 + err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); 224 + err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); 225 + 226 + /* translate umc channel address to soc pa, 3 parts are included */ 227 + retired_page = ADDR_OF_8KB_BLOCK(err_addr) | 228 + ADDR_OF_256B_BLOCK(channel_index) | 229 + OFFSET_IN_256B_BLOCK(err_addr); 230 + 231 + /* we only save ue error information currently, ce is skipped */ 232 + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) 233 + == 1) { 234 + err_rec->address = err_addr; 235 + /* page frame address is saved */ 236 + err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 237 + err_rec->ts = (uint64_t)ktime_get_real_seconds(); 238 + err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 239 + err_rec->cu = 0; 240 + err_rec->mem_channel = channel_index; 241 + err_rec->mcumc_id = umc_inst; 242 + 243 + err_data->err_addr_cnt++; 244 + } 245 + } 246 + 247 + /* clear umc status */ 248 + WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); 249 + } 250 + 251 + static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev, 252 + void *ras_error_status) 253 + { 254 + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 255 + 256 + uint32_t umc_inst = 0; 257 + uint32_t ch_inst = 0; 258 + uint32_t umc_reg_offset = 0; 259 + 260 + /*TODO: driver needs to toggle DF Cstate to ensure 261 + * safe access of UMC resgisters. Will add the protection 262 + * when firmware interface is ready */ 263 + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 264 + umc_reg_offset = get_umc_v6_7_reg_offset(adev, 265 + umc_inst, 266 + ch_inst); 267 + umc_v6_7_query_error_address(adev, 268 + err_data, 269 + umc_reg_offset, 270 + ch_inst, 271 + umc_inst); 272 + } 273 + } 274 + 186 275 const struct amdgpu_umc_funcs umc_v6_7_funcs = { 187 276 .ras_late_init = amdgpu_umc_ras_late_init, 188 277 .query_ras_error_count = umc_v6_7_query_ras_error_count, 278 + .query_ras_error_address = umc_v6_7_query_ras_error_address, 189 279 };