Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: retire RAS bad pages in different NPS modes

There are some changes in format of memory normalized address per
NPS mode, need to adjust bit mapping according to NPS mode.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by
Alex Deucher
19d4b27a c3d4acf0

+53 -24
+42 -24
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
··· 179 179 struct ta_ras_query_address_output *addr_out, 180 180 bool dump_addr) 181 181 { 182 - uint32_t col, row, bank, channel_index, umc_inst = 0; 183 - uint64_t soc_pa, retired_page, column, err_addr; 182 + uint32_t col, col_lower, row, row_lower, bank; 183 + uint32_t channel_index, umc_inst = 0; 184 + uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS]; 185 + uint64_t soc_pa, column, err_addr; 184 186 struct ta_ras_query_address_output addr_out_tmp; 185 187 struct ta_ras_query_address_output *paddr_out; 188 + enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; 186 189 int ret = 0; 187 190 188 191 if (!addr_out) ··· 202 199 dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", 203 200 err_addr); 204 201 205 - return ret; 202 + goto out; 206 203 } 207 204 208 205 bank = paddr_out->pa.bank; ··· 211 208 umc_inst = addr_in->ma.umc_inst; 212 209 } 213 210 211 + loop_bits[0] = UMC_V12_0_PA_C2_BIT; 212 + loop_bits[1] = UMC_V12_0_PA_C3_BIT; 213 + loop_bits[2] = UMC_V12_0_PA_C4_BIT; 214 + loop_bits[3] = UMC_V12_0_PA_R13_BIT; 215 + 216 + if (adev->gmc.gmc_funcs->query_mem_partition_mode) 217 + nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 218 + 219 + /* other nps modes are taken as nps1 */ 220 + if (nps == AMDGPU_NPS4_PARTITION_MODE) { 221 + loop_bits[0] = UMC_V12_0_PA_CH4_BIT; 222 + loop_bits[1] = UMC_V12_0_PA_CH5_BIT; 223 + loop_bits[2] = UMC_V12_0_PA_B0_BIT; 224 + loop_bits[3] = UMC_V12_0_PA_R11_BIT; 225 + } 226 + 214 227 soc_pa = paddr_out->pa.pa; 215 - 216 - if (!err_data && !dump_addr) 217 - return ret; 218 - 219 - col = (err_addr >> 1) & 0x1fULL; 220 - /* clear [C3 C2] in soc physical address */ 221 - soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); 222 - /* clear [C4] in soc physical address */ 223 - soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); 224 - /* clear [R13] in soc physical address */ 225 - soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT); 228 + /* clear loop bits in soc physical address */ 229 + for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) 230 + soc_pa &= ~BIT_ULL(loop_bits[i]); 226 231 227 232 paddr_out->pa.pa = soc_pa; 233 + /* get column bit 0 and 1 in mca address */ 234 + col_lower = (err_addr >> 1) & 0x3ULL; 235 + /* MA_R13_BIT will be handled later */ 236 + row_lower = (err_addr >> UMC_V12_0_MA_R0_BIT) & 0x1fffULL; 228 237 229 - /* loop for all possibilities of [R13 C4 C3 C2] */ 238 + if (!err_data && !dump_addr) 239 + goto out; 240 + 241 + /* loop for all possibilities of retired bits */ 230 242 for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { 231 - retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); 232 - retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); 233 - retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT); 243 + soc_pa = paddr_out->pa.pa; 244 + for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) 245 + soc_pa |= (((column >> i) & 0x1ULL) << loop_bits[i]); 234 246 235 - /* include column bit 0 and 1 */ 236 - col &= 0x3; 237 - col |= (column << 2); 238 - row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL; 247 + col = ((column & 0x7) << 2) | col_lower; 248 + /* add row bit 13 */ 249 + row = ((column >> 3) << 13) | row_lower; 239 250 240 251 if (dump_addr) 241 252 dev_info(adev->dev, 242 253 "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", 243 - retired_page, row, col, bank, channel_index); 254 + soc_pa, row, col, bank, channel_index); 244 255 245 256 if (err_data) 246 257 amdgpu_umc_fill_error_record(err_data, err_addr, 247 - retired_page, channel_index, umc_inst); 258 + soc_pa, channel_index, umc_inst); 248 259 } 249 260 261 + out: 250 262 return ret; 251 263 } 252 264
+11
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
··· 55 55 #define UMC_V12_0_NA_MAP_PA_NUM 8 56 56 /* R13 bit shift should be considered, double the number */ 57 57 #define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) 58 + /* C2, C3, C4, R13, four bits in MCA address are looped in retirement */ 59 + #define UMC_V12_0_RETIRE_LOOP_BITS 4 58 60 59 61 /* column bits in SOC physical address */ 60 62 #define UMC_V12_0_PA_C2_BIT 15 63 + #define UMC_V12_0_PA_C3_BIT 16 61 64 #define UMC_V12_0_PA_C4_BIT 21 62 65 /* row bits in SOC physical address */ 63 66 #define UMC_V12_0_PA_R0_BIT 22 67 + #define UMC_V12_0_PA_R11_BIT 33 64 68 #define UMC_V12_0_PA_R13_BIT 35 69 + /* channel bit in SOC physical address */ 70 + #define UMC_V12_0_PA_CH4_BIT 12 71 + #define UMC_V12_0_PA_CH5_BIT 13 72 + /* bank bit in SOC physical address */ 73 + #define UMC_V12_0_PA_B0_BIT 19 74 + /* row bits in MCA address */ 75 + #define UMC_V12_0_MA_R0_BIT 10 65 76 66 77 #define MCA_UMC_HWID_V12_0 0x96 67 78 #define MCA_UMC_MCATYPE_V12_0 0x0