Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

EDAC, MCE: Pass complete MCE info to decoders

... instead of the MCi_STATUS info only for improved handling of certain
types of errors later.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

authored by

Borislav Petkov and committed by
Borislav Petkov
7cfd4a87 6337583d

+56 -47
+10 -3
drivers/edac/amd64_edac.c
··· 2073 2073 amd64_handle_ue(mci, info); 2074 2074 } 2075 2075 2076 - void amd64_decode_bus_error(int node_id, struct err_regs *regs) 2076 + void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg) 2077 2077 { 2078 2078 struct mem_ctl_info *mci = mci_lookup[node_id]; 2079 + struct err_regs regs; 2079 2080 2080 - __amd64_decode_bus_error(mci, regs); 2081 + regs.nbsl = (u32) m->status; 2082 + regs.nbsh = (u32)(m->status >> 32); 2083 + regs.nbeal = (u32) m->addr; 2084 + regs.nbeah = (u32)(m->addr >> 32); 2085 + regs.nbcfg = nbcfg; 2086 + 2087 + __amd64_decode_bus_error(mci, &regs); 2081 2088 2082 2089 /* 2083 2090 * Check the UE bit of the NB status high register, if set generate some ··· 2093 2086 * 2094 2087 * FIXME: this should go somewhere else, if at all. 2095 2088 */ 2096 - if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2089 + if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2097 2090 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2098 2091 2099 2092 }
+8 -2
drivers/edac/amd64_edac_dbg.c
··· 10 10 size_t count) 11 11 { 12 12 struct amd64_pvt *pvt = mci->pvt_info; 13 - unsigned long long value; 13 + u64 value; 14 14 int ret = 0; 15 + struct mce m; 15 16 16 17 ret = strict_strtoull(data, 16, &value); 17 18 if (ret != -EINVAL) { 19 + struct err_regs *regs = &pvt->ctl_error_info; 20 + 18 21 debugf0("received NBEA= 0x%llx\n", value); 19 22 20 23 /* place the value into the virtual error packet */ ··· 25 22 value >>= 32; 26 23 pvt->ctl_error_info.nbeah = (u32) value; 27 24 25 + m.addr = value; 26 + m.status = regs->nbsl | ((u64)regs->nbsh << 32); 27 + 28 28 /* Process the Mapping request */ 29 29 /* TODO: Add race prevention */ 30 - amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info); 30 + amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg); 31 31 32 32 return count; 33 33 }
+35 -39
drivers/edac/edac_mce_amd.c
··· 2 2 #include "edac_mce_amd.h" 3 3 4 4 static bool report_gart_errors; 5 - static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); 5 + static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); 6 6 7 7 void amd_report_gart_errors(bool v) 8 8 { ··· 10 10 } 11 11 EXPORT_SYMBOL_GPL(amd_report_gart_errors); 12 12 13 - void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) 13 + void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) 14 14 { 15 15 nb_bus_decoder = f; 16 16 } 17 17 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); 18 18 19 - void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) 19 + void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) 20 20 { 21 21 if (nb_bus_decoder) { 22 22 WARN_ON(nb_bus_decoder != f); ··· 97 97 }; 98 98 EXPORT_SYMBOL_GPL(ext_msgs); 99 99 100 - static void amd_decode_dc_mce(u64 mc0_status) 100 + static void amd_decode_dc_mce(struct mce *m) 101 101 { 102 - u32 ec = mc0_status & 0xffff; 103 - u32 xec = (mc0_status >> 16) & 0xf; 102 + u32 ec = m->status & 0xffff; 103 + u32 xec = (m->status >> 16) & 0xf; 104 104 105 105 pr_emerg(HW_ERR "Data Cache Error: "); 106 106 107 107 if (xec == 1 && TLB_ERROR(ec)) 108 108 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); 109 109 else if (xec == 0) { 110 - if (mc0_status & (1ULL << 40)) 110 + if (m->status & (1ULL << 40)) 111 111 pr_cont(" during Data Scrub.\n"); 112 112 else if (TLB_ERROR(ec)) 113 113 pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); ··· 140 140 pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); 141 141 } 142 142 143 - static void amd_decode_ic_mce(u64 mc1_status) 143 + static void amd_decode_ic_mce(struct mce *m) 144 144 { 145 - u32 ec = mc1_status & 0xffff; 146 - u32 xec = (mc1_status >> 16) & 0xf; 145 + u32 ec = m->status & 0xffff; 146 + u32 xec = (m->status >> 16) & 0xf; 147 147 148 148 pr_emerg(HW_ERR "Instruction Cache Error"); 149 149 ··· 154 154 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); 155 155 else if (BUS_ERROR(ec)) { 156 156 if (boot_cpu_data.x86 == 0xf && 157 - (mc1_status & (1ULL << 58))) 157 + (m->status & BIT(58))) 158 158 pr_cont(" during system linefill.\n"); 159 159 else 160 160 pr_cont(" during attempted NB data read.\n"); ··· 197 197 pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); 198 198 } 199 199 200 - static void amd_decode_bu_mce(u64 mc2_status) 200 + static void amd_decode_bu_mce(struct mce *m) 201 201 { 202 - u32 ec = mc2_status & 0xffff; 203 - u32 xec = (mc2_status >> 16) & 0xf; 202 + u32 ec = m->status & 0xffff; 203 + u32 xec = (m->status >> 16) & 0xf; 204 204 205 205 pr_emerg(HW_ERR "Bus Unit Error"); 206 206 ··· 239 239 pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); 240 240 } 241 241 242 - static void amd_decode_ls_mce(u64 mc3_status) 242 + static void amd_decode_ls_mce(struct mce *m) 243 243 { 244 - u32 ec = mc3_status & 0xffff; 245 - u32 xec = (mc3_status >> 16) & 0xf; 244 + u32 ec = m->status & 0xffff; 245 + u32 xec = (m->status >> 16) & 0xf; 246 246 247 247 pr_emerg(HW_ERR "Load Store Error"); 248 248 ··· 260 260 pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); 261 261 } 262 262 263 - void amd_decode_nb_mce(int node_id, struct err_regs *regs) 263 + void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) 264 264 { 265 - u32 ec = ERROR_CODE(regs->nbsl); 265 + u32 ec = m->status & 0xffff; 266 + u32 nbsh = (u32)(m->status >> 32); 267 + u32 nbsl = (u32)m->status; 266 268 267 269 /* 268 270 * GART TLB error reporting is disabled by default. Bail out early. ··· 280 278 */ 281 279 if ((boot_cpu_data.x86 == 0x10) && 282 280 (boot_cpu_data.x86_model > 7)) { 283 - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) 284 - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); 281 + if (nbsh & K8_NBSH_ERR_CPU_VAL) 282 + pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); 285 283 } else { 286 - u8 assoc_cpus = regs->nbsh & 0xf; 284 + u8 assoc_cpus = nbsh & 0xf; 287 285 288 286 if (assoc_cpus > 0) 289 287 pr_cont(", core: %d", fls(assoc_cpus) - 1); ··· 291 289 pr_cont("\n"); 292 290 } 293 291 294 - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl)); 292 + pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); 295 293 296 294 if (BUS_ERROR(ec) && nb_bus_decoder) 297 - nb_bus_decoder(node_id, regs); 295 + nb_bus_decoder(node_id, m, nbcfg); 298 296 } 299 297 EXPORT_SYMBOL_GPL(amd_decode_nb_mce); 300 298 301 - static void amd_decode_fr_mce(u64 mc5_status) 299 + static void amd_decode_fr_mce(struct mce *m) 302 300 { 303 301 /* we have only one error signature so match all fields at once. */ 304 - if ((mc5_status & 0xffff) == 0x0f0f) 302 + if ((m->status & 0xffff) == 0x0f0f) 305 303 pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); 306 304 else 307 305 pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); ··· 328 326 void *data) 329 327 { 330 328 struct mce *m = (struct mce *)data; 331 - struct err_regs regs; 332 329 int node, ecc; 333 330 334 331 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); ··· 347 346 348 347 switch (m->bank) { 349 348 case 0: 350 - amd_decode_dc_mce(m->status); 349 + amd_decode_dc_mce(m); 351 350 break; 352 351 353 352 case 1: 354 - amd_decode_ic_mce(m->status); 353 + amd_decode_ic_mce(m); 355 354 break; 356 355 357 356 case 2: 358 - amd_decode_bu_mce(m->status); 357 + amd_decode_bu_mce(m); 359 358 break; 360 359 361 360 case 3: 362 - amd_decode_ls_mce(m->status); 361 + amd_decode_ls_mce(m); 363 362 break; 364 363 365 364 case 4: 366 - regs.nbsl = (u32) m->status; 367 - regs.nbsh = (u32)(m->status >> 32); 368 - regs.nbeal = (u32) m->addr; 369 - regs.nbeah = (u32)(m->addr >> 32); 370 - node = amd_get_nb_id(m->extcpu); 371 - 372 - amd_decode_nb_mce(node, &regs); 365 + node = amd_get_nb_id(m->extcpu); 366 + amd_decode_nb_mce(node, m, 0); 373 367 break; 374 368 375 369 case 5: 376 - amd_decode_fr_mce(m->status); 370 + amd_decode_fr_mce(m); 377 371 break; 378 372 379 373 default:
+3 -3
drivers/edac/edac_mce_amd.h
··· 63 63 64 64 65 65 void amd_report_gart_errors(bool); 66 - void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); 67 - void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); 68 - void amd_decode_nb_mce(int, struct err_regs *); 66 + void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); 67 + void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); 68 + void amd_decode_nb_mce(int, struct mce *, u32); 69 69 70 70 #endif /* _EDAC_MCE_AMD_H */