Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/slab.h>
4
5#include <asm/cpu.h>
6
7#include "mce_amd.h"
8
9static struct amd_decoder_ops fam_ops;
10
11static u8 xec_mask = 0xf;
12
13static void (*decode_dram_ecc)(int node_id, struct mce *m);
14
15void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16{
17 decode_dram_ecc = f;
18}
19EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20
21void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22{
23 if (decode_dram_ecc) {
24 WARN_ON(decode_dram_ecc != f);
25
26 decode_dram_ecc = NULL;
27 }
28}
29EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30
31/*
32 * string representation for the different MCA reported error types, see F3x48
33 * or MSR0000_0411.
34 */
35
36/* transaction type */
37static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38
39/* cache level */
40static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41
42/* memory transaction type */
43static const char * const rrrr_msgs[] = {
44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45};
46
47/* participating processor */
48const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49EXPORT_SYMBOL_GPL(pp_msgs);
50
51/* request timeout */
52static const char * const to_msgs[] = { "no timeout", "timed out" };
53
54/* memory or i/o */
55static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56
57/* internal error type */
58static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59
60static const char * const f15h_mc1_mce_desc[] = {
61 "UC during a demand linefill from L2",
62 "Parity error during data load from IC",
63 "Parity error for IC valid bit",
64 "Main tag parity error",
65 "Parity error in prediction queue",
66 "PFB data/address parity error",
67 "Parity error in the branch status reg",
68 "PFB promotion address error",
69 "Tag error during probe/victimization",
70 "Parity error for IC probe tag valid bit",
71 "PFB non-cacheable bit parity error",
72 "PFB valid bit parity error", /* xec = 0xd */
73 "Microcode Patch Buffer", /* xec = 010 */
74 "uop queue",
75 "insn buffer",
76 "predecode buffer",
77 "fetch address FIFO",
78 "dispatch uop queue"
79};
80
81static const char * const f15h_mc2_mce_desc[] = {
82 "Fill ECC error on data fills", /* xec = 0x4 */
83 "Fill parity error on insn fills",
84 "Prefetcher request FIFO parity error",
85 "PRQ address parity error",
86 "PRQ data parity error",
87 "WCC Tag ECC error",
88 "WCC Data ECC error",
89 "WCB Data parity error",
90 "VB Data ECC or parity error",
91 "L2 Tag ECC error", /* xec = 0x10 */
92 "Hard L2 Tag ECC error",
93 "Multiple hits on L2 tag",
94 "XAB parity error",
95 "PRB address parity error"
96};
97
98static const char * const mc4_mce_desc[] = {
99 "DRAM ECC error detected on the NB",
100 "CRC error detected on HT link",
101 "Link-defined sync error packets detected on HT link",
102 "HT Master abort",
103 "HT Target abort",
104 "Invalid GART PTE entry during GART table walk",
105 "Unsupported atomic RMW received from an IO link",
106 "Watchdog timeout due to lack of progress",
107 "DRAM ECC error detected on the NB",
108 "SVM DMA Exclusion Vector error",
109 "HT data error detected on link",
110 "Protocol error (link, L3, probe filter)",
111 "NB internal arrays parity error",
112 "DRAM addr/ctl signals parity error",
113 "IO link transmission error",
114 "L3 data cache ECC error", /* xec = 0x1c */
115 "L3 cache tag error",
116 "L3 LRU parity bits error",
117 "ECC Error in the Probe Filter directory"
118};
119
120static const char * const mc5_mce_desc[] = {
121 "CPU Watchdog timer expire",
122 "Wakeup array dest tag",
123 "AG payload array",
124 "EX payload array",
125 "IDRF array",
126 "Retire dispatch queue",
127 "Mapper checkpoint array",
128 "Physical register file EX0 port",
129 "Physical register file EX1 port",
130 "Physical register file AG0 port",
131 "Physical register file AG1 port",
132 "Flag register file",
133 "DE error occurred",
134 "Retire status queue"
135};
136
137static const char * const mc6_mce_desc[] = {
138 "Hardware Assertion",
139 "Free List",
140 "Physical Register File",
141 "Retire Queue",
142 "Scheduler table",
143 "Status Register File",
144};
145
146/* Scalable MCA error strings */
147static const char * const smca_ls_mce_desc[] = {
148 "Load queue parity error",
149 "Store queue parity error",
150 "Miss address buffer payload parity error",
151 "Level 1 TLB parity error",
152 "DC Tag error type 5",
153 "DC Tag error type 6",
154 "DC Tag error type 1",
155 "Internal error type 1",
156 "Internal error type 2",
157 "System Read Data Error Thread 0",
158 "System Read Data Error Thread 1",
159 "DC Tag error type 2",
160 "DC Data error type 1 and poison consumption",
161 "DC Data error type 2",
162 "DC Data error type 3",
163 "DC Tag error type 4",
164 "Level 2 TLB parity error",
165 "PDC parity error",
166 "DC Tag error type 3",
167 "DC Tag error type 5",
168 "L2 Fill Data error",
169};
170
171static const char * const smca_ls2_mce_desc[] = {
172 "An ECC error was detected on a data cache read by a probe or victimization",
173 "An ECC error or L2 poison was detected on a data cache read by a load",
174 "An ECC error was detected on a data cache read-modify-write by a store",
175 "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
176 "An ECC error or poison bit mismatch was detected on a tag read by a load",
177 "An ECC error or poison bit mismatch was detected on a tag read by a store",
178 "An ECC error was detected on an EMEM read by a load",
179 "An ECC error was detected on an EMEM read-modify-write by a store",
180 "A parity error was detected in an L1 TLB entry by any access",
181 "A parity error was detected in an L2 TLB entry by any access",
182 "A parity error was detected in a PWC entry by any access",
183 "A parity error was detected in an STQ entry by any access",
184 "A parity error was detected in an LDQ entry by any access",
185 "A parity error was detected in a MAB entry by any access",
186 "A parity error was detected in an SCB entry state field by any access",
187 "A parity error was detected in an SCB entry address field by any access",
188 "A parity error was detected in an SCB entry data field by any access",
189 "A parity error was detected in a WCB entry by any access",
190 "A poisoned line was detected in an SCB entry by any access",
191 "A SystemReadDataError error was reported on read data returned from L2 for a load",
192 "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
193 "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
194 "A hardware assertion error was reported",
195 "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
196};
197
198static const char * const smca_if_mce_desc[] = {
199 "Op Cache Microtag Probe Port Parity Error",
200 "IC Microtag or Full Tag Multi-hit Error",
201 "IC Full Tag Parity Error",
202 "IC Data Array Parity Error",
203 "Decoupling Queue PhysAddr Parity Error",
204 "L0 ITLB Parity Error",
205 "L1 ITLB Parity Error",
206 "L2 ITLB Parity Error",
207 "BPQ Thread 0 Snoop Parity Error",
208 "BPQ Thread 1 Snoop Parity Error",
209 "L1 BTB Multi-Match Error",
210 "L2 BTB Multi-Match Error",
211 "L2 Cache Response Poison Error",
212 "System Read Data Error",
213};
214
215static const char * const smca_l2_mce_desc[] = {
216 "L2M Tag Multiple-Way-Hit error",
217 "L2M Tag or State Array ECC Error",
218 "L2M Data Array ECC Error",
219 "Hardware Assert Error",
220};
221
222static const char * const smca_de_mce_desc[] = {
223 "Micro-op cache tag parity error",
224 "Micro-op cache data parity error",
225 "Instruction buffer parity error",
226 "Micro-op queue parity error",
227 "Instruction dispatch queue parity error",
228 "Fetch address FIFO parity error",
229 "Patch RAM data parity error",
230 "Patch RAM sequencer parity error",
231 "Micro-op buffer parity error"
232};
233
234static const char * const smca_ex_mce_desc[] = {
235 "Watchdog Timeout error",
236 "Physical register file parity error",
237 "Flag register file parity error",
238 "Immediate displacement register file parity error",
239 "Address generator payload parity error",
240 "EX payload parity error",
241 "Checkpoint queue parity error",
242 "Retire dispatch queue parity error",
243 "Retire status queue parity error",
244 "Scheduling queue parity error",
245 "Branch buffer queue parity error",
246 "Hardware Assertion error",
247};
248
249static const char * const smca_fp_mce_desc[] = {
250 "Physical register file (PRF) parity error",
251 "Freelist (FL) parity error",
252 "Schedule queue parity error",
253 "NSQ parity error",
254 "Retire queue (RQ) parity error",
255 "Status register file (SRF) parity error",
256 "Hardware assertion",
257};
258
259static const char * const smca_l3_mce_desc[] = {
260 "Shadow Tag Macro ECC Error",
261 "Shadow Tag Macro Multi-way-hit Error",
262 "L3M Tag ECC Error",
263 "L3M Tag Multi-way-hit Error",
264 "L3M Data ECC Error",
265 "SDP Parity Error or SystemReadDataError from XI",
266 "L3 Victim Queue Parity Error",
267 "L3 Hardware Assertion",
268};
269
270static const char * const smca_cs_mce_desc[] = {
271 "Illegal Request",
272 "Address Violation",
273 "Security Violation",
274 "Illegal Response",
275 "Unexpected Response",
276 "Request or Probe Parity Error",
277 "Read Response Parity Error",
278 "Atomic Request Parity Error",
279 "Probe Filter ECC Error",
280};
281
282static const char * const smca_cs2_mce_desc[] = {
283 "Illegal Request",
284 "Address Violation",
285 "Security Violation",
286 "Illegal Response",
287 "Unexpected Response",
288 "Request or Probe Parity Error",
289 "Read Response Parity Error",
290 "Atomic Request Parity Error",
291 "SDP read response had no match in the CS queue",
292 "Probe Filter Protocol Error",
293 "Probe Filter ECC Error",
294 "SDP read response had an unexpected RETRY error",
295 "Counter overflow error",
296 "Counter underflow error",
297};
298
299static const char * const smca_pie_mce_desc[] = {
300 "Hardware Assert",
301 "Register security violation",
302 "Link Error",
303 "Poison data consumption",
304 "A deferred error was detected in the DF"
305};
306
307static const char * const smca_umc_mce_desc[] = {
308 "DRAM ECC error",
309 "Data poison error",
310 "SDP parity error",
311 "Advanced peripheral bus error",
312 "Address/Command parity error",
313 "Write data CRC error",
314 "DCQ SRAM ECC error",
315 "AES SRAM ECC error",
316};
317
318static const char * const smca_pb_mce_desc[] = {
319 "An ECC error in the Parameter Block RAM array",
320};
321
322static const char * const smca_psp_mce_desc[] = {
323 "An ECC or parity error in a PSP RAM instance",
324};
325
326static const char * const smca_psp2_mce_desc[] = {
327 "High SRAM ECC or parity error",
328 "Low SRAM ECC or parity error",
329 "Instruction Cache Bank 0 ECC or parity error",
330 "Instruction Cache Bank 1 ECC or parity error",
331 "Instruction Tag Ram 0 parity error",
332 "Instruction Tag Ram 1 parity error",
333 "Data Cache Bank 0 ECC or parity error",
334 "Data Cache Bank 1 ECC or parity error",
335 "Data Cache Bank 2 ECC or parity error",
336 "Data Cache Bank 3 ECC or parity error",
337 "Data Tag Bank 0 parity error",
338 "Data Tag Bank 1 parity error",
339 "Data Tag Bank 2 parity error",
340 "Data Tag Bank 3 parity error",
341 "Dirty Data Ram parity error",
342 "TLB Bank 0 parity error",
343 "TLB Bank 1 parity error",
344 "System Hub Read Buffer ECC or parity error",
345};
346
347static const char * const smca_smu_mce_desc[] = {
348 "An ECC or parity error in an SMU RAM instance",
349};
350
351static const char * const smca_smu2_mce_desc[] = {
352 "High SRAM ECC or parity error",
353 "Low SRAM ECC or parity error",
354 "Data Cache Bank A ECC or parity error",
355 "Data Cache Bank B ECC or parity error",
356 "Data Tag Cache Bank A ECC or parity error",
357 "Data Tag Cache Bank B ECC or parity error",
358 "Instruction Cache Bank A ECC or parity error",
359 "Instruction Cache Bank B ECC or parity error",
360 "Instruction Tag Cache Bank A ECC or parity error",
361 "Instruction Tag Cache Bank B ECC or parity error",
362 "System Hub Read Buffer ECC or parity error",
363};
364
365static const char * const smca_mp5_mce_desc[] = {
366 "High SRAM ECC or parity error",
367 "Low SRAM ECC or parity error",
368 "Data Cache Bank A ECC or parity error",
369 "Data Cache Bank B ECC or parity error",
370 "Data Tag Cache Bank A ECC or parity error",
371 "Data Tag Cache Bank B ECC or parity error",
372 "Instruction Cache Bank A ECC or parity error",
373 "Instruction Cache Bank B ECC or parity error",
374 "Instruction Tag Cache Bank A ECC or parity error",
375 "Instruction Tag Cache Bank B ECC or parity error",
376};
377
378static const char * const smca_nbio_mce_desc[] = {
379 "ECC or Parity error",
380 "PCIE error",
381 "SDP ErrEvent error",
382 "SDP Egress Poison Error",
383 "IOHC Internal Poison Error",
384};
385
386static const char * const smca_pcie_mce_desc[] = {
387 "CCIX PER Message logging",
388 "CCIX Read Response with Status: Non-Data Error",
389 "CCIX Write Response with Status: Non-Data Error",
390 "CCIX Read Response with Status: Data Error",
391 "CCIX Non-okay write response with data error",
392};
393
394struct smca_mce_desc {
395 const char * const *descs;
396 unsigned int num_descs;
397};
398
399static struct smca_mce_desc smca_mce_descs[] = {
400 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
401 [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
402 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
403 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
404 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
405 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
406 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
407 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
408 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
409 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
410 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
411 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
412 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
413 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
414 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
415 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
416 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
417 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
418 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
419 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
420};
421
422static bool f12h_mc0_mce(u16 ec, u8 xec)
423{
424 bool ret = false;
425
426 if (MEM_ERROR(ec)) {
427 u8 ll = LL(ec);
428 ret = true;
429
430 if (ll == LL_L2)
431 pr_cont("during L1 linefill from L2.\n");
432 else if (ll == LL_L1)
433 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
434 else
435 ret = false;
436 }
437 return ret;
438}
439
440static bool f10h_mc0_mce(u16 ec, u8 xec)
441{
442 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
443 pr_cont("during data scrub.\n");
444 return true;
445 }
446 return f12h_mc0_mce(ec, xec);
447}
448
449static bool k8_mc0_mce(u16 ec, u8 xec)
450{
451 if (BUS_ERROR(ec)) {
452 pr_cont("during system linefill.\n");
453 return true;
454 }
455
456 return f10h_mc0_mce(ec, xec);
457}
458
459static bool cat_mc0_mce(u16 ec, u8 xec)
460{
461 u8 r4 = R4(ec);
462 bool ret = true;
463
464 if (MEM_ERROR(ec)) {
465
466 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
467 return false;
468
469 switch (r4) {
470 case R4_DRD:
471 case R4_DWR:
472 pr_cont("Data/Tag parity error due to %s.\n",
473 (r4 == R4_DRD ? "load/hw prf" : "store"));
474 break;
475 case R4_EVICT:
476 pr_cont("Copyback parity error on a tag miss.\n");
477 break;
478 case R4_SNOOP:
479 pr_cont("Tag parity error during snoop.\n");
480 break;
481 default:
482 ret = false;
483 }
484 } else if (BUS_ERROR(ec)) {
485
486 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
487 return false;
488
489 pr_cont("System read data error on a ");
490
491 switch (r4) {
492 case R4_RD:
493 pr_cont("TLB reload.\n");
494 break;
495 case R4_DWR:
496 pr_cont("store.\n");
497 break;
498 case R4_DRD:
499 pr_cont("load.\n");
500 break;
501 default:
502 ret = false;
503 }
504 } else {
505 ret = false;
506 }
507
508 return ret;
509}
510
511static bool f15h_mc0_mce(u16 ec, u8 xec)
512{
513 bool ret = true;
514
515 if (MEM_ERROR(ec)) {
516
517 switch (xec) {
518 case 0x0:
519 pr_cont("Data Array access error.\n");
520 break;
521
522 case 0x1:
523 pr_cont("UC error during a linefill from L2/NB.\n");
524 break;
525
526 case 0x2:
527 case 0x11:
528 pr_cont("STQ access error.\n");
529 break;
530
531 case 0x3:
532 pr_cont("SCB access error.\n");
533 break;
534
535 case 0x10:
536 pr_cont("Tag error.\n");
537 break;
538
539 case 0x12:
540 pr_cont("LDQ access error.\n");
541 break;
542
543 default:
544 ret = false;
545 }
546 } else if (BUS_ERROR(ec)) {
547
548 if (!xec)
549 pr_cont("System Read Data Error.\n");
550 else
551 pr_cont(" Internal error condition type %d.\n", xec);
552 } else if (INT_ERROR(ec)) {
553 if (xec <= 0x1f)
554 pr_cont("Hardware Assert.\n");
555 else
556 ret = false;
557
558 } else
559 ret = false;
560
561 return ret;
562}
563
564static void decode_mc0_mce(struct mce *m)
565{
566 u16 ec = EC(m->status);
567 u8 xec = XEC(m->status, xec_mask);
568
569 pr_emerg(HW_ERR "MC0 Error: ");
570
571 /* TLB error signatures are the same across families */
572 if (TLB_ERROR(ec)) {
573 if (TT(ec) == TT_DATA) {
574 pr_cont("%s TLB %s.\n", LL_MSG(ec),
575 ((xec == 2) ? "locked miss"
576 : (xec ? "multimatch" : "parity")));
577 return;
578 }
579 } else if (fam_ops.mc0_mce(ec, xec))
580 ;
581 else
582 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
583}
584
585static bool k8_mc1_mce(u16 ec, u8 xec)
586{
587 u8 ll = LL(ec);
588 bool ret = true;
589
590 if (!MEM_ERROR(ec))
591 return false;
592
593 if (ll == 0x2)
594 pr_cont("during a linefill from L2.\n");
595 else if (ll == 0x1) {
596 switch (R4(ec)) {
597 case R4_IRD:
598 pr_cont("Parity error during data load.\n");
599 break;
600
601 case R4_EVICT:
602 pr_cont("Copyback Parity/Victim error.\n");
603 break;
604
605 case R4_SNOOP:
606 pr_cont("Tag Snoop error.\n");
607 break;
608
609 default:
610 ret = false;
611 break;
612 }
613 } else
614 ret = false;
615
616 return ret;
617}
618
619static bool cat_mc1_mce(u16 ec, u8 xec)
620{
621 u8 r4 = R4(ec);
622 bool ret = true;
623
624 if (!MEM_ERROR(ec))
625 return false;
626
627 if (TT(ec) != TT_INSTR)
628 return false;
629
630 if (r4 == R4_IRD)
631 pr_cont("Data/tag array parity error for a tag hit.\n");
632 else if (r4 == R4_SNOOP)
633 pr_cont("Tag error during snoop/victimization.\n");
634 else if (xec == 0x0)
635 pr_cont("Tag parity error from victim castout.\n");
636 else if (xec == 0x2)
637 pr_cont("Microcode patch RAM parity error.\n");
638 else
639 ret = false;
640
641 return ret;
642}
643
644static bool f15h_mc1_mce(u16 ec, u8 xec)
645{
646 bool ret = true;
647
648 if (!MEM_ERROR(ec))
649 return false;
650
651 switch (xec) {
652 case 0x0 ... 0xa:
653 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
654 break;
655
656 case 0xd:
657 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
658 break;
659
660 case 0x10:
661 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
662 break;
663
664 case 0x11 ... 0x15:
665 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
666 break;
667
668 default:
669 ret = false;
670 }
671 return ret;
672}
673
674static void decode_mc1_mce(struct mce *m)
675{
676 u16 ec = EC(m->status);
677 u8 xec = XEC(m->status, xec_mask);
678
679 pr_emerg(HW_ERR "MC1 Error: ");
680
681 if (TLB_ERROR(ec))
682 pr_cont("%s TLB %s.\n", LL_MSG(ec),
683 (xec ? "multimatch" : "parity error"));
684 else if (BUS_ERROR(ec)) {
685 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
686
687 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
688 } else if (INT_ERROR(ec)) {
689 if (xec <= 0x3f)
690 pr_cont("Hardware Assert.\n");
691 else
692 goto wrong_mc1_mce;
693 } else if (fam_ops.mc1_mce(ec, xec))
694 ;
695 else
696 goto wrong_mc1_mce;
697
698 return;
699
700wrong_mc1_mce:
701 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
702}
703
704static bool k8_mc2_mce(u16 ec, u8 xec)
705{
706 bool ret = true;
707
708 if (xec == 0x1)
709 pr_cont(" in the write data buffers.\n");
710 else if (xec == 0x3)
711 pr_cont(" in the victim data buffers.\n");
712 else if (xec == 0x2 && MEM_ERROR(ec))
713 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
714 else if (xec == 0x0) {
715 if (TLB_ERROR(ec))
716 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
717 TT_MSG(ec));
718 else if (BUS_ERROR(ec))
719 pr_cont(": %s/ECC error in data read from NB: %s.\n",
720 R4_MSG(ec), PP_MSG(ec));
721 else if (MEM_ERROR(ec)) {
722 u8 r4 = R4(ec);
723
724 if (r4 >= 0x7)
725 pr_cont(": %s error during data copyback.\n",
726 R4_MSG(ec));
727 else if (r4 <= 0x1)
728 pr_cont(": %s parity/ECC error during data "
729 "access from L2.\n", R4_MSG(ec));
730 else
731 ret = false;
732 } else
733 ret = false;
734 } else
735 ret = false;
736
737 return ret;
738}
739
740static bool f15h_mc2_mce(u16 ec, u8 xec)
741{
742 bool ret = true;
743
744 if (TLB_ERROR(ec)) {
745 if (xec == 0x0)
746 pr_cont("Data parity TLB read error.\n");
747 else if (xec == 0x1)
748 pr_cont("Poison data provided for TLB fill.\n");
749 else
750 ret = false;
751 } else if (BUS_ERROR(ec)) {
752 if (xec > 2)
753 ret = false;
754
755 pr_cont("Error during attempted NB data read.\n");
756 } else if (MEM_ERROR(ec)) {
757 switch (xec) {
758 case 0x4 ... 0xc:
759 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
760 break;
761
762 case 0x10 ... 0x14:
763 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
764 break;
765
766 default:
767 ret = false;
768 }
769 } else if (INT_ERROR(ec)) {
770 if (xec <= 0x3f)
771 pr_cont("Hardware Assert.\n");
772 else
773 ret = false;
774 }
775
776 return ret;
777}
778
779static bool f16h_mc2_mce(u16 ec, u8 xec)
780{
781 u8 r4 = R4(ec);
782
783 if (!MEM_ERROR(ec))
784 return false;
785
786 switch (xec) {
787 case 0x04 ... 0x05:
788 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
789 break;
790
791 case 0x09 ... 0x0b:
792 case 0x0d ... 0x0f:
793 pr_cont("ECC error in L2 tag (%s).\n",
794 ((r4 == R4_GEN) ? "BankReq" :
795 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
796 break;
797
798 case 0x10 ... 0x19:
799 case 0x1b:
800 pr_cont("ECC error in L2 data array (%s).\n",
801 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
802 ((r4 == R4_GEN) ? "Attr" :
803 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
804 break;
805
806 case 0x1c ... 0x1d:
807 case 0x1f:
808 pr_cont("Parity error in L2 attribute bits (%s).\n",
809 ((r4 == R4_RD) ? "Hit" :
810 ((r4 == R4_GEN) ? "Attr" : "Fill")));
811 break;
812
813 default:
814 return false;
815 }
816
817 return true;
818}
819
820static void decode_mc2_mce(struct mce *m)
821{
822 u16 ec = EC(m->status);
823 u8 xec = XEC(m->status, xec_mask);
824
825 pr_emerg(HW_ERR "MC2 Error: ");
826
827 if (!fam_ops.mc2_mce(ec, xec))
828 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
829}
830
831static void decode_mc3_mce(struct mce *m)
832{
833 u16 ec = EC(m->status);
834 u8 xec = XEC(m->status, xec_mask);
835
836 if (boot_cpu_data.x86 >= 0x14) {
837 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
838 " please report on LKML.\n");
839 return;
840 }
841
842 pr_emerg(HW_ERR "MC3 Error");
843
844 if (xec == 0x0) {
845 u8 r4 = R4(ec);
846
847 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
848 goto wrong_mc3_mce;
849
850 pr_cont(" during %s.\n", R4_MSG(ec));
851 } else
852 goto wrong_mc3_mce;
853
854 return;
855
856 wrong_mc3_mce:
857 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
858}
859
860static void decode_mc4_mce(struct mce *m)
861{
862 unsigned int fam = x86_family(m->cpuid);
863 int node_id = amd_get_nb_id(m->extcpu);
864 u16 ec = EC(m->status);
865 u8 xec = XEC(m->status, 0x1f);
866 u8 offset = 0;
867
868 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
869
870 switch (xec) {
871 case 0x0 ... 0xe:
872
873 /* special handling for DRAM ECCs */
874 if (xec == 0x0 || xec == 0x8) {
875 /* no ECCs on F11h */
876 if (fam == 0x11)
877 goto wrong_mc4_mce;
878
879 pr_cont("%s.\n", mc4_mce_desc[xec]);
880
881 if (decode_dram_ecc)
882 decode_dram_ecc(node_id, m);
883 return;
884 }
885 break;
886
887 case 0xf:
888 if (TLB_ERROR(ec))
889 pr_cont("GART Table Walk data error.\n");
890 else if (BUS_ERROR(ec))
891 pr_cont("DMA Exclusion Vector Table Walk error.\n");
892 else
893 goto wrong_mc4_mce;
894 return;
895
896 case 0x19:
897 if (fam == 0x15 || fam == 0x16)
898 pr_cont("Compute Unit Data Error.\n");
899 else
900 goto wrong_mc4_mce;
901 return;
902
903 case 0x1c ... 0x1f:
904 offset = 13;
905 break;
906
907 default:
908 goto wrong_mc4_mce;
909 }
910
911 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
912 return;
913
914 wrong_mc4_mce:
915 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
916}
917
918static void decode_mc5_mce(struct mce *m)
919{
920 unsigned int fam = x86_family(m->cpuid);
921 u16 ec = EC(m->status);
922 u8 xec = XEC(m->status, xec_mask);
923
924 if (fam == 0xf || fam == 0x11)
925 goto wrong_mc5_mce;
926
927 pr_emerg(HW_ERR "MC5 Error: ");
928
929 if (INT_ERROR(ec)) {
930 if (xec <= 0x1f) {
931 pr_cont("Hardware Assert.\n");
932 return;
933 } else
934 goto wrong_mc5_mce;
935 }
936
937 if (xec == 0x0 || xec == 0xc)
938 pr_cont("%s.\n", mc5_mce_desc[xec]);
939 else if (xec <= 0xd)
940 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
941 else
942 goto wrong_mc5_mce;
943
944 return;
945
946 wrong_mc5_mce:
947 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
948}
949
950static void decode_mc6_mce(struct mce *m)
951{
952 u8 xec = XEC(m->status, xec_mask);
953
954 pr_emerg(HW_ERR "MC6 Error: ");
955
956 if (xec > 0x5)
957 goto wrong_mc6_mce;
958
959 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
960 return;
961
962 wrong_mc6_mce:
963 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
964}
965
966/* Decode errors according to Scalable MCA specification */
967static void decode_smca_error(struct mce *m)
968{
969 struct smca_hwid *hwid;
970 enum smca_bank_types bank_type;
971 const char *ip_name;
972 u8 xec = XEC(m->status, xec_mask);
973
974 if (m->bank >= ARRAY_SIZE(smca_banks))
975 return;
976
977 hwid = smca_banks[m->bank].hwid;
978 if (!hwid)
979 return;
980
981 bank_type = hwid->bank_type;
982
983 if (bank_type == SMCA_RESERVED) {
984 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
985 return;
986 }
987
988 ip_name = smca_get_long_name(bank_type);
989
990 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
991
992 /* Only print the decode of valid error codes */
993 if (xec < smca_mce_descs[bank_type].num_descs &&
994 (hwid->xec_bitmap & BIT_ULL(xec))) {
995 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
996 }
997
998 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
999 decode_dram_ecc(cpu_to_node(m->extcpu), m);
1000}
1001
1002static inline void amd_decode_err_code(u16 ec)
1003{
1004 if (INT_ERROR(ec)) {
1005 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1006 return;
1007 }
1008
1009 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1010
1011 if (BUS_ERROR(ec))
1012 pr_cont(", mem/io: %s", II_MSG(ec));
1013 else
1014 pr_cont(", tx: %s", TT_MSG(ec));
1015
1016 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1017 pr_cont(", mem-tx: %s", R4_MSG(ec));
1018
1019 if (BUS_ERROR(ec))
1020 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1021 }
1022
1023 pr_cont("\n");
1024}
1025
1026static const char *decode_error_status(struct mce *m)
1027{
1028 if (m->status & MCI_STATUS_UC) {
1029 if (m->status & MCI_STATUS_PCC)
1030 return "System Fatal error.";
1031 if (m->mcgstatus & MCG_STATUS_RIPV)
1032 return "Uncorrected, software restartable error.";
1033 return "Uncorrected, software containable error.";
1034 }
1035
1036 if (m->status & MCI_STATUS_DEFERRED)
1037 return "Deferred error, no action required.";
1038
1039 return "Corrected error, no action required.";
1040}
1041
1042static int
1043amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1044{
1045 struct mce *m = (struct mce *)data;
1046 unsigned int fam = x86_family(m->cpuid);
1047 int ecc;
1048
1049 if (m->kflags & MCE_HANDLED_CEC)
1050 return NOTIFY_DONE;
1051
1052 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1053
1054 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1055 m->extcpu,
1056 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1057 m->bank,
1058 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
1059 ((m->status & MCI_STATUS_UC) ? "UE" :
1060 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
1061 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1062 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
1063 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
1064
1065 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1066 u32 low, high;
1067 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1068
1069 if (!rdmsr_safe(addr, &low, &high) &&
1070 (low & MCI_CONFIG_MCAX))
1071 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1072
1073 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1074 }
1075
1076 /* do the two bits[14:13] together */
1077 ecc = (m->status >> 45) & 0x3;
1078 if (ecc)
1079 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1080
1081 if (fam >= 0x15) {
1082 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1083
1084 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1085 if (fam != 0x15 || m->bank != 4)
1086 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1087 }
1088
1089 if (fam >= 0x17)
1090 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1091
1092 pr_cont("]: 0x%016llx\n", m->status);
1093
1094 if (m->status & MCI_STATUS_ADDRV)
1095 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1096
1097 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1098 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1099
1100 if (m->status & MCI_STATUS_SYNDV)
1101 pr_cont(", Syndrome: 0x%016llx", m->synd);
1102
1103 pr_cont("\n");
1104
1105 decode_smca_error(m);
1106 goto err_code;
1107 }
1108
1109 if (m->tsc)
1110 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1111
1112 /* Doesn't matter which member to test. */
1113 if (!fam_ops.mc0_mce)
1114 goto err_code;
1115
1116 switch (m->bank) {
1117 case 0:
1118 decode_mc0_mce(m);
1119 break;
1120
1121 case 1:
1122 decode_mc1_mce(m);
1123 break;
1124
1125 case 2:
1126 decode_mc2_mce(m);
1127 break;
1128
1129 case 3:
1130 decode_mc3_mce(m);
1131 break;
1132
1133 case 4:
1134 decode_mc4_mce(m);
1135 break;
1136
1137 case 5:
1138 decode_mc5_mce(m);
1139 break;
1140
1141 case 6:
1142 decode_mc6_mce(m);
1143 break;
1144
1145 default:
1146 break;
1147 }
1148
1149 err_code:
1150 amd_decode_err_code(m->status & 0xffff);
1151
1152 m->kflags |= MCE_HANDLED_EDAC;
1153 return NOTIFY_OK;
1154}
1155
1156static struct notifier_block amd_mce_dec_nb = {
1157 .notifier_call = amd_decode_mce,
1158 .priority = MCE_PRIO_EDAC,
1159};
1160
1161static int __init mce_amd_init(void)
1162{
1163 struct cpuinfo_x86 *c = &boot_cpu_data;
1164
1165 if (c->x86_vendor != X86_VENDOR_AMD &&
1166 c->x86_vendor != X86_VENDOR_HYGON)
1167 return -ENODEV;
1168
1169 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1170 xec_mask = 0x3f;
1171 goto out;
1172 }
1173
1174 switch (c->x86) {
1175 case 0xf:
1176 fam_ops.mc0_mce = k8_mc0_mce;
1177 fam_ops.mc1_mce = k8_mc1_mce;
1178 fam_ops.mc2_mce = k8_mc2_mce;
1179 break;
1180
1181 case 0x10:
1182 fam_ops.mc0_mce = f10h_mc0_mce;
1183 fam_ops.mc1_mce = k8_mc1_mce;
1184 fam_ops.mc2_mce = k8_mc2_mce;
1185 break;
1186
1187 case 0x11:
1188 fam_ops.mc0_mce = k8_mc0_mce;
1189 fam_ops.mc1_mce = k8_mc1_mce;
1190 fam_ops.mc2_mce = k8_mc2_mce;
1191 break;
1192
1193 case 0x12:
1194 fam_ops.mc0_mce = f12h_mc0_mce;
1195 fam_ops.mc1_mce = k8_mc1_mce;
1196 fam_ops.mc2_mce = k8_mc2_mce;
1197 break;
1198
1199 case 0x14:
1200 fam_ops.mc0_mce = cat_mc0_mce;
1201 fam_ops.mc1_mce = cat_mc1_mce;
1202 fam_ops.mc2_mce = k8_mc2_mce;
1203 break;
1204
1205 case 0x15:
1206 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1207
1208 fam_ops.mc0_mce = f15h_mc0_mce;
1209 fam_ops.mc1_mce = f15h_mc1_mce;
1210 fam_ops.mc2_mce = f15h_mc2_mce;
1211 break;
1212
1213 case 0x16:
1214 xec_mask = 0x1f;
1215 fam_ops.mc0_mce = cat_mc0_mce;
1216 fam_ops.mc1_mce = cat_mc1_mce;
1217 fam_ops.mc2_mce = f16h_mc2_mce;
1218 break;
1219
1220 case 0x17:
1221 case 0x18:
1222 pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1223 return -EINVAL;
1224
1225 default:
1226 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1227 return -EINVAL;
1228 }
1229
1230out:
1231 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1232
1233 mce_register_decode_chain(&amd_mce_dec_nb);
1234
1235 return 0;
1236}
1237early_initcall(mce_amd_init);
1238
1239#ifdef MODULE
1240static void __exit mce_amd_exit(void)
1241{
1242 mce_unregister_decode_chain(&amd_mce_dec_nb);
1243}
1244
1245MODULE_DESCRIPTION("AMD MCE decoder");
1246MODULE_ALIAS("edac-mce-amd");
1247MODULE_LICENSE("GPL");
1248module_exit(mce_amd_exit);
1249#endif