Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86, mce, severity: Extend the the mce_severity mechanism to handle UCNA/DEFERRED error

Until now, the mce_severity mechanism can only identify the severity
of UCNA error as MCE_KEEP_SEVERITY. Meanwhile, it is not able to filter
out DEFERRED error for AMD platform.

This patch extends the mce_severity mechanism for handling
UCNA/DEFERRED error. In order to do this, the patch introduces a new
severity level - MCE_UCNA/DEFERRED_SEVERITY.

In addition, mce_severity is specific to machine check exception,
and it will check MCIP/EIPV/RIPV bits. In order to use mce_severity
mechanism in non-exception context, the patch also introduces a new
argument (is_excp) for mce_severity. `is_excp' is used to explicitly
specify the calling context of mce_severity.

Reviewed-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Chen Yucong <slaoub@gmail.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>

authored by

Chen Yucong and committed by
Tony Luck
e3480271 8dcf32ea

+32 -16
+4
arch/x86/include/asm/mce.h
··· 34 34 #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 35 35 #define MCI_STATUS_AR (1ULL<<55) /* Action required */ 36 36 37 + /* AMD-specific bits */ 38 + #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ 39 + #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ 40 + 37 41 /* 38 42 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is 39 43 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+3 -1
arch/x86/kernel/cpu/mcheck/mce-internal.h
··· 3 3 4 4 enum severity_level { 5 5 MCE_NO_SEVERITY, 6 + MCE_DEFERRED_SEVERITY, 7 + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, 6 8 MCE_KEEP_SEVERITY, 7 9 MCE_SOME_SEVERITY, 8 10 MCE_AO_SEVERITY, ··· 23 21 char attrname[ATTR_LEN]; /* attribute name */ 24 22 }; 25 23 26 - int mce_severity(struct mce *a, int tolerant, char **msg); 24 + int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); 27 25 struct dentry *mce_get_debugfs_dir(void); 28 26 29 27 extern struct mce_bank *mce_banks;
+17 -6
arch/x86/kernel/cpu/mcheck/mce-severity.c
··· 31 31 32 32 enum context { IN_KERNEL = 1, IN_USER = 2 }; 33 33 enum ser { SER_REQUIRED = 1, NO_SER = 2 }; 34 + enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; 34 35 35 36 static struct severity { 36 37 u64 mask; ··· 41 40 unsigned char mcgres; 42 41 unsigned char ser; 43 42 unsigned char context; 43 + unsigned char excp; 44 44 unsigned char covered; 45 45 char *msg; 46 46 } severities[] = { ··· 50 48 #define USER .context = IN_USER 51 49 #define SER .ser = SER_REQUIRED 52 50 #define NOSER .ser = NO_SER 51 + #define EXCP .excp = EXCP_CONTEXT 52 + #define NOEXCP .excp = NO_EXCP 53 53 #define BITCLR(x) .mask = x, .result = 0 54 54 #define BITSET(x) .mask = x, .result = x 55 55 #define MCGMASK(x, y) .mcgmask = x, .mcgres = y ··· 66 62 ), 67 63 MCESEV( 68 64 NO, "Not enabled", 69 - BITCLR(MCI_STATUS_EN) 65 + EXCP, BITCLR(MCI_STATUS_EN) 70 66 ), 71 67 MCESEV( 72 68 PANIC, "Processor context corrupt", ··· 75 71 /* When MCIP is not set something is very confused */ 76 72 MCESEV( 77 73 PANIC, "MCIP not set in MCA handler", 78 - MCGMASK(MCG_STATUS_MCIP, 0) 74 + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) 79 75 ), 80 76 /* Neither return not error IP -- no chance to recover -> PANIC */ 81 77 MCESEV( 82 78 PANIC, "Neither restart nor error IP", 83 - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) 79 + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) 84 80 ), 85 81 MCESEV( 86 82 PANIC, "In kernel and no restart IP", 87 - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) 83 + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) 84 + ), 85 + MCESEV( 86 + DEFERRED, "Deferred error", 87 + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) 88 88 ), 89 89 MCESEV( 90 90 KEEP, "Corrected error", ··· 97 89 98 90 /* ignore OVER for UCNA */ 99 91 MCESEV( 100 - KEEP, "Uncorrected no action required", 92 + UCNA, "Uncorrected no action required", 101 93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) 102 94 ), 103 95 MCESEV( ··· 186 178 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 187 179 } 188 180 189 - int mce_severity(struct mce *m, int tolerant, char **msg) 181 + int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) 190 182 { 183 + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); 191 184 enum context ctx = error_context(m); 192 185 struct severity *s; 193 186 ··· 202 193 if (s->ser == NO_SER && mca_cfg.ser) 203 194 continue; 204 195 if (s->context && ctx != s->context) 196 + continue; 197 + if (s->excp && excp != s->excp) 205 198 continue; 206 199 if (msg) 207 200 *msg = s->msg;
+8 -6
arch/x86/kernel/cpu/mcheck/mce.c
··· 668 668 if (quirk_no_way_out) 669 669 quirk_no_way_out(i, m, regs); 670 670 } 671 - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 671 + if (mce_severity(m, mca_cfg.tolerant, msg, true) >= 672 + MCE_PANIC_SEVERITY) 672 673 ret = 1; 673 674 } 674 675 return ret; ··· 755 754 for_each_possible_cpu(cpu) { 756 755 int severity = mce_severity(&per_cpu(mces_seen, cpu), 757 756 mca_cfg.tolerant, 758 - &nmsg); 757 + &nmsg, true); 759 758 if (severity > global_worst) { 760 759 msg = nmsg; 761 760 global_worst = severity; ··· 1096 1095 */ 1097 1096 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1098 1097 1099 - severity = mce_severity(&m, cfg->tolerant, NULL); 1098 + severity = mce_severity(&m, cfg->tolerant, NULL, true); 1100 1099 1101 1100 /* 1102 - * When machine check was for corrected handler don't touch, 1103 - * unless we're panicing. 1101 + * When machine check was for corrected/deferred handler don't 1102 + * touch, unless we're panicing. 1104 1103 */ 1105 - if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1104 + if ((severity == MCE_KEEP_SEVERITY || 1105 + severity == MCE_UCNA_SEVERITY) && !no_way_out) 1106 1106 continue; 1107 1107 __set_bit(i, toclear); 1108 1108 if (severity == MCE_NO_SEVERITY) {
-3
drivers/edac/mce_amd.h
··· 32 32 #define R4(x) (((x) >> 4) & 0xf) 33 33 #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") 34 34 35 - #define MCI_STATUS_DEFERRED BIT_64(44) 36 - #define MCI_STATUS_POISON BIT_64(43) 37 - 38 35 extern const char * const pp_msgs[]; 39 36 40 37 enum tt_ids {