Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/mce: Save and use APEI corrected threshold limit

The MCA threshold limit generally is not something that needs to change during
runtime. It is common for a system administrator to decide on a policy for
their managed systems.

If MCA thresholding is OS-managed, then the threshold limit must be set at
every boot. However, many systems allow the user to set a value in their BIOS.
And this is reported through an APEI HEST entry even if thresholding is not in
FW-First mode.

Use this value, if available, to set the OS-managed threshold limit. Users
can still override it through sysfs if desired for testing or debug.

APEI is parsed after MCE is initialized. So reset the thresholding blocks
later to pick up the threshold limit.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com

authored by

Yazen Ghannam and committed by
Borislav Petkov (AMD)
eeb3f76d 56f17be6

+39 -2
+6
arch/x86/include/asm/mce.h
··· 308 308 /* Disable CMCI/polling for MCA bank claimed by firmware */ 309 309 extern void mce_disable_bank(int bank); 310 310 311 + #ifdef CONFIG_X86_MCE_THRESHOLD 312 + void mce_save_apei_thr_limit(u32 thr_limit); 313 + #else 314 + static inline void mce_save_apei_thr_limit(u32 thr_limit) { } 315 + #endif /* CONFIG_X86_MCE_THRESHOLD */ 316 + 311 317 /* 312 318 * Exception handler 313 319 */
+2
arch/x86/kernel/acpi/apei.c
··· 19 19 if (!cmc->enabled) 20 20 return 0; 21 21 22 + mce_save_apei_thr_limit(cmc->notify.error_threshold_value); 23 + 22 24 /* 23 25 * We expect HEST to provide a list of MC banks that report errors 24 26 * in firmware first mode. Otherwise, return non-zero value to
+16 -2
arch/x86/kernel/cpu/mce/amd.c
··· 489 489 } 490 490 } 491 491 492 + /* Try to use the threshold limit reported through APEI. */ 493 + static u16 get_thr_limit(void) 494 + { 495 + u32 thr_limit = mce_get_apei_thr_limit(); 496 + 497 + /* Fallback to old default if APEI limit is not available. */ 498 + if (!thr_limit) 499 + return THRESHOLD_MAX; 500 + 501 + return min(thr_limit, THRESHOLD_MAX); 502 + } 503 + 492 504 static void mce_threshold_block_init(struct threshold_block *b, int offset) 493 505 { 494 506 struct thresh_restart tr = { ··· 509 497 .lvt_off = offset, 510 498 }; 511 499 512 - b->threshold_limit = THRESHOLD_MAX; 500 + b->threshold_limit = get_thr_limit(); 513 501 threshold_restart_block(&tr); 514 502 }; 515 503 ··· 1083 1071 b->address = address; 1084 1072 b->interrupt_enable = 0; 1085 1073 b->interrupt_capable = lvt_interrupt_supported(bank, high); 1086 - b->threshold_limit = THRESHOLD_MAX; 1074 + b->threshold_limit = get_thr_limit(); 1087 1075 1088 1076 if (b->interrupt_capable) { 1089 1077 default_attrs[2] = &interrupt_enable.attr; ··· 1093 1081 } 1094 1082 1095 1083 list_add(&b->miscj, &tb->miscj); 1084 + 1085 + mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20); 1096 1086 1097 1087 err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); 1098 1088 if (err)
+2
arch/x86/kernel/cpu/mce/internal.h
··· 67 67 void mce_inherit_storm(unsigned int bank); 68 68 bool mce_get_storm_mode(void); 69 69 void mce_set_storm_mode(bool storm); 70 + u32 mce_get_apei_thr_limit(void); 70 71 #else 71 72 static inline void cmci_storm_begin(unsigned int bank) {} 72 73 static inline void cmci_storm_end(unsigned int bank) {} ··· 75 74 static inline void mce_inherit_storm(unsigned int bank) {} 76 75 static inline bool mce_get_storm_mode(void) { return false; } 77 76 static inline void mce_set_storm_mode(bool storm) {} 77 + static inline u32 mce_get_apei_thr_limit(void) { return 0; } 78 78 #endif 79 79 80 80 /*
+13
arch/x86/kernel/cpu/mce/threshold.c
··· 13 13 14 14 #include "internal.h" 15 15 16 + static u32 mce_apei_thr_limit; 17 + 18 + void mce_save_apei_thr_limit(u32 thr_limit) 19 + { 20 + mce_apei_thr_limit = thr_limit; 21 + pr_info("HEST corrected error threshold limit: %u\n", thr_limit); 22 + } 23 + 24 + u32 mce_get_apei_thr_limit(void) 25 + { 26 + return mce_apei_thr_limit; 27 + } 28 + 16 29 static void default_threshold_interrupt(void) 17 30 { 18 31 pr_err("Unexpected threshold interrupt at vector %x\n",