vmcoreinfo: track and log recoverable hardware errors

+60

Documentation/driver-api/hw-recoverable-errors.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + ================================================= 4 + Recoverable Hardware Error Tracking in vmcoreinfo 5 + ================================================= 6 + 7 + Overview 8 + -------- 9 + 10 + This feature provides a generic infrastructure within the Linux kernel to track 11 + and log recoverable hardware errors. These are hardware recoverable errors 12 + visible that might not cause immediate panics but may influence health, mainly 13 + because new code path will be executed in the kernel. 14 + 15 + By recording counts and timestamps of recoverable errors into the vmcoreinfo 16 + crash dump notes, this infrastructure aids post-mortem crash analysis tools in 17 + correlating hardware events with kernel failures. This enables faster triage 18 + and better understanding of root causes, especially in large-scale cloud 19 + environments where hardware issues are common. 20 + 21 + Benefits 22 + -------- 23 + 24 + - Facilitates correlation of hardware recoverable errors with kernel panics or 25 + unusual code paths that lead to system crashes. 26 + - Provides operators and cloud providers quick insights, improving reliability 27 + and reducing troubleshooting time. 28 + - Complements existing full hardware diagnostics without replacing them. 29 + 30 + Data Exposure and Consumption 31 + ----------------------------- 32 + 33 + - The tracked error data consists of per-error-type counts and timestamps of 34 + last occurrence. 35 + - This data is stored in the `hwerror_data` array, categorized by error source 36 + types like CPU, memory, PCI, CXL, and others. 37 + - It is exposed via vmcoreinfo crash dump notes and can be read using tools 38 + like `crash`, `drgn`, or other kernel crash analysis utilities. 39 + - There is no other way to read these data other than from crash dumps. 40 + - These errors are divided by area, which includes CPU, Memory, PCI, CXL and 41 + others. 42 + 43 + Typical usage example (in drgn REPL): 44 + 45 + .. code-block:: python 46 + 47 + >>> prog['hwerror_data'] 48 + (struct hwerror_info[HWERR_RECOV_MAX]){ 49 + { 50 + .count = (int)844, 51 + .timestamp = (time64_t)1752852018, 52 + }, 53 + ... 54 + } 55 + 56 + Enabling 57 + -------- 58 + 59 + - This feature is enabled when CONFIG_VMCORE_INFO is set. 60 +

+1

Documentation/driver-api/index.rst

··· 96 96 gpio/index 97 97 hsi 98 98 hte/index 99 + hw-recoverable-errors 99 100 i2c 100 101 iio/index 101 102 infiniband

+4

arch/x86/kernel/cpu/mce/core.c

··· 45 45 #include <linux/task_work.h> 46 46 #include <linux/hardirq.h> 47 47 #include <linux/kexec.h> 48 + #include <linux/vmcore_info.h> 48 49 49 50 #include <asm/fred.h> 50 51 #include <asm/cpu_device_id.h> ··· 1701 1700 } 1702 1701 1703 1702 out: 1703 + /* Given it didn't panic, mark it as recoverable */ 1704 + hwerr_log_error_type(HWERR_RECOV_OTHERS); 1705 + 1704 1706 instrumentation_end(); 1705 1707 1706 1708 clear:

+36

drivers/acpi/apei/ghes.c

··· 43 43 #include <linux/uuid.h> 44 44 #include <linux/ras.h> 45 45 #include <linux/task_work.h> 46 + #include <linux/vmcore_info.h> 46 47 47 48 #include <acpi/actbl1.h> 48 49 #include <acpi/ghes.h> ··· 868 867 } 869 868 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL"); 870 869 870 + static void ghes_log_hwerr(int sev, guid_t *sec_type) 871 + { 872 + if (sev != CPER_SEV_RECOVERABLE) 873 + return; 874 + 875 + if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) || 876 + guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) || 877 + guid_equal(sec_type, &CPER_SEC_PROC_IA)) { 878 + hwerr_log_error_type(HWERR_RECOV_CPU); 879 + return; 880 + } 881 + 882 + if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) || 883 + guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) || 884 + guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) || 885 + guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) { 886 + hwerr_log_error_type(HWERR_RECOV_CXL); 887 + return; 888 + } 889 + 890 + if (guid_equal(sec_type, &CPER_SEC_PCIE) || 891 + guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) { 892 + hwerr_log_error_type(HWERR_RECOV_PCI); 893 + return; 894 + } 895 + 896 + if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { 897 + hwerr_log_error_type(HWERR_RECOV_MEMORY); 898 + return; 899 + } 900 + 901 + hwerr_log_error_type(HWERR_RECOV_OTHERS); 902 + } 903 + 871 904 static void ghes_do_proc(struct ghes *ghes, 872 905 const struct acpi_hest_generic_status *estatus) 873 906 { ··· 923 888 if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) 924 889 fru_text = gdata->fru_text; 925 890 891 + ghes_log_hwerr(sev, sec_type); 926 892 if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { 927 893 struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); 928 894

+2

drivers/pci/pcie/aer.c

··· 30 30 #include <linux/kfifo.h> 31 31 #include <linux/ratelimit.h> 32 32 #include <linux/slab.h> 33 + #include <linux/vmcore_info.h> 33 34 #include <acpi/apei.h> 34 35 #include <acpi/ghes.h> 35 36 #include <ras/ras_event.h> ··· 766 765 break; 767 766 case AER_NONFATAL: 768 767 aer_info->dev_total_nonfatal_errs++; 768 + hwerr_log_error_type(HWERR_RECOV_PCI); 769 769 counter = &aer_info->dev_nonfatal_errs[0]; 770 770 max = AER_MAX_TYPEOF_UNCOR_ERRS; 771 771 break;

+8

include/linux/vmcore_info.h

··· 5 5 #include <linux/linkage.h> 6 6 #include <linux/elfcore.h> 7 7 #include <linux/elf.h> 8 + #include <uapi/linux/vmcore.h> 8 9 9 10 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) 10 11 #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) ··· 78 77 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, 79 78 void *data, size_t data_len); 80 79 void final_note(Elf_Word *buf); 80 + 81 + #ifdef CONFIG_VMCORE_INFO 82 + void hwerr_log_error_type(enum hwerr_error_type src); 83 + #else 84 + static inline void hwerr_log_error_type(enum hwerr_error_type src) {}; 85 + #endif 86 + 81 87 #endif /* LINUX_VMCORE_INFO_H */

+9

include/uapi/linux/vmcore.h

··· 15 15 __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */ 16 16 }; 17 17 18 + enum hwerr_error_type { 19 + HWERR_RECOV_CPU, 20 + HWERR_RECOV_MEMORY, 21 + HWERR_RECOV_PCI, 22 + HWERR_RECOV_CXL, 23 + HWERR_RECOV_OTHERS, 24 + HWERR_RECOV_MAX, 25 + }; 26 + 18 27 #endif /* _UAPI_VMCORE_H */

+17

kernel/vmcore_info.c

··· 31 31 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ 32 32 static unsigned char *vmcoreinfo_data_safecopy; 33 33 34 + struct hwerr_info { 35 + atomic_t count; 36 + time64_t timestamp; 37 + }; 38 + 39 + static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; 40 + 34 41 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, 35 42 void *data, size_t data_len) 36 43 { ··· 124 117 return __pa(vmcoreinfo_note); 125 118 } 126 119 EXPORT_SYMBOL(paddr_vmcoreinfo_note); 120 + 121 + void hwerr_log_error_type(enum hwerr_error_type src) 122 + { 123 + if (src < 0 || src >= HWERR_RECOV_MAX) 124 + return; 125 + 126 + atomic_inc(&hwerr_data[src].count); 127 + WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); 128 + } 129 + EXPORT_SYMBOL_GPL(hwerr_log_error_type); 127 130 128 131 static int __init crash_save_vmcoreinfo_init(void) 129 132 {