Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

trace, RAS: Add eMCA trace event interface

Add trace interface to elaborate all H/W error related information.

Signed-off-by: Chen, Gong <gong.chen@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>

authored by

Chen, Gong and committed by
Tony Luck
2dfb7d51 d963cd95

+158 -8
+3 -1
drivers/acpi/Kconfig
··· 370 370 tristate "Extended Error Log support" 371 371 depends on X86_MCE && X86_LOCAL_APIC 372 372 select UEFI_CPER 373 + select RAS 373 374 default n 374 375 help 375 376 Certain usages such as Predictive Failure Analysis (PFA) require ··· 385 384 386 385 Enhanced MCA Logging allows firmware to provide additional error 387 386 information to system software, synchronous with MCE or CMCI. This 388 - driver adds support for that functionality. 387 + driver adds support for that functionality with corresponding 388 + tracepoint which carries that information to userspace. 389 389 390 390 endif # ACPI
+24 -3
drivers/acpi/acpi_extlog.c
··· 16 16 #include <asm/mce.h> 17 17 18 18 #include "apei/apei-internal.h" 19 + #include <ras/ras_event.h> 19 20 20 21 #define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */ 21 22 ··· 138 137 struct mce *mce = (struct mce *)data; 139 138 int bank = mce->bank; 140 139 int cpu = mce->extcpu; 141 - struct acpi_generic_status *estatus; 142 - int rc; 140 + struct acpi_generic_status *estatus, *tmp; 141 + struct acpi_generic_data *gdata; 142 + const uuid_le *fru_id = &NULL_UUID_LE; 143 + char *fru_text = ""; 144 + uuid_le *sec_type; 145 + static u32 err_seq; 143 146 144 147 estatus = extlog_elog_entry_check(cpu, bank); 145 148 if (estatus == NULL) ··· 153 148 /* clear record status to enable BIOS to update it again */ 154 149 estatus->block_status = 0; 155 150 156 - rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); 151 + tmp = (struct acpi_generic_status *)elog_buf; 152 + print_extlog_rcd(NULL, tmp, cpu); 153 + 154 + /* log event via trace */ 155 + err_seq++; 156 + gdata = (struct acpi_generic_data *)(tmp + 1); 157 + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) 158 + fru_id = (uuid_le *)gdata->fru_id; 159 + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) 160 + fru_text = gdata->fru_text; 161 + sec_type = (uuid_le *)gdata->section_type; 162 + if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { 163 + struct cper_sec_mem_err *mem = (void *)(gdata + 1); 164 + if (gdata->error_data_length >= sizeof(*mem)) 165 + trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, 166 + (u8)gdata->error_severity); 167 + } 157 168 158 169 return NOTIFY_STOP; 159 170 }
+41 -4
drivers/firmware/efi/cper.c
··· 207 207 } 208 208 EXPORT_SYMBOL_GPL(cper_mem_err_type_str); 209 209 210 - static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg) 210 + static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg) 211 211 { 212 212 u32 len, n; 213 213 ··· 249 249 return n; 250 250 } 251 251 252 - static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg) 252 + static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) 253 253 { 254 254 u32 len, n; 255 255 const char *bank = NULL, *device = NULL; ··· 271 271 return n; 272 272 } 273 273 274 + void cper_mem_err_pack(const struct cper_sec_mem_err *mem, 275 + struct cper_mem_err_compact *cmem) 276 + { 277 + cmem->validation_bits = mem->validation_bits; 278 + cmem->node = mem->node; 279 + cmem->card = mem->card; 280 + cmem->module = mem->module; 281 + cmem->bank = mem->bank; 282 + cmem->device = mem->device; 283 + cmem->row = mem->row; 284 + cmem->column = mem->column; 285 + cmem->bit_pos = mem->bit_pos; 286 + cmem->requestor_id = mem->requestor_id; 287 + cmem->responder_id = mem->responder_id; 288 + cmem->target_id = mem->target_id; 289 + cmem->rank = mem->rank; 290 + cmem->mem_array_handle = mem->mem_array_handle; 291 + cmem->mem_dev_handle = mem->mem_dev_handle; 292 + } 293 + 294 + const char *cper_mem_err_unpack(struct trace_seq *p, 295 + struct cper_mem_err_compact *cmem) 296 + { 297 + const char *ret = p->buffer + p->len; 298 + 299 + if (cper_mem_err_location(cmem, rcd_decode_str)) 300 + trace_seq_printf(p, "%s", rcd_decode_str); 301 + if (cper_dimm_err_location(cmem, rcd_decode_str)) 302 + trace_seq_printf(p, "%s", rcd_decode_str); 303 + trace_seq_putc(p, '\0'); 304 + 305 + return ret; 306 + } 307 + 274 308 static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) 275 309 { 310 + struct cper_mem_err_compact cmem; 311 + 276 312 if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) 277 313 printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); 278 314 if (mem->validation_bits & CPER_MEM_VALID_PA) ··· 317 281 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) 318 282 printk("%s""physical_address_mask: 0x%016llx\n", 319 283 pfx, mem->physical_addr_mask); 320 - if (cper_mem_err_location(mem, rcd_decode_str)) 284 + cper_mem_err_pack(mem, &cmem); 285 + if (cper_mem_err_location(&cmem, rcd_decode_str)) 321 286 printk("%s%s\n", pfx, rcd_decode_str); 322 287 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 323 288 u8 etype = mem->error_type; 324 289 printk("%s""error_type: %d, %s\n", pfx, etype, 325 290 cper_mem_err_type_str(etype)); 326 291 } 327 - if (cper_dimm_err_location(mem, rcd_decode_str)) 292 + if (cper_dimm_err_location(&cmem, rcd_decode_str)) 328 293 printk("%s%s\n", pfx, rcd_decode_str); 329 294 } 330 295
+3
drivers/ras/ras.c
··· 23 23 } 24 24 subsys_initcall(ras_init); 25 25 26 + #if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) 27 + EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); 28 + #endif 26 29 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+23
include/linux/cper.h
··· 22 22 #define LINUX_CPER_H 23 23 24 24 #include <linux/uuid.h> 25 + #include <linux/trace_seq.h> 25 26 26 27 /* CPER record signature and the size */ 27 28 #define CPER_SIG_RECORD "CPER" ··· 364 363 __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ 365 364 }; 366 365 366 + struct cper_mem_err_compact { 367 + __u64 validation_bits; 368 + __u16 node; 369 + __u16 card; 370 + __u16 module; 371 + __u16 bank; 372 + __u16 device; 373 + __u16 row; 374 + __u16 column; 375 + __u16 bit_pos; 376 + __u64 requestor_id; 377 + __u64 responder_id; 378 + __u64 target_id; 379 + __u16 rank; 380 + __u16 mem_array_handle; 381 + __u16 mem_dev_handle; 382 + }; 383 + 367 384 struct cper_sec_pcie { 368 385 __u64 validation_bits; 369 386 __u32 port_type; ··· 425 406 const char *cper_mem_err_type_str(unsigned int); 426 407 void cper_print_bits(const char *prefix, unsigned int bits, 427 408 const char * const strs[], unsigned int strs_size); 409 + void cper_mem_err_pack(const struct cper_sec_mem_err *, 410 + struct cper_mem_err_compact *); 411 + const char *cper_mem_err_unpack(struct trace_seq *, 412 + struct cper_mem_err_compact *); 428 413 429 414 #endif
+64
include/ras/ras_event.h
··· 9 9 #include <linux/edac.h> 10 10 #include <linux/ktime.h> 11 11 #include <linux/aer.h> 12 + #include <linux/cper.h> 13 + 14 + /* 15 + * MCE Extended Error Log trace event 16 + * 17 + * These events are generated when hardware detects a corrected or 18 + * uncorrected event. 19 + */ 20 + 21 + /* memory trace event */ 22 + 23 + #if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE) 24 + TRACE_EVENT(extlog_mem_event, 25 + TP_PROTO(struct cper_sec_mem_err *mem, 26 + u32 err_seq, 27 + const uuid_le *fru_id, 28 + const char *fru_text, 29 + u8 sev), 30 + 31 + TP_ARGS(mem, err_seq, fru_id, fru_text, sev), 32 + 33 + TP_STRUCT__entry( 34 + __field(u32, err_seq) 35 + __field(u8, etype) 36 + __field(u8, sev) 37 + __field(u64, pa) 38 + __field(u8, pa_mask_lsb) 39 + __field_struct(uuid_le, fru_id) 40 + __string(fru_text, fru_text) 41 + __field_struct(struct cper_mem_err_compact, data) 42 + ), 43 + 44 + TP_fast_assign( 45 + __entry->err_seq = err_seq; 46 + if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) 47 + __entry->etype = mem->error_type; 48 + else 49 + __entry->etype = ~0; 50 + __entry->sev = sev; 51 + if (mem->validation_bits & CPER_MEM_VALID_PA) 52 + __entry->pa = mem->physical_addr; 53 + else 54 + __entry->pa = ~0ull; 55 + 56 + if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) 57 + __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask); 58 + else 59 + __entry->pa_mask_lsb = ~0; 60 + __entry->fru_id = *fru_id; 61 + __assign_str(fru_text, fru_text); 62 + cper_mem_err_pack(mem, &__entry->data); 63 + ), 64 + 65 + TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s", 66 + __entry->err_seq, 67 + cper_severity_str(__entry->sev), 68 + cper_mem_err_type_str(__entry->etype), 69 + __entry->pa, 70 + __entry->pa_mask_lsb, 71 + cper_mem_err_unpack(p, &__entry->data), 72 + &__entry->fru_id, 73 + __get_str(fru_text)) 74 + ); 75 + #endif 12 76 13 77 /* 14 78 * Hardware Events Report