Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0 */
2#undef TRACE_SYSTEM
3#define TRACE_SYSTEM ras
4#define TRACE_INCLUDE_FILE ras_event
5
6#if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ)
7#define _TRACE_HW_EVENT_MC_H
8
9#include <linux/tracepoint.h>
10#include <linux/edac.h>
11#include <linux/ktime.h>
12#include <linux/pci.h>
13#include <linux/aer.h>
14#include <linux/cper.h>
15
16/*
17 * MCE Extended Error Log trace event
18 *
19 * These events are generated when hardware detects a corrected or
20 * uncorrected event.
21 */
22
23/* memory trace event */
24
25#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
26TRACE_EVENT(extlog_mem_event,
27 TP_PROTO(struct cper_sec_mem_err *mem,
28 u32 err_seq,
29 const guid_t *fru_id,
30 const char *fru_text,
31 u8 sev),
32
33 TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
34
35 TP_STRUCT__entry(
36 __field(u32, err_seq)
37 __field(u8, etype)
38 __field(u8, sev)
39 __field(u64, pa)
40 __field(u8, pa_mask_lsb)
41 __field_struct(guid_t, fru_id)
42 __string(fru_text, fru_text)
43 __field_struct(struct cper_mem_err_compact, data)
44 ),
45
46 TP_fast_assign(
47 __entry->err_seq = err_seq;
48 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
49 __entry->etype = mem->error_type;
50 else
51 __entry->etype = ~0;
52 __entry->sev = sev;
53 if (mem->validation_bits & CPER_MEM_VALID_PA)
54 __entry->pa = mem->physical_addr;
55 else
56 __entry->pa = ~0ull;
57
58 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
59 __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
60 else
61 __entry->pa_mask_lsb = ~0;
62 __entry->fru_id = *fru_id;
63 __assign_str(fru_text);
64 cper_mem_err_pack(mem, &__entry->data);
65 ),
66
67 TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
68 __entry->err_seq,
69 cper_severity_str(__entry->sev),
70 cper_mem_err_type_str(__entry->etype),
71 __entry->pa,
72 __entry->pa_mask_lsb,
73 cper_mem_err_unpack(p, &__entry->data),
74 &__entry->fru_id,
75 __get_str(fru_text))
76);
77#endif
78
79/*
80 * Hardware Events Report
81 *
82 * Those events are generated when hardware detected a corrected or
83 * uncorrected event, and are meant to replace the current API to report
84 * errors defined on both EDAC and MCE subsystems.
85 *
86 * FIXME: Add events for handling memory errors originated from the
87 * MCE subsystem.
88 */
89
90/*
91 * Hardware-independent Memory Controller specific events
92 */
93
94/*
95 * Default error mechanisms for Memory Controller errors (CE and UE)
96 */
97TRACE_EVENT(mc_event,
98
99 TP_PROTO(const unsigned int err_type,
100 const char *error_msg,
101 const char *label,
102 const int error_count,
103 const u8 mc_index,
104 const s8 top_layer,
105 const s8 mid_layer,
106 const s8 low_layer,
107 unsigned long address,
108 const u8 grain_bits,
109 unsigned long syndrome,
110 const char *driver_detail),
111
112 TP_ARGS(err_type, error_msg, label, error_count, mc_index,
113 top_layer, mid_layer, low_layer, address, grain_bits,
114 syndrome, driver_detail),
115
116 TP_STRUCT__entry(
117 __field( unsigned int, error_type )
118 __string( msg, error_msg )
119 __string( label, label )
120 __field( u16, error_count )
121 __field( u8, mc_index )
122 __field( s8, top_layer )
123 __field( s8, middle_layer )
124 __field( s8, lower_layer )
125 __field( long, address )
126 __field( u8, grain_bits )
127 __field( long, syndrome )
128 __string( driver_detail, driver_detail )
129 ),
130
131 TP_fast_assign(
132 __entry->error_type = err_type;
133 __assign_str(msg);
134 __assign_str(label);
135 __entry->error_count = error_count;
136 __entry->mc_index = mc_index;
137 __entry->top_layer = top_layer;
138 __entry->middle_layer = mid_layer;
139 __entry->lower_layer = low_layer;
140 __entry->address = address;
141 __entry->grain_bits = grain_bits;
142 __entry->syndrome = syndrome;
143 __assign_str(driver_detail);
144 ),
145
146 TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
147 __entry->error_count,
148 mc_event_error_type(__entry->error_type),
149 __entry->error_count > 1 ? "s" : "",
150 __get_str(msg)[0] ? " " : "",
151 __get_str(msg),
152 __get_str(label),
153 __entry->mc_index,
154 __entry->top_layer,
155 __entry->middle_layer,
156 __entry->lower_layer,
157 __entry->address,
158 1 << __entry->grain_bits,
159 __entry->syndrome,
160 __get_str(driver_detail)[0] ? " " : "",
161 __get_str(driver_detail))
162);
163
164/*
165 * ARM Processor Events Report
166 *
167 * This event is generated when hardware detects an ARM processor error
168 * has occurred. UEFI 2.6 spec section N.2.4.4.
169 */
170#define APEIL "ARM Processor Err Info data len"
171#define APEID "ARM Processor Err Info raw data"
172#define APECIL "ARM Processor Err Context Info data len"
173#define APECID "ARM Processor Err Context Info raw data"
174#define VSEIL "Vendor Specific Err Info data len"
175#define VSEID "Vendor Specific Err Info raw data"
176TRACE_EVENT(arm_event,
177
178 TP_PROTO(const struct cper_sec_proc_arm *proc,
179 const u8 *pei_err,
180 const u32 pei_len,
181 const u8 *ctx_err,
182 const u32 ctx_len,
183 const u8 *oem,
184 const u32 oem_len,
185 u8 sev,
186 int cpu),
187
188 TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu),
189
190 TP_STRUCT__entry(
191 __field(u64, mpidr)
192 __field(u64, midr)
193 __field(u32, running_state)
194 __field(u32, psci_state)
195 __field(u8, affinity)
196 __field(u32, pei_len)
197 __dynamic_array(u8, pei_buf, pei_len)
198 __field(u32, ctx_len)
199 __dynamic_array(u8, ctx_buf, ctx_len)
200 __field(u32, oem_len)
201 __dynamic_array(u8, oem_buf, oem_len)
202 __field(u8, sev)
203 __field(int, cpu)
204 ),
205
206 TP_fast_assign(
207 if (proc->validation_bits & CPER_ARM_VALID_AFFINITY_LEVEL)
208 __entry->affinity = proc->affinity_level;
209 else
210 __entry->affinity = ~0;
211 if (proc->validation_bits & CPER_ARM_VALID_MPIDR)
212 __entry->mpidr = proc->mpidr;
213 else
214 __entry->mpidr = 0ULL;
215 __entry->midr = proc->midr;
216 if (proc->validation_bits & CPER_ARM_VALID_RUNNING_STATE) {
217 __entry->running_state = proc->running_state;
218 __entry->psci_state = proc->psci_state;
219 } else {
220 __entry->running_state = ~0;
221 __entry->psci_state = ~0;
222 }
223 __entry->pei_len = pei_len;
224 memcpy(__get_dynamic_array(pei_buf), pei_err, pei_len);
225 __entry->ctx_len = ctx_len;
226 memcpy(__get_dynamic_array(ctx_buf), ctx_err, ctx_len);
227 __entry->oem_len = oem_len;
228 memcpy(__get_dynamic_array(oem_buf), oem, oem_len);
229 __entry->sev = sev;
230 __entry->cpu = cpu;
231 ),
232
233 TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
234 "running state: %d; PSCI state: %d; "
235 "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
236 __entry->cpu,
237 __entry->sev,
238 __entry->affinity, __entry->mpidr, __entry->midr,
239 __entry->running_state, __entry->psci_state,
240 APEIL, __entry->pei_len, APEID,
241 __print_hex(__get_dynamic_array(pei_buf), __entry->pei_len),
242 APECIL, __entry->ctx_len, APECID,
243 __print_hex(__get_dynamic_array(ctx_buf), __entry->ctx_len),
244 VSEIL, __entry->oem_len, VSEID,
245 __print_hex(__get_dynamic_array(oem_buf), __entry->oem_len))
246);
247
248/*
249 * Non-Standard Section Report
250 *
251 * This event is generated when hardware detected a hardware
252 * error event, which may be of non-standard section as defined
253 * in UEFI spec appendix "Common Platform Error Record", or may
254 * be of sections for which TRACE_EVENT is not defined.
255 *
256 */
257TRACE_EVENT(non_standard_event,
258
259 TP_PROTO(const guid_t *sec_type,
260 const guid_t *fru_id,
261 const char *fru_text,
262 const u8 sev,
263 const u8 *err,
264 const u32 len),
265
266 TP_ARGS(sec_type, fru_id, fru_text, sev, err, len),
267
268 TP_STRUCT__entry(
269 __array(char, sec_type, UUID_SIZE)
270 __array(char, fru_id, UUID_SIZE)
271 __string(fru_text, fru_text)
272 __field(u8, sev)
273 __field(u32, len)
274 __dynamic_array(u8, buf, len)
275 ),
276
277 TP_fast_assign(
278 memcpy(__entry->sec_type, sec_type, UUID_SIZE);
279 memcpy(__entry->fru_id, fru_id, UUID_SIZE);
280 __assign_str(fru_text);
281 __entry->sev = sev;
282 __entry->len = len;
283 memcpy(__get_dynamic_array(buf), err, len);
284 ),
285
286 TP_printk("severity: %d; sec type:%pU; FRU: %pU %s; data len:%d; raw data:%s",
287 __entry->sev, __entry->sec_type,
288 __entry->fru_id, __get_str(fru_text),
289 __entry->len,
290 __print_hex(__get_dynamic_array(buf), __entry->len))
291);
292
293#ifdef CONFIG_PCIEAER
294/*
295 * PCIe AER Trace event
296 *
297 * These events are generated when hardware detects a corrected or
298 * uncorrected event on a PCIe device. The event report has
299 * the following structure:
300 *
301 * char * dev_name - The name of the slot where the device resides
302 * ([domain:]bus:device.function).
303 * u32 status - Either the correctable or uncorrectable register
304 * indicating what error or errors have been seen
305 * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED
306 */
307
308#define aer_correctable_errors \
309 {PCI_ERR_COR_RCVR, "Receiver Error"}, \
310 {PCI_ERR_COR_BAD_TLP, "Bad TLP"}, \
311 {PCI_ERR_COR_BAD_DLLP, "Bad DLLP"}, \
312 {PCI_ERR_COR_REP_ROLL, "RELAY_NUM Rollover"}, \
313 {PCI_ERR_COR_REP_TIMER, "Replay Timer Timeout"}, \
314 {PCI_ERR_COR_ADV_NFAT, "Advisory Non-Fatal Error"}, \
315 {PCI_ERR_COR_INTERNAL, "Corrected Internal Error"}, \
316 {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"}
317
318#define aer_uncorrectable_errors \
319 {PCI_ERR_UNC_UND, "Undefined"}, \
320 {PCI_ERR_UNC_DLP, "Data Link Protocol Error"}, \
321 {PCI_ERR_UNC_SURPDN, "Surprise Down Error"}, \
322 {PCI_ERR_UNC_POISON_TLP,"Poisoned TLP"}, \
323 {PCI_ERR_UNC_FCP, "Flow Control Protocol Error"}, \
324 {PCI_ERR_UNC_COMP_TIME, "Completion Timeout"}, \
325 {PCI_ERR_UNC_COMP_ABORT,"Completer Abort"}, \
326 {PCI_ERR_UNC_UNX_COMP, "Unexpected Completion"}, \
327 {PCI_ERR_UNC_RX_OVER, "Receiver Overflow"}, \
328 {PCI_ERR_UNC_MALF_TLP, "Malformed TLP"}, \
329 {PCI_ERR_UNC_ECRC, "ECRC Error"}, \
330 {PCI_ERR_UNC_UNSUP, "Unsupported Request Error"}, \
331 {PCI_ERR_UNC_ACSV, "ACS Violation"}, \
332 {PCI_ERR_UNC_INTN, "Uncorrectable Internal Error"},\
333 {PCI_ERR_UNC_MCBTLP, "MC Blocked TLP"}, \
334 {PCI_ERR_UNC_ATOMEG, "AtomicOp Egress Blocked"}, \
335 {PCI_ERR_UNC_TLPPRE, "TLP Prefix Blocked Error"}
336
337TRACE_EVENT(aer_event,
338 TP_PROTO(const char *dev_name,
339 const u32 status,
340 const u8 severity,
341 const u8 tlp_header_valid,
342 struct pcie_tlp_log *tlp),
343
344 TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
345
346 TP_STRUCT__entry(
347 __string( dev_name, dev_name )
348 __field( u32, status )
349 __field( u8, severity )
350 __field( u8, tlp_header_valid)
351 __array( u32, tlp_header, PCIE_STD_MAX_TLP_HEADERLOG)
352 ),
353
354 TP_fast_assign(
355 __assign_str(dev_name);
356 __entry->status = status;
357 __entry->severity = severity;
358 __entry->tlp_header_valid = tlp_header_valid;
359 if (tlp_header_valid) {
360 int i;
361
362 for (i = 0; i < PCIE_STD_MAX_TLP_HEADERLOG; i++)
363 __entry->tlp_header[i] = tlp->dw[i];
364 }
365 ),
366
367 TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
368 __get_str(dev_name),
369 __entry->severity == AER_CORRECTABLE ? "Corrected" :
370 __entry->severity == AER_FATAL ?
371 "Fatal" : "Uncorrected, non-fatal",
372 __entry->severity == AER_CORRECTABLE ?
373 __print_flags(__entry->status, "|", aer_correctable_errors) :
374 __print_flags(__entry->status, "|", aer_uncorrectable_errors),
375 __entry->tlp_header_valid ?
376 __print_array(__entry->tlp_header, PCIE_STD_MAX_TLP_HEADERLOG, 4) :
377 "Not available")
378);
379#endif /* CONFIG_PCIEAER */
380#endif /* _TRACE_HW_EVENT_MC_H */
381
382/* This part must be outside protection */
383#include <trace/define_trace.h>