Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors

When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.

The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
to trace FW-First Protocol errors.

Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.

Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.

Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.

[DJ: return cxl_cper_register_prot_err_work() directly in cxl_ras_init()]

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Link: https://patch.msgid.link/20250310223839.31342-2-Smita.KoralahalliChannabasappa@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>

authored by

Smita Koralahalli and committed by
Dave Jiang
36f257e3 315c2f0b

+158
+49
drivers/acpi/apei/ghes.c
··· 674 674 schedule_work(&entry->work); 675 675 } 676 676 677 + /* Room for 8 entries */ 678 + #define CXL_CPER_PROT_ERR_FIFO_DEPTH 8 679 + static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data, 680 + CXL_CPER_PROT_ERR_FIFO_DEPTH); 681 + 682 + /* Synchronize schedule_work() with cxl_cper_prot_err_work changes */ 683 + static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock); 684 + struct work_struct *cxl_cper_prot_err_work; 685 + 677 686 static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, 678 687 int severity) 679 688 { ··· 709 700 if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) 710 701 pr_warn(FW_WARN "CXL CPER no device serial number\n"); 711 702 703 + guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock); 704 + 705 + if (!cxl_cper_prot_err_work) 706 + return; 707 + 712 708 switch (prot_err->agent_type) { 713 709 case RCD: 714 710 case DEVICE: ··· 735 721 prot_err->agent_type); 736 722 return; 737 723 } 724 + 725 + if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) { 726 + pr_err_ratelimited("CXL CPER kfifo overflow\n"); 727 + return; 728 + } 729 + 730 + schedule_work(cxl_cper_prot_err_work); 738 731 #endif 739 732 } 733 + 734 + int cxl_cper_register_prot_err_work(struct work_struct *work) 735 + { 736 + if (cxl_cper_prot_err_work) 737 + return -EINVAL; 738 + 739 + guard(spinlock)(&cxl_cper_prot_err_work_lock); 740 + cxl_cper_prot_err_work = work; 741 + return 0; 742 + } 743 + EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL"); 744 + 745 + int cxl_cper_unregister_prot_err_work(struct work_struct *work) 746 + { 747 + if (cxl_cper_prot_err_work != work) 748 + return -EINVAL; 749 + 750 + guard(spinlock)(&cxl_cper_prot_err_work_lock); 751 + cxl_cper_prot_err_work = NULL; 752 + return 0; 753 + } 754 + EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL"); 755 + 756 + int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) 757 + { 758 + return kfifo_get(&cxl_cper_prot_err_fifo, wd); 759 + } 760 + EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL"); 740 761 741 762 /* Room for 8 entries for each of the 4 event log queues */ 742 763 #define CXL_CPER_FIFO_DEPTH 32
+1
drivers/cxl/core/Makefile
··· 14 14 cxl_core-y += hdm.o 15 15 cxl_core-y += pmu.o 16 16 cxl_core-y += cdat.o 17 + cxl_core-y += ras.o 17 18 cxl_core-$(CONFIG_TRACING) += trace.o 18 19 cxl_core-$(CONFIG_CXL_REGION) += region.o
+3
drivers/cxl/core/core.h
··· 115 115 int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, 116 116 struct access_coordinate *c); 117 117 118 + int cxl_ras_init(void); 119 + void cxl_ras_exit(void); 120 + 118 121 #endif /* __CXL_CORE_H__ */
+7
drivers/cxl/core/port.c
··· 2339 2339 if (rc) 2340 2340 goto err_region; 2341 2341 2342 + rc = cxl_ras_init(); 2343 + if (rc) 2344 + goto err_ras; 2345 + 2342 2346 return 0; 2343 2347 2348 + err_ras: 2349 + cxl_region_exit(); 2344 2350 err_region: 2345 2351 bus_unregister(&cxl_bus_type); 2346 2352 err_bus: ··· 2358 2352 2359 2353 static void cxl_core_exit(void) 2360 2354 { 2355 + cxl_ras_exit(); 2361 2356 cxl_region_exit(); 2362 2357 bus_unregister(&cxl_bus_type); 2363 2358 destroy_workqueue(cxl_bus_wq);
+82
drivers/cxl/core/ras.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright(c) 2025 AMD Corporation. All rights reserved. */ 3 + 4 + #include <linux/pci.h> 5 + #include <linux/aer.h> 6 + #include <cxl/event.h> 7 + #include <cxlmem.h> 8 + #include "trace.h" 9 + 10 + static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev, 11 + struct cxl_ras_capability_regs ras_cap) 12 + { 13 + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; 14 + struct cxl_dev_state *cxlds; 15 + 16 + cxlds = pci_get_drvdata(pdev); 17 + if (!cxlds) 18 + return; 19 + 20 + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); 21 + } 22 + 23 + static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev, 24 + struct cxl_ras_capability_regs ras_cap) 25 + { 26 + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; 27 + struct cxl_dev_state *cxlds; 28 + u32 fe; 29 + 30 + cxlds = pci_get_drvdata(pdev); 31 + if (!cxlds) 32 + return; 33 + 34 + if (hweight32(status) > 1) 35 + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, 36 + ras_cap.cap_control)); 37 + else 38 + fe = status; 39 + 40 + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, 41 + ras_cap.header_log); 42 + } 43 + 44 + static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) 45 + { 46 + unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, 47 + data->prot_err.agent_addr.function); 48 + struct pci_dev *pdev __free(pci_dev_put) = 49 + pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, 50 + data->prot_err.agent_addr.bus, 51 + devfn); 52 + 53 + if (!pdev) 54 + return; 55 + 56 + guard(device)(&pdev->dev); 57 + 58 + if (data->severity == AER_CORRECTABLE) 59 + cxl_cper_trace_corr_prot_err(pdev, data->ras_cap); 60 + else 61 + cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap); 62 + } 63 + 64 + static void cxl_cper_prot_err_work_fn(struct work_struct *work) 65 + { 66 + struct cxl_cper_prot_err_work_data wd; 67 + 68 + while (cxl_cper_prot_err_kfifo_get(&wd)) 69 + cxl_cper_handle_prot_err(&wd); 70 + } 71 + static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); 72 + 73 + int cxl_ras_init(void) 74 + { 75 + return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); 76 + } 77 + 78 + void cxl_ras_exit(void) 79 + { 80 + cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); 81 + cancel_work_sync(&cxl_cper_prot_err_work); 82 + }
+15
include/cxl/event.h
··· 254 254 int cxl_cper_register_work(struct work_struct *work); 255 255 int cxl_cper_unregister_work(struct work_struct *work); 256 256 int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd); 257 + int cxl_cper_register_prot_err_work(struct work_struct *work); 258 + int cxl_cper_unregister_prot_err_work(struct work_struct *work); 259 + int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd); 257 260 #else 258 261 static inline int cxl_cper_register_work(struct work_struct *work) 259 262 { ··· 268 265 return 0; 269 266 } 270 267 static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) 268 + { 269 + return 0; 270 + } 271 + static inline int cxl_cper_register_prot_err_work(struct work_struct *work) 272 + { 273 + return 0; 274 + } 275 + static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work) 276 + { 277 + return 0; 278 + } 279 + static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd) 271 280 { 272 281 return 0; 273 282 }
+1
tools/testing/cxl/Kbuild
··· 61 61 cxl_core-y += $(CXL_CORE_SRC)/hdm.o 62 62 cxl_core-y += $(CXL_CORE_SRC)/pmu.o 63 63 cxl_core-y += $(CXL_CORE_SRC)/cdat.o 64 + cxl_core-y += $(CXL_CORE_SRC)/ras.o 64 65 cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o 65 66 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o 66 67 cxl_core-y += config_check.o