drm/xe/xe_hw_error: Handle CSC Firmware reported Hardware errors

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add support to handle CSC firmware reported errors. When CSC firmware
errors are encoutered, a error interrupt is received by the GFX device as
a MSI interrupt.

Device Source control registers indicates the source of the error as CSC
The HEC error status register indicates that the error is firmware reported
Depending on the type of error, the error cause is written to the HEC
Firmware error register.

On encountering such CSC firmware errors, the graphics device is
non-recoverable from driver context. The only way to recover from these
errors is firmware flash.

System admin/userspace is notified of the necessity of firmware flash
with a combination of vendor-specific drm device edged uevent, dmesg logs
and runtime survivability sysfs. It is the responsiblity of the consumer
to verify all the actions and then trigger a firmware flash using tools
like fwupd.

$ udevadm monitor --property --kernel
monitor will print the received events for:
KERNEL - the kernel uevent

KERNEL[754.709341] change /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0/drm/card0 (drm)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0/drm/card0
SUBSYSTEM=drm
WEDGED=vendor-specific
DEVNAME=/dev/dri/card0
DEVTYPE=drm_minor
SEQNUM=5973
MAJOR=226
MINOR=0

Logs

xe 0000:03:00.0: [drm] *ERROR* [Hardware Error]: Tile0 reported NONFATAL error 0x20000
xe 0000:03:00.0: [drm] *ERROR* [Hardware Error]: NONFATAL: HEC Uncorrected FW FD Corruption error reported, bit[2] is set
xe 0000:03:00.0: Runtime Survivability mode enabled
xe 0000:03:00.0: [drm] *ERROR* CRITICAL: Xe has declared device 0000:03:00.0 as wedged.
IOCTLs and executions are blocked. Only a rebind may clear the failure
Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new
xe 0000:03:00.0: [drm] device wedged, needs recovery
xe 0000:03:00.0: Firmware flash required, Please refer to the userspace documentation for more details!

Runtime survivability Sysfs:

/sys/bus/pci/devices/<device>/survivability_mode

v2: use vendor recovery method with
runtime survivability (Christian, Rodrigo, Raag)
v3: move declare wedged to runtime survivability mode (Rodrigo)
v4: update commit message

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Link: https://lore.kernel.org/r/20250826063419.3022216-10-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Riana Tauro and committed by

Rodrigo Vivi 7 months ago a7df563b 0a2a873d

+75 -3

5 changed files

expand all

drivers

gpu

drm

regs

xe_gsc_regs.h

xe_hw_error_regs.h

xe_device_types.h

xe_hw_error.c

xe_survivability_mode.c

drivers/gpu/drm/xe/regs/xe_gsc_regs.h

··· 13 13 14 14 /* Definitions of GSC H/W registers, bits, etc */ 15 15 16 + #define BMG_GSC_HECI1_BASE 0x373000 17 + 16 18 #define MTL_GSC_HECI1_BASE 0x00116000 17 19 #define MTL_GSC_HECI2_BASE 0x00117000 18 20

+6 -1

drivers/gpu/drm/xe/regs/xe_hw_error_regs.h

··· 6 6 #ifndef _XE_HW_ERROR_REGS_H_ 7 7 #define _XE_HW_ERROR_REGS_H_ 8 8 9 + #define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118) 10 + #define UNCORR_FW_REPORTED_ERR BIT(6) 11 + 12 + #define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124) 13 + 9 14 #define DEV_ERR_STAT_NONFATAL 0x100178 10 15 #define DEV_ERR_STAT_CORRECTABLE 0x10017c 11 16 #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ 12 17 DEV_ERR_STAT_CORRECTABLE, \ 13 18 DEV_ERR_STAT_NONFATAL)) 14 - 19 + #define XE_CSC_ERROR BIT(17) 15 20 #endif

drivers/gpu/drm/xe/xe_device_types.h

··· 192 192 /** @memirq: Memory Based Interrupts. */ 193 193 struct xe_memirq memirq; 194 194 195 + /** @csc_hw_error_work: worker to report CSC HW errors */ 196 + struct work_struct csc_hw_error_work; 197 + 195 198 /** @pcode: tile's PCODE */ 196 199 struct { 197 200 /** @pcode.lock: protecting tile's PCODE mailbox data */

+63 -1

drivers/gpu/drm/xe/xe_hw_error.c

··· 3 3 * Copyright © 2025 Intel Corporation 4 4 */ 5 5 6 + #include "regs/xe_gsc_regs.h" 6 7 #include "regs/xe_hw_error_regs.h" 7 8 #include "regs/xe_irq_regs.h" 8 9 9 10 #include "xe_device.h" 10 11 #include "xe_hw_error.h" 11 12 #include "xe_mmio.h" 13 + #include "xe_survivability_mode.h" 14 + 15 + #define HEC_UNCORR_FW_ERR_BITS 4 12 16 13 17 /* Error categories reported by hardware */ 14 18 enum hardware_error { ··· 20 16 HARDWARE_ERROR_NONFATAL = 1, 21 17 HARDWARE_ERROR_FATAL = 2, 22 18 HARDWARE_ERROR_MAX, 19 + }; 20 + 21 + static const char * const hec_uncorrected_fw_errors[] = { 22 + "Fatal", 23 + "CSE Disabled", 24 + "FD Corruption", 25 + "Data Corruption" 23 26 }; 24 27 25 28 static const char *hw_error_to_str(const enum hardware_error hw_err) ··· 41 30 default: 42 31 return "UNKNOWN"; 43 32 } 33 + } 34 + 35 + static void csc_hw_error_work(struct work_struct *work) 36 + { 37 + struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work); 38 + struct xe_device *xe = tile_to_xe(tile); 39 + int ret; 40 + 41 + ret = xe_survivability_mode_runtime_enable(xe); 42 + if (ret) 43 + drm_err(&xe->drm, "Failed to enable runtime survivability mode\n"); 44 + } 45 + 46 + static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err) 47 + { 48 + const char *hw_err_str = hw_error_to_str(hw_err); 49 + struct xe_device *xe = tile_to_xe(tile); 50 + struct xe_mmio *mmio = &tile->mmio; 51 + u32 base, err_bit, err_src; 52 + unsigned long fw_err; 53 + 54 + if (xe->info.platform != XE_BATTLEMAGE) 55 + return; 56 + 57 + base = BMG_GSC_HECI1_BASE; 58 + lockdep_assert_held(&xe->irq.lock); 59 + err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base)); 60 + if (!err_src) { 61 + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n", 62 + tile->id, hw_err_str); 63 + return; 64 + } 65 + 66 + if (err_src & UNCORR_FW_REPORTED_ERR) { 67 + fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base)); 68 + for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) { 69 + drm_err_ratelimited(&xe->drm, HW_ERR 70 + "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n", 71 + hw_err_str, hec_uncorrected_fw_errors[err_bit], 72 + err_bit); 73 + 74 + schedule_work(&tile->csc_hw_error_work); 75 + } 76 + } 77 + 78 + xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); 44 79 } 45 80 46 81 static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) ··· 107 50 goto unlock; 108 51 } 109 52 110 - /* TODO: Process errrors per source */ 53 + if (err_src & XE_CSC_ERROR) 54 + csc_hw_error_handler(tile, hw_err); 111 55 112 56 xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); 113 57 ··· 160 102 */ 161 103 void xe_hw_error_init(struct xe_device *xe) 162 104 { 105 + struct xe_tile *tile = xe_device_get_root_tile(xe); 106 + 163 107 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 164 108 return; 109 + 110 + INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); 165 111 166 112 process_hw_errors(xe); 167 113 }

+1 -1

drivers/gpu/drm/xe/xe_survivability_mode.c

··· 346 346 347 347 xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); 348 348 xe_device_declare_wedged(xe); 349 - dev_err(&pdev->dev, "Firmware flash required, Refer the userspace documentation for more details!\n"); 349 + dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); 350 350 351 351 return 0; 352 352 }