Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Add support to handle hardware errors

Gfx device reports two classes of errors: uncorrectable and
correctable. Depending on the severity uncorrectable errors are further
classified Non-Fatal and Fatal.

Correctable and Non-Fatal errors: These errors are reported as MSI. Bits in
the Master Interrupt Register indicate the class of the error.
The source of the error is then read from the Device Error Source
Register.

Fatal errors: These are reported as PCIe errors
When a PCIe error is asserted, the OS will perform a SBR (Secondary
Bus reset) which causes the driver to reload. The error registers are
sticky and the values are maintained through SBR.

Add basic support to handle these errors.

Bspec: 50875, 53073, 53074, 53075, 53076

v2: Format commit message (Umesh)
v3: fix documentation (Stuart)

Cc: Stuart Summers <stuart.summers@intel.com>
Co-developed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Link: https://lore.kernel.org/r/20250826063419.3022216-9-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Riana Tauro and committed by
Rodrigo Vivi
0a2a873d f646c9f9

+145
+1
drivers/gpu/drm/xe/Makefile
··· 82 82 xe_hw_engine.o \ 83 83 xe_hw_engine_class_sysfs.o \ 84 84 xe_hw_engine_group.o \ 85 + xe_hw_error.o \ 85 86 xe_hw_fence.o \ 86 87 xe_irq.o \ 87 88 xe_lrc.o \
+15
drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + 6 + #ifndef _XE_HW_ERROR_REGS_H_ 7 + #define _XE_HW_ERROR_REGS_H_ 8 + 9 + #define DEV_ERR_STAT_NONFATAL 0x100178 10 + #define DEV_ERR_STAT_CORRECTABLE 0x10017c 11 + #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ 12 + DEV_ERR_STAT_CORRECTABLE, \ 13 + DEV_ERR_STAT_NONFATAL)) 14 + 15 + #endif
+1
drivers/gpu/drm/xe/regs/xe_irq_regs.h
··· 18 18 #define GFX_MSTR_IRQ XE_REG(0x190010, XE_REG_OPTION_VF) 19 19 #define MASTER_IRQ REG_BIT(31) 20 20 #define GU_MISC_IRQ REG_BIT(29) 21 + #define ERROR_IRQ(x) REG_BIT(26 + (x)) 21 22 #define DISPLAY_IRQ REG_BIT(16) 22 23 #define I2C_IRQ REG_BIT(12) 23 24 #define GT_DW_IRQ(x) REG_BIT(x)
+109
drivers/gpu/drm/xe/xe_hw_error.c
··· 1 + // SPDX-License-Identifier: MIT 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + 6 + #include "regs/xe_hw_error_regs.h" 7 + #include "regs/xe_irq_regs.h" 8 + 9 + #include "xe_device.h" 10 + #include "xe_hw_error.h" 11 + #include "xe_mmio.h" 12 + 13 + /* Error categories reported by hardware */ 14 + enum hardware_error { 15 + HARDWARE_ERROR_CORRECTABLE = 0, 16 + HARDWARE_ERROR_NONFATAL = 1, 17 + HARDWARE_ERROR_FATAL = 2, 18 + HARDWARE_ERROR_MAX, 19 + }; 20 + 21 + static const char *hw_error_to_str(const enum hardware_error hw_err) 22 + { 23 + switch (hw_err) { 24 + case HARDWARE_ERROR_CORRECTABLE: 25 + return "CORRECTABLE"; 26 + case HARDWARE_ERROR_NONFATAL: 27 + return "NONFATAL"; 28 + case HARDWARE_ERROR_FATAL: 29 + return "FATAL"; 30 + default: 31 + return "UNKNOWN"; 32 + } 33 + } 34 + 35 + static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) 36 + { 37 + const char *hw_err_str = hw_error_to_str(hw_err); 38 + struct xe_device *xe = tile_to_xe(tile); 39 + unsigned long flags; 40 + u32 err_src; 41 + 42 + if (xe->info.platform != XE_BATTLEMAGE) 43 + return; 44 + 45 + spin_lock_irqsave(&xe->irq.lock, flags); 46 + err_src = xe_mmio_read32(&tile->mmio, DEV_ERR_STAT_REG(hw_err)); 47 + if (!err_src) { 48 + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported DEV_ERR_STAT_%s blank!\n", 49 + tile->id, hw_err_str); 50 + goto unlock; 51 + } 52 + 53 + /* TODO: Process errrors per source */ 54 + 55 + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); 56 + 57 + unlock: 58 + spin_unlock_irqrestore(&xe->irq.lock, flags); 59 + } 60 + 61 + /** 62 + * xe_hw_error_irq_handler - irq handling for hw errors 63 + * @tile: tile instance 64 + * @master_ctl: value read from master interrupt register 65 + * 66 + * Xe platforms add three error bits to the master interrupt register to support error handling. 67 + * These three bits are used to convey the class of error FATAL, NONFATAL, or CORRECTABLE. 68 + * To process the interrupt, determine the source of error by reading the Device Error Source 69 + * Register that corresponds to the class of error being serviced. 70 + */ 71 + void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) 72 + { 73 + enum hardware_error hw_err; 74 + 75 + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) 76 + if (master_ctl & ERROR_IRQ(hw_err)) 77 + hw_error_source_handler(tile, hw_err); 78 + } 79 + 80 + /* 81 + * Process hardware errors during boot 82 + */ 83 + static void process_hw_errors(struct xe_device *xe) 84 + { 85 + struct xe_tile *tile; 86 + u32 master_ctl; 87 + u8 id; 88 + 89 + for_each_tile(tile, xe, id) { 90 + master_ctl = xe_mmio_read32(&tile->mmio, GFX_MSTR_IRQ); 91 + xe_hw_error_irq_handler(tile, master_ctl); 92 + xe_mmio_write32(&tile->mmio, GFX_MSTR_IRQ, master_ctl); 93 + } 94 + } 95 + 96 + /** 97 + * xe_hw_error_init - Initialize hw errors 98 + * @xe: xe device instance 99 + * 100 + * Initialize and check for errors that occurred during boot 101 + * prior to driver load 102 + */ 103 + void xe_hw_error_init(struct xe_device *xe) 104 + { 105 + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) 106 + return; 107 + 108 + process_hw_errors(xe); 109 + }
+15
drivers/gpu/drm/xe/xe_hw_error.h
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Copyright © 2025 Intel Corporation 4 + */ 5 + #ifndef XE_HW_ERROR_H_ 6 + #define XE_HW_ERROR_H_ 7 + 8 + #include <linux/types.h> 9 + 10 + struct xe_tile; 11 + struct xe_device; 12 + 13 + void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl); 14 + void xe_hw_error_init(struct xe_device *xe); 15 + #endif
+4
drivers/gpu/drm/xe/xe_irq.c
··· 18 18 #include "xe_gt.h" 19 19 #include "xe_guc.h" 20 20 #include "xe_hw_engine.h" 21 + #include "xe_hw_error.h" 21 22 #include "xe_i2c.h" 22 23 #include "xe_memirq.h" 23 24 #include "xe_mmio.h" ··· 469 468 xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl); 470 469 471 470 gt_irq_handler(tile, master_ctl, intr_dw, identity); 471 + xe_hw_error_irq_handler(tile, master_ctl); 472 472 473 473 /* 474 474 * Display interrupts (including display backlight operations ··· 757 755 unsigned int irq_flags = PCI_IRQ_MSI; 758 756 int nvec = 1; 759 757 int err; 758 + 759 + xe_hw_error_init(xe); 760 760 761 761 xe_irq_reset(xe); 762 762