Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe/xe_hw_error: Add fault injection to trigger csc error handler

Add a debugfs fault handler to trigger csc error handler that
wedges the device and enables runtime survivability mode.

v2: add debugfs only for bmg (Umesh)
v3: do not use csc_fault attribute if debugfs is not enabled
v4: rebase

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Link: https://lore.kernel.org/r/20250826063419.3022216-11-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Riana Tauro and committed by
Rodrigo Vivi
d1f51a4f a7df563b

+16 -1
+5 -1
drivers/gpu/drm/xe/xe_debugfs.c
··· 35 35 #endif 36 36 37 37 DECLARE_FAULT_ATTR(gt_reset_failure); 38 + DECLARE_FAULT_ATTR(inject_csc_hw_error); 38 39 39 40 static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio, 40 41 u32 offset, char *name, struct drm_printer *p) ··· 362 361 ARRAY_SIZE(debugfs_list), 363 362 root, minor); 364 363 365 - if (xe->info.platform == XE_BATTLEMAGE) 364 + if (xe->info.platform == XE_BATTLEMAGE) { 366 365 drm_debugfs_create_files(debugfs_residencies, 367 366 ARRAY_SIZE(debugfs_residencies), 368 367 root, minor); 368 + fault_create_debugfs_attr("inject_csc_hw_error", root, 369 + &inject_csc_hw_error); 370 + } 369 371 370 372 debugfs_create_file("forcewake_all", 0400, root, xe, 371 373 &forcewake_all_fops);
+11
drivers/gpu/drm/xe/xe_hw_error.c
··· 3 3 * Copyright © 2025 Intel Corporation 4 4 */ 5 5 6 + #include <linux/fault-inject.h> 7 + 6 8 #include "regs/xe_gsc_regs.h" 7 9 #include "regs/xe_hw_error_regs.h" 8 10 #include "regs/xe_irq_regs.h" ··· 15 13 #include "xe_survivability_mode.h" 16 14 17 15 #define HEC_UNCORR_FW_ERR_BITS 4 16 + extern struct fault_attr inject_csc_hw_error; 18 17 19 18 /* Error categories reported by hardware */ 20 19 enum hardware_error { ··· 44 41 default: 45 42 return "UNKNOWN"; 46 43 } 44 + } 45 + 46 + static bool fault_inject_csc_hw_error(void) 47 + { 48 + return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); 47 49 } 48 50 49 51 static void csc_hw_error_work(struct work_struct *work) ··· 137 129 void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) 138 130 { 139 131 enum hardware_error hw_err; 132 + 133 + if (fault_inject_csc_hw_error()) 134 + schedule_work(&tile->csc_hw_error_work); 140 135 141 136 for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) 142 137 if (master_ctl & ERROR_IRQ(hw_err))