Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/xe: Use fault injection infrastructure to find issues at probe time

The kernel fault injection infrastructure is used to test proper error
handling during probe. The return code of the functions using
ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by
tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION
(among others).

One way to use fault injection at probe time by making each of those
functions fail one at a time is:

FAILTYPE=fail_function
DEVICE="0000:00:08.0" # depends on the system
ERRNO=-12 # -ENOMEM, can depend on the function

echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 1 > /sys/kernel/debug/$FAILTYPE/verbose

modprobe xe
echo $DEVICE > /sys/bus/pci/drivers/xe/unbind

grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
cut -d ' ' -f 1 | while read -r FUNCTION ; do
echo "Injecting fault in $FUNCTION"
echo "" > /sys/kernel/debug/$FAILTYPE/inject
echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
echo $DEVICE > /sys/bus/pci/drivers/xe/bind
done

rmmod xe

It will also be integrated into IGT for systematic execution by CI.

v2: Wrappers are not needed in the cases covered by this patch, so
remove them and use ALLOW_ERROR_INJECTION() directly.

v3: Document the use of fault injection at probe time in xe_pci_probe
and refer to it where ALLOW_ERROR_INJECTION() is used.

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240927151207.399354-1-francois.dugast@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Francois Dugast and committed by
Rodrigo Vivi
91b2c42c 11bfc4a2

+49
+3
drivers/gpu/drm/xe/xe_device.c
··· 6 6 #include "xe_device.h" 7 7 8 8 #include <linux/delay.h> 9 + #include <linux/fault-inject.h> 9 10 #include <linux/units.h> 10 11 11 12 #include <drm/drm_aperture.h> ··· 383 382 err: 384 383 return ERR_PTR(err); 385 384 } 385 + ALLOW_ERROR_INJECTION(xe_device_create, ERRNO); /* See xe_pci_probe() */ 386 386 387 387 static bool xe_driver_flr_disabled(struct xe_device *xe) 388 388 { ··· 552 550 553 551 return 0; 554 552 } 553 + ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO); /* See xe_pci_probe() */ 555 554 556 555 static void update_device_info(struct xe_device *xe) 557 556 {
+2
drivers/gpu/drm/xe/xe_ggtt.c
··· 5 5 6 6 #include "xe_ggtt.h" 7 7 8 + #include <linux/fault-inject.h> 8 9 #include <linux/io-64-nonatomic-lo-hi.h> 9 10 #include <linux/sizes.h> 10 11 ··· 265 264 266 265 return 0; 267 266 } 267 + ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO); /* See xe_pci_probe() */ 268 268 269 269 static void xe_ggtt_invalidate(struct xe_ggtt *ggtt); 270 270
+3
drivers/gpu/drm/xe/xe_guc_ads.c
··· 5 5 6 6 #include "xe_guc_ads.h" 7 7 8 + #include <linux/fault-inject.h> 9 + 8 10 #include <drm/drm_managed.h> 9 11 10 12 #include <generated/xe_wa_oob.h> ··· 420 418 421 419 return 0; 422 420 } 421 + ALLOW_ERROR_INJECTION(xe_guc_ads_init, ERRNO); /* See xe_pci_probe() */ 423 422 424 423 /** 425 424 * xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load
+2
drivers/gpu/drm/xe/xe_guc_ct.c
··· 8 8 #include <linux/bitfield.h> 9 9 #include <linux/circ_buf.h> 10 10 #include <linux/delay.h> 11 + #include <linux/fault-inject.h> 11 12 12 13 #include <kunit/static_stub.h> 13 14 ··· 210 209 ct->state = XE_GUC_CT_STATE_DISABLED; 211 210 return 0; 212 211 } 212 + ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */ 213 213 214 214 #define desc_read(xe_, guc_ctb__, field_) \ 215 215 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \
+3
drivers/gpu/drm/xe/xe_guc_log.c
··· 5 5 6 6 #include "xe_guc_log.h" 7 7 8 + #include <linux/fault-inject.h> 9 + 8 10 #include <drm/drm_managed.h> 9 11 10 12 #include "xe_bo.h" ··· 98 96 99 97 return 0; 100 98 } 99 + ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO); /* See xe_pci_probe() */
+2
drivers/gpu/drm/xe/xe_guc_relay.c
··· 5 5 6 6 #include <linux/bitfield.h> 7 7 #include <linux/delay.h> 8 + #include <linux/fault-inject.h> 8 9 9 10 #include <drm/drm_managed.h> 10 11 ··· 356 355 357 356 return drmm_add_action_or_reset(&xe->drm, __fini_relay, relay); 358 357 } 358 + ALLOW_ERROR_INJECTION(xe_guc_relay_init, ERRNO); /* See xe_pci_probe() */ 359 359 360 360 static u32 to_relay_error(int err) 361 361 {
+19
drivers/gpu/drm/xe/xe_pci.c
··· 765 765 pci_set_drvdata(pdev, NULL); 766 766 } 767 767 768 + /* 769 + * Probe the PCI device, initialize various parts of the driver. 770 + * 771 + * Fault injection is used to test the error paths of some initialization 772 + * functions called either directly from xe_pci_probe() or indirectly for 773 + * example through xe_device_probe(). Those functions use the kernel fault 774 + * injection capabilities infrastructure, see 775 + * Documentation/fault-injection/fault-injection.rst for details. The macro 776 + * ALLOW_ERROR_INJECTION() is used to conditionally skip function execution 777 + * at runtime and use a provided return value. The first requirement for 778 + * error injectable functions is proper handling of the error code by the 779 + * caller for recovery, which is always the case here. The second 780 + * requirement is that no state is changed before the first error return. 781 + * It is not strictly fullfilled for all initialization functions using the 782 + * ALLOW_ERROR_INJECTION() macro but this is acceptable because for those 783 + * error cases at probe time, the error code is simply propagated up by the 784 + * caller. Therefore there is no consequence on those specific callers when 785 + * function error injection skips the whole function. 786 + */ 768 787 static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) 769 788 { 770 789 const struct xe_device_desc *desc = (const void *)ent->driver_data;
+2
drivers/gpu/drm/xe/xe_pm.c
··· 5 5 6 6 #include "xe_pm.h" 7 7 8 + #include <linux/fault-inject.h> 8 9 #include <linux/pm_runtime.h> 9 10 10 11 #include <drm/drm_managed.h> ··· 264 263 265 264 return 0; 266 265 } 266 + ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */ 267 267 268 268 /** 269 269 * xe_pm_init - Initialize Xe Power Management
+3
drivers/gpu/drm/xe/xe_sriov.c
··· 3 3 * Copyright © 2023 Intel Corporation 4 4 */ 5 5 6 + #include <linux/fault-inject.h> 7 + 6 8 #include <drm/drm_managed.h> 7 9 8 10 #include "regs/xe_regs.h" ··· 121 119 122 120 return drmm_add_action_or_reset(&xe->drm, fini_sriov, xe); 123 121 } 122 + ALLOW_ERROR_INJECTION(xe_sriov_init, ERRNO); /* See xe_pci_probe() */ 124 123 125 124 /** 126 125 * xe_sriov_print_info - Print basic SR-IOV information.
+3
drivers/gpu/drm/xe/xe_tile.c
··· 3 3 * Copyright © 2023 Intel Corporation 4 4 */ 5 5 6 + #include <linux/fault-inject.h> 7 + 6 8 #include <drm/drm_managed.h> 7 9 8 10 #include "xe_device.h" ··· 131 129 132 130 return 0; 133 131 } 132 + ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO); /* See xe_pci_probe() */ 134 133 135 134 static int tile_ttm_mgr_init(struct xe_tile *tile) 136 135 {
+2
drivers/gpu/drm/xe/xe_uc_fw.c
··· 4 4 */ 5 5 6 6 #include <linux/bitfield.h> 7 + #include <linux/fault-inject.h> 7 8 #include <linux/firmware.h> 8 9 9 10 #include <drm/drm_managed.h> ··· 797 796 798 797 return err; 799 798 } 799 + ALLOW_ERROR_INJECTION(xe_uc_fw_init, ERRNO); /* See xe_pci_probe() */ 800 800 801 801 static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw) 802 802 {
+2
drivers/gpu/drm/xe/xe_wa.c
··· 8 8 #include <drm/drm_managed.h> 9 9 #include <kunit/visibility.h> 10 10 #include <linux/compiler_types.h> 11 + #include <linux/fault-inject.h> 11 12 12 13 #include <generated/xe_wa_oob.h> 13 14 ··· 851 850 852 851 return 0; 853 852 } 853 + ALLOW_ERROR_INJECTION(xe_wa_init, ERRNO); /* See xe_pci_probe() */ 854 854 855 855 void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p) 856 856 {
+3
drivers/gpu/drm/xe/xe_wopcm.c
··· 5 5 6 6 #include "xe_wopcm.h" 7 7 8 + #include <linux/fault-inject.h> 9 + 8 10 #include "regs/xe_guc_regs.h" 9 11 #include "xe_device.h" 10 12 #include "xe_force_wake.h" ··· 270 268 271 269 return ret; 272 270 } 271 + ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO); /* See xe_pci_probe() */