Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm: Create a task info option for wedge events

When a device get wedged, it might be caused by a guilty application.
For userspace, knowing which task was involved can be useful for some
situations, like for implementing a policy, logs or for giving a chance
for the compositor to let the user know what task was involved in the
problem. This is an optional argument, when the task info is not
available, the PID and TASK string won't appear in the event string.

Sometimes just the PID isn't enough giving that the task might be already
dead by the time userspace will try to check what was this PID's name,
so to make the life easier also notify what's the task's name in the user
event.

Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Acked-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20250617124949.2151549-4-andrealmeid@igalia.com
Signed-off-by: André Almeida <andrealmeid@igalia.com>

+34 -9
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 6364 6364 atomic_set(&adev->reset_domain->reset_res, r); 6365 6365 6366 6366 if (!r) 6367 - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6367 + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); 6368 6368 6369 6369 return r; 6370 6370 }
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
··· 164 164 if (amdgpu_ring_sched_ready(ring)) 165 165 drm_sched_start(&ring->sched, 0); 166 166 dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); 167 - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 167 + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL); 168 168 goto exit; 169 169 } 170 170 dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
+17 -4
drivers/gpu/drm/drm_drv.c
··· 35 35 #include <linux/moduleparam.h> 36 36 #include <linux/mount.h> 37 37 #include <linux/pseudo_fs.h> 38 + #include <linux/sched.h> 38 39 #include <linux/slab.h> 39 40 #include <linux/sprintf.h> 40 41 #include <linux/srcu.h> ··· 540 539 } 541 540 } 542 541 542 + #define WEDGE_STR_LEN 32 543 + #define PID_STR_LEN 15 544 + #define COMM_STR_LEN (TASK_COMM_LEN + 5) 545 + 543 546 /** 544 547 * drm_dev_wedged_event - generate a device wedged uevent 545 548 * @dev: DRM device 546 549 * @method: method(s) to be used for recovery 550 + * @info: optional information about the guilty task 547 551 * 548 552 * This generates a device wedged uevent for the DRM device specified by @dev. 549 553 * Recovery @method\(s) of choice will be sent in the uevent environment as ··· 561 555 * 562 556 * Returns: 0 on success, negative error code otherwise. 563 557 */ 564 - int drm_dev_wedged_event(struct drm_device *dev, unsigned long method) 558 + int drm_dev_wedged_event(struct drm_device *dev, unsigned long method, 559 + struct drm_wedge_task_info *info) 565 560 { 561 + char event_string[WEDGE_STR_LEN], pid_string[PID_STR_LEN], comm_string[COMM_STR_LEN]; 562 + char *envp[] = { event_string, NULL, NULL, NULL }; 566 563 const char *recovery = NULL; 567 564 unsigned int len, opt; 568 - /* Event string length up to 28+ characters with available methods */ 569 - char event_string[32]; 570 - char *envp[] = { event_string, NULL }; 571 565 572 566 len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED="); 573 567 ··· 588 582 589 583 drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ? 590 584 "but recovered through reset" : "needs recovery"); 585 + 586 + if (info && (info->comm[0] != '\0') && (info->pid >= 0)) { 587 + snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid); 588 + snprintf(comm_string, sizeof(comm_string), "TASK=%s", info->comm); 589 + envp[1] = pid_string; 590 + envp[2] = comm_string; 591 + } 591 592 592 593 return kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp); 593 594 }
+2 -1
drivers/gpu/drm/i915/gt/intel_reset.c
··· 1448 1448 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1449 1449 else 1450 1450 drm_dev_wedged_event(&gt->i915->drm, 1451 - DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET); 1451 + DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET, 1452 + NULL); 1452 1453 } 1453 1454 1454 1455 /**
+2 -1
drivers/gpu/drm/xe/xe_device.c
··· 1168 1168 1169 1169 /* Notify userspace of wedged device */ 1170 1170 drm_dev_wedged_event(&xe->drm, 1171 - DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET); 1171 + DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET, 1172 + NULL); 1172 1173 } 1173 1174 1174 1175 for_each_gt(gt, xe, id)
+9
include/drm/drm_device.h
··· 5 5 #include <linux/kref.h> 6 6 #include <linux/mutex.h> 7 7 #include <linux/idr.h> 8 + #include <linux/sched.h> 8 9 9 10 #include <drm/drm_mode_config.h> 10 11 ··· 30 29 #define DRM_WEDGE_RECOVERY_NONE BIT(0) /* optional telemetry collection */ 31 30 #define DRM_WEDGE_RECOVERY_REBIND BIT(1) /* unbind + bind driver */ 32 31 #define DRM_WEDGE_RECOVERY_BUS_RESET BIT(2) /* unbind + reset bus device + bind */ 32 + 33 + /** 34 + * struct drm_wedge_task_info - information about the guilty task of a wedge dev 35 + */ 36 + struct drm_wedge_task_info { 37 + pid_t pid; 38 + char comm[TASK_COMM_LEN]; 39 + }; 33 40 34 41 /** 35 42 * enum switch_power_state - power state of drm device
+2 -1
include/drm/drm_drv.h
··· 487 487 bool drm_dev_enter(struct drm_device *dev, int *idx); 488 488 void drm_dev_exit(int idx); 489 489 void drm_dev_unplug(struct drm_device *dev); 490 - int drm_dev_wedged_event(struct drm_device *dev, unsigned long method); 490 + int drm_dev_wedged_event(struct drm_device *dev, unsigned long method, 491 + struct drm_wedge_task_info *info); 491 492 492 493 /** 493 494 * drm_dev_is_unplugged - is a DRM device unplugged