Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/msm: print offender task name on hangcheck recovery

Track the pid per submit, so we can print the name of the task which
submitted the batch that caused the gpu to hang.

Signed-off-by: Rob Clark <robdclark@gmail.com>

Rob Clark 4816b626 40e6815b

+22 -4
+1
drivers/gpu/drm/msm/msm_gem.h
··· 86 86 struct list_head bo_list; 87 87 struct ww_acquire_ctx ticket; 88 88 struct fence *fence; 89 + struct pid *pid; /* submitting process */ 89 90 bool valid; /* true if no cmdstream patching needed */ 90 91 unsigned int nr_cmds; 91 92 unsigned int nr_bos;
+2
drivers/gpu/drm/msm/msm_gem_submit.c
··· 45 45 46 46 submit->dev = dev; 47 47 submit->gpu = gpu; 48 + submit->pid = get_pid(task_pid(current)); 48 49 49 50 /* initially, until copy_from_user() and bo lookup succeeds: */ 50 51 submit->nr_bos = 0; ··· 61 60 { 62 61 fence_put(submit->fence); 63 62 list_del(&submit->node); 63 + put_pid(submit->pid); 64 64 kfree(submit); 65 65 } 66 66
+19 -4
drivers/gpu/drm/msm/msm_gpu.c
··· 272 272 { 273 273 struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work); 274 274 struct drm_device *dev = gpu->dev; 275 + struct msm_gem_submit *submit; 275 276 uint32_t fence = gpu->funcs->last_fence(gpu); 276 - 277 - dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); 278 277 279 278 msm_update_fence(gpu->fctx, fence + 1); 280 279 281 280 mutex_lock(&dev->struct_mutex); 282 - if (msm_gpu_active(gpu)) { 283 - struct msm_gem_submit *submit; 284 281 282 + dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); 283 + list_for_each_entry(submit, &gpu->submit_list, node) { 284 + if (submit->fence->seqno == (fence + 1)) { 285 + struct task_struct *task; 286 + 287 + rcu_read_lock(); 288 + task = pid_task(submit->pid, PIDTYPE_PID); 289 + if (task) { 290 + dev_err(dev->dev, "%s: offending task: %s\n", 291 + gpu->name, task->comm); 292 + } 293 + rcu_read_unlock(); 294 + break; 295 + } 296 + } 297 + 298 + if (msm_gpu_active(gpu)) { 285 299 /* retire completed submits, plus the one that hung: */ 286 300 retire_submits(gpu); 287 301 ··· 307 293 gpu->funcs->submit(gpu, submit, NULL); 308 294 } 309 295 } 296 + 310 297 mutex_unlock(&dev->struct_mutex); 311 298 312 299 msm_gpu_retire(gpu);