Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: rework TDR in scheduler (v2)

Add two callbacks to scheduler to maintain jobs, and invoked for
job timeout calculations. Now TDR measures time gap from
job is processed by hw.

v2:
fix typo

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Monk Liu and committed by
Alex Deucher
0de2479c cccd9bce

+62 -1
+1
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 754 754 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, 755 755 struct amd_sched_entity *entity, void *owner, 756 756 struct fence **f); 757 + void amdgpu_job_timeout_func(struct work_struct *work); 757 758 758 759 struct amdgpu_ring { 759 760 struct amdgpu_device *adev;
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
··· 871 871 872 872 r = amd_sched_job_init(&job->base, &ring->sched, 873 873 &p->ctx->rings[ring->idx].entity, 874 + amdgpu_job_timeout_func, 874 875 p->filp, &fence); 875 876 if (r) { 876 877 amdgpu_job_free(job);
+15 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
··· 34 34 kfree(job); 35 35 } 36 36 37 + void amdgpu_job_timeout_func(struct work_struct *work) 38 + { 39 + struct amdgpu_job *job = container_of(work, struct amdgpu_job, base.work_tdr.work); 40 + DRM_ERROR("ring %s timeout, last signaled seq=%u, last emitted seq=%u\n", 41 + job->base.sched->name, 42 + (uint32_t)atomic_read(&job->ring->fence_drv.last_seq), 43 + job->ring->fence_drv.sync_seq); 44 + } 45 + 37 46 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, 38 47 struct amdgpu_job **job) 39 48 { ··· 112 103 if (!f) 113 104 return -EINVAL; 114 105 115 - r = amd_sched_job_init(&job->base, &ring->sched, entity, owner, &fence); 106 + r = amd_sched_job_init(&job->base, &ring->sched, 107 + entity, owner, 108 + amdgpu_job_timeout_func, 109 + &fence); 116 110 if (r) 117 111 return r; 118 112 ··· 192 180 struct amd_sched_backend_ops amdgpu_sched_ops = { 193 181 .dependency = amdgpu_job_dependency, 194 182 .run_job = amdgpu_job_run, 183 + .begin_job = amd_sched_job_begin, 184 + .finish_job = amd_sched_job_finish, 195 185 };
+37
drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
··· 324 324 schedule_work(&job->work_free_job); 325 325 } 326 326 327 + /* job_finish is called after hw fence signaled, and 328 + * the job had already been deleted from ring_mirror_list 329 + */ 330 + void amd_sched_job_finish(struct amd_sched_job *s_job) 331 + { 332 + struct amd_sched_job *next; 333 + struct amd_gpu_scheduler *sched = s_job->sched; 334 + 335 + if (sched->timeout != MAX_SCHEDULE_TIMEOUT) { 336 + cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */ 337 + 338 + /* queue TDR for next job */ 339 + next = list_first_entry_or_null(&sched->ring_mirror_list, 340 + struct amd_sched_job, node); 341 + 342 + if (next) { 343 + INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback); 344 + schedule_delayed_work(&next->work_tdr, sched->timeout); 345 + } 346 + } 347 + } 348 + 349 + void amd_sched_job_begin(struct amd_sched_job *s_job) 350 + { 351 + struct amd_gpu_scheduler *sched = s_job->sched; 352 + 353 + if (sched->timeout != MAX_SCHEDULE_TIMEOUT && 354 + list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job) 355 + { 356 + INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback); 357 + schedule_delayed_work(&s_job->work_tdr, sched->timeout); 358 + } 359 + } 360 + 327 361 /** 328 362 * Submit a job to the job queue 329 363 * ··· 381 347 int amd_sched_job_init(struct amd_sched_job *job, 382 348 struct amd_gpu_scheduler *sched, 383 349 struct amd_sched_entity *entity, 350 + void (*timeout_cb)(struct work_struct *work), 384 351 void *owner, struct fence **fence) 385 352 { 386 353 INIT_LIST_HEAD(&job->node); ··· 392 357 return -ENOMEM; 393 358 394 359 job->s_fence->s_job = job; 360 + job->timeout_callback = timeout_cb; 395 361 396 362 if (fence) 397 363 *fence = &job->s_fence->base; ··· 451 415 /* remove job from ring_mirror_list */ 452 416 spin_lock_irqsave(&sched->job_list_lock, flags); 453 417 list_del_init(&s_fence->s_job->node); 418 + sched->ops->finish_job(s_fence->s_job); 454 419 spin_unlock_irqrestore(&sched->job_list_lock, flags); 455 420 456 421 amd_sched_fence_signal(s_fence);
+7
drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
··· 85 85 struct fence_cb cb_free_job; 86 86 struct work_struct work_free_job; 87 87 struct list_head node; 88 + struct delayed_work work_tdr; 89 + void (*timeout_callback) (struct work_struct *work); 88 90 }; 89 91 90 92 extern const struct fence_ops amd_sched_fence_ops; ··· 107 105 struct amd_sched_backend_ops { 108 106 struct fence *(*dependency)(struct amd_sched_job *sched_job); 109 107 struct fence *(*run_job)(struct amd_sched_job *sched_job); 108 + void (*begin_job)(struct amd_sched_job *sched_job); 109 + void (*finish_job)(struct amd_sched_job *sched_job); 110 110 }; 111 111 112 112 enum amd_sched_priority { ··· 154 150 int amd_sched_job_init(struct amd_sched_job *job, 155 151 struct amd_gpu_scheduler *sched, 156 152 struct amd_sched_entity *entity, 153 + void (*timeout_cb)(struct work_struct *work), 157 154 void *owner, struct fence **fence); 158 155 void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched , 159 156 struct amd_sched_job *s_job); 157 + void amd_sched_job_finish(struct amd_sched_job *s_job); 158 + void amd_sched_job_begin(struct amd_sched_job *s_job); 160 159 #endif
+1
drivers/gpu/drm/amd/scheduler/sched_fence.c
··· 63 63 unsigned long flags; 64 64 spin_lock_irqsave(&sched->job_list_lock, flags); 65 65 list_add_tail(&s_job->node, &sched->ring_mirror_list); 66 + sched->ops->begin_job(s_job); 66 67 spin_unlock_irqrestore(&sched->job_list_lock, flags); 67 68 } 68 69