drm/amdgpu: rework TDR in scheduler (v2)

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add two callbacks to scheduler to maintain jobs, and invoked for
job timeout calculations. Now TDR measures time gap from
job is processed by hw.

v2:
fix typo

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Monk Liu and committed by

Alex Deucher 10 years ago 0de2479c cccd9bce

+62 -1

6 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu.h

amdgpu_cs.c

amdgpu_job.c

scheduler

gpu_scheduler.c

gpu_scheduler.h

sched_fence.c

drivers/gpu/drm/amd/amdgpu/amdgpu.h

··· 754 754 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, 755 755 struct amd_sched_entity *entity, void *owner, 756 756 struct fence **f); 757 + void amdgpu_job_timeout_func(struct work_struct *work); 757 758 758 759 struct amdgpu_ring { 759 760 struct amdgpu_device *adev;

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

··· 871 871 872 872 r = amd_sched_job_init(&job->base, &ring->sched, 873 873 &p->ctx->rings[ring->idx].entity, 874 + amdgpu_job_timeout_func, 874 875 p->filp, &fence); 875 876 if (r) { 876 877 amdgpu_job_free(job);

+15 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

··· 34 34 kfree(job); 35 35 } 36 36 37 + void amdgpu_job_timeout_func(struct work_struct *work) 38 + { 39 + struct amdgpu_job *job = container_of(work, struct amdgpu_job, base.work_tdr.work); 40 + DRM_ERROR("ring %s timeout, last signaled seq=%u, last emitted seq=%u\n", 41 + job->base.sched->name, 42 + (uint32_t)atomic_read(&job->ring->fence_drv.last_seq), 43 + job->ring->fence_drv.sync_seq); 44 + } 45 + 37 46 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, 38 47 struct amdgpu_job **job) 39 48 { ··· 112 103 if (!f) 113 104 return -EINVAL; 114 105 115 - r = amd_sched_job_init(&job->base, &ring->sched, entity, owner, &fence); 106 + r = amd_sched_job_init(&job->base, &ring->sched, 107 + entity, owner, 108 + amdgpu_job_timeout_func, 109 + &fence); 116 110 if (r) 117 111 return r; 118 112 ··· 192 180 struct amd_sched_backend_ops amdgpu_sched_ops = { 193 181 .dependency = amdgpu_job_dependency, 194 182 .run_job = amdgpu_job_run, 183 + .begin_job = amd_sched_job_begin, 184 + .finish_job = amd_sched_job_finish, 195 185 };

+37

drivers/gpu/drm/amd/scheduler/gpu_scheduler.c

··· 324 324 schedule_work(&job->work_free_job); 325 325 } 326 326 327 + /* job_finish is called after hw fence signaled, and 328 + * the job had already been deleted from ring_mirror_list 329 + */ 330 + void amd_sched_job_finish(struct amd_sched_job *s_job) 331 + { 332 + struct amd_sched_job *next; 333 + struct amd_gpu_scheduler *sched = s_job->sched; 334 + 335 + if (sched->timeout != MAX_SCHEDULE_TIMEOUT) { 336 + cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */ 337 + 338 + /* queue TDR for next job */ 339 + next = list_first_entry_or_null(&sched->ring_mirror_list, 340 + struct amd_sched_job, node); 341 + 342 + if (next) { 343 + INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback); 344 + schedule_delayed_work(&next->work_tdr, sched->timeout); 345 + } 346 + } 347 + } 348 + 349 + void amd_sched_job_begin(struct amd_sched_job *s_job) 350 + { 351 + struct amd_gpu_scheduler *sched = s_job->sched; 352 + 353 + if (sched->timeout != MAX_SCHEDULE_TIMEOUT && 354 + list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job) 355 + { 356 + INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback); 357 + schedule_delayed_work(&s_job->work_tdr, sched->timeout); 358 + } 359 + } 360 + 327 361 /** 328 362 * Submit a job to the job queue 329 363 * ··· 381 347 int amd_sched_job_init(struct amd_sched_job *job, 382 348 struct amd_gpu_scheduler *sched, 383 349 struct amd_sched_entity *entity, 350 + void (*timeout_cb)(struct work_struct *work), 384 351 void *owner, struct fence **fence) 385 352 { 386 353 INIT_LIST_HEAD(&job->node); ··· 392 357 return -ENOMEM; 393 358 394 359 job->s_fence->s_job = job; 360 + job->timeout_callback = timeout_cb; 395 361 396 362 if (fence) 397 363 *fence = &job->s_fence->base; ··· 451 415 /* remove job from ring_mirror_list */ 452 416 spin_lock_irqsave(&sched->job_list_lock, flags); 453 417 list_del_init(&s_fence->s_job->node); 418 + sched->ops->finish_job(s_fence->s_job); 454 419 spin_unlock_irqrestore(&sched->job_list_lock, flags); 455 420 456 421 amd_sched_fence_signal(s_fence);

drivers/gpu/drm/amd/scheduler/gpu_scheduler.h

··· 85 85 struct fence_cb cb_free_job; 86 86 struct work_struct work_free_job; 87 87 struct list_head node; 88 + struct delayed_work work_tdr; 89 + void (*timeout_callback) (struct work_struct *work); 88 90 }; 89 91 90 92 extern const struct fence_ops amd_sched_fence_ops; ··· 107 105 struct amd_sched_backend_ops { 108 106 struct fence *(*dependency)(struct amd_sched_job *sched_job); 109 107 struct fence *(*run_job)(struct amd_sched_job *sched_job); 108 + void (*begin_job)(struct amd_sched_job *sched_job); 109 + void (*finish_job)(struct amd_sched_job *sched_job); 110 110 }; 111 111 112 112 enum amd_sched_priority { ··· 154 150 int amd_sched_job_init(struct amd_sched_job *job, 155 151 struct amd_gpu_scheduler *sched, 156 152 struct amd_sched_entity *entity, 153 + void (*timeout_cb)(struct work_struct *work), 157 154 void *owner, struct fence **fence); 158 155 void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched , 159 156 struct amd_sched_job *s_job); 157 + void amd_sched_job_finish(struct amd_sched_job *s_job); 158 + void amd_sched_job_begin(struct amd_sched_job *s_job); 160 159 #endif

drivers/gpu/drm/amd/scheduler/sched_fence.c

··· 63 63 unsigned long flags; 64 64 spin_lock_irqsave(&sched->job_list_lock, flags); 65 65 list_add_tail(&s_job->node, &sched->ring_mirror_list); 66 + sched->ops->begin_job(s_job); 66 67 spin_unlock_irqrestore(&sched->job_list_lock, flags); 67 68 } 68 69