Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/scheduler: rework job destruction

We now destroy finished jobs from the worker thread to make sure that
we never destroy a job currently in timeout processing.
By this we avoid holding lock around ring mirror list in drm_sched_stop
which should solve a deadlock reported by a user.

v2: Remove unused variable.
v4: Move guilty job free into sched code.
v5:
Move sched->hw_rq_count to drm_sched_start to account for counter
decrement in drm_sched_stop even when we don't call resubmit jobs
if guily job did signal.
v6: remove unused variable

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692

Acked-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1555599624-12285-3-git-send-email-andrey.grodzovsky@amd.com

authored by

Christian König and committed by
Alex Deucher
5918045c b3198c38

+102 -85
+3 -6
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 3334 3334 if (!ring || !ring->sched.thread) 3335 3335 continue; 3336 3336 3337 - drm_sched_stop(&ring->sched); 3337 + drm_sched_stop(&ring->sched, &job->base); 3338 3338 3339 3339 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3340 3340 amdgpu_fence_driver_force_completion(ring); ··· 3342 3342 3343 3343 if(job) 3344 3344 drm_sched_increase_karma(&job->base); 3345 - 3346 - 3347 3345 3348 3346 if (!amdgpu_sriov_vf(adev)) { 3349 3347 ··· 3480 3482 return r; 3481 3483 } 3482 3484 3483 - static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, 3484 - struct amdgpu_job *job) 3485 + static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev) 3485 3486 { 3486 3487 int i; 3487 3488 ··· 3620 3623 3621 3624 /* Post ASIC reset for all devs .*/ 3622 3625 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3623 - amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL); 3626 + amdgpu_device_post_asic_reset(tmp_adev); 3624 3627 3625 3628 if (r) { 3626 3629 /* bad news, how to tell it to userspace ? */
-5
drivers/gpu/drm/etnaviv/etnaviv_dump.c
··· 118 118 unsigned int n_obj, n_bomap_pages; 119 119 size_t file_size, mmu_size; 120 120 __le64 *bomap, *bomap_start; 121 - unsigned long flags; 122 121 123 122 /* Only catch the first event, or when manually re-armed */ 124 123 if (!etnaviv_dump_core) ··· 134 135 mmu_size + gpu->buffer.size; 135 136 136 137 /* Add in the active command buffers */ 137 - spin_lock_irqsave(&gpu->sched.job_list_lock, flags); 138 138 list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) { 139 139 submit = to_etnaviv_submit(s_job); 140 140 file_size += submit->cmdbuf.size; 141 141 n_obj++; 142 142 } 143 - spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags); 144 143 145 144 /* Add in the active buffer objects */ 146 145 list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) { ··· 180 183 gpu->buffer.size, 181 184 etnaviv_cmdbuf_get_va(&gpu->buffer)); 182 185 183 - spin_lock_irqsave(&gpu->sched.job_list_lock, flags); 184 186 list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) { 185 187 submit = to_etnaviv_submit(s_job); 186 188 etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD, 187 189 submit->cmdbuf.vaddr, submit->cmdbuf.size, 188 190 etnaviv_cmdbuf_get_va(&submit->cmdbuf)); 189 191 } 190 - spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags); 191 192 192 193 /* Reserve space for the bomap */ 193 194 if (n_bomap_pages) {
+1 -1
drivers/gpu/drm/etnaviv/etnaviv_sched.c
··· 109 109 } 110 110 111 111 /* block scheduler */ 112 - drm_sched_stop(&gpu->sched); 112 + drm_sched_stop(&gpu->sched, sched_job); 113 113 114 114 if(sched_job) 115 115 drm_sched_increase_karma(sched_job);
+1 -1
drivers/gpu/drm/lima/lima_sched.c
··· 258 258 static void lima_sched_handle_error_task(struct lima_sched_pipe *pipe, 259 259 struct lima_sched_task *task) 260 260 { 261 - drm_sched_stop(&pipe->base); 261 + drm_sched_stop(&pipe->base, &task->base); 262 262 263 263 if (task) 264 264 drm_sched_increase_karma(&task->base);
+1 -1
drivers/gpu/drm/panfrost/panfrost_job.c
··· 387 387 mutex_lock(&pfdev->reset_lock); 388 388 389 389 for (i = 0; i < NUM_JOB_SLOTS; i++) 390 - drm_sched_stop(&pfdev->js->queue[i].sched); 390 + drm_sched_stop(&pfdev->js->queue[i].sched, sched_job); 391 391 392 392 if (sched_job) 393 393 drm_sched_increase_karma(sched_job);
+94 -65
drivers/gpu/drm/scheduler/sched_main.c
··· 265 265 } 266 266 EXPORT_SYMBOL(drm_sched_resume_timeout); 267 267 268 - /* job_finish is called after hw fence signaled 269 - */ 270 - static void drm_sched_job_finish(struct work_struct *work) 271 - { 272 - struct drm_sched_job *s_job = container_of(work, struct drm_sched_job, 273 - finish_work); 274 - struct drm_gpu_scheduler *sched = s_job->sched; 275 - unsigned long flags; 276 - 277 - /* 278 - * Canceling the timeout without removing our job from the ring mirror 279 - * list is safe, as we will only end up in this worker if our jobs 280 - * finished fence has been signaled. So even if some another worker 281 - * manages to find this job as the next job in the list, the fence 282 - * signaled check below will prevent the timeout to be restarted. 283 - */ 284 - cancel_delayed_work_sync(&sched->work_tdr); 285 - 286 - spin_lock_irqsave(&sched->job_list_lock, flags); 287 - /* queue TDR for next job */ 288 - drm_sched_start_timeout(sched); 289 - spin_unlock_irqrestore(&sched->job_list_lock, flags); 290 - 291 - sched->ops->free_job(s_job); 292 - } 293 - 294 268 static void drm_sched_job_begin(struct drm_sched_job *s_job) 295 269 { 296 270 struct drm_gpu_scheduler *sched = s_job->sched; ··· 288 314 289 315 if (job) 290 316 job->sched->ops->timedout_job(job); 317 + 318 + /* 319 + * Guilty job did complete and hence needs to be manually removed 320 + * See drm_sched_stop doc. 321 + */ 322 + if (list_empty(&job->node)) 323 + job->sched->ops->free_job(job); 291 324 292 325 spin_lock_irqsave(&sched->job_list_lock, flags); 293 326 drm_sched_start_timeout(sched); ··· 352 371 * @sched: scheduler instance 353 372 * @bad: bad scheduler job 354 373 * 374 + * Stop the scheduler and also removes and frees all completed jobs. 375 + * Note: bad job will not be freed as it might be used later and so it's 376 + * callers responsibility to release it manually if it's not part of the 377 + * mirror list any more. 378 + * 355 379 */ 356 - void drm_sched_stop(struct drm_gpu_scheduler *sched) 380 + void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) 357 381 { 358 - struct drm_sched_job *s_job; 382 + struct drm_sched_job *s_job, *tmp; 359 383 unsigned long flags; 360 - struct dma_fence *last_fence = NULL; 361 384 362 385 kthread_park(sched->thread); 363 386 364 387 /* 365 - * Verify all the signaled jobs in mirror list are removed from the ring 366 - * by waiting for the latest job to enter the list. This should insure that 367 - * also all the previous jobs that were in flight also already singaled 368 - * and removed from the list. 388 + * Iterate the job list from later to earlier one and either deactive 389 + * their HW callbacks or remove them from mirror list if they already 390 + * signaled. 391 + * This iteration is thread safe as sched thread is stopped. 369 392 */ 370 - spin_lock_irqsave(&sched->job_list_lock, flags); 371 - list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { 393 + list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) { 372 394 if (s_job->s_fence->parent && 373 395 dma_fence_remove_callback(s_job->s_fence->parent, 374 396 &s_job->cb)) { ··· 379 395 s_job->s_fence->parent = NULL; 380 396 atomic_dec(&sched->hw_rq_count); 381 397 } else { 382 - last_fence = dma_fence_get(&s_job->s_fence->finished); 383 - break; 384 - } 385 - } 386 - spin_unlock_irqrestore(&sched->job_list_lock, flags); 398 + /* 399 + * remove job from ring_mirror_list. 400 + * Locking here is for concurrent resume timeout 401 + */ 402 + spin_lock_irqsave(&sched->job_list_lock, flags); 403 + list_del_init(&s_job->node); 404 + spin_unlock_irqrestore(&sched->job_list_lock, flags); 387 405 388 - if (last_fence) { 389 - dma_fence_wait(last_fence, false); 390 - dma_fence_put(last_fence); 406 + /* 407 + * Wait for job's HW fence callback to finish using s_job 408 + * before releasing it. 409 + * 410 + * Job is still alive so fence refcount at least 1 411 + */ 412 + dma_fence_wait(&s_job->s_fence->finished, false); 413 + 414 + /* 415 + * We must keep bad job alive for later use during 416 + * recovery by some of the drivers 417 + */ 418 + if (bad != s_job) 419 + sched->ops->free_job(s_job); 420 + } 391 421 } 392 422 } 393 423 ··· 416 418 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) 417 419 { 418 420 struct drm_sched_job *s_job, *tmp; 421 + unsigned long flags; 419 422 int r; 420 - 421 - if (!full_recovery) 422 - goto unpark; 423 423 424 424 /* 425 425 * Locking the list is not required here as the sched thread is parked 426 - * so no new jobs are being pushed in to HW and in drm_sched_stop we 427 - * flushed all the jobs who were still in mirror list but who already 428 - * signaled and removed them self from the list. Also concurrent 426 + * so no new jobs are being inserted or removed. Also concurrent 429 427 * GPU recovers can't run in parallel. 430 428 */ 431 429 list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { 432 430 struct dma_fence *fence = s_job->s_fence->parent; 431 + 432 + atomic_inc(&sched->hw_rq_count); 433 + 434 + if (!full_recovery) 435 + continue; 433 436 434 437 if (fence) { 435 438 r = dma_fence_add_callback(fence, &s_job->cb, ··· 444 445 drm_sched_process_job(NULL, &s_job->cb); 445 446 } 446 447 447 - drm_sched_start_timeout(sched); 448 + if (full_recovery) { 449 + spin_lock_irqsave(&sched->job_list_lock, flags); 450 + drm_sched_start_timeout(sched); 451 + spin_unlock_irqrestore(&sched->job_list_lock, flags); 452 + } 448 453 449 - unpark: 450 454 kthread_unpark(sched->thread); 451 455 } 452 456 EXPORT_SYMBOL(drm_sched_start); ··· 466 464 uint64_t guilty_context; 467 465 bool found_guilty = false; 468 466 469 - /*TODO DO we need spinlock here ? */ 470 467 list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { 471 468 struct drm_sched_fence *s_fence = s_job->s_fence; 472 469 ··· 478 477 dma_fence_set_error(&s_fence->finished, -ECANCELED); 479 478 480 479 s_job->s_fence->parent = sched->ops->run_job(s_job); 481 - atomic_inc(&sched->hw_rq_count); 482 480 } 483 481 } 484 482 EXPORT_SYMBOL(drm_sched_resubmit_jobs); ··· 514 514 return -ENOMEM; 515 515 job->id = atomic64_inc_return(&sched->job_id_count); 516 516 517 - INIT_WORK(&job->finish_work, drm_sched_job_finish); 518 517 INIT_LIST_HEAD(&job->node); 519 518 520 519 return 0; ··· 596 597 struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); 597 598 struct drm_sched_fence *s_fence = s_job->s_fence; 598 599 struct drm_gpu_scheduler *sched = s_fence->sched; 599 - unsigned long flags; 600 - 601 - cancel_delayed_work(&sched->work_tdr); 602 600 603 601 atomic_dec(&sched->hw_rq_count); 604 602 atomic_dec(&sched->num_jobs); 605 603 606 - spin_lock_irqsave(&sched->job_list_lock, flags); 607 - /* remove job from ring_mirror_list */ 608 - list_del_init(&s_job->node); 609 - spin_unlock_irqrestore(&sched->job_list_lock, flags); 604 + trace_drm_sched_process_job(s_fence); 610 605 611 606 drm_sched_fence_finished(s_fence); 612 - 613 - trace_drm_sched_process_job(s_fence); 614 607 wake_up_interruptible(&sched->wake_up_worker); 608 + } 615 609 616 - schedule_work(&s_job->finish_work); 610 + /** 611 + * drm_sched_cleanup_jobs - destroy finished jobs 612 + * 613 + * @sched: scheduler instance 614 + * 615 + * Remove all finished jobs from the mirror list and destroy them. 616 + */ 617 + static void drm_sched_cleanup_jobs(struct drm_gpu_scheduler *sched) 618 + { 619 + unsigned long flags; 620 + 621 + /* Don't destroy jobs while the timeout worker is running */ 622 + if (!cancel_delayed_work(&sched->work_tdr)) 623 + return; 624 + 625 + 626 + while (!list_empty(&sched->ring_mirror_list)) { 627 + struct drm_sched_job *job; 628 + 629 + job = list_first_entry(&sched->ring_mirror_list, 630 + struct drm_sched_job, node); 631 + if (!dma_fence_is_signaled(&job->s_fence->finished)) 632 + break; 633 + 634 + spin_lock_irqsave(&sched->job_list_lock, flags); 635 + /* remove job from ring_mirror_list */ 636 + list_del_init(&job->node); 637 + spin_unlock_irqrestore(&sched->job_list_lock, flags); 638 + 639 + sched->ops->free_job(job); 640 + } 641 + 642 + /* queue timeout for next job */ 643 + spin_lock_irqsave(&sched->job_list_lock, flags); 644 + drm_sched_start_timeout(sched); 645 + spin_unlock_irqrestore(&sched->job_list_lock, flags); 646 + 617 647 } 618 648 619 649 /** ··· 684 656 struct dma_fence *fence; 685 657 686 658 wait_event_interruptible(sched->wake_up_worker, 659 + (drm_sched_cleanup_jobs(sched), 687 660 (!drm_sched_blocked(sched) && 688 661 (entity = drm_sched_select_entity(sched))) || 689 - kthread_should_stop()); 662 + kthread_should_stop())); 690 663 691 664 if (!entity) 692 665 continue;
+1 -1
drivers/gpu/drm/v3d/v3d_sched.c
··· 268 268 269 269 /* block scheduler */ 270 270 for (q = 0; q < V3D_MAX_QUEUES; q++) 271 - drm_sched_stop(&v3d->queue[q].sched); 271 + drm_sched_stop(&v3d->queue[q].sched, sched_job); 272 272 273 273 if (sched_job) 274 274 drm_sched_increase_karma(sched_job);
+1 -5
include/drm/gpu_scheduler.h
··· 167 167 * @sched: the scheduler instance on which this job is scheduled. 168 168 * @s_fence: contains the fences for the scheduling of job. 169 169 * @finish_cb: the callback for the finished fence. 170 - * @finish_work: schedules the function @drm_sched_job_finish once the job has 171 - * finished to remove the job from the 172 - * @drm_gpu_scheduler.ring_mirror_list. 173 170 * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list. 174 171 * @id: a unique id assigned to each job scheduled on the scheduler. 175 172 * @karma: increment on every hang caused by this job. If this exceeds the hang ··· 185 188 struct drm_gpu_scheduler *sched; 186 189 struct drm_sched_fence *s_fence; 187 190 struct dma_fence_cb finish_cb; 188 - struct work_struct finish_work; 189 191 struct list_head node; 190 192 uint64_t id; 191 193 atomic_t karma; ··· 292 296 void *owner); 293 297 void drm_sched_job_cleanup(struct drm_sched_job *job); 294 298 void drm_sched_wakeup(struct drm_gpu_scheduler *sched); 295 - void drm_sched_stop(struct drm_gpu_scheduler *sched); 299 + void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); 296 300 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery); 297 301 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); 298 302 void drm_sched_increase_karma(struct drm_sched_job *bad);