Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/sched: implement dynamic job-flow control

Currently, job flow control is implemented simply by limiting the number
of jobs in flight. Therefore, a scheduler is initialized with a credit
limit that corresponds to the number of jobs which can be sent to the
hardware.

This implies that for each job, drivers need to account for the maximum
job size possible in order to not overflow the ring buffer.

However, there are drivers, such as Nouveau, where the job size has a
rather large range. For such drivers it can easily happen that job
submissions not even filling the ring by 1% can block subsequent
submissions, which, in the worst case, can lead to the ring run dry.

In order to overcome this issue, allow for tracking the actual job size
instead of the number of jobs. Therefore, add a field to track a job's
credit count, which represents the number of credits a job contributes
to the scheduler's credit limit.

Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Reviewed-by: Luben Tuikov <ltuikov89@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231110001638.71750-1-dakr@redhat.com

+175 -51
+6
Documentation/gpu/drm-mm.rst
··· 552 552 .. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c 553 553 :doc: Overview 554 554 555 + Flow Control 556 + ------------ 557 + 558 + .. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c 559 + :doc: Flow Control 560 + 555 561 Scheduler Function References 556 562 ----------------------------- 557 563
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
··· 115 115 if (!entity) 116 116 return 0; 117 117 118 - return drm_sched_job_init(&(*job)->base, entity, owner); 118 + return drm_sched_job_init(&(*job)->base, entity, 1, owner); 119 119 } 120 120 121 121 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
+1 -1
drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
··· 535 535 536 536 ret = drm_sched_job_init(&submit->sched_job, 537 537 &ctx->sched_entity[args->pipe], 538 - submit->ctx); 538 + 1, submit->ctx); 539 539 if (ret) 540 540 goto err_submit_put; 541 541
+1 -1
drivers/gpu/drm/etnaviv/etnaviv_gpu.c
··· 1917 1917 u32 idle, mask; 1918 1918 1919 1919 /* If there are any jobs in the HW queue, we're not idle */ 1920 - if (atomic_read(&gpu->sched.hw_rq_count)) 1920 + if (atomic_read(&gpu->sched.credit_count)) 1921 1921 return -EBUSY; 1922 1922 1923 1923 /* Check whether the hardware (except FE and MC) is idle */
+1 -1
drivers/gpu/drm/lima/lima_device.c
··· 514 514 515 515 /* check any task running */ 516 516 for (i = 0; i < lima_pipe_num; i++) { 517 - if (atomic_read(&ldev->pipe[i].base.hw_rq_count)) 517 + if (atomic_read(&ldev->pipe[i].base.credit_count)) 518 518 return -EBUSY; 519 519 } 520 520
+1 -1
drivers/gpu/drm/lima/lima_sched.c
··· 123 123 for (i = 0; i < num_bos; i++) 124 124 drm_gem_object_get(&bos[i]->base.base); 125 125 126 - err = drm_sched_job_init(&task->base, &context->base, vm); 126 + err = drm_sched_job_init(&task->base, &context->base, 1, vm); 127 127 if (err) { 128 128 kfree(task->bos); 129 129 return err;
+1 -1
drivers/gpu/drm/msm/msm_gem_submit.c
··· 48 48 return ERR_PTR(ret); 49 49 } 50 50 51 - ret = drm_sched_job_init(&submit->base, queue->entity, queue); 51 + ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue); 52 52 if (ret) { 53 53 kfree(submit->hw_fence); 54 54 kfree(submit);
+1 -1
drivers/gpu/drm/nouveau/nouveau_sched.c
··· 89 89 90 90 } 91 91 92 - ret = drm_sched_job_init(&job->base, &entity->base, NULL); 92 + ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL); 93 93 if (ret) 94 94 goto err_free_chains; 95 95
+1 -1
drivers/gpu/drm/panfrost/panfrost_drv.c
··· 274 274 275 275 ret = drm_sched_job_init(&job->base, 276 276 &file_priv->sched_entity[slot], 277 - NULL); 277 + 1, NULL); 278 278 if (ret) 279 279 goto out_put_job; 280 280
+1 -1
drivers/gpu/drm/panfrost/panfrost_job.c
··· 963 963 964 964 for (i = 0; i < NUM_JOB_SLOTS; i++) { 965 965 /* If there are any jobs in the HW queue, we're not idle */ 966 - if (atomic_read(&js->queue[i].sched.hw_rq_count)) 966 + if (atomic_read(&js->queue[i].sched.credit_count)) 967 967 return false; 968 968 } 969 969
+1 -1
drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
··· 51 51 __assign_str(name, sched_job->sched->name); 52 52 __entry->job_count = spsc_queue_count(&entity->job_queue); 53 53 __entry->hw_job_count = atomic_read( 54 - &sched_job->sched->hw_rq_count); 54 + &sched_job->sched->credit_count); 55 55 ), 56 56 TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d", 57 57 __entry->entity, __entry->id,
+136 -34
drivers/gpu/drm/scheduler/sched_main.c
··· 48 48 * through the jobs entity pointer. 49 49 */ 50 50 51 + /** 52 + * DOC: Flow Control 53 + * 54 + * The DRM GPU scheduler provides a flow control mechanism to regulate the rate 55 + * in which the jobs fetched from scheduler entities are executed. 56 + * 57 + * In this context the &drm_gpu_scheduler keeps track of a driver specified 58 + * credit limit representing the capacity of this scheduler and a credit count; 59 + * every &drm_sched_job carries a driver specified number of credits. 60 + * 61 + * Once a job is executed (but not yet finished), the job's credits contribute 62 + * to the scheduler's credit count until the job is finished. If by executing 63 + * one more job the scheduler's credit count would exceed the scheduler's 64 + * credit limit, the job won't be executed. Instead, the scheduler will wait 65 + * until the credit count has decreased enough to not overflow its credit limit. 66 + * This implies waiting for previously executed jobs. 67 + * 68 + * Optionally, drivers may register a callback (update_job_credits) provided by 69 + * struct drm_sched_backend_ops to update the job's credits dynamically. The 70 + * scheduler executes this callback every time the scheduler considers a job for 71 + * execution and subsequently checks whether the job fits the scheduler's credit 72 + * limit. 73 + */ 74 + 51 75 #include <linux/wait.h> 52 76 #include <linux/sched.h> 53 77 #include <linux/completion.h> ··· 98 74 */ 99 75 MODULE_PARM_DESC(sched_policy, "Specify the scheduling policy for entities on a run-queue, " __stringify(DRM_SCHED_POLICY_RR) " = Round Robin, " __stringify(DRM_SCHED_POLICY_FIFO) " = FIFO (default)."); 100 76 module_param_named(sched_policy, drm_sched_policy, int, 0444); 77 + 78 + static u32 drm_sched_available_credits(struct drm_gpu_scheduler *sched) 79 + { 80 + u32 credits; 81 + 82 + drm_WARN_ON(sched, check_sub_overflow(sched->credit_limit, 83 + atomic_read(&sched->credit_count), 84 + &credits)); 85 + 86 + return credits; 87 + } 88 + 89 + /** 90 + * drm_sched_can_queue -- Can we queue more to the hardware? 91 + * @sched: scheduler instance 92 + * @entity: the scheduler entity 93 + * 94 + * Return true if we can push at least one more job from @entity, false 95 + * otherwise. 96 + */ 97 + static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched, 98 + struct drm_sched_entity *entity) 99 + { 100 + struct drm_sched_job *s_job; 101 + 102 + s_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue)); 103 + if (!s_job) 104 + return false; 105 + 106 + if (sched->ops->update_job_credits) { 107 + s_job->credits = sched->ops->update_job_credits(s_job); 108 + 109 + drm_WARN(sched, !s_job->credits, 110 + "Jobs with zero credits bypass job-flow control.\n"); 111 + } 112 + 113 + /* If a job exceeds the credit limit, truncate it to the credit limit 114 + * itself to guarantee forward progress. 115 + */ 116 + if (drm_WARN(sched, s_job->credits > sched->credit_limit, 117 + "Jobs may not exceed the credit limit, truncate.\n")) 118 + s_job->credits = sched->credit_limit; 119 + 120 + return drm_sched_available_credits(sched) >= s_job->credits; 121 + } 101 122 102 123 static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a, 103 124 const struct rb_node *b) ··· 255 186 /** 256 187 * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run 257 188 * 189 + * @sched: the gpu scheduler 258 190 * @rq: scheduler run queue to check. 259 191 * 260 - * Try to find a ready entity, returns NULL if none found. 192 + * Try to find the next ready entity. 193 + * 194 + * Return an entity if one is found; return an error-pointer (!NULL) if an 195 + * entity was ready, but the scheduler had insufficient credits to accommodate 196 + * its job; return NULL, if no ready entity was found. 261 197 */ 262 198 static struct drm_sched_entity * 263 - drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq) 199 + drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched, 200 + struct drm_sched_rq *rq) 264 201 { 265 202 struct drm_sched_entity *entity; 266 203 ··· 276 201 if (entity) { 277 202 list_for_each_entry_continue(entity, &rq->entities, list) { 278 203 if (drm_sched_entity_is_ready(entity)) { 204 + /* If we can't queue yet, preserve the current 205 + * entity in terms of fairness. 206 + */ 207 + if (!drm_sched_can_queue(sched, entity)) { 208 + spin_unlock(&rq->lock); 209 + return ERR_PTR(-ENOSPC); 210 + } 211 + 279 212 rq->current_entity = entity; 280 213 reinit_completion(&entity->entity_idle); 281 214 spin_unlock(&rq->lock); ··· 293 210 } 294 211 295 212 list_for_each_entry(entity, &rq->entities, list) { 296 - 297 213 if (drm_sched_entity_is_ready(entity)) { 214 + /* If we can't queue yet, preserve the current entity in 215 + * terms of fairness. 216 + */ 217 + if (!drm_sched_can_queue(sched, entity)) { 218 + spin_unlock(&rq->lock); 219 + return ERR_PTR(-ENOSPC); 220 + } 221 + 298 222 rq->current_entity = entity; 299 223 reinit_completion(&entity->entity_idle); 300 224 spin_unlock(&rq->lock); ··· 320 230 /** 321 231 * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run 322 232 * 233 + * @sched: the gpu scheduler 323 234 * @rq: scheduler run queue to check. 324 235 * 325 - * Find oldest waiting ready entity, returns NULL if none found. 236 + * Find oldest waiting ready entity. 237 + * 238 + * Return an entity if one is found; return an error-pointer (!NULL) if an 239 + * entity was ready, but the scheduler had insufficient credits to accommodate 240 + * its job; return NULL, if no ready entity was found. 326 241 */ 327 242 static struct drm_sched_entity * 328 - drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) 243 + drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched, 244 + struct drm_sched_rq *rq) 329 245 { 330 246 struct rb_node *rb; 331 247 ··· 341 245 342 246 entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node); 343 247 if (drm_sched_entity_is_ready(entity)) { 248 + /* If we can't queue yet, preserve the current entity in 249 + * terms of fairness. 250 + */ 251 + if (!drm_sched_can_queue(sched, entity)) { 252 + spin_unlock(&rq->lock); 253 + return ERR_PTR(-ENOSPC); 254 + } 255 + 344 256 rq->current_entity = entity; 345 257 reinit_completion(&entity->entity_idle); 346 258 break; ··· 406 302 struct drm_sched_fence *s_fence = s_job->s_fence; 407 303 struct drm_gpu_scheduler *sched = s_fence->sched; 408 304 409 - atomic_dec(&sched->hw_rq_count); 305 + atomic_sub(s_job->credits, &sched->credit_count); 410 306 atomic_dec(sched->score); 411 307 412 308 trace_drm_sched_process_job(s_fence); ··· 629 525 &s_job->cb)) { 630 526 dma_fence_put(s_job->s_fence->parent); 631 527 s_job->s_fence->parent = NULL; 632 - atomic_dec(&sched->hw_rq_count); 528 + atomic_sub(s_job->credits, &sched->credit_count); 633 529 } else { 634 530 /* 635 531 * remove job from pending_list. ··· 690 586 list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { 691 587 struct dma_fence *fence = s_job->s_fence->parent; 692 588 693 - atomic_inc(&sched->hw_rq_count); 589 + atomic_add(s_job->credits, &sched->credit_count); 694 590 695 591 if (!full_recovery) 696 592 continue; ··· 771 667 * drm_sched_job_init - init a scheduler job 772 668 * @job: scheduler job to init 773 669 * @entity: scheduler entity to use 670 + * @credits: the number of credits this job contributes to the schedulers 671 + * credit limit 774 672 * @owner: job owner for debugging 775 673 * 776 674 * Refer to drm_sched_entity_push_job() documentation ··· 790 684 */ 791 685 int drm_sched_job_init(struct drm_sched_job *job, 792 686 struct drm_sched_entity *entity, 793 - void *owner) 687 + u32 credits, void *owner) 794 688 { 795 689 if (!entity->rq) { 796 690 /* This will most likely be followed by missing frames ··· 801 695 return -ENOENT; 802 696 } 803 697 698 + if (unlikely(!credits)) { 699 + pr_err("*ERROR* %s: credits cannot be 0!\n", __func__); 700 + return -EINVAL; 701 + } 702 + 804 703 job->entity = entity; 704 + job->credits = credits; 805 705 job->s_fence = drm_sched_fence_alloc(entity, owner); 806 706 if (!job->s_fence) 807 707 return -ENOMEM; ··· 1020 908 EXPORT_SYMBOL(drm_sched_job_cleanup); 1021 909 1022 910 /** 1023 - * drm_sched_can_queue -- Can we queue more to the hardware? 1024 - * @sched: scheduler instance 1025 - * 1026 - * Return true if we can push more jobs to the hw, otherwise false. 1027 - */ 1028 - static bool drm_sched_can_queue(struct drm_gpu_scheduler *sched) 1029 - { 1030 - return atomic_read(&sched->hw_rq_count) < 1031 - sched->hw_submission_limit; 1032 - } 1033 - 1034 - /** 1035 911 * drm_sched_wakeup - Wake up the scheduler if it is ready to queue 1036 912 * @sched: scheduler instance 913 + * @entity: the scheduler entity 1037 914 * 1038 915 * Wake up the scheduler if we can queue jobs. 1039 916 */ ··· 1030 929 struct drm_sched_entity *entity) 1031 930 { 1032 931 if (drm_sched_entity_is_ready(entity)) 1033 - if (drm_sched_can_queue(sched)) 932 + if (drm_sched_can_queue(sched, entity)) 1034 933 drm_sched_run_job_queue(sched); 1035 934 } 1036 935 ··· 1039 938 * 1040 939 * @sched: scheduler instance 1041 940 * 1042 - * Returns the entity to process or NULL if none are found. 941 + * Return an entity to process or NULL if none are found. 942 + * 943 + * Note, that we break out of the for-loop when "entity" is non-null, which can 944 + * also be an error-pointer--this assures we don't process lower priority 945 + * run-queues. See comments in the respectively called functions. 1043 946 */ 1044 947 static struct drm_sched_entity * 1045 948 drm_sched_select_entity(struct drm_gpu_scheduler *sched) ··· 1051 946 struct drm_sched_entity *entity; 1052 947 int i; 1053 948 1054 - if (!drm_sched_can_queue(sched)) 1055 - return NULL; 1056 - 1057 949 /* Kernel run queue has higher priority than normal run queue*/ 1058 950 for (i = sched->num_rqs - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { 1059 951 entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ? 1060 - drm_sched_rq_select_entity_fifo(sched->sched_rq[i]) : 1061 - drm_sched_rq_select_entity_rr(sched->sched_rq[i]); 952 + drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) : 953 + drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]); 1062 954 if (entity) 1063 955 break; 1064 956 } 1065 957 1066 - return entity; 958 + return IS_ERR(entity) ? NULL : entity; 1067 959 } 1068 960 1069 961 /** ··· 1196 1094 1197 1095 s_fence = sched_job->s_fence; 1198 1096 1199 - atomic_inc(&sched->hw_rq_count); 1097 + atomic_add(sched_job->credits, &sched->credit_count); 1200 1098 drm_sched_job_begin(sched_job); 1201 1099 1202 1100 trace_drm_run_job(sched_job, entity); ··· 1231 1129 * @submit_wq: workqueue to use for submission. If NULL, an ordered wq is 1232 1130 * allocated and used 1233 1131 * @num_rqs: number of runqueues, one for each priority, up to DRM_SCHED_PRIORITY_COUNT 1234 - * @hw_submission: number of hw submissions that can be in flight 1132 + * @credit_limit: the number of credits this scheduler can hold from all jobs 1235 1133 * @hang_limit: number of times to allow a job to hang before dropping it 1236 1134 * @timeout: timeout value in jiffies for the scheduler 1237 1135 * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is ··· 1245 1143 int drm_sched_init(struct drm_gpu_scheduler *sched, 1246 1144 const struct drm_sched_backend_ops *ops, 1247 1145 struct workqueue_struct *submit_wq, 1248 - u32 num_rqs, uint32_t hw_submission, unsigned int hang_limit, 1146 + u32 num_rqs, u32 credit_limit, unsigned int hang_limit, 1249 1147 long timeout, struct workqueue_struct *timeout_wq, 1250 1148 atomic_t *score, const char *name, struct device *dev) 1251 1149 { 1252 1150 int i, ret; 1253 1151 1254 1152 sched->ops = ops; 1255 - sched->hw_submission_limit = hw_submission; 1153 + sched->credit_limit = credit_limit; 1256 1154 sched->name = name; 1257 1155 sched->timeout = timeout; 1258 1156 sched->timeout_wq = timeout_wq ? : system_wq; ··· 1301 1199 init_waitqueue_head(&sched->job_scheduled); 1302 1200 INIT_LIST_HEAD(&sched->pending_list); 1303 1201 spin_lock_init(&sched->job_list_lock); 1304 - atomic_set(&sched->hw_rq_count, 0); 1202 + atomic_set(&sched->credit_count, 0); 1305 1203 INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); 1306 1204 INIT_WORK(&sched->work_run_job, drm_sched_run_job_work); 1307 1205 INIT_WORK(&sched->work_free_job, drm_sched_free_job_work);
+1 -1
drivers/gpu/drm/v3d/v3d_gem.c
··· 418 418 job->file = file_priv; 419 419 420 420 ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue], 421 - v3d_priv); 421 + 1, v3d_priv); 422 422 if (ret) 423 423 goto fail; 424 424
+22 -6
include/drm/gpu_scheduler.h
··· 321 321 * @sched: the scheduler instance on which this job is scheduled. 322 322 * @s_fence: contains the fences for the scheduling of job. 323 323 * @finish_cb: the callback for the finished fence. 324 + * @credits: the number of credits this job contributes to the scheduler 324 325 * @work: Helper to reschdeule job kill to different context. 325 326 * @id: a unique id assigned to each job scheduled on the scheduler. 326 327 * @karma: increment on every hang caused by this job. If this exceeds the hang ··· 340 339 struct list_head list; 341 340 struct drm_gpu_scheduler *sched; 342 341 struct drm_sched_fence *s_fence; 342 + 343 + u32 credits; 343 344 344 345 /* 345 346 * work is used only after finish_cb has been used and will not be ··· 466 463 * and it's time to clean it up. 467 464 */ 468 465 void (*free_job)(struct drm_sched_job *sched_job); 466 + 467 + /** 468 + * @update_job_credits: Called when the scheduler is considering this 469 + * job for execution. 470 + * 471 + * This callback returns the number of credits the job would take if 472 + * pushed to the hardware. Drivers may use this to dynamically update 473 + * the job's credit count. For instance, deduct the number of credits 474 + * for already signalled native fences. 475 + * 476 + * This callback is optional. 477 + */ 478 + u32 (*update_job_credits)(struct drm_sched_job *sched_job); 469 479 }; 470 480 471 481 /** 472 482 * struct drm_gpu_scheduler - scheduler instance-specific data 473 483 * 474 484 * @ops: backend operations provided by the driver. 475 - * @hw_submission_limit: the max size of the hardware queue. 485 + * @credit_limit: the credit limit of this scheduler 486 + * @credit_count: the current credit count of this scheduler 476 487 * @timeout: the time after which a job is removed from the scheduler. 477 488 * @name: name of the ring for which this scheduler is being used. 478 489 * @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT, ··· 495 478 * @job_scheduled: once @drm_sched_entity_do_release is called the scheduler 496 479 * waits on this wait queue until all the scheduled jobs are 497 480 * finished. 498 - * @hw_rq_count: the number of jobs currently in the hardware queue. 499 481 * @job_id_count: used to assign unique id to the each job. 500 482 * @submit_wq: workqueue used to queue @work_run_job and @work_free_job 501 483 * @timeout_wq: workqueue used to queue @work_tdr ··· 518 502 */ 519 503 struct drm_gpu_scheduler { 520 504 const struct drm_sched_backend_ops *ops; 521 - uint32_t hw_submission_limit; 505 + u32 credit_limit; 506 + atomic_t credit_count; 522 507 long timeout; 523 508 const char *name; 524 509 u32 num_rqs; 525 510 struct drm_sched_rq **sched_rq; 526 511 wait_queue_head_t job_scheduled; 527 - atomic_t hw_rq_count; 528 512 atomic64_t job_id_count; 529 513 struct workqueue_struct *submit_wq; 530 514 struct workqueue_struct *timeout_wq; ··· 546 530 int drm_sched_init(struct drm_gpu_scheduler *sched, 547 531 const struct drm_sched_backend_ops *ops, 548 532 struct workqueue_struct *submit_wq, 549 - u32 num_rqs, uint32_t hw_submission, unsigned int hang_limit, 533 + u32 num_rqs, u32 credit_limit, unsigned int hang_limit, 550 534 long timeout, struct workqueue_struct *timeout_wq, 551 535 atomic_t *score, const char *name, struct device *dev); 552 536 553 537 void drm_sched_fini(struct drm_gpu_scheduler *sched); 554 538 int drm_sched_job_init(struct drm_sched_job *job, 555 539 struct drm_sched_entity *entity, 556 - void *owner); 540 + u32 credits, void *owner); 557 541 void drm_sched_job_arm(struct drm_sched_job *job); 558 542 int drm_sched_job_add_dependency(struct drm_sched_job *job, 559 543 struct dma_fence *fence);