Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: use ref to keep job alive

this is to fix fatal page fault error that occured if:
job is signaled/released after its timeout work is already
put to the global queue (in this case the cancel_delayed_work
will return false), which will lead to NX-protection error
page fault during job_timeout_func.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Monk Liu and committed by
Alex Deucher
b6723c8d 0de2479c

+35 -4
+2
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 750 750 struct amdgpu_job **job); 751 751 int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, 752 752 struct amdgpu_job **job); 753 + 753 754 void amdgpu_job_free(struct amdgpu_job *job); 755 + void amdgpu_job_free_func(struct kref *refcount); 754 756 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, 755 757 struct amd_sched_entity *entity, void *owner, 756 758 struct fence **f);
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
··· 872 872 r = amd_sched_job_init(&job->base, &ring->sched, 873 873 &p->ctx->rings[ring->idx].entity, 874 874 amdgpu_job_timeout_func, 875 + amdgpu_job_free_func, 875 876 p->filp, &fence); 876 877 if (r) { 877 878 amdgpu_job_free(job);
+12 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
··· 31 31 static void amdgpu_job_free_handler(struct work_struct *ws) 32 32 { 33 33 struct amdgpu_job *job = container_of(ws, struct amdgpu_job, base.work_free_job); 34 - kfree(job); 34 + amd_sched_job_put(&job->base); 35 35 } 36 36 37 37 void amdgpu_job_timeout_func(struct work_struct *work) ··· 41 41 job->base.sched->name, 42 42 (uint32_t)atomic_read(&job->ring->fence_drv.last_seq), 43 43 job->ring->fence_drv.sync_seq); 44 + 45 + amd_sched_job_put(&job->base); 44 46 } 45 47 46 48 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, ··· 103 101 kfree(job); 104 102 } 105 103 104 + void amdgpu_job_free_func(struct kref *refcount) 105 + { 106 + struct amdgpu_job *job = container_of(refcount, struct amdgpu_job, base.refcount); 107 + kfree(job); 108 + } 109 + 106 110 int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, 107 111 struct amd_sched_entity *entity, void *owner, 108 112 struct fence **f) ··· 121 113 return -EINVAL; 122 114 123 115 r = amd_sched_job_init(&job->base, &ring->sched, 124 - entity, owner, 116 + entity, 125 117 amdgpu_job_timeout_func, 126 - &fence); 118 + amdgpu_job_free_func, 119 + owner, &fence); 127 120 if (r) 128 121 return r; 129 122
+7 -1
drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
··· 333 333 struct amd_gpu_scheduler *sched = s_job->sched; 334 334 335 335 if (sched->timeout != MAX_SCHEDULE_TIMEOUT) { 336 - cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */ 336 + if (cancel_delayed_work(&s_job->work_tdr)) 337 + amd_sched_job_put(s_job); 337 338 338 339 /* queue TDR for next job */ 339 340 next = list_first_entry_or_null(&sched->ring_mirror_list, ··· 342 341 343 342 if (next) { 344 343 INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback); 344 + amd_sched_job_get(next); 345 345 schedule_delayed_work(&next->work_tdr, sched->timeout); 346 346 } 347 347 } ··· 356 354 list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job) 357 355 { 358 356 INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback); 357 + amd_sched_job_get(s_job); 359 358 schedule_delayed_work(&s_job->work_tdr, sched->timeout); 360 359 } 361 360 } ··· 385 382 struct amd_gpu_scheduler *sched, 386 383 struct amd_sched_entity *entity, 387 384 void (*timeout_cb)(struct work_struct *work), 385 + void (*free_cb)(struct kref *refcount), 388 386 void *owner, struct fence **fence) 389 387 { 390 388 INIT_LIST_HEAD(&job->node); 389 + kref_init(&job->refcount); 391 390 job->sched = sched; 392 391 job->s_entity = entity; 393 392 job->s_fence = amd_sched_fence_create(entity, owner); ··· 398 393 399 394 job->s_fence->s_job = job; 400 395 job->timeout_callback = timeout_cb; 396 + job->free_callback = free_cb; 401 397 402 398 if (fence) 403 399 *fence = &job->s_fence->base;
+13
drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
··· 78 78 }; 79 79 80 80 struct amd_sched_job { 81 + struct kref refcount; 81 82 struct amd_gpu_scheduler *sched; 82 83 struct amd_sched_entity *s_entity; 83 84 struct amd_sched_fence *s_fence; ··· 88 87 struct list_head node; 89 88 struct delayed_work work_tdr; 90 89 void (*timeout_callback) (struct work_struct *work); 90 + void (*free_callback)(struct kref *refcount); 91 91 }; 92 92 93 93 extern const struct fence_ops amd_sched_fence_ops; ··· 157 155 struct amd_gpu_scheduler *sched, 158 156 struct amd_sched_entity *entity, 159 157 void (*timeout_cb)(struct work_struct *work), 158 + void (*free_cb)(struct kref* refcount), 160 159 void *owner, struct fence **fence); 161 160 void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched , 162 161 struct amd_sched_job *s_job); 163 162 void amd_sched_job_finish(struct amd_sched_job *s_job); 164 163 void amd_sched_job_begin(struct amd_sched_job *s_job); 164 + static inline void amd_sched_job_get(struct amd_sched_job *job) { 165 + if (job) 166 + kref_get(&job->refcount); 167 + } 168 + 169 + static inline void amd_sched_job_put(struct amd_sched_job *job) { 170 + if (job) 171 + kref_put(&job->refcount, job->free_callback); 172 + } 173 + 165 174 #endif