drm/amdgpu: add VM generation token · tjh.dev/kernel@f88e295

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

drm/amdgpu: add VM generation token

Instead of using the VRAM lost counter add a 64bit token which indicates
if a context or job is still valid to use.

Should the VRAM be lost or the page tables need re-creation the token will
change indicating that userspace needs to act and re-create the contexts
and re-submit the work.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by

Alex Deucher 2 years ago f88e295e 55bf196f

+37 -7

7 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu_cs.c

amdgpu_ctx.c

amdgpu_ctx.h

amdgpu_job.c

amdgpu_job.h

amdgpu_vm.c

amdgpu_vm.h

+1 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

··· 309 309 } 310 310 p->gang_leader = p->jobs[p->gang_leader_idx]; 311 311 312 - if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) { 312 + if (p->ctx->generation != p->gang_leader->generation) { 313 313 ret = -ECANCELED; 314 314 goto free_all_kdata; 315 315 }

+2 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

··· 333 333 334 334 ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter); 335 335 ctx->reset_counter_query = ctx->reset_counter; 336 - ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter); 336 + ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm); 337 337 ctx->init_priority = priority; 338 338 ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET; 339 339 ··· 586 586 if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter)) 587 587 out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET; 588 588 589 - if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) 589 + if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) 590 590 out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; 591 591 592 592 if (atomic_read(&ctx->guilty))

+1 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h

··· 47 47 struct amdgpu_ctx_mgr *mgr; 48 48 unsigned reset_counter; 49 49 unsigned reset_counter_query; 50 - uint32_t vram_lost_counter; 50 + uint64_t generation; 51 51 spinlock_t ring_lock; 52 52 struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM]; 53 53 bool preamble_presented;

+2 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

··· 109 109 (*job)->vm = vm; 110 110 111 111 amdgpu_sync_create(&(*job)->explicit_sync); 112 - (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); 112 + (*job)->generation = amdgpu_vm_generation(adev, vm); 113 113 (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; 114 114 115 115 if (!entity) ··· 295 295 trace_amdgpu_sched_run_job(job); 296 296 297 297 /* Skip job if VRAM is lost and never resubmit gangs */ 298 - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) || 298 + if (job->generation != amdgpu_vm_generation(adev, job->vm) || 299 299 (job->job_run_counter && job->gang_submit)) 300 300 dma_fence_set_error(finished, -ECANCELED); 301 301

+1 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

··· 61 61 uint32_t gds_base, gds_size; 62 62 uint32_t gws_base, gws_size; 63 63 uint32_t oa_base, oa_size; 64 - uint32_t vram_lost_counter; 64 + uint64_t generation; 65 65 66 66 /* user fence handling */ 67 67 uint64_t uf_addr;

+26

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

··· 406 406 } 407 407 408 408 /** 409 + * amdgpu_vm_generation - return the page table re-generation counter 410 + * @adev: the amdgpu_device 411 + * @vm: optional VM to check, might be NULL 412 + * 413 + * Returns a page table re-generation token to allow checking if submissions 414 + * are still valid to use this VM. The VM parameter might be NULL in which case 415 + * just the VRAM lost counter will be used. 416 + */ 417 + uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm) 418 + { 419 + uint64_t result = (u64)atomic_read(&adev->vram_lost_counter) << 32; 420 + 421 + if (!vm) 422 + return result; 423 + 424 + result += vm->generation; 425 + /* Add one if the page tables will be re-generated on next CS */ 426 + if (drm_sched_entity_error(&vm->delayed)) 427 + ++result; 428 + 429 + return result; 430 + } 431 + 432 + /** 409 433 * amdgpu_vm_validate_pt_bos - validate the page table BOs 410 434 * 411 435 * @adev: amdgpu device pointer ··· 452 428 int r; 453 429 454 430 if (drm_sched_entity_error(&vm->delayed)) { 431 + ++vm->generation; 455 432 amdgpu_vm_bo_reset_state_machine(vm); 456 433 amdgpu_vm_fini_entities(vm); 457 434 r = amdgpu_vm_init_entities(adev, vm); ··· 2159 2134 vm->last_update = dma_fence_get_stub(); 2160 2135 vm->last_unlocked = dma_fence_get_stub(); 2161 2136 vm->last_tlb_flush = dma_fence_get_stub(); 2137 + vm->generation = 0; 2162 2138 2163 2139 mutex_init(&vm->eviction_lock); 2164 2140 vm->evicting = false;

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

··· 295 295 atomic64_t tlb_seq; 296 296 struct dma_fence *last_tlb_flush; 297 297 298 + /* How many times we had to re-generate the page tables */ 299 + uint64_t generation; 300 + 298 301 /* Last unlocked submission to the scheduler entities */ 299 302 struct dma_fence *last_unlocked; 300 303 ··· 400 397 struct list_head *validated, 401 398 struct amdgpu_bo_list_entry *entry); 402 399 bool amdgpu_vm_ready(struct amdgpu_vm *vm); 400 + uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); 403 401 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, 404 402 int (*callback)(void *p, struct amdgpu_bo *bo), 405 403 void *param);