Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: remove all KFD fences from the BO on release

Remove all KFD BOs from the private dma_resv object.

This prevents the KFD from being evict unecessarily when an exported BO
is released.

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Reviewed-and-tested-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
cb0de06d 3521276a

+47 -48
+2 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
··· 193 193 #if IS_ENABLED(CONFIG_HSA_AMD) 194 194 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm); 195 195 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); 196 - int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo); 196 + void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo); 197 197 int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, 198 198 unsigned long cur_seq, struct kgd_mem *mem); 199 199 int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, ··· 213 213 } 214 214 215 215 static inline 216 - int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) 216 + void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo) 217 217 { 218 - return 0; 219 218 } 220 219 221 220 static inline
+22 -30
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
··· 370 370 return 0; 371 371 } 372 372 373 - int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) 373 + /** 374 + * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences 375 + * @bo: the BO where to remove the evictions fences from. 376 + * 377 + * This functions should only be used on release when all references to the BO 378 + * are already dropped. We remove the eviction fence from the private copy of 379 + * the dma_resv object here since that is what is used during release to 380 + * determine of the BO is idle or not. 381 + */ 382 + void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo) 374 383 { 375 - struct amdgpu_bo *root = bo; 376 - struct amdgpu_vm_bo_base *vm_bo; 377 - struct amdgpu_vm *vm; 378 - struct amdkfd_process_info *info; 379 - struct amdgpu_amdkfd_fence *ef; 380 - int ret; 384 + struct dma_resv *resv = &bo->tbo.base._resv; 385 + struct dma_fence *fence, *stub; 386 + struct dma_resv_iter cursor; 381 387 382 - /* we can always get vm_bo from root PD bo.*/ 383 - while (root->parent) 384 - root = root->parent; 388 + dma_resv_assert_held(resv); 385 389 386 - vm_bo = root->vm_bo; 387 - if (!vm_bo) 388 - return 0; 390 + stub = dma_fence_get_stub(); 391 + dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) { 392 + if (!to_amdgpu_amdkfd_fence(fence)) 393 + continue; 389 394 390 - vm = vm_bo->vm; 391 - if (!vm) 392 - return 0; 393 - 394 - info = vm->process_info; 395 - if (!info || !info->eviction_fence) 396 - return 0; 397 - 398 - ef = container_of(dma_fence_get(&info->eviction_fence->base), 399 - struct amdgpu_amdkfd_fence, base); 400 - 401 - BUG_ON(!dma_resv_trylock(bo->tbo.base.resv)); 402 - ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef); 403 - dma_resv_unlock(bo->tbo.base.resv); 404 - 405 - dma_fence_put(&ef->base); 406 - return ret; 395 + dma_resv_replace_fences(resv, fence->context, stub, 396 + DMA_RESV_USAGE_BOOKKEEP); 397 + } 398 + dma_fence_put(stub); 407 399 } 408 400 409 401 static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
+23 -15
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
··· 1295 1295 if (abo->kfd_bo) 1296 1296 amdgpu_amdkfd_release_notify(abo); 1297 1297 1298 - /* We only remove the fence if the resv has individualized. */ 1299 - WARN_ON_ONCE(bo->type == ttm_bo_type_kernel 1300 - && bo->base.resv != &bo->base._resv); 1301 - if (bo->base.resv == &bo->base._resv) 1302 - amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo); 1298 + /* 1299 + * We lock the private dma_resv object here and since the BO is about to 1300 + * be released nobody else should have a pointer to it. 1301 + * So when this locking here fails something is wrong with the reference 1302 + * counting. 1303 + */ 1304 + if (WARN_ON_ONCE(!dma_resv_trylock(&bo->base._resv))) 1305 + return; 1306 + 1307 + amdgpu_amdkfd_remove_all_eviction_fences(abo); 1303 1308 1304 1309 if (!bo->resource || bo->resource->mem_type != TTM_PL_VRAM || 1305 1310 !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) || 1306 1311 adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev))) 1307 - return; 1312 + goto out; 1308 1313 1309 - if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv))) 1310 - return; 1314 + r = dma_resv_reserve_fences(&bo->base._resv, 1); 1315 + if (r) 1316 + goto out; 1311 1317 1312 - r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true); 1313 - if (!WARN_ON(r)) { 1314 - amdgpu_vram_mgr_set_cleared(bo->resource); 1315 - amdgpu_bo_fence(abo, fence, false); 1316 - dma_fence_put(fence); 1317 - } 1318 + r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true); 1319 + if (WARN_ON(r)) 1320 + goto out; 1318 1321 1319 - dma_resv_unlock(bo->base.resv); 1322 + amdgpu_vram_mgr_set_cleared(bo->resource); 1323 + dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL); 1324 + dma_fence_put(fence); 1325 + 1326 + out: 1327 + dma_resv_unlock(&bo->base._resv); 1320 1328 } 1321 1329 1322 1330 /**