Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: nuke the VM PD/PT shadow handling

This was only used as workaround for recovering the page tables after
VRAM was lost and is no longer necessary after the function
amdgpu_vm_bo_reset_state_machine() started to do the same.

Compute never used shadows either, so the only proplematic case left is
SVM and that is most likely not recoverable in any way when VRAM is
lost.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
7181faaa c1de938f

+6 -265
-4
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 1083 1083 1084 1084 struct amdgpu_virt virt; 1085 1085 1086 - /* link all shadow bo */ 1087 - struct list_head shadow_list; 1088 - struct mutex shadow_list_lock; 1089 - 1090 1086 /* record hw reset is performed */ 1091 1087 bool has_hw_reset; 1092 1088 u8 reset_magic[AMDGPU_RESET_MAGIC_NUM];
+2 -85
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4107 4107 spin_lock_init(&adev->mm_stats.lock); 4108 4108 spin_lock_init(&adev->wb.lock); 4109 4109 4110 - INIT_LIST_HEAD(&adev->shadow_list); 4111 - mutex_init(&adev->shadow_list_lock); 4112 - 4113 4110 INIT_LIST_HEAD(&adev->reset_list); 4114 4111 4115 4112 INIT_LIST_HEAD(&adev->ras_list); ··· 5027 5030 } 5028 5031 5029 5032 /** 5030 - * amdgpu_device_recover_vram - Recover some VRAM contents 5031 - * 5032 - * @adev: amdgpu_device pointer 5033 - * 5034 - * Restores the contents of VRAM buffers from the shadows in GTT. Used to 5035 - * restore things like GPUVM page tables after a GPU reset where 5036 - * the contents of VRAM might be lost. 5037 - * 5038 - * Returns: 5039 - * 0 on success, negative error code on failure. 5040 - */ 5041 - static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 5042 - { 5043 - struct dma_fence *fence = NULL, *next = NULL; 5044 - struct amdgpu_bo *shadow; 5045 - struct amdgpu_bo_vm *vmbo; 5046 - long r = 1, tmo; 5047 - 5048 - if (amdgpu_sriov_runtime(adev)) 5049 - tmo = msecs_to_jiffies(8000); 5050 - else 5051 - tmo = msecs_to_jiffies(100); 5052 - 5053 - dev_info(adev->dev, "recover vram bo from shadow start\n"); 5054 - mutex_lock(&adev->shadow_list_lock); 5055 - list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 5056 - /* If vm is compute context or adev is APU, shadow will be NULL */ 5057 - if (!vmbo->shadow) 5058 - continue; 5059 - shadow = vmbo->shadow; 5060 - 5061 - /* No need to recover an evicted BO */ 5062 - if (!shadow->tbo.resource || 5063 - shadow->tbo.resource->mem_type != TTM_PL_TT || 5064 - shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 5065 - shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 5066 - continue; 5067 - 5068 - r = amdgpu_bo_restore_shadow(shadow, &next); 5069 - if (r) 5070 - break; 5071 - 5072 - if (fence) { 5073 - tmo = dma_fence_wait_timeout(fence, false, tmo); 5074 - dma_fence_put(fence); 5075 - fence = next; 5076 - if (tmo == 0) { 5077 - r = -ETIMEDOUT; 5078 - break; 5079 - } else if (tmo < 0) { 5080 - r = tmo; 5081 - break; 5082 - } 5083 - } else { 5084 - fence = next; 5085 - } 5086 - } 5087 - mutex_unlock(&adev->shadow_list_lock); 5088 - 5089 - if (fence) 5090 - tmo = dma_fence_wait_timeout(fence, false, tmo); 5091 - dma_fence_put(fence); 5092 - 5093 - if (r < 0 || tmo <= 0) { 5094 - dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 5095 - return -EIO; 5096 - } 5097 - 5098 - dev_info(adev->dev, "recover vram bo from shadow done\n"); 5099 - return 0; 5100 - } 5101 - 5102 - 5103 - /** 5104 5033 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5105 5034 * 5106 5035 * @adev: amdgpu_device pointer ··· 5088 5165 if (r) 5089 5166 return r; 5090 5167 5091 - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5168 + if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5092 5169 amdgpu_inc_vram_lost(adev); 5093 - r = amdgpu_device_recover_vram(adev); 5094 - } 5095 - if (r) 5096 - return r; 5097 5170 5098 5171 /* need to be called during full access so we can't do it later like 5099 5172 * bare-metal does. ··· 5488 5569 } 5489 5570 } 5490 5571 5491 - if (!r) 5492 - r = amdgpu_device_recover_vram(tmp_adev); 5493 - else 5572 + if (r) 5494 5573 tmp_adev->asic_reset_res = r; 5495 5574 } 5496 5575
+1 -66
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
··· 77 77 amdgpu_bo_destroy(tbo); 78 78 } 79 79 80 - static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo) 81 - { 82 - struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev); 83 - struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo; 84 - struct amdgpu_bo_vm *vmbo; 85 - 86 - bo = shadow_bo->parent; 87 - vmbo = to_amdgpu_bo_vm(bo); 88 - /* in case amdgpu_device_recover_vram got NULL of bo->parent */ 89 - if (!list_empty(&vmbo->shadow_list)) { 90 - mutex_lock(&adev->shadow_list_lock); 91 - list_del_init(&vmbo->shadow_list); 92 - mutex_unlock(&adev->shadow_list_lock); 93 - } 94 - 95 - amdgpu_bo_destroy(tbo); 96 - } 97 - 98 80 /** 99 81 * amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo 100 82 * @bo: buffer object to be checked ··· 90 108 bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo) 91 109 { 92 110 if (bo->destroy == &amdgpu_bo_destroy || 93 - bo->destroy == &amdgpu_bo_user_destroy || 94 - bo->destroy == &amdgpu_bo_vm_destroy) 111 + bo->destroy == &amdgpu_bo_user_destroy) 95 112 return true; 96 113 97 114 return false; ··· 701 720 702 721 *vmbo_ptr = to_amdgpu_bo_vm(bo_ptr); 703 722 return r; 704 - } 705 - 706 - /** 707 - * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list 708 - * 709 - * @vmbo: BO that will be inserted into the shadow list 710 - * 711 - * Insert a BO to the shadow list. 712 - */ 713 - void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo) 714 - { 715 - struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev); 716 - 717 - mutex_lock(&adev->shadow_list_lock); 718 - list_add_tail(&vmbo->shadow_list, &adev->shadow_list); 719 - vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo); 720 - vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy; 721 - mutex_unlock(&adev->shadow_list_lock); 722 - } 723 - 724 - /** 725 - * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow 726 - * 727 - * @shadow: &amdgpu_bo shadow to be restored 728 - * @fence: dma_fence associated with the operation 729 - * 730 - * Copies a buffer object's shadow content back to the object. 731 - * This is used for recovering a buffer from its shadow in case of a gpu 732 - * reset where vram context may be lost. 733 - * 734 - * Returns: 735 - * 0 for success or a negative error code on failure. 736 - */ 737 - int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence) 738 - 739 - { 740 - struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev); 741 - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; 742 - uint64_t shadow_addr, parent_addr; 743 - 744 - shadow_addr = amdgpu_bo_gpu_offset(shadow); 745 - parent_addr = amdgpu_bo_gpu_offset(shadow->parent); 746 - 747 - return amdgpu_copy_buffer(ring, shadow_addr, parent_addr, 748 - amdgpu_bo_size(shadow), NULL, fence, 749 - true, false, 0); 750 723 } 751 724 752 725 /**
-21
drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
··· 136 136 137 137 struct amdgpu_bo_vm { 138 138 struct amdgpu_bo bo; 139 - struct amdgpu_bo *shadow; 140 - struct list_head shadow_list; 141 139 struct amdgpu_vm_bo_base entries[]; 142 140 }; 143 141 ··· 273 275 return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED; 274 276 } 275 277 276 - /** 277 - * amdgpu_bo_shadowed - check if the BO is shadowed 278 - * 279 - * @bo: BO to be tested. 280 - * 281 - * Returns: 282 - * NULL if not shadowed or else return a BO pointer. 283 - */ 284 - static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo) 285 - { 286 - if (bo->tbo.type == ttm_bo_type_kernel) 287 - return to_amdgpu_bo_vm(bo)->shadow; 288 - 289 - return NULL; 290 - } 291 - 292 278 bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); 293 279 void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain); 294 280 ··· 331 349 u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo); 332 350 void amdgpu_bo_get_memory(struct amdgpu_bo *bo, 333 351 struct amdgpu_mem_stats *stats); 334 - void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo); 335 - int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, 336 - struct dma_fence **fence); 337 352 uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, 338 353 uint32_t domain); 339 354
-17
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
··· 465 465 { 466 466 uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); 467 467 struct amdgpu_vm_bo_base *bo_base; 468 - struct amdgpu_bo *shadow; 469 468 struct amdgpu_bo *bo; 470 469 int r; 471 470 ··· 485 486 spin_unlock(&vm->status_lock); 486 487 487 488 bo = bo_base->bo; 488 - shadow = amdgpu_bo_shadowed(bo); 489 489 490 490 r = validate(param, bo); 491 491 if (r) 492 492 return r; 493 - if (shadow) { 494 - r = validate(param, shadow); 495 - if (r) 496 - return r; 497 - } 498 493 499 494 if (bo->tbo.type != ttm_bo_type_kernel) { 500 495 amdgpu_vm_bo_moved(bo_base); ··· 2142 2149 { 2143 2150 struct amdgpu_vm_bo_base *bo_base; 2144 2151 2145 - /* shadow bo doesn't have bo base, its validation needs its parent */ 2146 - if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo)) 2147 - bo = bo->parent; 2148 - 2149 2152 for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { 2150 2153 struct amdgpu_vm *vm = bo_base->vm; 2151 2154 ··· 2471 2482 root_bo = amdgpu_bo_ref(&root->bo); 2472 2483 r = amdgpu_bo_reserve(root_bo, true); 2473 2484 if (r) { 2474 - amdgpu_bo_unref(&root->shadow); 2475 2485 amdgpu_bo_unref(&root_bo); 2476 2486 goto error_free_delayed; 2477 2487 } ··· 2562 2574 dma_fence_put(vm->last_update); 2563 2575 vm->last_update = dma_fence_get_stub(); 2564 2576 vm->is_compute_context = true; 2565 - 2566 - /* Free the shadow bo for compute VM */ 2567 - amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow); 2568 - 2569 - goto unreserve_bo; 2570 2577 2571 2578 unreserve_bo: 2572 2579 amdgpu_bo_unreserve(vm->root.bo);
+1 -55
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
··· 383 383 if (r) 384 384 return r; 385 385 386 - if (vmbo->shadow) { 387 - struct amdgpu_bo *shadow = vmbo->shadow; 388 - 389 - r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx); 390 - if (r) 391 - return r; 392 - } 393 - 394 386 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 395 387 return -ENODEV; 396 388 ··· 440 448 int32_t xcp_id) 441 449 { 442 450 struct amdgpu_bo_param bp; 443 - struct amdgpu_bo *bo; 444 - struct dma_resv *resv; 445 451 unsigned int num_entries; 446 - int r; 447 452 448 453 memset(&bp, 0, sizeof(bp)); 449 454 ··· 473 484 if (vm->root.bo) 474 485 bp.resv = vm->root.bo->tbo.base.resv; 475 486 476 - r = amdgpu_bo_create_vm(adev, &bp, vmbo); 477 - if (r) 478 - return r; 479 - 480 - bo = &(*vmbo)->bo; 481 - if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) { 482 - (*vmbo)->shadow = NULL; 483 - return 0; 484 - } 485 - 486 - if (!bp.resv) 487 - WARN_ON(dma_resv_lock(bo->tbo.base.resv, 488 - NULL)); 489 - resv = bp.resv; 490 - memset(&bp, 0, sizeof(bp)); 491 - bp.size = amdgpu_vm_pt_size(adev, level); 492 - bp.domain = AMDGPU_GEM_DOMAIN_GTT; 493 - bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC; 494 - bp.type = ttm_bo_type_kernel; 495 - bp.resv = bo->tbo.base.resv; 496 - bp.bo_ptr_size = sizeof(struct amdgpu_bo); 497 - bp.xcp_id_plus1 = xcp_id + 1; 498 - 499 - r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); 500 - 501 - if (!resv) 502 - dma_resv_unlock(bo->tbo.base.resv); 503 - 504 - if (r) { 505 - amdgpu_bo_unref(&bo); 506 - return r; 507 - } 508 - 509 - amdgpu_bo_add_to_shadow_list(*vmbo); 510 - 511 - return 0; 487 + return amdgpu_bo_create_vm(adev, &bp, vmbo); 512 488 } 513 489 514 490 /** ··· 523 569 return 0; 524 570 525 571 error_free_pt: 526 - amdgpu_bo_unref(&pt->shadow); 527 572 amdgpu_bo_unref(&pt_bo); 528 573 return r; 529 574 } ··· 534 581 */ 535 582 static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) 536 583 { 537 - struct amdgpu_bo *shadow; 538 - 539 584 if (!entry->bo) 540 585 return; 541 586 542 587 entry->bo->vm_bo = NULL; 543 - shadow = amdgpu_bo_shadowed(entry->bo); 544 - if (shadow) { 545 - ttm_bo_set_bulk_move(&shadow->tbo, NULL); 546 - amdgpu_bo_unref(&shadow); 547 - } 548 588 ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); 549 589 550 590 spin_lock(&entry->vm->status_lock);
+2 -17
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
··· 35 35 */ 36 36 static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table) 37 37 { 38 - int r; 39 - 40 - r = amdgpu_ttm_alloc_gart(&table->bo.tbo); 41 - if (r) 42 - return r; 43 - 44 - if (table->shadow) 45 - r = amdgpu_ttm_alloc_gart(&table->shadow->tbo); 46 - 47 - return r; 38 + return amdgpu_ttm_alloc_gart(&table->bo.tbo); 48 39 } 49 40 50 41 /* Allocate a new job for @count PTE updates */ ··· 256 265 257 266 if (!p->pages_addr) { 258 267 /* set page commands needed */ 259 - if (vmbo->shadow) 260 - amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr, 261 - count, incr, flags); 262 268 amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count, 263 269 incr, flags); 264 270 return 0; 265 271 } 266 272 267 273 /* copy commands needed */ 268 - ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw * 269 - (vmbo->shadow ? 2 : 1); 274 + ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw; 270 275 271 276 /* for padding */ 272 277 ndw -= 7; ··· 277 290 pte[i] |= flags; 278 291 } 279 292 280 - if (vmbo->shadow) 281 - amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes); 282 293 amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes); 283 294 284 295 pe += nptes * 8;