Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: fix userptr HMM range handling v2

The basic problem here is that it's not allowed to page fault while
holding the reservation lock.

So it can happen that multiple processes try to validate an userptr
at the same time.

Work around that by putting the HMM range object into the mutex
protected bo list for now.

v2: make sure range is set to NULL in case of an error

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
CC: stable@vger.kernel.org
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
4458da0b b39df63b

+46 -51
+8 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
··· 986 986 struct amdkfd_process_info *process_info = mem->process_info; 987 987 struct amdgpu_bo *bo = mem->bo; 988 988 struct ttm_operation_ctx ctx = { true, false }; 989 + struct hmm_range *range; 989 990 int ret = 0; 990 991 991 992 mutex_lock(&process_info->lock); ··· 1016 1015 return 0; 1017 1016 } 1018 1017 1019 - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages); 1018 + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); 1020 1019 if (ret) { 1021 1020 pr_err("%s: Failed to get user pages: %d\n", __func__, ret); 1022 1021 goto unregister_out; ··· 1034 1033 amdgpu_bo_unreserve(bo); 1035 1034 1036 1035 release_out: 1037 - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); 1036 + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range); 1038 1037 unregister_out: 1039 1038 if (ret) 1040 1039 amdgpu_mn_unregister(bo); ··· 2371 2370 /* Go through userptr_inval_list and update any invalid user_pages */ 2372 2371 list_for_each_entry(mem, &process_info->userptr_inval_list, 2373 2372 validate_list.head) { 2373 + struct hmm_range *range; 2374 + 2374 2375 invalid = atomic_read(&mem->invalid); 2375 2376 if (!invalid) 2376 2377 /* BO hasn't been invalidated since the last ··· 2383 2380 bo = mem->bo; 2384 2381 2385 2382 /* Get updated user pages */ 2386 - ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages); 2383 + ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, 2384 + &range); 2387 2385 if (ret) { 2388 2386 pr_debug("Failed %d to get user pages\n", ret); 2389 2387 ··· 2403 2399 * FIXME: Cannot ignore the return code, must hold 2404 2400 * notifier_lock 2405 2401 */ 2406 - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); 2402 + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range); 2407 2403 } 2408 2404 2409 2405 /* Mark the BO as valid unless it was invalidated
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
··· 209 209 list_add_tail(&e->tv.head, &bucket[priority]); 210 210 211 211 e->user_pages = NULL; 212 + e->range = NULL; 212 213 } 213 214 214 215 /* Connect the sorted buckets in the output list. */
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
··· 26 26 #include <drm/ttm/ttm_execbuf_util.h> 27 27 #include <drm/amdgpu_drm.h> 28 28 29 + struct hmm_range; 30 + 29 31 struct amdgpu_device; 30 32 struct amdgpu_bo; 31 33 struct amdgpu_bo_va; ··· 38 36 struct amdgpu_bo_va *bo_va; 39 37 uint32_t priority; 40 38 struct page **user_pages; 39 + struct hmm_range *range; 41 40 bool user_invalidated; 42 41 }; 43 42
+5 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
··· 913 913 goto out_free_user_pages; 914 914 } 915 915 916 - r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages); 916 + r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages, &e->range); 917 917 if (r) { 918 918 kvfree(e->user_pages); 919 919 e->user_pages = NULL; ··· 991 991 992 992 if (!e->user_pages) 993 993 continue; 994 - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); 994 + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range); 995 995 kvfree(e->user_pages); 996 996 e->user_pages = NULL; 997 + e->range = NULL; 997 998 } 998 999 mutex_unlock(&p->bo_list->bo_list_mutex); 999 1000 return r; ··· 1274 1273 amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) { 1275 1274 struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); 1276 1275 1277 - r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); 1276 + r |= !amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range); 1277 + e->range = NULL; 1278 1278 } 1279 1279 if (r) { 1280 1280 r = -EAGAIN;
+4 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
··· 378 378 struct amdgpu_device *adev = drm_to_adev(dev); 379 379 struct drm_amdgpu_gem_userptr *args = data; 380 380 struct drm_gem_object *gobj; 381 + struct hmm_range *range; 381 382 struct amdgpu_bo *bo; 382 383 uint32_t handle; 383 384 int r; ··· 419 418 goto release_object; 420 419 421 420 if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) { 422 - r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages); 421 + r = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, 422 + &range); 423 423 if (r) 424 424 goto release_object; 425 425 ··· 443 441 444 442 user_pages_done: 445 443 if (args->flags & AMDGPU_GEM_USERPTR_VALIDATE) 446 - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm); 444 + amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range); 447 445 448 446 release_object: 449 447 drm_gem_object_put(gobj);
+15 -38
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
··· 643 643 struct task_struct *usertask; 644 644 uint32_t userflags; 645 645 bool bound; 646 - #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR) 647 - struct hmm_range *range; 648 - #endif 649 646 }; 650 647 651 648 #define ttm_to_amdgpu_ttm_tt(ptr) container_of(ptr, struct amdgpu_ttm_tt, ttm) ··· 655 658 * Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only 656 659 * once afterwards to stop HMM tracking 657 660 */ 658 - int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) 661 + int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages, 662 + struct hmm_range **range) 659 663 { 660 664 struct ttm_tt *ttm = bo->tbo.ttm; 661 665 struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm); ··· 666 668 bool readonly; 667 669 int r = 0; 668 670 671 + /* Make sure get_user_pages_done() can cleanup gracefully */ 672 + *range = NULL; 673 + 669 674 mm = bo->notifier.mm; 670 675 if (unlikely(!mm)) { 671 676 DRM_DEBUG_DRIVER("BO is not registered?\n"); 672 677 return -EFAULT; 673 678 } 674 - 675 - /* Another get_user_pages is running at the same time?? */ 676 - if (WARN_ON(gtt->range)) 677 - return -EFAULT; 678 679 679 680 if (!mmget_not_zero(mm)) /* Happens during process shutdown */ 680 681 return -ESRCH; ··· 692 695 693 696 readonly = amdgpu_ttm_tt_is_readonly(ttm); 694 697 r = amdgpu_hmm_range_get_pages(&bo->notifier, mm, pages, start, 695 - ttm->num_pages, &gtt->range, readonly, 698 + ttm->num_pages, range, readonly, 696 699 true, NULL); 697 700 out_unlock: 698 701 mmap_read_unlock(mm); ··· 710 713 * 711 714 * Returns: true if pages are still valid 712 715 */ 713 - bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) 716 + bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, 717 + struct hmm_range *range) 714 718 { 715 719 struct amdgpu_ttm_tt *gtt = ttm_to_amdgpu_ttm_tt(ttm); 716 - bool r = false; 717 720 718 - if (!gtt || !gtt->userptr) 721 + if (!gtt || !gtt->userptr || !range) 719 722 return false; 720 723 721 724 DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%x\n", 722 725 gtt->userptr, ttm->num_pages); 723 726 724 - WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns, 725 - "No user pages to check\n"); 727 + WARN_ONCE(!range->hmm_pfns, "No user pages to check\n"); 726 728 727 - if (gtt->range) { 728 - /* 729 - * FIXME: Must always hold notifier_lock for this, and must 730 - * not ignore the return code. 731 - */ 732 - r = amdgpu_hmm_range_get_pages_done(gtt->range); 733 - gtt->range = NULL; 734 - } 735 - 736 - return !r; 729 + /* 730 + * FIXME: Must always hold notifier_lock for this, and must 731 + * not ignore the return code. 732 + */ 733 + return !amdgpu_hmm_range_get_pages_done(range); 737 734 } 738 735 #endif 739 736 ··· 804 813 /* unmap the pages mapped to the device */ 805 814 dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0); 806 815 sg_free_table(ttm->sg); 807 - 808 - #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR) 809 - if (gtt->range) { 810 - unsigned long i; 811 - 812 - for (i = 0; i < ttm->num_pages; i++) { 813 - if (ttm->pages[i] != 814 - hmm_pfn_to_page(gtt->range->hmm_pfns[i])) 815 - break; 816 - } 817 - 818 - WARN((i == ttm->num_pages), "Missing get_user_page_done\n"); 819 - } 820 - #endif 821 816 } 822 817 823 818 static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
+10 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
··· 39 39 40 40 #define AMDGPU_POISON 0xd0bed0be 41 41 42 + struct hmm_range; 43 + 42 44 struct amdgpu_gtt_mgr { 43 45 struct ttm_resource_manager manager; 44 46 struct drm_mm mm; ··· 151 149 uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t type); 152 150 153 151 #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR) 154 - int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages); 155 - bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm); 152 + int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages, 153 + struct hmm_range **range); 154 + bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, 155 + struct hmm_range *range); 156 156 #else 157 157 static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, 158 - struct page **pages) 158 + struct page **pages, 159 + struct hmm_range **range) 159 160 { 160 161 return -EPERM; 161 162 } 162 - static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm) 163 + static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, 164 + struct hmm_range *range) 163 165 { 164 166 return false; 165 167 }