drm/amdgpu: implement TLB flush fence · tjh.dev/kernel@d8a3f0a

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

drm/amdgpu: implement TLB flush fence

The problem is that when (for example) 4k pages are replaced
with a single 2M page we need to wait for change to be flushed
out by invalidating the TLB before the PT can be freed.

Solve this by moving the TLB flush into a DMA-fence object which
can be used to delay the freeing of the PT BOs until it is signaled.

V2: (Shashank)
- rebase
- set dma_fence_error only in case of error
- add tlb_flush fence only when PT/PD BO is locked (Felix)
- use vm->pasid when f is NULL (Mukul)

V4: - add a wait for (f->dependency) in tlb_fence_work (Christian)
- move the misplaced fence_create call to the end (Philip)

V5: - free the f->dependency properly

V6: (Shashank)
- light code movement, moved all the clean-up in previous patch
- introduce params.needs_flush and its usage in this patch
- rebase without TLB HW sequence patch

V7:
- Keep the vm->last_update_fence and tlb_cb code until
we can fix the HW sequencing (Christian)
- Move all the tlb_fence related code in a separate function so that
its easier to read and review

V9: Addressed review comments from Christian
- start PT update only when we have callback memory allocated

V10:
- handle device unlock in OOM case (Christian, Mukul)
- added Christian's R-B

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Reviewed-by: Shashank Sharma <shashank.sharma@amd.com>
Reviewed-by: Christian Koenig <christian.koenig@amd.com>
Signed-off-by: Christian Koenig <christian.koenig@amd.com>
Signed-off-by: Shashank Sharma <shashank.sharma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian Koenig and committed by

Alex Deucher 2 years ago d8a3f0a0 e6136150

+175 -20

7 changed files

expand all

drivers

gpu

drm

amd

amdgpu

Makefile

amdgpu_vm.c

amdgpu_vm.h

amdgpu_vm_cpu.c

amdgpu_vm_pt.c

amdgpu_vm_sdma.c

amdgpu_vm_tlb_fence.c

+2 -1

drivers/gpu/drm/amd/amdgpu/Makefile

··· 70 70 amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ 71 71 atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ 72 72 atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ 73 - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ 73 + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ 74 + amdgpu_ib.o amdgpu_pll.o \ 74 75 amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ 75 76 amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ 76 77 amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \

+47 -15

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

··· 886 886 } 887 887 888 888 /** 889 + * amdgpu_vm_tlb_flush - prepare TLB flush 890 + * 891 + * @params: parameters for update 892 + * @fence: input fence to sync TLB flush with 893 + * @tlb_cb: the callback structure 894 + * 895 + * Increments the tlb sequence to make sure that future CS execute a VM flush. 896 + */ 897 + static void 898 + amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, 899 + struct dma_fence **fence, 900 + struct amdgpu_vm_tlb_seq_struct *tlb_cb) 901 + { 902 + struct amdgpu_vm *vm = params->vm; 903 + 904 + if (!fence || !*fence) 905 + return; 906 + 907 + tlb_cb->vm = vm; 908 + if (!dma_fence_add_callback(*fence, &tlb_cb->cb, 909 + amdgpu_vm_tlb_seq_cb)) { 910 + dma_fence_put(vm->last_tlb_flush); 911 + vm->last_tlb_flush = dma_fence_get(*fence); 912 + } else { 913 + amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); 914 + } 915 + 916 + /* Prepare a TLB flush fence to be attached to PTs */ 917 + if (!params->unlocked && vm->is_compute_context) { 918 + amdgpu_vm_tlb_fence_create(params->adev, vm, fence); 919 + 920 + /* Makes sure no PD/PT is freed before the flush */ 921 + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, 922 + DMA_RESV_USAGE_BOOKKEEP); 923 + } 924 + } 925 + 926 + /** 889 927 * amdgpu_vm_update_range - update a range in the vm page table 890 928 * 891 929 * @adev: amdgpu_device pointer to use for commands ··· 954 916 struct ttm_resource *res, dma_addr_t *pages_addr, 955 917 struct dma_fence **fence) 956 918 { 957 - struct amdgpu_vm_update_params params; 958 919 struct amdgpu_vm_tlb_seq_struct *tlb_cb; 920 + struct amdgpu_vm_update_params params; 959 921 struct amdgpu_res_cursor cursor; 960 922 enum amdgpu_sync_mode sync_mode; 961 923 int r, idx; ··· 965 927 966 928 tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL); 967 929 if (!tlb_cb) { 968 - r = -ENOMEM; 969 - goto error_unlock; 930 + drm_dev_exit(idx); 931 + return -ENOMEM; 970 932 } 971 933 972 934 /* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache, ··· 986 948 params.immediate = immediate; 987 949 params.pages_addr = pages_addr; 988 950 params.unlocked = unlocked; 951 + params.needs_flush = flush_tlb; 989 952 params.allow_override = allow_override; 990 953 991 954 /* Implicitly sync to command submissions in the same VM before ··· 1070 1031 } 1071 1032 1072 1033 r = vm->update_funcs->commit(&params, fence); 1034 + if (r) 1035 + goto error_free; 1073 1036 1074 - if (flush_tlb || params.table_freed) { 1075 - tlb_cb->vm = vm; 1076 - if (fence && *fence && 1077 - !dma_fence_add_callback(*fence, &tlb_cb->cb, 1078 - amdgpu_vm_tlb_seq_cb)) { 1079 - dma_fence_put(vm->last_tlb_flush); 1080 - vm->last_tlb_flush = dma_fence_get(*fence); 1081 - } else { 1082 - amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); 1083 - } 1037 + if (params.needs_flush) { 1038 + amdgpu_vm_tlb_flush(&params, fence, tlb_cb); 1084 1039 tlb_cb = NULL; 1085 1040 } 1086 1041 1087 1042 error_free: 1088 1043 kfree(tlb_cb); 1089 - 1090 - error_unlock: 1091 1044 amdgpu_vm_eviction_unlock(vm); 1092 1045 drm_dev_exit(idx); 1093 1046 return r; ··· 2422 2391 2423 2392 mutex_init(&vm->eviction_lock); 2424 2393 vm->evicting = false; 2394 + vm->tlb_fence_context = dma_fence_context_alloc(1); 2425 2395 2426 2396 r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, 2427 2397 false, &root, xcp_id);

+6 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

··· 257 257 unsigned int num_dw_left; 258 258 259 259 /** 260 - * @table_freed: return true if page table is freed when updating 260 + * @needs_flush: true whenever we need to invalidate the TLB 261 261 */ 262 - bool table_freed; 262 + bool needs_flush; 263 263 264 264 /** 265 265 * @allow_override: true for memory that is not uncached: allows MTYPE ··· 342 342 atomic64_t tlb_seq; 343 343 struct dma_fence *last_tlb_flush; 344 344 atomic64_t kfd_last_flushed_seq; 345 + uint64_t tlb_fence_context; 345 346 346 347 /* How many times we had to re-generate the page tables */ 347 348 uint64_t generation; ··· 612 611 uint64_t addr, 613 612 uint32_t status, 614 613 unsigned int vmhub); 614 + void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, 615 + struct amdgpu_vm *vm, 616 + struct dma_fence **fence); 615 617 616 618 #endif

+3 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c

··· 108 108 static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p, 109 109 struct dma_fence **fence) 110 110 { 111 - /* Flush HDP */ 111 + if (p->needs_flush) 112 + atomic64_inc(&p->vm->tlb_seq); 113 + 112 114 mb(); 113 115 amdgpu_device_flush_hdp(p->adev, NULL); 114 116 return 0;

+1 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c

··· 972 972 while (cursor.pfn < frag_start) { 973 973 /* Make sure previous mapping is freed */ 974 974 if (cursor.entry->bo) { 975 - params->table_freed = true; 975 + params->needs_flush = true; 976 976 amdgpu_vm_pt_free_dfs(adev, params->vm, 977 977 &cursor, 978 978 params->unlocked);

drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c

··· 126 126 127 127 WARN_ON(ib->length_dw == 0); 128 128 amdgpu_ring_pad_ib(ring, ib); 129 + 130 + if (p->needs_flush) 131 + atomic64_inc(&p->vm->tlb_seq); 132 + 129 133 WARN_ON(ib->length_dw > p->num_dw_left); 130 134 f = amdgpu_job_submit(p->job); 131 135

+112

drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 OR MIT 2 + /* 3 + * Copyright 2023 Advanced Micro Devices, Inc. 4 + * 5 + * Permission is hereby granted, free of charge, to any person obtaining a 6 + * copy of this software and associated documentation files (the "Software"), 7 + * to deal in the Software without restriction, including without limitation 8 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 + * and/or sell copies of the Software, and to permit persons to whom the 10 + * Software is furnished to do so, subject to the following conditions: 11 + * 12 + * The above copyright notice and this permission notice shall be included in 13 + * all copies or substantial portions of the Software. 14 + * 15 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 + * OTHER DEALINGS IN THE SOFTWARE. 22 + */ 23 + 24 + #include <linux/dma-fence.h> 25 + #include <linux/workqueue.h> 26 + 27 + #include "amdgpu.h" 28 + #include "amdgpu_vm.h" 29 + #include "amdgpu_gmc.h" 30 + 31 + struct amdgpu_tlb_fence { 32 + struct dma_fence base; 33 + struct amdgpu_device *adev; 34 + struct dma_fence *dependency; 35 + struct work_struct work; 36 + spinlock_t lock; 37 + uint16_t pasid; 38 + 39 + }; 40 + 41 + static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence) 42 + { 43 + return "amdgpu tlb fence"; 44 + } 45 + 46 + static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f) 47 + { 48 + return "amdgpu tlb timeline"; 49 + } 50 + 51 + static void amdgpu_tlb_fence_work(struct work_struct *work) 52 + { 53 + struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work); 54 + int r; 55 + 56 + if (f->dependency) { 57 + dma_fence_wait(f->dependency, false); 58 + dma_fence_put(f->dependency); 59 + f->dependency = NULL; 60 + } 61 + 62 + r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0); 63 + if (r) { 64 + dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n", 65 + f->pasid); 66 + dma_fence_set_error(&f->base, r); 67 + } 68 + 69 + dma_fence_signal(&f->base); 70 + dma_fence_put(&f->base); 71 + } 72 + 73 + static const struct dma_fence_ops amdgpu_tlb_fence_ops = { 74 + .use_64bit_seqno = true, 75 + .get_driver_name = amdgpu_tlb_fence_get_driver_name, 76 + .get_timeline_name = amdgpu_tlb_fence_get_timeline_name 77 + }; 78 + 79 + void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, 80 + struct dma_fence **fence) 81 + { 82 + struct amdgpu_tlb_fence *f; 83 + 84 + f = kmalloc(sizeof(*f), GFP_KERNEL); 85 + if (!f) { 86 + /* 87 + * We can't fail since the PDEs and PTEs are already updated, so 88 + * just block for the dependency and execute the TLB flush 89 + */ 90 + if (*fence) 91 + dma_fence_wait(*fence, false); 92 + 93 + amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0); 94 + *fence = dma_fence_get_stub(); 95 + return; 96 + } 97 + 98 + f->adev = adev; 99 + f->dependency = *fence; 100 + f->pasid = vm->pasid; 101 + INIT_WORK(&f->work, amdgpu_tlb_fence_work); 102 + spin_lock_init(&f->lock); 103 + 104 + dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock, 105 + vm->tlb_fence_context, atomic64_read(&vm->tlb_seq)); 106 + 107 + /* TODO: We probably need a separate wq here */ 108 + dma_fence_get(&f->base); 109 + schedule_work(&f->work); 110 + 111 + *fence = &f->base; 112 + }