Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'drm-amdkfd-next-2018-05-14' of git://people.freedesktop.org/~gabbayo/linux into drm-next

This is amdkfd pull for 4.18. The major new features are:

- Add support for GFXv9 dGPUs (VEGA)
- Add support for userptr memory mapping

In addition, there are a couple of small fixes and improvements, such as:
- Fix lock handling
- Fix rollback packet in kernel kfd_queue
- Optimize kfd signal handling
- Fix CP hang in APU

Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180514070126.GA1827@odedg-x270

+6222 -858
+2
MAINTAINERS
··· 767 767 F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 768 768 F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 769 769 F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 770 + F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 770 771 F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c 771 772 F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 772 773 F: drivers/gpu/drm/amd/amdkfd/ 773 774 F: drivers/gpu/drm/amd/include/cik_structs.h 774 775 F: drivers/gpu/drm/amd/include/kgd_kfd_interface.h 775 776 F: drivers/gpu/drm/amd/include/vi_structs.h 777 + F: drivers/gpu/drm/amd/include/v9_structs.h 776 778 F: include/uapi/linux/kfd_ioctl.h 777 779 778 780 AMD SEATTLE DEVICE TREE SUPPORT
+2 -1
drivers/gpu/drm/amd/amdgpu/Makefile
··· 130 130 amdgpu_amdkfd.o \ 131 131 amdgpu_amdkfd_fence.o \ 132 132 amdgpu_amdkfd_gpuvm.o \ 133 - amdgpu_amdkfd_gfx_v8.o 133 + amdgpu_amdkfd_gfx_v8.o \ 134 + amdgpu_amdkfd_gfx_v9.o 134 135 135 136 # add cgs 136 137 amdgpu-y += amdgpu_cgs.o
+26
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
··· 92 92 case CHIP_POLARIS11: 93 93 kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions(); 94 94 break; 95 + case CHIP_VEGA10: 96 + case CHIP_RAVEN: 97 + kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions(); 98 + break; 95 99 default: 96 100 dev_dbg(adev->dev, "kfd not supported on this ASIC\n"); 97 101 return; ··· 179 175 &gpu_resources.doorbell_physical_address, 180 176 &gpu_resources.doorbell_aperture_size, 181 177 &gpu_resources.doorbell_start_offset); 178 + if (adev->asic_type >= CHIP_VEGA10) { 179 + /* On SOC15 the BIF is involved in routing 180 + * doorbells using the low 12 bits of the 181 + * address. Communicate the assignments to 182 + * KFD. KFD uses two doorbell pages per 183 + * process in case of 64-bit doorbells so we 184 + * can use each doorbell assignment twice. 185 + */ 186 + gpu_resources.sdma_doorbell[0][0] = 187 + AMDGPU_DOORBELL64_sDMA_ENGINE0; 188 + gpu_resources.sdma_doorbell[0][1] = 189 + AMDGPU_DOORBELL64_sDMA_ENGINE0 + 0x200; 190 + gpu_resources.sdma_doorbell[1][0] = 191 + AMDGPU_DOORBELL64_sDMA_ENGINE1; 192 + gpu_resources.sdma_doorbell[1][1] = 193 + AMDGPU_DOORBELL64_sDMA_ENGINE1 + 0x200; 194 + /* Doorbells 0x0f0-0ff and 0x2f0-2ff are reserved for 195 + * SDMA, IH and VCN. So don't use them for the CP. 196 + */ 197 + gpu_resources.reserved_doorbell_mask = 0x1f0; 198 + gpu_resources.reserved_doorbell_val = 0x0f0; 199 + } 182 200 183 201 kgd2kfd->device_init(adev->kfd, &gpu_resources); 184 202 }
+13
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
··· 28 28 #include <linux/types.h> 29 29 #include <linux/mm.h> 30 30 #include <linux/mmu_context.h> 31 + #include <linux/workqueue.h> 31 32 #include <kgd_kfd_interface.h> 32 33 #include <drm/ttm/ttm_execbuf_util.h> 33 34 #include "amdgpu_sync.h" ··· 60 59 61 60 uint32_t mapping_flags; 62 61 62 + atomic_t invalid; 63 63 struct amdkfd_process_info *process_info; 64 + struct page **user_pages; 64 65 65 66 struct amdgpu_sync sync; 66 67 ··· 87 84 struct list_head vm_list_head; 88 85 /* List head for all KFD BOs that belong to a KFD process. */ 89 86 struct list_head kfd_bo_list; 87 + /* List of userptr BOs that are valid or invalid */ 88 + struct list_head userptr_valid_list; 89 + struct list_head userptr_inval_list; 90 90 /* Lock to protect kfd_bo_list */ 91 91 struct mutex lock; 92 92 ··· 97 91 unsigned int n_vms; 98 92 /* Eviction Fence */ 99 93 struct amdgpu_amdkfd_fence *eviction_fence; 94 + 95 + /* MMU-notifier related fields */ 96 + atomic_t evicted_bos; 97 + struct delayed_work restore_userptr_work; 98 + struct pid *pid; 100 99 }; 101 100 102 101 int amdgpu_amdkfd_init(void); ··· 115 104 void amdgpu_amdkfd_device_init(struct amdgpu_device *adev); 116 105 void amdgpu_amdkfd_device_fini(struct amdgpu_device *adev); 117 106 107 + int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); 118 108 int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, 119 109 uint32_t vmid, uint64_t gpu_addr, 120 110 uint32_t *ib_cmd, uint32_t ib_len); 121 111 122 112 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void); 123 113 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void); 114 + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void); 124 115 125 116 bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); 126 117
-10
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
··· 98 98 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 99 99 unsigned int vmid); 100 100 101 - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, 102 - uint32_t hpd_size, uint64_t hpd_gpu_addr); 103 101 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 104 102 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 105 103 uint32_t queue_id, uint32_t __user *wptr, ··· 181 183 .free_pasid = amdgpu_pasid_free, 182 184 .program_sh_mem_settings = kgd_program_sh_mem_settings, 183 185 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 184 - .init_pipeline = kgd_init_pipeline, 185 186 .init_interrupts = kgd_init_interrupts, 186 187 .hqd_load = kgd_hqd_load, 187 188 .hqd_sdma_load = kgd_hqd_sdma_load, ··· 303 306 /* Mapping vmid to pasid also for IH block */ 304 307 WREG32(mmIH_VMID_0_LUT + vmid, pasid_mapping); 305 308 306 - return 0; 307 - } 308 - 309 - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, 310 - uint32_t hpd_size, uint64_t hpd_gpu_addr) 311 - { 312 - /* amdgpu owns the per-pipe state */ 313 309 return 0; 314 310 } 315 311
-10
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
··· 57 57 uint32_t sh_mem_bases); 58 58 static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 59 59 unsigned int vmid); 60 - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, 61 - uint32_t hpd_size, uint64_t hpd_gpu_addr); 62 60 static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 63 61 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 64 62 uint32_t queue_id, uint32_t __user *wptr, ··· 139 141 .free_pasid = amdgpu_pasid_free, 140 142 .program_sh_mem_settings = kgd_program_sh_mem_settings, 141 143 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 142 - .init_pipeline = kgd_init_pipeline, 143 144 .init_interrupts = kgd_init_interrupts, 144 145 .hqd_load = kgd_hqd_load, 145 146 .hqd_sdma_load = kgd_hqd_sdma_load, ··· 264 267 /* Mapping vmid to pasid also for IH block */ 265 268 WREG32(mmIH_VMID_0_LUT + vmid, pasid_mapping); 266 269 267 - return 0; 268 - } 269 - 270 - static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, 271 - uint32_t hpd_size, uint64_t hpd_gpu_addr) 272 - { 273 - /* amdgpu owns the per-pipe state */ 274 270 return 0; 275 271 } 276 272
+1043
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
··· 1 + /* 2 + * Copyright 2014-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + */ 22 + 23 + #define pr_fmt(fmt) "kfd2kgd: " fmt 24 + 25 + #include <linux/module.h> 26 + #include <linux/fdtable.h> 27 + #include <linux/uaccess.h> 28 + #include <linux/firmware.h> 29 + #include <drm/drmP.h> 30 + #include "amdgpu.h" 31 + #include "amdgpu_amdkfd.h" 32 + #include "amdgpu_ucode.h" 33 + #include "soc15_hw_ip.h" 34 + #include "gc/gc_9_0_offset.h" 35 + #include "gc/gc_9_0_sh_mask.h" 36 + #include "vega10_enum.h" 37 + #include "sdma0/sdma0_4_0_offset.h" 38 + #include "sdma0/sdma0_4_0_sh_mask.h" 39 + #include "sdma1/sdma1_4_0_offset.h" 40 + #include "sdma1/sdma1_4_0_sh_mask.h" 41 + #include "athub/athub_1_0_offset.h" 42 + #include "athub/athub_1_0_sh_mask.h" 43 + #include "oss/osssys_4_0_offset.h" 44 + #include "oss/osssys_4_0_sh_mask.h" 45 + #include "soc15_common.h" 46 + #include "v9_structs.h" 47 + #include "soc15.h" 48 + #include "soc15d.h" 49 + 50 + /* HACK: MMHUB and GC both have VM-related register with the same 51 + * names but different offsets. Define the MMHUB register we need here 52 + * with a prefix. A proper solution would be to move the functions 53 + * programming these registers into gfx_v9_0.c and mmhub_v1_0.c 54 + * respectively. 55 + */ 56 + #define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3 57 + #define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0 58 + 59 + #define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705 60 + #define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0 61 + 62 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b 63 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0 64 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c 65 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0 66 + 67 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b 68 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0 69 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c 70 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0 71 + 72 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b 73 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0 74 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c 75 + #define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0 76 + 77 + #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727 78 + #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0 79 + #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728 80 + #define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0 81 + 82 + #define V9_PIPE_PER_MEC (4) 83 + #define V9_QUEUES_PER_PIPE_MEC (8) 84 + 85 + enum hqd_dequeue_request_type { 86 + NO_ACTION = 0, 87 + DRAIN_PIPE, 88 + RESET_WAVES 89 + }; 90 + 91 + /* 92 + * Register access functions 93 + */ 94 + 95 + static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 96 + uint32_t sh_mem_config, 97 + uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, 98 + uint32_t sh_mem_bases); 99 + static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 100 + unsigned int vmid); 101 + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); 102 + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 103 + uint32_t queue_id, uint32_t __user *wptr, 104 + uint32_t wptr_shift, uint32_t wptr_mask, 105 + struct mm_struct *mm); 106 + static int kgd_hqd_dump(struct kgd_dev *kgd, 107 + uint32_t pipe_id, uint32_t queue_id, 108 + uint32_t (**dump)[2], uint32_t *n_regs); 109 + static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 110 + uint32_t __user *wptr, struct mm_struct *mm); 111 + static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 112 + uint32_t engine_id, uint32_t queue_id, 113 + uint32_t (**dump)[2], uint32_t *n_regs); 114 + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 115 + uint32_t pipe_id, uint32_t queue_id); 116 + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); 117 + static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 118 + enum kfd_preempt_type reset_type, 119 + unsigned int utimeout, uint32_t pipe_id, 120 + uint32_t queue_id); 121 + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 122 + unsigned int utimeout); 123 + static int kgd_address_watch_disable(struct kgd_dev *kgd); 124 + static int kgd_address_watch_execute(struct kgd_dev *kgd, 125 + unsigned int watch_point_id, 126 + uint32_t cntl_val, 127 + uint32_t addr_hi, 128 + uint32_t addr_lo); 129 + static int kgd_wave_control_execute(struct kgd_dev *kgd, 130 + uint32_t gfx_index_val, 131 + uint32_t sq_cmd); 132 + static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 133 + unsigned int watch_point_id, 134 + unsigned int reg_offset); 135 + 136 + static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 137 + uint8_t vmid); 138 + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 139 + uint8_t vmid); 140 + static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 141 + uint32_t page_table_base); 142 + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); 143 + static void set_scratch_backing_va(struct kgd_dev *kgd, 144 + uint64_t va, uint32_t vmid); 145 + static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); 146 + static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); 147 + 148 + /* Because of REG_GET_FIELD() being used, we put this function in the 149 + * asic specific file. 150 + */ 151 + static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, 152 + struct tile_config *config) 153 + { 154 + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; 155 + 156 + config->gb_addr_config = adev->gfx.config.gb_addr_config; 157 + 158 + config->tile_config_ptr = adev->gfx.config.tile_mode_array; 159 + config->num_tile_configs = 160 + ARRAY_SIZE(adev->gfx.config.tile_mode_array); 161 + config->macro_tile_config_ptr = 162 + adev->gfx.config.macrotile_mode_array; 163 + config->num_macro_tile_configs = 164 + ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); 165 + 166 + return 0; 167 + } 168 + 169 + static const struct kfd2kgd_calls kfd2kgd = { 170 + .init_gtt_mem_allocation = alloc_gtt_mem, 171 + .free_gtt_mem = free_gtt_mem, 172 + .get_local_mem_info = get_local_mem_info, 173 + .get_gpu_clock_counter = get_gpu_clock_counter, 174 + .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz, 175 + .alloc_pasid = amdgpu_pasid_alloc, 176 + .free_pasid = amdgpu_pasid_free, 177 + .program_sh_mem_settings = kgd_program_sh_mem_settings, 178 + .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 179 + .init_interrupts = kgd_init_interrupts, 180 + .hqd_load = kgd_hqd_load, 181 + .hqd_sdma_load = kgd_hqd_sdma_load, 182 + .hqd_dump = kgd_hqd_dump, 183 + .hqd_sdma_dump = kgd_hqd_sdma_dump, 184 + .hqd_is_occupied = kgd_hqd_is_occupied, 185 + .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 186 + .hqd_destroy = kgd_hqd_destroy, 187 + .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 188 + .address_watch_disable = kgd_address_watch_disable, 189 + .address_watch_execute = kgd_address_watch_execute, 190 + .wave_control_execute = kgd_wave_control_execute, 191 + .address_watch_get_offset = kgd_address_watch_get_offset, 192 + .get_atc_vmid_pasid_mapping_pasid = 193 + get_atc_vmid_pasid_mapping_pasid, 194 + .get_atc_vmid_pasid_mapping_valid = 195 + get_atc_vmid_pasid_mapping_valid, 196 + .get_fw_version = get_fw_version, 197 + .set_scratch_backing_va = set_scratch_backing_va, 198 + .get_tile_config = amdgpu_amdkfd_get_tile_config, 199 + .get_cu_info = get_cu_info, 200 + .get_vram_usage = amdgpu_amdkfd_get_vram_usage, 201 + .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, 202 + .acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm, 203 + .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, 204 + .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, 205 + .set_vm_context_page_table_base = set_vm_context_page_table_base, 206 + .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu, 207 + .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu, 208 + .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu, 209 + .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu, 210 + .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory, 211 + .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel, 212 + .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos, 213 + .invalidate_tlbs = invalidate_tlbs, 214 + .invalidate_tlbs_vmid = invalidate_tlbs_vmid, 215 + .submit_ib = amdgpu_amdkfd_submit_ib, 216 + }; 217 + 218 + struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) 219 + { 220 + return (struct kfd2kgd_calls *)&kfd2kgd; 221 + } 222 + 223 + static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 224 + { 225 + return (struct amdgpu_device *)kgd; 226 + } 227 + 228 + static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 229 + uint32_t queue, uint32_t vmid) 230 + { 231 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 232 + 233 + mutex_lock(&adev->srbm_mutex); 234 + soc15_grbm_select(adev, mec, pipe, queue, vmid); 235 + } 236 + 237 + static void unlock_srbm(struct kgd_dev *kgd) 238 + { 239 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 240 + 241 + soc15_grbm_select(adev, 0, 0, 0, 0); 242 + mutex_unlock(&adev->srbm_mutex); 243 + } 244 + 245 + static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 246 + uint32_t queue_id) 247 + { 248 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 249 + 250 + uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 251 + uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 252 + 253 + lock_srbm(kgd, mec, pipe, queue_id, 0); 254 + } 255 + 256 + static uint32_t get_queue_mask(struct amdgpu_device *adev, 257 + uint32_t pipe_id, uint32_t queue_id) 258 + { 259 + unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + 260 + queue_id) & 31; 261 + 262 + return ((uint32_t)1) << bit; 263 + } 264 + 265 + static void release_queue(struct kgd_dev *kgd) 266 + { 267 + unlock_srbm(kgd); 268 + } 269 + 270 + static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 271 + uint32_t sh_mem_config, 272 + uint32_t sh_mem_ape1_base, 273 + uint32_t sh_mem_ape1_limit, 274 + uint32_t sh_mem_bases) 275 + { 276 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 277 + 278 + lock_srbm(kgd, 0, 0, 0, vmid); 279 + 280 + WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 281 + WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 282 + /* APE1 no longer exists on GFX9 */ 283 + 284 + unlock_srbm(kgd); 285 + } 286 + 287 + static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 288 + unsigned int vmid) 289 + { 290 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 291 + 292 + /* 293 + * We have to assume that there is no outstanding mapping. 294 + * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 295 + * a mapping is in progress or because a mapping finished 296 + * and the SW cleared it. 297 + * So the protocol is to always wait & clear. 298 + */ 299 + uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 300 + ATC_VMID0_PASID_MAPPING__VALID_MASK; 301 + 302 + /* 303 + * need to do this twice, once for gfx and once for mmhub 304 + * for ATC add 16 to VMID for mmhub, for IH different registers. 305 + * ATC_VMID0..15 registers are separate from ATC_VMID16..31. 306 + */ 307 + 308 + WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 309 + pasid_mapping); 310 + 311 + while (!(RREG32(SOC15_REG_OFFSET( 312 + ATHUB, 0, 313 + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 314 + (1U << vmid))) 315 + cpu_relax(); 316 + 317 + WREG32(SOC15_REG_OFFSET(ATHUB, 0, 318 + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 319 + 1U << vmid); 320 + 321 + /* Mapping vmid to pasid also for IH block */ 322 + WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 323 + pasid_mapping); 324 + 325 + WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, 326 + pasid_mapping); 327 + 328 + while (!(RREG32(SOC15_REG_OFFSET( 329 + ATHUB, 0, 330 + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 331 + (1U << (vmid + 16)))) 332 + cpu_relax(); 333 + 334 + WREG32(SOC15_REG_OFFSET(ATHUB, 0, 335 + mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 336 + 1U << (vmid + 16)); 337 + 338 + /* Mapping vmid to pasid also for IH block */ 339 + WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, 340 + pasid_mapping); 341 + return 0; 342 + } 343 + 344 + /* TODO - RING0 form of field is obsolete, seems to date back to SI 345 + * but still works 346 + */ 347 + 348 + static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 349 + { 350 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 351 + uint32_t mec; 352 + uint32_t pipe; 353 + 354 + mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 355 + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 356 + 357 + lock_srbm(kgd, mec, pipe, 0, 0); 358 + 359 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 360 + CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 361 + CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 362 + 363 + unlock_srbm(kgd); 364 + 365 + return 0; 366 + } 367 + 368 + static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, 369 + unsigned int engine_id, 370 + unsigned int queue_id) 371 + { 372 + uint32_t base[2] = { 373 + SOC15_REG_OFFSET(SDMA0, 0, 374 + mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, 375 + SOC15_REG_OFFSET(SDMA1, 0, 376 + mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL 377 + }; 378 + uint32_t retval; 379 + 380 + retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - 381 + mmSDMA0_RLC0_RB_CNTL); 382 + 383 + pr_debug("sdma base address: 0x%x\n", retval); 384 + 385 + return retval; 386 + } 387 + 388 + static inline struct v9_mqd *get_mqd(void *mqd) 389 + { 390 + return (struct v9_mqd *)mqd; 391 + } 392 + 393 + static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 394 + { 395 + return (struct v9_sdma_mqd *)mqd; 396 + } 397 + 398 + static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 399 + uint32_t queue_id, uint32_t __user *wptr, 400 + uint32_t wptr_shift, uint32_t wptr_mask, 401 + struct mm_struct *mm) 402 + { 403 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 404 + struct v9_mqd *m; 405 + uint32_t *mqd_hqd; 406 + uint32_t reg, hqd_base, data; 407 + 408 + m = get_mqd(mqd); 409 + 410 + acquire_queue(kgd, pipe_id, queue_id); 411 + 412 + /* HIQ is set during driver init period with vmid set to 0*/ 413 + if (m->cp_hqd_vmid == 0) { 414 + uint32_t value, mec, pipe; 415 + 416 + mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 417 + pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 418 + 419 + pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 420 + mec, pipe, queue_id); 421 + value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); 422 + value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, 423 + ((mec << 5) | (pipe << 3) | queue_id | 0x80)); 424 + WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); 425 + } 426 + 427 + /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 428 + mqd_hqd = &m->cp_mqd_base_addr_lo; 429 + hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 430 + 431 + for (reg = hqd_base; 432 + reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 433 + WREG32(reg, mqd_hqd[reg - hqd_base]); 434 + 435 + 436 + /* Activate doorbell logic before triggering WPTR poll. */ 437 + data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 438 + CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 439 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 440 + 441 + if (wptr) { 442 + /* Don't read wptr with get_user because the user 443 + * context may not be accessible (if this function 444 + * runs in a work queue). Instead trigger a one-shot 445 + * polling read from memory in the CP. This assumes 446 + * that wptr is GPU-accessible in the queue's VMID via 447 + * ATC or SVM. WPTR==RPTR before starting the poll so 448 + * the CP starts fetching new commands from the right 449 + * place. 450 + * 451 + * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 452 + * tricky. Assume that the queue didn't overflow. The 453 + * number of valid bits in the 32-bit RPTR depends on 454 + * the queue size. The remaining bits are taken from 455 + * the saved 64-bit WPTR. If the WPTR wrapped, add the 456 + * queue size. 457 + */ 458 + uint32_t queue_size = 459 + 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 460 + CP_HQD_PQ_CONTROL, QUEUE_SIZE); 461 + uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 462 + 463 + if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 464 + guessed_wptr += queue_size; 465 + guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 466 + guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 467 + 468 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 469 + lower_32_bits(guessed_wptr)); 470 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 471 + upper_32_bits(guessed_wptr)); 472 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 473 + lower_32_bits((uint64_t)wptr)); 474 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 475 + upper_32_bits((uint64_t)wptr)); 476 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 477 + get_queue_mask(adev, pipe_id, queue_id)); 478 + } 479 + 480 + /* Start the EOP fetcher */ 481 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 482 + REG_SET_FIELD(m->cp_hqd_eop_rptr, 483 + CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 484 + 485 + data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 486 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 487 + 488 + release_queue(kgd); 489 + 490 + return 0; 491 + } 492 + 493 + static int kgd_hqd_dump(struct kgd_dev *kgd, 494 + uint32_t pipe_id, uint32_t queue_id, 495 + uint32_t (**dump)[2], uint32_t *n_regs) 496 + { 497 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 498 + uint32_t i = 0, reg; 499 + #define HQD_N_REGS 56 500 + #define DUMP_REG(addr) do { \ 501 + if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 502 + break; \ 503 + (*dump)[i][0] = (addr) << 2; \ 504 + (*dump)[i++][1] = RREG32(addr); \ 505 + } while (0) 506 + 507 + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 508 + if (*dump == NULL) 509 + return -ENOMEM; 510 + 511 + acquire_queue(kgd, pipe_id, queue_id); 512 + 513 + for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 514 + reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 515 + DUMP_REG(reg); 516 + 517 + release_queue(kgd); 518 + 519 + WARN_ON_ONCE(i != HQD_N_REGS); 520 + *n_regs = i; 521 + 522 + return 0; 523 + } 524 + 525 + static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 526 + uint32_t __user *wptr, struct mm_struct *mm) 527 + { 528 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 529 + struct v9_sdma_mqd *m; 530 + uint32_t sdma_base_addr, sdmax_gfx_context_cntl; 531 + unsigned long end_jiffies; 532 + uint32_t data; 533 + uint64_t data64; 534 + uint64_t __user *wptr64 = (uint64_t __user *)wptr; 535 + 536 + m = get_sdma_mqd(mqd); 537 + sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 538 + m->sdma_queue_id); 539 + sdmax_gfx_context_cntl = m->sdma_engine_id ? 540 + SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : 541 + SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); 542 + 543 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 544 + m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 545 + 546 + end_jiffies = msecs_to_jiffies(2000) + jiffies; 547 + while (true) { 548 + data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 549 + if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 550 + break; 551 + if (time_after(jiffies, end_jiffies)) 552 + return -ETIME; 553 + usleep_range(500, 1000); 554 + } 555 + data = RREG32(sdmax_gfx_context_cntl); 556 + data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, 557 + RESUME_CTX, 0); 558 + WREG32(sdmax_gfx_context_cntl, data); 559 + 560 + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, 561 + m->sdmax_rlcx_doorbell_offset); 562 + 563 + data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 564 + ENABLE, 1); 565 + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); 566 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); 567 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, 568 + m->sdmax_rlcx_rb_rptr_hi); 569 + 570 + WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 571 + if (read_user_wptr(mm, wptr64, data64)) { 572 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 573 + lower_32_bits(data64)); 574 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 575 + upper_32_bits(data64)); 576 + } else { 577 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 578 + m->sdmax_rlcx_rb_rptr); 579 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, 580 + m->sdmax_rlcx_rb_rptr_hi); 581 + } 582 + WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 583 + 584 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 585 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, 586 + m->sdmax_rlcx_rb_base_hi); 587 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 588 + m->sdmax_rlcx_rb_rptr_addr_lo); 589 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 590 + m->sdmax_rlcx_rb_rptr_addr_hi); 591 + 592 + data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 593 + RB_ENABLE, 1); 594 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); 595 + 596 + return 0; 597 + } 598 + 599 + static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 600 + uint32_t engine_id, uint32_t queue_id, 601 + uint32_t (**dump)[2], uint32_t *n_regs) 602 + { 603 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 604 + uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); 605 + uint32_t i = 0, reg; 606 + #undef HQD_N_REGS 607 + #define HQD_N_REGS (19+6+7+10) 608 + 609 + *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 610 + if (*dump == NULL) 611 + return -ENOMEM; 612 + 613 + for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 614 + DUMP_REG(sdma_base_addr + reg); 615 + for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 616 + DUMP_REG(sdma_base_addr + reg); 617 + for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 618 + reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 619 + DUMP_REG(sdma_base_addr + reg); 620 + for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 621 + reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 622 + DUMP_REG(sdma_base_addr + reg); 623 + 624 + WARN_ON_ONCE(i != HQD_N_REGS); 625 + *n_regs = i; 626 + 627 + return 0; 628 + } 629 + 630 + static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 631 + uint32_t pipe_id, uint32_t queue_id) 632 + { 633 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 634 + uint32_t act; 635 + bool retval = false; 636 + uint32_t low, high; 637 + 638 + acquire_queue(kgd, pipe_id, queue_id); 639 + act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 640 + if (act) { 641 + low = lower_32_bits(queue_address >> 8); 642 + high = upper_32_bits(queue_address >> 8); 643 + 644 + if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 645 + high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 646 + retval = true; 647 + } 648 + release_queue(kgd); 649 + return retval; 650 + } 651 + 652 + static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 653 + { 654 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 655 + struct v9_sdma_mqd *m; 656 + uint32_t sdma_base_addr; 657 + uint32_t sdma_rlc_rb_cntl; 658 + 659 + m = get_sdma_mqd(mqd); 660 + sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 661 + m->sdma_queue_id); 662 + 663 + sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 664 + 665 + if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 666 + return true; 667 + 668 + return false; 669 + } 670 + 671 + static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 672 + enum kfd_preempt_type reset_type, 673 + unsigned int utimeout, uint32_t pipe_id, 674 + uint32_t queue_id) 675 + { 676 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 677 + enum hqd_dequeue_request_type type; 678 + unsigned long end_jiffies; 679 + uint32_t temp; 680 + struct v9_mqd *m = get_mqd(mqd); 681 + 682 + acquire_queue(kgd, pipe_id, queue_id); 683 + 684 + if (m->cp_hqd_vmid == 0) 685 + WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 686 + 687 + switch (reset_type) { 688 + case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 689 + type = DRAIN_PIPE; 690 + break; 691 + case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 692 + type = RESET_WAVES; 693 + break; 694 + default: 695 + type = DRAIN_PIPE; 696 + break; 697 + } 698 + 699 + WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 700 + 701 + end_jiffies = (utimeout * HZ / 1000) + jiffies; 702 + while (true) { 703 + temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 704 + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 705 + break; 706 + if (time_after(jiffies, end_jiffies)) { 707 + pr_err("cp queue preemption time out.\n"); 708 + release_queue(kgd); 709 + return -ETIME; 710 + } 711 + usleep_range(500, 1000); 712 + } 713 + 714 + release_queue(kgd); 715 + return 0; 716 + } 717 + 718 + static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 719 + unsigned int utimeout) 720 + { 721 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 722 + struct v9_sdma_mqd *m; 723 + uint32_t sdma_base_addr; 724 + uint32_t temp; 725 + unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 726 + 727 + m = get_sdma_mqd(mqd); 728 + sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, 729 + m->sdma_queue_id); 730 + 731 + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); 732 + temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 733 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); 734 + 735 + while (true) { 736 + temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); 737 + if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 738 + break; 739 + if (time_after(jiffies, end_jiffies)) 740 + return -ETIME; 741 + usleep_range(500, 1000); 742 + } 743 + 744 + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); 745 + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, 746 + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | 747 + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 748 + 749 + m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); 750 + m->sdmax_rlcx_rb_rptr_hi = 751 + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); 752 + 753 + return 0; 754 + } 755 + 756 + static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, 757 + uint8_t vmid) 758 + { 759 + uint32_t reg; 760 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 761 + 762 + reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 763 + + vmid); 764 + return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; 765 + } 766 + 767 + static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, 768 + uint8_t vmid) 769 + { 770 + uint32_t reg; 771 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 772 + 773 + reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 774 + + vmid); 775 + return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; 776 + } 777 + 778 + static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) 779 + { 780 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 781 + uint32_t req = (1 << vmid) | 782 + (0 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* legacy */ 783 + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK | 784 + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK | 785 + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK | 786 + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK | 787 + VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK; 788 + 789 + mutex_lock(&adev->srbm_mutex); 790 + 791 + /* Use legacy mode tlb invalidation. 792 + * 793 + * Currently on Raven the code below is broken for anything but 794 + * legacy mode due to a MMHUB power gating problem. A workaround 795 + * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ 796 + * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack 797 + * bit. 798 + * 799 + * TODO 1: agree on the right set of invalidation registers for 800 + * KFD use. Use the last one for now. Invalidate both GC and 801 + * MMHUB. 802 + * 803 + * TODO 2: support range-based invalidation, requires kfg2kgd 804 + * interface change 805 + */ 806 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32), 807 + 0xffffffff); 808 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32), 809 + 0x0000001f); 810 + 811 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, 812 + mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32), 813 + 0xffffffff); 814 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, 815 + mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32), 816 + 0x0000001f); 817 + 818 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req); 819 + 820 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ), 821 + req); 822 + 823 + while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) & 824 + (1 << vmid))) 825 + cpu_relax(); 826 + 827 + while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0, 828 + mmMMHUB_VM_INVALIDATE_ENG16_ACK)) & 829 + (1 << vmid))) 830 + cpu_relax(); 831 + 832 + mutex_unlock(&adev->srbm_mutex); 833 + 834 + } 835 + 836 + static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) 837 + { 838 + signed long r; 839 + uint32_t seq; 840 + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 841 + 842 + spin_lock(&adev->gfx.kiq.ring_lock); 843 + amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ 844 + amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); 845 + amdgpu_ring_write(ring, 846 + PACKET3_INVALIDATE_TLBS_DST_SEL(1) | 847 + PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | 848 + PACKET3_INVALIDATE_TLBS_PASID(pasid) | 849 + PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ 850 + amdgpu_fence_emit_polling(ring, &seq); 851 + amdgpu_ring_commit(ring); 852 + spin_unlock(&adev->gfx.kiq.ring_lock); 853 + 854 + r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); 855 + if (r < 1) { 856 + DRM_ERROR("wait for kiq fence error: %ld.\n", r); 857 + return -ETIME; 858 + } 859 + 860 + return 0; 861 + } 862 + 863 + static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) 864 + { 865 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 866 + int vmid; 867 + struct amdgpu_ring *ring = &adev->gfx.kiq.ring; 868 + 869 + if (ring->ready) 870 + return invalidate_tlbs_with_kiq(adev, pasid); 871 + 872 + for (vmid = 0; vmid < 16; vmid++) { 873 + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) 874 + continue; 875 + if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { 876 + if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) 877 + == pasid) { 878 + write_vmid_invalidate_request(kgd, vmid); 879 + break; 880 + } 881 + } 882 + } 883 + 884 + return 0; 885 + } 886 + 887 + static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) 888 + { 889 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 890 + 891 + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 892 + pr_err("non kfd vmid %d\n", vmid); 893 + return 0; 894 + } 895 + 896 + write_vmid_invalidate_request(kgd, vmid); 897 + return 0; 898 + } 899 + 900 + static int kgd_address_watch_disable(struct kgd_dev *kgd) 901 + { 902 + return 0; 903 + } 904 + 905 + static int kgd_address_watch_execute(struct kgd_dev *kgd, 906 + unsigned int watch_point_id, 907 + uint32_t cntl_val, 908 + uint32_t addr_hi, 909 + uint32_t addr_lo) 910 + { 911 + return 0; 912 + } 913 + 914 + static int kgd_wave_control_execute(struct kgd_dev *kgd, 915 + uint32_t gfx_index_val, 916 + uint32_t sq_cmd) 917 + { 918 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 919 + uint32_t data = 0; 920 + 921 + mutex_lock(&adev->grbm_idx_mutex); 922 + 923 + WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); 924 + WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 925 + 926 + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 927 + INSTANCE_BROADCAST_WRITES, 1); 928 + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 929 + SH_BROADCAST_WRITES, 1); 930 + data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 931 + SE_BROADCAST_WRITES, 1); 932 + 933 + WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); 934 + mutex_unlock(&adev->grbm_idx_mutex); 935 + 936 + return 0; 937 + } 938 + 939 + static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 940 + unsigned int watch_point_id, 941 + unsigned int reg_offset) 942 + { 943 + return 0; 944 + } 945 + 946 + static void set_scratch_backing_va(struct kgd_dev *kgd, 947 + uint64_t va, uint32_t vmid) 948 + { 949 + /* No longer needed on GFXv9. The scratch base address is 950 + * passed to the shader by the CP. It's the user mode driver's 951 + * responsibility. 952 + */ 953 + } 954 + 955 + /* FIXME: Does this need to be ASIC-specific code? */ 956 + static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) 957 + { 958 + struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 959 + const union amdgpu_firmware_header *hdr; 960 + 961 + switch (type) { 962 + case KGD_ENGINE_PFP: 963 + hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; 964 + break; 965 + 966 + case KGD_ENGINE_ME: 967 + hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; 968 + break; 969 + 970 + case KGD_ENGINE_CE: 971 + hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; 972 + break; 973 + 974 + case KGD_ENGINE_MEC1: 975 + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; 976 + break; 977 + 978 + case KGD_ENGINE_MEC2: 979 + hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; 980 + break; 981 + 982 + case KGD_ENGINE_RLC: 983 + hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; 984 + break; 985 + 986 + case KGD_ENGINE_SDMA1: 987 + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; 988 + break; 989 + 990 + case KGD_ENGINE_SDMA2: 991 + hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; 992 + break; 993 + 994 + default: 995 + return 0; 996 + } 997 + 998 + if (hdr == NULL) 999 + return 0; 1000 + 1001 + /* Only 12 bit in use*/ 1002 + return hdr->common.ucode_version; 1003 + } 1004 + 1005 + static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 1006 + uint32_t page_table_base) 1007 + { 1008 + struct amdgpu_device *adev = get_amdgpu_device(kgd); 1009 + uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT | 1010 + AMDGPU_PTE_VALID; 1011 + 1012 + if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 1013 + pr_err("trying to set page table base for wrong VMID %u\n", 1014 + vmid); 1015 + return; 1016 + } 1017 + 1018 + /* TODO: take advantage of per-process address space size. For 1019 + * now, all processes share the same address space size, like 1020 + * on GFX8 and older. 1021 + */ 1022 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); 1023 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); 1024 + 1025 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), 1026 + lower_32_bits(adev->vm_manager.max_pfn - 1)); 1027 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), 1028 + upper_32_bits(adev->vm_manager.max_pfn - 1)); 1029 + 1030 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); 1031 + WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); 1032 + 1033 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0); 1034 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0); 1035 + 1036 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2), 1037 + lower_32_bits(adev->vm_manager.max_pfn - 1)); 1038 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2), 1039 + upper_32_bits(adev->vm_manager.max_pfn - 1)); 1040 + 1041 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base)); 1042 + WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base)); 1043 + }
+549 -23
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
··· 23 23 #define pr_fmt(fmt) "kfd2kgd: " fmt 24 24 25 25 #include <linux/list.h> 26 + #include <linux/sched/mm.h> 26 27 #include <drm/drmP.h> 27 28 #include "amdgpu_object.h" 28 29 #include "amdgpu_vm.h" ··· 34 33 */ 35 34 #define VI_BO_SIZE_ALIGN (0x8000) 36 35 36 + /* BO flag to indicate a KFD userptr BO */ 37 + #define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63) 38 + 39 + /* Userptr restore delay, just long enough to allow consecutive VM 40 + * changes to accumulate 41 + */ 42 + #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1 43 + 37 44 /* Impose limit on how much memory KFD can use */ 38 45 static struct { 39 46 uint64_t max_system_mem_limit; 47 + uint64_t max_userptr_mem_limit; 40 48 int64_t system_mem_used; 49 + int64_t userptr_mem_used; 41 50 spinlock_t mem_limit_lock; 42 51 } kfd_mem_limit; 43 52 ··· 68 57 69 58 #define domain_string(domain) domain_bit_to_string[ffs(domain)-1] 70 59 60 + static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work); 71 61 72 62 73 63 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) ··· 90 78 91 79 /* Set memory usage limits. Current, limits are 92 80 * System (kernel) memory - 3/8th System RAM 81 + * Userptr memory - 3/4th System RAM 93 82 */ 94 83 void amdgpu_amdkfd_gpuvm_init_mem_limits(void) 95 84 { ··· 103 90 104 91 spin_lock_init(&kfd_mem_limit.mem_limit_lock); 105 92 kfd_mem_limit.max_system_mem_limit = (mem >> 1) - (mem >> 3); 106 - pr_debug("Kernel memory limit %lluM\n", 107 - (kfd_mem_limit.max_system_mem_limit >> 20)); 93 + kfd_mem_limit.max_userptr_mem_limit = mem - (mem >> 2); 94 + pr_debug("Kernel memory limit %lluM, userptr limit %lluM\n", 95 + (kfd_mem_limit.max_system_mem_limit >> 20), 96 + (kfd_mem_limit.max_userptr_mem_limit >> 20)); 108 97 } 109 98 110 99 static int amdgpu_amdkfd_reserve_system_mem_limit(struct amdgpu_device *adev, ··· 126 111 goto err_no_mem; 127 112 } 128 113 kfd_mem_limit.system_mem_used += (acc_size + size); 114 + } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { 115 + if ((kfd_mem_limit.system_mem_used + acc_size > 116 + kfd_mem_limit.max_system_mem_limit) || 117 + (kfd_mem_limit.userptr_mem_used + (size + acc_size) > 118 + kfd_mem_limit.max_userptr_mem_limit)) { 119 + ret = -ENOMEM; 120 + goto err_no_mem; 121 + } 122 + kfd_mem_limit.system_mem_used += acc_size; 123 + kfd_mem_limit.userptr_mem_used += size; 129 124 } 130 125 err_no_mem: 131 126 spin_unlock(&kfd_mem_limit.mem_limit_lock); ··· 151 126 sizeof(struct amdgpu_bo)); 152 127 153 128 spin_lock(&kfd_mem_limit.mem_limit_lock); 154 - if (domain == AMDGPU_GEM_DOMAIN_GTT) 129 + if (domain == AMDGPU_GEM_DOMAIN_GTT) { 155 130 kfd_mem_limit.system_mem_used -= (acc_size + size); 131 + } else if (domain == AMDGPU_GEM_DOMAIN_CPU) { 132 + kfd_mem_limit.system_mem_used -= acc_size; 133 + kfd_mem_limit.userptr_mem_used -= size; 134 + } 156 135 WARN_ONCE(kfd_mem_limit.system_mem_used < 0, 157 136 "kfd system memory accounting unbalanced"); 137 + WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, 138 + "kfd userptr memory accounting unbalanced"); 158 139 159 140 spin_unlock(&kfd_mem_limit.mem_limit_lock); 160 141 } ··· 169 138 { 170 139 spin_lock(&kfd_mem_limit.mem_limit_lock); 171 140 172 - if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { 141 + if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) { 142 + kfd_mem_limit.system_mem_used -= bo->tbo.acc_size; 143 + kfd_mem_limit.userptr_mem_used -= amdgpu_bo_size(bo); 144 + } else if (bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT) { 173 145 kfd_mem_limit.system_mem_used -= 174 146 (bo->tbo.acc_size + amdgpu_bo_size(bo)); 175 147 } 176 148 WARN_ONCE(kfd_mem_limit.system_mem_used < 0, 177 149 "kfd system memory accounting unbalanced"); 150 + WARN_ONCE(kfd_mem_limit.userptr_mem_used < 0, 151 + "kfd userptr memory accounting unbalanced"); 178 152 179 153 spin_unlock(&kfd_mem_limit.mem_limit_lock); 180 154 } ··· 542 506 } 543 507 544 508 static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem, 545 - struct amdkfd_process_info *process_info) 509 + struct amdkfd_process_info *process_info, 510 + bool userptr) 546 511 { 547 512 struct ttm_validate_buffer *entry = &mem->validate_list; 548 513 struct amdgpu_bo *bo = mem->bo; ··· 552 515 entry->shared = true; 553 516 entry->bo = &bo->tbo; 554 517 mutex_lock(&process_info->lock); 555 - list_add_tail(&entry->head, &process_info->kfd_bo_list); 518 + if (userptr) 519 + list_add_tail(&entry->head, &process_info->userptr_valid_list); 520 + else 521 + list_add_tail(&entry->head, &process_info->kfd_bo_list); 556 522 mutex_unlock(&process_info->lock); 523 + } 524 + 525 + /* Initializes user pages. It registers the MMU notifier and validates 526 + * the userptr BO in the GTT domain. 527 + * 528 + * The BO must already be on the userptr_valid_list. Otherwise an 529 + * eviction and restore may happen that leaves the new BO unmapped 530 + * with the user mode queues running. 531 + * 532 + * Takes the process_info->lock to protect against concurrent restore 533 + * workers. 534 + * 535 + * Returns 0 for success, negative errno for errors. 536 + */ 537 + static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, 538 + uint64_t user_addr) 539 + { 540 + struct amdkfd_process_info *process_info = mem->process_info; 541 + struct amdgpu_bo *bo = mem->bo; 542 + struct ttm_operation_ctx ctx = { true, false }; 543 + int ret = 0; 544 + 545 + mutex_lock(&process_info->lock); 546 + 547 + ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0); 548 + if (ret) { 549 + pr_err("%s: Failed to set userptr: %d\n", __func__, ret); 550 + goto out; 551 + } 552 + 553 + ret = amdgpu_mn_register(bo, user_addr); 554 + if (ret) { 555 + pr_err("%s: Failed to register MMU notifier: %d\n", 556 + __func__, ret); 557 + goto out; 558 + } 559 + 560 + /* If no restore worker is running concurrently, user_pages 561 + * should not be allocated 562 + */ 563 + WARN(mem->user_pages, "Leaking user_pages array"); 564 + 565 + mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages, 566 + sizeof(struct page *), 567 + GFP_KERNEL | __GFP_ZERO); 568 + if (!mem->user_pages) { 569 + pr_err("%s: Failed to allocate pages array\n", __func__); 570 + ret = -ENOMEM; 571 + goto unregister_out; 572 + } 573 + 574 + ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages); 575 + if (ret) { 576 + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); 577 + goto free_out; 578 + } 579 + 580 + amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages); 581 + 582 + ret = amdgpu_bo_reserve(bo, true); 583 + if (ret) { 584 + pr_err("%s: Failed to reserve BO\n", __func__); 585 + goto release_out; 586 + } 587 + amdgpu_ttm_placement_from_domain(bo, mem->domain); 588 + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 589 + if (ret) 590 + pr_err("%s: failed to validate BO\n", __func__); 591 + amdgpu_bo_unreserve(bo); 592 + 593 + release_out: 594 + if (ret) 595 + release_pages(mem->user_pages, bo->tbo.ttm->num_pages); 596 + free_out: 597 + kvfree(mem->user_pages); 598 + mem->user_pages = NULL; 599 + unregister_out: 600 + if (ret) 601 + amdgpu_mn_unregister(bo); 602 + out: 603 + mutex_unlock(&process_info->lock); 604 + return ret; 557 605 } 558 606 559 607 /* Reserving a BO and its page table BOs must happen atomically to ··· 870 748 } 871 749 872 750 static int map_bo_to_gpuvm(struct amdgpu_device *adev, 873 - struct kfd_bo_va_list *entry, struct amdgpu_sync *sync) 751 + struct kfd_bo_va_list *entry, struct amdgpu_sync *sync, 752 + bool no_update_pte) 874 753 { 875 754 int ret; 876 755 ··· 884 761 entry->va, ret); 885 762 return ret; 886 763 } 764 + 765 + if (no_update_pte) 766 + return 0; 887 767 888 768 ret = update_gpuvm_pte(adev, entry, sync); 889 769 if (ret) { ··· 946 820 mutex_init(&info->lock); 947 821 INIT_LIST_HEAD(&info->vm_list_head); 948 822 INIT_LIST_HEAD(&info->kfd_bo_list); 823 + INIT_LIST_HEAD(&info->userptr_valid_list); 824 + INIT_LIST_HEAD(&info->userptr_inval_list); 949 825 950 826 info->eviction_fence = 951 827 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), ··· 957 829 ret = -ENOMEM; 958 830 goto create_evict_fence_fail; 959 831 } 832 + 833 + info->pid = get_task_pid(current->group_leader, PIDTYPE_PID); 834 + atomic_set(&info->evicted_bos, 0); 835 + INIT_DELAYED_WORK(&info->restore_userptr_work, 836 + amdgpu_amdkfd_restore_userptr_worker); 960 837 961 838 *process_info = info; 962 839 *ef = dma_fence_get(&info->eviction_fence->base); ··· 1005 872 dma_fence_put(*ef); 1006 873 *ef = NULL; 1007 874 *process_info = NULL; 875 + put_pid(info->pid); 1008 876 create_evict_fence_fail: 1009 877 mutex_destroy(&info->lock); 1010 878 kfree(info); ··· 1101 967 /* Release per-process resources when last compute VM is destroyed */ 1102 968 if (!process_info->n_vms) { 1103 969 WARN_ON(!list_empty(&process_info->kfd_bo_list)); 970 + WARN_ON(!list_empty(&process_info->userptr_valid_list)); 971 + WARN_ON(!list_empty(&process_info->userptr_inval_list)); 1104 972 1105 973 dma_fence_put(&process_info->eviction_fence->base); 974 + cancel_delayed_work_sync(&process_info->restore_userptr_work); 975 + put_pid(process_info->pid); 1106 976 mutex_destroy(&process_info->lock); 1107 977 kfree(process_info); 1108 978 } ··· 1141 1003 { 1142 1004 struct amdgpu_device *adev = get_amdgpu_device(kgd); 1143 1005 struct amdgpu_vm *avm = (struct amdgpu_vm *)vm; 1006 + uint64_t user_addr = 0; 1144 1007 struct amdgpu_bo *bo; 1145 1008 int byte_align; 1146 - u32 alloc_domain; 1009 + u32 domain, alloc_domain; 1147 1010 u64 alloc_flags; 1148 1011 uint32_t mapping_flags; 1149 1012 int ret; ··· 1153 1014 * Check on which domain to allocate BO 1154 1015 */ 1155 1016 if (flags & ALLOC_MEM_FLAGS_VRAM) { 1156 - alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; 1017 + domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; 1157 1018 alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED; 1158 1019 alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ? 1159 1020 AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 1160 1021 AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 1161 1022 } else if (flags & ALLOC_MEM_FLAGS_GTT) { 1162 - alloc_domain = AMDGPU_GEM_DOMAIN_GTT; 1023 + domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT; 1163 1024 alloc_flags = 0; 1025 + } else if (flags & ALLOC_MEM_FLAGS_USERPTR) { 1026 + domain = AMDGPU_GEM_DOMAIN_GTT; 1027 + alloc_domain = AMDGPU_GEM_DOMAIN_CPU; 1028 + alloc_flags = 0; 1029 + if (!offset || !*offset) 1030 + return -EINVAL; 1031 + user_addr = *offset; 1164 1032 } else { 1165 1033 return -EINVAL; 1166 1034 } ··· 1224 1078 } 1225 1079 bo->kfd_bo = *mem; 1226 1080 (*mem)->bo = bo; 1081 + if (user_addr) 1082 + bo->flags |= AMDGPU_AMDKFD_USERPTR_BO; 1227 1083 1228 1084 (*mem)->va = va; 1229 - (*mem)->domain = alloc_domain; 1085 + (*mem)->domain = domain; 1230 1086 (*mem)->mapped_to_gpu_memory = 0; 1231 1087 (*mem)->process_info = avm->process_info; 1232 - add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info); 1088 + add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr); 1089 + 1090 + if (user_addr) { 1091 + ret = init_user_pages(*mem, current->mm, user_addr); 1092 + if (ret) { 1093 + mutex_lock(&avm->process_info->lock); 1094 + list_del(&(*mem)->validate_list.head); 1095 + mutex_unlock(&avm->process_info->lock); 1096 + goto allocate_init_user_pages_failed; 1097 + } 1098 + } 1233 1099 1234 1100 if (offset) 1235 1101 *offset = amdgpu_bo_mmap_offset(bo); 1236 1102 1237 1103 return 0; 1238 1104 1105 + allocate_init_user_pages_failed: 1106 + amdgpu_bo_unref(&bo); 1107 + /* Don't unreserve system mem limit twice */ 1108 + goto err_reserve_system_mem; 1239 1109 err_bo_create: 1240 1110 unreserve_system_mem_limit(adev, size, alloc_domain); 1241 1111 err_reserve_system_mem: ··· 1284 1122 * be freed anyway 1285 1123 */ 1286 1124 1125 + /* No more MMU notifiers */ 1126 + amdgpu_mn_unregister(mem->bo); 1127 + 1287 1128 /* Make sure restore workers don't access the BO any more */ 1288 1129 bo_list_entry = &mem->validate_list; 1289 1130 mutex_lock(&process_info->lock); 1290 1131 list_del(&bo_list_entry->head); 1291 1132 mutex_unlock(&process_info->lock); 1133 + 1134 + /* Free user pages if necessary */ 1135 + if (mem->user_pages) { 1136 + pr_debug("%s: Freeing user_pages array\n", __func__); 1137 + if (mem->user_pages[0]) 1138 + release_pages(mem->user_pages, 1139 + mem->bo->tbo.ttm->num_pages); 1140 + kvfree(mem->user_pages); 1141 + } 1292 1142 1293 1143 ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); 1294 1144 if (unlikely(ret)) ··· 1347 1173 struct kfd_bo_va_list *bo_va_entry = NULL; 1348 1174 struct kfd_bo_va_list *bo_va_entry_aql = NULL; 1349 1175 unsigned long bo_size; 1176 + bool is_invalid_userptr = false; 1350 1177 1351 - /* Make sure restore is not running concurrently. 1178 + bo = mem->bo; 1179 + if (!bo) { 1180 + pr_err("Invalid BO when mapping memory to GPU\n"); 1181 + return -EINVAL; 1182 + } 1183 + 1184 + /* Make sure restore is not running concurrently. Since we 1185 + * don't map invalid userptr BOs, we rely on the next restore 1186 + * worker to do the mapping 1352 1187 */ 1353 1188 mutex_lock(&mem->process_info->lock); 1354 1189 1355 - mutex_lock(&mem->lock); 1356 - 1357 - bo = mem->bo; 1358 - 1359 - if (!bo) { 1360 - pr_err("Invalid BO when mapping memory to GPU\n"); 1361 - ret = -EINVAL; 1362 - goto out; 1190 + /* Lock mmap-sem. If we find an invalid userptr BO, we can be 1191 + * sure that the MMU notifier is no longer running 1192 + * concurrently and the queues are actually stopped 1193 + */ 1194 + if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { 1195 + down_write(&current->mm->mmap_sem); 1196 + is_invalid_userptr = atomic_read(&mem->invalid); 1197 + up_write(&current->mm->mmap_sem); 1363 1198 } 1199 + 1200 + mutex_lock(&mem->lock); 1364 1201 1365 1202 domain = mem->domain; 1366 1203 bo_size = bo->tbo.mem.size; ··· 1384 1199 ret = reserve_bo_and_vm(mem, vm, &ctx); 1385 1200 if (unlikely(ret)) 1386 1201 goto out; 1202 + 1203 + /* Userptr can be marked as "not invalid", but not actually be 1204 + * validated yet (still in the system domain). In that case 1205 + * the queues are still stopped and we can leave mapping for 1206 + * the next restore worker 1207 + */ 1208 + if (bo->tbo.mem.mem_type == TTM_PL_SYSTEM) 1209 + is_invalid_userptr = true; 1387 1210 1388 1211 if (check_if_add_bo_to_vm(avm, mem)) { 1389 1212 ret = add_bo_to_vm(adev, mem, avm, false, ··· 1410 1217 goto add_bo_to_vm_failed; 1411 1218 } 1412 1219 1413 - if (mem->mapped_to_gpu_memory == 0) { 1220 + if (mem->mapped_to_gpu_memory == 0 && 1221 + !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { 1414 1222 /* Validate BO only once. The eviction fence gets added to BO 1415 1223 * the first time it is mapped. Validate will wait for all 1416 1224 * background evictions to complete. ··· 1429 1235 entry->va, entry->va + bo_size, 1430 1236 entry); 1431 1237 1432 - ret = map_bo_to_gpuvm(adev, entry, ctx.sync); 1238 + ret = map_bo_to_gpuvm(adev, entry, ctx.sync, 1239 + is_invalid_userptr); 1433 1240 if (ret) { 1434 1241 pr_err("Failed to map radeon bo to gpuvm\n"); 1435 1242 goto map_bo_to_gpuvm_failed; ··· 1611 1416 mutex_unlock(&mem->process_info->lock); 1612 1417 1613 1418 return ret; 1419 + } 1420 + 1421 + /* Evict a userptr BO by stopping the queues if necessary 1422 + * 1423 + * Runs in MMU notifier, may be in RECLAIM_FS context. This means it 1424 + * cannot do any memory allocations, and cannot take any locks that 1425 + * are held elsewhere while allocating memory. Therefore this is as 1426 + * simple as possible, using atomic counters. 1427 + * 1428 + * It doesn't do anything to the BO itself. The real work happens in 1429 + * restore, where we get updated page addresses. This function only 1430 + * ensures that GPU access to the BO is stopped. 1431 + */ 1432 + int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, 1433 + struct mm_struct *mm) 1434 + { 1435 + struct amdkfd_process_info *process_info = mem->process_info; 1436 + int invalid, evicted_bos; 1437 + int r = 0; 1438 + 1439 + invalid = atomic_inc_return(&mem->invalid); 1440 + evicted_bos = atomic_inc_return(&process_info->evicted_bos); 1441 + if (evicted_bos == 1) { 1442 + /* First eviction, stop the queues */ 1443 + r = kgd2kfd->quiesce_mm(mm); 1444 + if (r) 1445 + pr_err("Failed to quiesce KFD\n"); 1446 + schedule_delayed_work(&process_info->restore_userptr_work, 1447 + msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); 1448 + } 1449 + 1450 + return r; 1451 + } 1452 + 1453 + /* Update invalid userptr BOs 1454 + * 1455 + * Moves invalidated (evicted) userptr BOs from userptr_valid_list to 1456 + * userptr_inval_list and updates user pages for all BOs that have 1457 + * been invalidated since their last update. 1458 + */ 1459 + static int update_invalid_user_pages(struct amdkfd_process_info *process_info, 1460 + struct mm_struct *mm) 1461 + { 1462 + struct kgd_mem *mem, *tmp_mem; 1463 + struct amdgpu_bo *bo; 1464 + struct ttm_operation_ctx ctx = { false, false }; 1465 + int invalid, ret; 1466 + 1467 + /* Move all invalidated BOs to the userptr_inval_list and 1468 + * release their user pages by migration to the CPU domain 1469 + */ 1470 + list_for_each_entry_safe(mem, tmp_mem, 1471 + &process_info->userptr_valid_list, 1472 + validate_list.head) { 1473 + if (!atomic_read(&mem->invalid)) 1474 + continue; /* BO is still valid */ 1475 + 1476 + bo = mem->bo; 1477 + 1478 + if (amdgpu_bo_reserve(bo, true)) 1479 + return -EAGAIN; 1480 + amdgpu_ttm_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); 1481 + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 1482 + amdgpu_bo_unreserve(bo); 1483 + if (ret) { 1484 + pr_err("%s: Failed to invalidate userptr BO\n", 1485 + __func__); 1486 + return -EAGAIN; 1487 + } 1488 + 1489 + list_move_tail(&mem->validate_list.head, 1490 + &process_info->userptr_inval_list); 1491 + } 1492 + 1493 + if (list_empty(&process_info->userptr_inval_list)) 1494 + return 0; /* All evicted userptr BOs were freed */ 1495 + 1496 + /* Go through userptr_inval_list and update any invalid user_pages */ 1497 + list_for_each_entry(mem, &process_info->userptr_inval_list, 1498 + validate_list.head) { 1499 + invalid = atomic_read(&mem->invalid); 1500 + if (!invalid) 1501 + /* BO hasn't been invalidated since the last 1502 + * revalidation attempt. Keep its BO list. 1503 + */ 1504 + continue; 1505 + 1506 + bo = mem->bo; 1507 + 1508 + if (!mem->user_pages) { 1509 + mem->user_pages = 1510 + kvmalloc_array(bo->tbo.ttm->num_pages, 1511 + sizeof(struct page *), 1512 + GFP_KERNEL | __GFP_ZERO); 1513 + if (!mem->user_pages) { 1514 + pr_err("%s: Failed to allocate pages array\n", 1515 + __func__); 1516 + return -ENOMEM; 1517 + } 1518 + } else if (mem->user_pages[0]) { 1519 + release_pages(mem->user_pages, bo->tbo.ttm->num_pages); 1520 + } 1521 + 1522 + /* Get updated user pages */ 1523 + ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, 1524 + mem->user_pages); 1525 + if (ret) { 1526 + mem->user_pages[0] = NULL; 1527 + pr_info("%s: Failed to get user pages: %d\n", 1528 + __func__, ret); 1529 + /* Pretend it succeeded. It will fail later 1530 + * with a VM fault if the GPU tries to access 1531 + * it. Better than hanging indefinitely with 1532 + * stalled user mode queues. 1533 + */ 1534 + } 1535 + 1536 + /* Mark the BO as valid unless it was invalidated 1537 + * again concurrently 1538 + */ 1539 + if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) 1540 + return -EAGAIN; 1541 + } 1542 + 1543 + return 0; 1544 + } 1545 + 1546 + /* Validate invalid userptr BOs 1547 + * 1548 + * Validates BOs on the userptr_inval_list, and moves them back to the 1549 + * userptr_valid_list. Also updates GPUVM page tables with new page 1550 + * addresses and waits for the page table updates to complete. 1551 + */ 1552 + static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) 1553 + { 1554 + struct amdgpu_bo_list_entry *pd_bo_list_entries; 1555 + struct list_head resv_list, duplicates; 1556 + struct ww_acquire_ctx ticket; 1557 + struct amdgpu_sync sync; 1558 + 1559 + struct amdgpu_vm *peer_vm; 1560 + struct kgd_mem *mem, *tmp_mem; 1561 + struct amdgpu_bo *bo; 1562 + struct ttm_operation_ctx ctx = { false, false }; 1563 + int i, ret; 1564 + 1565 + pd_bo_list_entries = kcalloc(process_info->n_vms, 1566 + sizeof(struct amdgpu_bo_list_entry), 1567 + GFP_KERNEL); 1568 + if (!pd_bo_list_entries) { 1569 + pr_err("%s: Failed to allocate PD BO list entries\n", __func__); 1570 + return -ENOMEM; 1571 + } 1572 + 1573 + INIT_LIST_HEAD(&resv_list); 1574 + INIT_LIST_HEAD(&duplicates); 1575 + 1576 + /* Get all the page directory BOs that need to be reserved */ 1577 + i = 0; 1578 + list_for_each_entry(peer_vm, &process_info->vm_list_head, 1579 + vm_list_node) 1580 + amdgpu_vm_get_pd_bo(peer_vm, &resv_list, 1581 + &pd_bo_list_entries[i++]); 1582 + /* Add the userptr_inval_list entries to resv_list */ 1583 + list_for_each_entry(mem, &process_info->userptr_inval_list, 1584 + validate_list.head) { 1585 + list_add_tail(&mem->resv_list.head, &resv_list); 1586 + mem->resv_list.bo = mem->validate_list.bo; 1587 + mem->resv_list.shared = mem->validate_list.shared; 1588 + } 1589 + 1590 + /* Reserve all BOs and page tables for validation */ 1591 + ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates); 1592 + WARN(!list_empty(&duplicates), "Duplicates should be empty"); 1593 + if (ret) 1594 + goto out; 1595 + 1596 + amdgpu_sync_create(&sync); 1597 + 1598 + /* Avoid triggering eviction fences when unmapping invalid 1599 + * userptr BOs (waits for all fences, doesn't use 1600 + * FENCE_OWNER_VM) 1601 + */ 1602 + list_for_each_entry(peer_vm, &process_info->vm_list_head, 1603 + vm_list_node) 1604 + amdgpu_amdkfd_remove_eviction_fence(peer_vm->root.base.bo, 1605 + process_info->eviction_fence, 1606 + NULL, NULL); 1607 + 1608 + ret = process_validate_vms(process_info); 1609 + if (ret) 1610 + goto unreserve_out; 1611 + 1612 + /* Validate BOs and update GPUVM page tables */ 1613 + list_for_each_entry_safe(mem, tmp_mem, 1614 + &process_info->userptr_inval_list, 1615 + validate_list.head) { 1616 + struct kfd_bo_va_list *bo_va_entry; 1617 + 1618 + bo = mem->bo; 1619 + 1620 + /* Copy pages array and validate the BO if we got user pages */ 1621 + if (mem->user_pages[0]) { 1622 + amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, 1623 + mem->user_pages); 1624 + amdgpu_ttm_placement_from_domain(bo, mem->domain); 1625 + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); 1626 + if (ret) { 1627 + pr_err("%s: failed to validate BO\n", __func__); 1628 + goto unreserve_out; 1629 + } 1630 + } 1631 + 1632 + /* Validate succeeded, now the BO owns the pages, free 1633 + * our copy of the pointer array. Put this BO back on 1634 + * the userptr_valid_list. If we need to revalidate 1635 + * it, we need to start from scratch. 1636 + */ 1637 + kvfree(mem->user_pages); 1638 + mem->user_pages = NULL; 1639 + list_move_tail(&mem->validate_list.head, 1640 + &process_info->userptr_valid_list); 1641 + 1642 + /* Update mapping. If the BO was not validated 1643 + * (because we couldn't get user pages), this will 1644 + * clear the page table entries, which will result in 1645 + * VM faults if the GPU tries to access the invalid 1646 + * memory. 1647 + */ 1648 + list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) { 1649 + if (!bo_va_entry->is_mapped) 1650 + continue; 1651 + 1652 + ret = update_gpuvm_pte((struct amdgpu_device *) 1653 + bo_va_entry->kgd_dev, 1654 + bo_va_entry, &sync); 1655 + if (ret) { 1656 + pr_err("%s: update PTE failed\n", __func__); 1657 + /* make sure this gets validated again */ 1658 + atomic_inc(&mem->invalid); 1659 + goto unreserve_out; 1660 + } 1661 + } 1662 + } 1663 + 1664 + /* Update page directories */ 1665 + ret = process_update_pds(process_info, &sync); 1666 + 1667 + unreserve_out: 1668 + list_for_each_entry(peer_vm, &process_info->vm_list_head, 1669 + vm_list_node) 1670 + amdgpu_bo_fence(peer_vm->root.base.bo, 1671 + &process_info->eviction_fence->base, true); 1672 + ttm_eu_backoff_reservation(&ticket, &resv_list); 1673 + amdgpu_sync_wait(&sync, false); 1674 + amdgpu_sync_free(&sync); 1675 + out: 1676 + kfree(pd_bo_list_entries); 1677 + 1678 + return ret; 1679 + } 1680 + 1681 + /* Worker callback to restore evicted userptr BOs 1682 + * 1683 + * Tries to update and validate all userptr BOs. If successful and no 1684 + * concurrent evictions happened, the queues are restarted. Otherwise, 1685 + * reschedule for another attempt later. 1686 + */ 1687 + static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) 1688 + { 1689 + struct delayed_work *dwork = to_delayed_work(work); 1690 + struct amdkfd_process_info *process_info = 1691 + container_of(dwork, struct amdkfd_process_info, 1692 + restore_userptr_work); 1693 + struct task_struct *usertask; 1694 + struct mm_struct *mm; 1695 + int evicted_bos; 1696 + 1697 + evicted_bos = atomic_read(&process_info->evicted_bos); 1698 + if (!evicted_bos) 1699 + return; 1700 + 1701 + /* Reference task and mm in case of concurrent process termination */ 1702 + usertask = get_pid_task(process_info->pid, PIDTYPE_PID); 1703 + if (!usertask) 1704 + return; 1705 + mm = get_task_mm(usertask); 1706 + if (!mm) { 1707 + put_task_struct(usertask); 1708 + return; 1709 + } 1710 + 1711 + mutex_lock(&process_info->lock); 1712 + 1713 + if (update_invalid_user_pages(process_info, mm)) 1714 + goto unlock_out; 1715 + /* userptr_inval_list can be empty if all evicted userptr BOs 1716 + * have been freed. In that case there is nothing to validate 1717 + * and we can just restart the queues. 1718 + */ 1719 + if (!list_empty(&process_info->userptr_inval_list)) { 1720 + if (atomic_read(&process_info->evicted_bos) != evicted_bos) 1721 + goto unlock_out; /* Concurrent eviction, try again */ 1722 + 1723 + if (validate_invalid_user_pages(process_info)) 1724 + goto unlock_out; 1725 + } 1726 + /* Final check for concurrent evicton and atomic update. If 1727 + * another eviction happens after successful update, it will 1728 + * be a first eviction that calls quiesce_mm. The eviction 1729 + * reference counting inside KFD will handle this case. 1730 + */ 1731 + if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != 1732 + evicted_bos) 1733 + goto unlock_out; 1734 + evicted_bos = 0; 1735 + if (kgd2kfd->resume_mm(mm)) { 1736 + pr_err("%s: Failed to resume KFD\n", __func__); 1737 + /* No recovery from this failure. Probably the CP is 1738 + * hanging. No point trying again. 1739 + */ 1740 + } 1741 + unlock_out: 1742 + mutex_unlock(&process_info->lock); 1743 + mmput(mm); 1744 + put_task_struct(usertask); 1745 + 1746 + /* If validation failed, reschedule another attempt */ 1747 + if (evicted_bos) 1748 + schedule_delayed_work(&process_info->restore_userptr_work, 1749 + msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); 1614 1750 } 1615 1751 1616 1752 /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
··· 536 536 if (p->bo_list) { 537 537 amdgpu_bo_list_get_list(p->bo_list, &p->validated); 538 538 if (p->bo_list->first_userptr != p->bo_list->num_entries) 539 - p->mn = amdgpu_mn_get(p->adev); 539 + p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX); 540 540 } 541 541 542 542 INIT_LIST_HEAD(&duplicates);
+88 -23
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
··· 36 36 #include <drm/drm.h> 37 37 38 38 #include "amdgpu.h" 39 + #include "amdgpu_amdkfd.h" 39 40 40 41 struct amdgpu_mn { 41 42 /* constant after initialisation */ 42 43 struct amdgpu_device *adev; 43 44 struct mm_struct *mm; 44 45 struct mmu_notifier mn; 46 + enum amdgpu_mn_type type; 45 47 46 48 /* only used on destruction */ 47 49 struct work_struct work; ··· 187 185 } 188 186 189 187 /** 190 - * amdgpu_mn_invalidate_range_start - callback to notify about mm change 188 + * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change 191 189 * 192 190 * @mn: our notifier 193 191 * @mn: the mm this callback is about ··· 197 195 * We block for all BOs between start and end to be idle and 198 196 * unmap them by move them into system domain again. 199 197 */ 200 - static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, 201 - struct mm_struct *mm, 202 - unsigned long start, 203 - unsigned long end) 198 + static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, 199 + struct mm_struct *mm, 200 + unsigned long start, 201 + unsigned long end) 204 202 { 205 203 struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); 206 204 struct interval_tree_node *it; ··· 218 216 it = interval_tree_iter_next(it, start, end); 219 217 220 218 amdgpu_mn_invalidate_node(node, start, end); 219 + } 220 + } 221 + 222 + /** 223 + * amdgpu_mn_invalidate_range_start_hsa - callback to notify about mm change 224 + * 225 + * @mn: our notifier 226 + * @mn: the mm this callback is about 227 + * @start: start of updated range 228 + * @end: end of updated range 229 + * 230 + * We temporarily evict all BOs between start and end. This 231 + * necessitates evicting all user-mode queues of the process. The BOs 232 + * are restorted in amdgpu_mn_invalidate_range_end_hsa. 233 + */ 234 + static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, 235 + struct mm_struct *mm, 236 + unsigned long start, 237 + unsigned long end) 238 + { 239 + struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); 240 + struct interval_tree_node *it; 241 + 242 + /* notification is exclusive, but interval is inclusive */ 243 + end -= 1; 244 + 245 + amdgpu_mn_read_lock(rmn); 246 + 247 + it = interval_tree_iter_first(&rmn->objects, start, end); 248 + while (it) { 249 + struct amdgpu_mn_node *node; 250 + struct amdgpu_bo *bo; 251 + 252 + node = container_of(it, struct amdgpu_mn_node, it); 253 + it = interval_tree_iter_next(it, start, end); 254 + 255 + list_for_each_entry(bo, &node->bos, mn_list) { 256 + struct kgd_mem *mem = bo->kfd_bo; 257 + 258 + if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, 259 + start, end)) 260 + amdgpu_amdkfd_evict_userptr(mem, mm); 261 + } 221 262 } 222 263 } 223 264 ··· 284 239 amdgpu_mn_read_unlock(rmn); 285 240 } 286 241 287 - static const struct mmu_notifier_ops amdgpu_mn_ops = { 288 - .release = amdgpu_mn_release, 289 - .invalidate_range_start = amdgpu_mn_invalidate_range_start, 290 - .invalidate_range_end = amdgpu_mn_invalidate_range_end, 242 + static const struct mmu_notifier_ops amdgpu_mn_ops[] = { 243 + [AMDGPU_MN_TYPE_GFX] = { 244 + .release = amdgpu_mn_release, 245 + .invalidate_range_start = amdgpu_mn_invalidate_range_start_gfx, 246 + .invalidate_range_end = amdgpu_mn_invalidate_range_end, 247 + }, 248 + [AMDGPU_MN_TYPE_HSA] = { 249 + .release = amdgpu_mn_release, 250 + .invalidate_range_start = amdgpu_mn_invalidate_range_start_hsa, 251 + .invalidate_range_end = amdgpu_mn_invalidate_range_end, 252 + }, 291 253 }; 254 + 255 + /* Low bits of any reasonable mm pointer will be unused due to struct 256 + * alignment. Use these bits to make a unique key from the mm pointer 257 + * and notifier type. 258 + */ 259 + #define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type)) 292 260 293 261 /** 294 262 * amdgpu_mn_get - create notifier context 295 263 * 296 264 * @adev: amdgpu device pointer 265 + * @type: type of MMU notifier context 297 266 * 298 267 * Creates a notifier context for current->mm. 299 268 */ 300 - struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) 269 + struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, 270 + enum amdgpu_mn_type type) 301 271 { 302 272 struct mm_struct *mm = current->mm; 303 273 struct amdgpu_mn *rmn; 274 + unsigned long key = AMDGPU_MN_KEY(mm, type); 304 275 int r; 305 276 306 277 mutex_lock(&adev->mn_lock); ··· 325 264 return ERR_PTR(-EINTR); 326 265 } 327 266 328 - hash_for_each_possible(adev->mn_hash, rmn, node, (unsigned long)mm) 329 - if (rmn->mm == mm) 267 + hash_for_each_possible(adev->mn_hash, rmn, node, key) 268 + if (AMDGPU_MN_KEY(rmn->mm, rmn->type) == key) 330 269 goto release_locks; 331 270 332 271 rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); ··· 337 276 338 277 rmn->adev = adev; 339 278 rmn->mm = mm; 340 - rmn->mn.ops = &amdgpu_mn_ops; 341 279 init_rwsem(&rmn->lock); 280 + rmn->type = type; 281 + rmn->mn.ops = &amdgpu_mn_ops[type]; 342 282 rmn->objects = RB_ROOT_CACHED; 343 283 mutex_init(&rmn->read_lock); 344 284 atomic_set(&rmn->recursion, 0); ··· 348 286 if (r) 349 287 goto free_rmn; 350 288 351 - hash_add(adev->mn_hash, &rmn->node, (unsigned long)mm); 289 + hash_add(adev->mn_hash, &rmn->node, AMDGPU_MN_KEY(mm, type)); 352 290 353 291 release_locks: 354 292 up_write(&mm->mmap_sem); ··· 377 315 { 378 316 unsigned long end = addr + amdgpu_bo_size(bo) - 1; 379 317 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 318 + enum amdgpu_mn_type type = 319 + bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX; 380 320 struct amdgpu_mn *rmn; 381 - struct amdgpu_mn_node *node = NULL; 321 + struct amdgpu_mn_node *node = NULL, *new_node; 382 322 struct list_head bos; 383 323 struct interval_tree_node *it; 384 324 385 - rmn = amdgpu_mn_get(adev); 325 + rmn = amdgpu_mn_get(adev, type); 386 326 if (IS_ERR(rmn)) 387 327 return PTR_ERR(rmn); 328 + 329 + new_node = kmalloc(sizeof(*new_node), GFP_KERNEL); 330 + if (!new_node) 331 + return -ENOMEM; 388 332 389 333 INIT_LIST_HEAD(&bos); 390 334 ··· 405 337 list_splice(&node->bos, &bos); 406 338 } 407 339 408 - if (!node) { 409 - node = kmalloc(sizeof(struct amdgpu_mn_node), GFP_KERNEL); 410 - if (!node) { 411 - up_write(&rmn->lock); 412 - return -ENOMEM; 413 - } 414 - } 340 + if (!node) 341 + node = new_node; 342 + else 343 + kfree(new_node); 415 344 416 345 bo->mn = rmn; 417 346
+9 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h
··· 29 29 */ 30 30 struct amdgpu_mn; 31 31 32 + enum amdgpu_mn_type { 33 + AMDGPU_MN_TYPE_GFX, 34 + AMDGPU_MN_TYPE_HSA, 35 + }; 36 + 32 37 #if defined(CONFIG_MMU_NOTIFIER) 33 38 void amdgpu_mn_lock(struct amdgpu_mn *mn); 34 39 void amdgpu_mn_unlock(struct amdgpu_mn *mn); 35 - struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev); 40 + struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, 41 + enum amdgpu_mn_type type); 36 42 int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr); 37 43 void amdgpu_mn_unregister(struct amdgpu_bo *bo); 38 44 #else 39 45 static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {} 40 46 static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {} 41 - static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev) 47 + static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, 48 + enum amdgpu_mn_type type) 42 49 { 43 50 return NULL; 44 51 }
+29 -9
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
··· 695 695 struct ttm_dma_tt ttm; 696 696 u64 offset; 697 697 uint64_t userptr; 698 - struct mm_struct *usermm; 698 + struct task_struct *usertask; 699 699 uint32_t userflags; 700 700 spinlock_t guptasklock; 701 701 struct list_head guptasks; ··· 706 706 int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) 707 707 { 708 708 struct amdgpu_ttm_tt *gtt = (void *)ttm; 709 + struct mm_struct *mm = gtt->usertask->mm; 709 710 unsigned int flags = 0; 710 711 unsigned pinned = 0; 711 712 int r; 712 713 714 + if (!mm) /* Happens during process shutdown */ 715 + return -ESRCH; 716 + 713 717 if (!(gtt->userflags & AMDGPU_GEM_USERPTR_READONLY)) 714 718 flags |= FOLL_WRITE; 715 719 716 - down_read(&current->mm->mmap_sem); 720 + down_read(&mm->mmap_sem); 717 721 718 722 if (gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) { 719 723 /* check that we only use anonymous memory ··· 725 721 unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; 726 722 struct vm_area_struct *vma; 727 723 728 - vma = find_vma(gtt->usermm, gtt->userptr); 724 + vma = find_vma(mm, gtt->userptr); 729 725 if (!vma || vma->vm_file || vma->vm_end < end) { 730 - up_read(&current->mm->mmap_sem); 726 + up_read(&mm->mmap_sem); 731 727 return -EPERM; 732 728 } 733 729 } ··· 743 739 list_add(&guptask.list, &gtt->guptasks); 744 740 spin_unlock(&gtt->guptasklock); 745 741 746 - r = get_user_pages(userptr, num_pages, flags, p, NULL); 742 + if (mm == current->mm) 743 + r = get_user_pages(userptr, num_pages, flags, p, NULL); 744 + else 745 + r = get_user_pages_remote(gtt->usertask, 746 + mm, userptr, num_pages, 747 + flags, p, NULL, NULL); 747 748 748 749 spin_lock(&gtt->guptasklock); 749 750 list_del(&guptask.list); ··· 761 752 762 753 } while (pinned < ttm->num_pages); 763 754 764 - up_read(&current->mm->mmap_sem); 755 + up_read(&mm->mmap_sem); 765 756 return 0; 766 757 767 758 release_pages: 768 759 release_pages(pages, pinned); 769 - up_read(&current->mm->mmap_sem); 760 + up_read(&mm->mmap_sem); 770 761 return r; 771 762 } 772 763 ··· 987 978 { 988 979 struct amdgpu_ttm_tt *gtt = (void *)ttm; 989 980 981 + if (gtt->usertask) 982 + put_task_struct(gtt->usertask); 983 + 990 984 ttm_dma_tt_fini(&gtt->ttm); 991 985 kfree(gtt); 992 986 } ··· 1091 1079 return -EINVAL; 1092 1080 1093 1081 gtt->userptr = addr; 1094 - gtt->usermm = current->mm; 1095 1082 gtt->userflags = flags; 1083 + 1084 + if (gtt->usertask) 1085 + put_task_struct(gtt->usertask); 1086 + gtt->usertask = current->group_leader; 1087 + get_task_struct(gtt->usertask); 1088 + 1096 1089 spin_lock_init(&gtt->guptasklock); 1097 1090 INIT_LIST_HEAD(&gtt->guptasks); 1098 1091 atomic_set(&gtt->mmu_invalidations, 0); ··· 1113 1096 if (gtt == NULL) 1114 1097 return NULL; 1115 1098 1116 - return gtt->usermm; 1099 + if (gtt->usertask == NULL) 1100 + return NULL; 1101 + 1102 + return gtt->usertask->mm; 1117 1103 } 1118 1104 1119 1105 bool amdgpu_ttm_tt_affect_userptr(struct ttm_tt *ttm, unsigned long start,
+1
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
··· 4686 4686 4687 4687 cu_info->number = active_cu_number; 4688 4688 cu_info->ao_cu_mask = ao_cu_mask; 4689 + cu_info->simd_per_cu = NUM_SIMD_PER_CU; 4689 4690 4690 4691 return 0; 4691 4692 }
+5
drivers/gpu/drm/amd/amdgpu/soc15d.h
··· 268 268 * x=1: tmz_end 269 269 */ 270 270 271 + #define PACKET3_INVALIDATE_TLBS 0x98 272 + # define PACKET3_INVALIDATE_TLBS_DST_SEL(x) ((x) << 0) 273 + # define PACKET3_INVALIDATE_TLBS_ALL_HUB(x) ((x) << 4) 274 + # define PACKET3_INVALIDATE_TLBS_PASID(x) ((x) << 5) 275 + # define PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(x) ((x) << 29) 271 276 #define PACKET3_SET_RESOURCES 0xA0 272 277 /* 1. header 273 278 * 2. CONTROL
+6 -4
drivers/gpu/drm/amd/amdkfd/Makefile
··· 30 30 kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ 31 31 kfd_process.o kfd_queue.o kfd_mqd_manager.o \ 32 32 kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ 33 + kfd_mqd_manager_v9.o \ 33 34 kfd_kernel_queue.o kfd_kernel_queue_cik.o \ 34 - kfd_kernel_queue_vi.o kfd_packet_manager.o \ 35 - kfd_process_queue_manager.o kfd_device_queue_manager.o \ 36 - kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ 35 + kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ 36 + kfd_packet_manager.o kfd_process_queue_manager.o \ 37 + kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ 38 + kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ 37 39 kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ 38 - kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o 40 + kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o 39 41 40 42 ifneq ($(CONFIG_AMD_IOMMU_V2),) 41 43 amdkfd-y += kfd_iommu.o
+15 -5
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
··· 27 27 static bool cik_event_interrupt_isr(struct kfd_dev *dev, 28 28 const uint32_t *ih_ring_entry) 29 29 { 30 - unsigned int pasid; 31 30 const struct cik_ih_ring_entry *ihre = 32 31 (const struct cik_ih_ring_entry *)ih_ring_entry; 32 + unsigned int vmid, pasid; 33 33 34 + /* Only handle interrupts from KFD VMIDs */ 35 + vmid = (ihre->ring_id & 0x0000ff00) >> 8; 36 + if (vmid < dev->vm_info.first_vmid_kfd || 37 + vmid > dev->vm_info.last_vmid_kfd) 38 + return 0; 39 + 40 + /* If there is no valid PASID, it's likely a firmware bug */ 34 41 pasid = (ihre->ring_id & 0xffff0000) >> 16; 42 + if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) 43 + return 0; 35 44 36 - /* Do not process in ISR, just request it to be forwarded to WQ. */ 37 - return (pasid != 0) && 38 - (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || 45 + /* Interrupt types we care about: various signals and faults. 46 + * They will be forwarded to a work queue (see below). 47 + */ 48 + return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || 39 49 ihre->source_id == CIK_INTSRC_SDMA_TRAP || 40 50 ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || 41 - ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); 51 + ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; 42 52 } 43 53 44 54 static void cik_event_interrupt_wq(struct kfd_dev *dev,
+2 -1
drivers/gpu/drm/amd/amdkfd/cik_regs.h
··· 33 33 #define APE1_MTYPE(x) ((x) << 7) 34 34 35 35 /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ 36 - #define MTYPE_CACHED 0 36 + #define MTYPE_CACHED_NV 0 37 + #define MTYPE_CACHED 1 37 38 #define MTYPE_NONCACHED 3 38 39 39 40 #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8)
+560
drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
··· 1 + /* 2 + * Copyright 2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + */ 22 + 23 + static const uint32_t cwsr_trap_gfx8_hex[] = { 24 + 0xbf820001, 0xbf820125, 25 + 0xb8f4f802, 0x89748674, 26 + 0xb8f5f803, 0x8675ff75, 27 + 0x00000400, 0xbf850011, 28 + 0xc00a1e37, 0x00000000, 29 + 0xbf8c007f, 0x87777978, 30 + 0xbf840002, 0xb974f802, 31 + 0xbe801d78, 0xb8f5f803, 32 + 0x8675ff75, 0x000001ff, 33 + 0xbf850002, 0x80708470, 34 + 0x82718071, 0x8671ff71, 35 + 0x0000ffff, 0xb974f802, 36 + 0xbe801f70, 0xb8f5f803, 37 + 0x8675ff75, 0x00000100, 38 + 0xbf840006, 0xbefa0080, 39 + 0xb97a0203, 0x8671ff71, 40 + 0x0000ffff, 0x80f08870, 41 + 0x82f18071, 0xbefa0080, 42 + 0xb97a0283, 0xbef60068, 43 + 0xbef70069, 0xb8fa1c07, 44 + 0x8e7a9c7a, 0x87717a71, 45 + 0xb8fa03c7, 0x8e7a9b7a, 46 + 0x87717a71, 0xb8faf807, 47 + 0x867aff7a, 0x00007fff, 48 + 0xb97af807, 0xbef2007e, 49 + 0xbef3007f, 0xbefe0180, 50 + 0xbf900004, 0x877a8474, 51 + 0xb97af802, 0xbf8e0002, 52 + 0xbf88fffe, 0xbef8007e, 53 + 0x8679ff7f, 0x0000ffff, 54 + 0x8779ff79, 0x00040000, 55 + 0xbefa0080, 0xbefb00ff, 56 + 0x00807fac, 0x867aff7f, 57 + 0x08000000, 0x8f7a837a, 58 + 0x877b7a7b, 0x867aff7f, 59 + 0x70000000, 0x8f7a817a, 60 + 0x877b7a7b, 0xbeef007c, 61 + 0xbeee0080, 0xb8ee2a05, 62 + 0x806e816e, 0x8e6e8a6e, 63 + 0xb8fa1605, 0x807a817a, 64 + 0x8e7a867a, 0x806e7a6e, 65 + 0xbefa0084, 0xbefa00ff, 66 + 0x01000000, 0xbefe007c, 67 + 0xbefc006e, 0xc0611bfc, 68 + 0x0000007c, 0x806e846e, 69 + 0xbefc007e, 0xbefe007c, 70 + 0xbefc006e, 0xc0611c3c, 71 + 0x0000007c, 0x806e846e, 72 + 0xbefc007e, 0xbefe007c, 73 + 0xbefc006e, 0xc0611c7c, 74 + 0x0000007c, 0x806e846e, 75 + 0xbefc007e, 0xbefe007c, 76 + 0xbefc006e, 0xc0611cbc, 77 + 0x0000007c, 0x806e846e, 78 + 0xbefc007e, 0xbefe007c, 79 + 0xbefc006e, 0xc0611cfc, 80 + 0x0000007c, 0x806e846e, 81 + 0xbefc007e, 0xbefe007c, 82 + 0xbefc006e, 0xc0611d3c, 83 + 0x0000007c, 0x806e846e, 84 + 0xbefc007e, 0xb8f5f803, 85 + 0xbefe007c, 0xbefc006e, 86 + 0xc0611d7c, 0x0000007c, 87 + 0x806e846e, 0xbefc007e, 88 + 0xbefe007c, 0xbefc006e, 89 + 0xc0611dbc, 0x0000007c, 90 + 0x806e846e, 0xbefc007e, 91 + 0xbefe007c, 0xbefc006e, 92 + 0xc0611dfc, 0x0000007c, 93 + 0x806e846e, 0xbefc007e, 94 + 0xb8eff801, 0xbefe007c, 95 + 0xbefc006e, 0xc0611bfc, 96 + 0x0000007c, 0x806e846e, 97 + 0xbefc007e, 0xbefe007c, 98 + 0xbefc006e, 0xc0611b3c, 99 + 0x0000007c, 0x806e846e, 100 + 0xbefc007e, 0xbefe007c, 101 + 0xbefc006e, 0xc0611b7c, 102 + 0x0000007c, 0x806e846e, 103 + 0xbefc007e, 0x867aff7f, 104 + 0x04000000, 0xbef30080, 105 + 0x8773737a, 0xb8ee2a05, 106 + 0x806e816e, 0x8e6e8a6e, 107 + 0xb8f51605, 0x80758175, 108 + 0x8e758475, 0x8e7a8275, 109 + 0xbefa00ff, 0x01000000, 110 + 0xbef60178, 0x80786e78, 111 + 0x82798079, 0xbefc0080, 112 + 0xbe802b00, 0xbe822b02, 113 + 0xbe842b04, 0xbe862b06, 114 + 0xbe882b08, 0xbe8a2b0a, 115 + 0xbe8c2b0c, 0xbe8e2b0e, 116 + 0xc06b003c, 0x00000000, 117 + 0xc06b013c, 0x00000010, 118 + 0xc06b023c, 0x00000020, 119 + 0xc06b033c, 0x00000030, 120 + 0x8078c078, 0x82798079, 121 + 0x807c907c, 0xbf0a757c, 122 + 0xbf85ffeb, 0xbef80176, 123 + 0xbeee0080, 0xbefe00c1, 124 + 0xbeff00c1, 0xbefa00ff, 125 + 0x01000000, 0xe0724000, 126 + 0x6e1e0000, 0xe0724100, 127 + 0x6e1e0100, 0xe0724200, 128 + 0x6e1e0200, 0xe0724300, 129 + 0x6e1e0300, 0xbefe00c1, 130 + 0xbeff00c1, 0xb8f54306, 131 + 0x8675c175, 0xbf84002c, 132 + 0xbf8a0000, 0x867aff73, 133 + 0x04000000, 0xbf840028, 134 + 0x8e758675, 0x8e758275, 135 + 0xbefa0075, 0xb8ee2a05, 136 + 0x806e816e, 0x8e6e8a6e, 137 + 0xb8fa1605, 0x807a817a, 138 + 0x8e7a867a, 0x806e7a6e, 139 + 0x806eff6e, 0x00000080, 140 + 0xbefa00ff, 0x01000000, 141 + 0xbefc0080, 0xd28c0002, 142 + 0x000100c1, 0xd28d0003, 143 + 0x000204c1, 0xd1060002, 144 + 0x00011103, 0x7e0602ff, 145 + 0x00000200, 0xbefc00ff, 146 + 0x00010000, 0xbe80007b, 147 + 0x867bff7b, 0xff7fffff, 148 + 0x877bff7b, 0x00058000, 149 + 0xd8ec0000, 0x00000002, 150 + 0xbf8c007f, 0xe0765000, 151 + 0x6e1e0002, 0x32040702, 152 + 0xd0c9006a, 0x0000eb02, 153 + 0xbf87fff7, 0xbefb0000, 154 + 0xbeee00ff, 0x00000400, 155 + 0xbefe00c1, 0xbeff00c1, 156 + 0xb8f52a05, 0x80758175, 157 + 0x8e758275, 0x8e7a8875, 158 + 0xbefa00ff, 0x01000000, 159 + 0xbefc0084, 0xbf0a757c, 160 + 0xbf840015, 0xbf11017c, 161 + 0x8075ff75, 0x00001000, 162 + 0x7e000300, 0x7e020301, 163 + 0x7e040302, 0x7e060303, 164 + 0xe0724000, 0x6e1e0000, 165 + 0xe0724100, 0x6e1e0100, 166 + 0xe0724200, 0x6e1e0200, 167 + 0xe0724300, 0x6e1e0300, 168 + 0x807c847c, 0x806eff6e, 169 + 0x00000400, 0xbf0a757c, 170 + 0xbf85ffef, 0xbf9c0000, 171 + 0xbf8200ca, 0xbef8007e, 172 + 0x8679ff7f, 0x0000ffff, 173 + 0x8779ff79, 0x00040000, 174 + 0xbefa0080, 0xbefb00ff, 175 + 0x00807fac, 0x8676ff7f, 176 + 0x08000000, 0x8f768376, 177 + 0x877b767b, 0x8676ff7f, 178 + 0x70000000, 0x8f768176, 179 + 0x877b767b, 0x8676ff7f, 180 + 0x04000000, 0xbf84001e, 181 + 0xbefe00c1, 0xbeff00c1, 182 + 0xb8f34306, 0x8673c173, 183 + 0xbf840019, 0x8e738673, 184 + 0x8e738273, 0xbefa0073, 185 + 0xb8f22a05, 0x80728172, 186 + 0x8e728a72, 0xb8f61605, 187 + 0x80768176, 0x8e768676, 188 + 0x80727672, 0x8072ff72, 189 + 0x00000080, 0xbefa00ff, 190 + 0x01000000, 0xbefc0080, 191 + 0xe0510000, 0x721e0000, 192 + 0xe0510100, 0x721e0000, 193 + 0x807cff7c, 0x00000200, 194 + 0x8072ff72, 0x00000200, 195 + 0xbf0a737c, 0xbf85fff6, 196 + 0xbef20080, 0xbefe00c1, 197 + 0xbeff00c1, 0xb8f32a05, 198 + 0x80738173, 0x8e738273, 199 + 0x8e7a8873, 0xbefa00ff, 200 + 0x01000000, 0xbef60072, 201 + 0x8072ff72, 0x00000400, 202 + 0xbefc0084, 0xbf11087c, 203 + 0x8073ff73, 0x00008000, 204 + 0xe0524000, 0x721e0000, 205 + 0xe0524100, 0x721e0100, 206 + 0xe0524200, 0x721e0200, 207 + 0xe0524300, 0x721e0300, 208 + 0xbf8c0f70, 0x7e000300, 209 + 0x7e020301, 0x7e040302, 210 + 0x7e060303, 0x807c847c, 211 + 0x8072ff72, 0x00000400, 212 + 0xbf0a737c, 0xbf85ffee, 213 + 0xbf9c0000, 0xe0524000, 214 + 0x761e0000, 0xe0524100, 215 + 0x761e0100, 0xe0524200, 216 + 0x761e0200, 0xe0524300, 217 + 0x761e0300, 0xb8f22a05, 218 + 0x80728172, 0x8e728a72, 219 + 0xb8f61605, 0x80768176, 220 + 0x8e768676, 0x80727672, 221 + 0x80f2c072, 0xb8f31605, 222 + 0x80738173, 0x8e738473, 223 + 0x8e7a8273, 0xbefa00ff, 224 + 0x01000000, 0xbefc0073, 225 + 0xc031003c, 0x00000072, 226 + 0x80f2c072, 0xbf8c007f, 227 + 0x80fc907c, 0xbe802d00, 228 + 0xbe822d02, 0xbe842d04, 229 + 0xbe862d06, 0xbe882d08, 230 + 0xbe8a2d0a, 0xbe8c2d0c, 231 + 0xbe8e2d0e, 0xbf06807c, 232 + 0xbf84fff1, 0xb8f22a05, 233 + 0x80728172, 0x8e728a72, 234 + 0xb8f61605, 0x80768176, 235 + 0x8e768676, 0x80727672, 236 + 0xbefa0084, 0xbefa00ff, 237 + 0x01000000, 0xc0211cfc, 238 + 0x00000072, 0x80728472, 239 + 0xc0211c3c, 0x00000072, 240 + 0x80728472, 0xc0211c7c, 241 + 0x00000072, 0x80728472, 242 + 0xc0211bbc, 0x00000072, 243 + 0x80728472, 0xc0211bfc, 244 + 0x00000072, 0x80728472, 245 + 0xc0211d3c, 0x00000072, 246 + 0x80728472, 0xc0211d7c, 247 + 0x00000072, 0x80728472, 248 + 0xc0211a3c, 0x00000072, 249 + 0x80728472, 0xc0211a7c, 250 + 0x00000072, 0x80728472, 251 + 0xc0211dfc, 0x00000072, 252 + 0x80728472, 0xc0211b3c, 253 + 0x00000072, 0x80728472, 254 + 0xc0211b7c, 0x00000072, 255 + 0x80728472, 0xbf8c007f, 256 + 0xbefc0073, 0xbefe006e, 257 + 0xbeff006f, 0x867375ff, 258 + 0x000003ff, 0xb9734803, 259 + 0x867375ff, 0xfffff800, 260 + 0x8f738b73, 0xb973a2c3, 261 + 0xb977f801, 0x8673ff71, 262 + 0xf0000000, 0x8f739c73, 263 + 0x8e739073, 0xbef60080, 264 + 0x87767376, 0x8673ff71, 265 + 0x08000000, 0x8f739b73, 266 + 0x8e738f73, 0x87767376, 267 + 0x8673ff74, 0x00800000, 268 + 0x8f739773, 0xb976f807, 269 + 0x8671ff71, 0x0000ffff, 270 + 0x86fe7e7e, 0x86ea6a6a, 271 + 0xb974f802, 0xbf8a0000, 272 + 0x95807370, 0xbf810000, 273 + }; 274 + 275 + 276 + static const uint32_t cwsr_trap_gfx9_hex[] = { 277 + 0xbf820001, 0xbf82015a, 278 + 0xb8f8f802, 0x89788678, 279 + 0xb8f1f803, 0x866eff71, 280 + 0x00000400, 0xbf850034, 281 + 0x866eff71, 0x00000800, 282 + 0xbf850003, 0x866eff71, 283 + 0x00000100, 0xbf840008, 284 + 0x866eff78, 0x00002000, 285 + 0xbf840001, 0xbf810000, 286 + 0x8778ff78, 0x00002000, 287 + 0x80ec886c, 0x82ed806d, 288 + 0xb8eef807, 0x866fff6e, 289 + 0x001f8000, 0x8e6f8b6f, 290 + 0x8977ff77, 0xfc000000, 291 + 0x87776f77, 0x896eff6e, 292 + 0x001f8000, 0xb96ef807, 293 + 0xb8f0f812, 0xb8f1f813, 294 + 0x8ef08870, 0xc0071bb8, 295 + 0x00000000, 0xbf8cc07f, 296 + 0xc0071c38, 0x00000008, 297 + 0xbf8cc07f, 0x86ee6e6e, 298 + 0xbf840001, 0xbe801d6e, 299 + 0xb8f1f803, 0x8671ff71, 300 + 0x000001ff, 0xbf850002, 301 + 0x806c846c, 0x826d806d, 302 + 0x866dff6d, 0x0000ffff, 303 + 0x8f6e8b77, 0x866eff6e, 304 + 0x001f8000, 0xb96ef807, 305 + 0x86fe7e7e, 0x86ea6a6a, 306 + 0xb978f802, 0xbe801f6c, 307 + 0x866dff6d, 0x0000ffff, 308 + 0xbef00080, 0xb9700283, 309 + 0xb8f02407, 0x8e709c70, 310 + 0x876d706d, 0xb8f003c7, 311 + 0x8e709b70, 0x876d706d, 312 + 0xb8f0f807, 0x8670ff70, 313 + 0x00007fff, 0xb970f807, 314 + 0xbeee007e, 0xbeef007f, 315 + 0xbefe0180, 0xbf900004, 316 + 0x87708478, 0xb970f802, 317 + 0xbf8e0002, 0xbf88fffe, 318 + 0xb8f02a05, 0x80708170, 319 + 0x8e708a70, 0xb8f11605, 320 + 0x80718171, 0x8e718671, 321 + 0x80707170, 0x80707e70, 322 + 0x8271807f, 0x8671ff71, 323 + 0x0000ffff, 0xc0471cb8, 324 + 0x00000040, 0xbf8cc07f, 325 + 0xc04b1d38, 0x00000048, 326 + 0xbf8cc07f, 0xc0431e78, 327 + 0x00000058, 0xbf8cc07f, 328 + 0xc0471eb8, 0x0000005c, 329 + 0xbf8cc07f, 0xbef4007e, 330 + 0x8675ff7f, 0x0000ffff, 331 + 0x8775ff75, 0x00040000, 332 + 0xbef60080, 0xbef700ff, 333 + 0x00807fac, 0x8670ff7f, 334 + 0x08000000, 0x8f708370, 335 + 0x87777077, 0x8670ff7f, 336 + 0x70000000, 0x8f708170, 337 + 0x87777077, 0xbefb007c, 338 + 0xbefa0080, 0xb8fa2a05, 339 + 0x807a817a, 0x8e7a8a7a, 340 + 0xb8f01605, 0x80708170, 341 + 0x8e708670, 0x807a707a, 342 + 0xbef60084, 0xbef600ff, 343 + 0x01000000, 0xbefe007c, 344 + 0xbefc007a, 0xc0611efa, 345 + 0x0000007c, 0xbf8cc07f, 346 + 0x807a847a, 0xbefc007e, 347 + 0xbefe007c, 0xbefc007a, 348 + 0xc0611b3a, 0x0000007c, 349 + 0xbf8cc07f, 0x807a847a, 350 + 0xbefc007e, 0xbefe007c, 351 + 0xbefc007a, 0xc0611b7a, 352 + 0x0000007c, 0xbf8cc07f, 353 + 0x807a847a, 0xbefc007e, 354 + 0xbefe007c, 0xbefc007a, 355 + 0xc0611bba, 0x0000007c, 356 + 0xbf8cc07f, 0x807a847a, 357 + 0xbefc007e, 0xbefe007c, 358 + 0xbefc007a, 0xc0611bfa, 359 + 0x0000007c, 0xbf8cc07f, 360 + 0x807a847a, 0xbefc007e, 361 + 0xbefe007c, 0xbefc007a, 362 + 0xc0611e3a, 0x0000007c, 363 + 0xbf8cc07f, 0x807a847a, 364 + 0xbefc007e, 0xb8f1f803, 365 + 0xbefe007c, 0xbefc007a, 366 + 0xc0611c7a, 0x0000007c, 367 + 0xbf8cc07f, 0x807a847a, 368 + 0xbefc007e, 0xbefe007c, 369 + 0xbefc007a, 0xc0611a3a, 370 + 0x0000007c, 0xbf8cc07f, 371 + 0x807a847a, 0xbefc007e, 372 + 0xbefe007c, 0xbefc007a, 373 + 0xc0611a7a, 0x0000007c, 374 + 0xbf8cc07f, 0x807a847a, 375 + 0xbefc007e, 0xb8fbf801, 376 + 0xbefe007c, 0xbefc007a, 377 + 0xc0611efa, 0x0000007c, 378 + 0xbf8cc07f, 0x807a847a, 379 + 0xbefc007e, 0x8670ff7f, 380 + 0x04000000, 0xbeef0080, 381 + 0x876f6f70, 0xb8fa2a05, 382 + 0x807a817a, 0x8e7a8a7a, 383 + 0xb8f11605, 0x80718171, 384 + 0x8e718471, 0x8e768271, 385 + 0xbef600ff, 0x01000000, 386 + 0xbef20174, 0x80747a74, 387 + 0x82758075, 0xbefc0080, 388 + 0xbf800000, 0xbe802b00, 389 + 0xbe822b02, 0xbe842b04, 390 + 0xbe862b06, 0xbe882b08, 391 + 0xbe8a2b0a, 0xbe8c2b0c, 392 + 0xbe8e2b0e, 0xc06b003a, 393 + 0x00000000, 0xbf8cc07f, 394 + 0xc06b013a, 0x00000010, 395 + 0xbf8cc07f, 0xc06b023a, 396 + 0x00000020, 0xbf8cc07f, 397 + 0xc06b033a, 0x00000030, 398 + 0xbf8cc07f, 0x8074c074, 399 + 0x82758075, 0x807c907c, 400 + 0xbf0a717c, 0xbf85ffe7, 401 + 0xbef40172, 0xbefa0080, 402 + 0xbefe00c1, 0xbeff00c1, 403 + 0xbee80080, 0xbee90080, 404 + 0xbef600ff, 0x01000000, 405 + 0xe0724000, 0x7a1d0000, 406 + 0xe0724100, 0x7a1d0100, 407 + 0xe0724200, 0x7a1d0200, 408 + 0xe0724300, 0x7a1d0300, 409 + 0xbefe00c1, 0xbeff00c1, 410 + 0xb8f14306, 0x8671c171, 411 + 0xbf84002c, 0xbf8a0000, 412 + 0x8670ff6f, 0x04000000, 413 + 0xbf840028, 0x8e718671, 414 + 0x8e718271, 0xbef60071, 415 + 0xb8fa2a05, 0x807a817a, 416 + 0x8e7a8a7a, 0xb8f01605, 417 + 0x80708170, 0x8e708670, 418 + 0x807a707a, 0x807aff7a, 419 + 0x00000080, 0xbef600ff, 420 + 0x01000000, 0xbefc0080, 421 + 0xd28c0002, 0x000100c1, 422 + 0xd28d0003, 0x000204c1, 423 + 0xd1060002, 0x00011103, 424 + 0x7e0602ff, 0x00000200, 425 + 0xbefc00ff, 0x00010000, 426 + 0xbe800077, 0x8677ff77, 427 + 0xff7fffff, 0x8777ff77, 428 + 0x00058000, 0xd8ec0000, 429 + 0x00000002, 0xbf8cc07f, 430 + 0xe0765000, 0x7a1d0002, 431 + 0x68040702, 0xd0c9006a, 432 + 0x0000e302, 0xbf87fff7, 433 + 0xbef70000, 0xbefa00ff, 434 + 0x00000400, 0xbefe00c1, 435 + 0xbeff00c1, 0xb8f12a05, 436 + 0x80718171, 0x8e718271, 437 + 0x8e768871, 0xbef600ff, 438 + 0x01000000, 0xbefc0084, 439 + 0xbf0a717c, 0xbf840015, 440 + 0xbf11017c, 0x8071ff71, 441 + 0x00001000, 0x7e000300, 442 + 0x7e020301, 0x7e040302, 443 + 0x7e060303, 0xe0724000, 444 + 0x7a1d0000, 0xe0724100, 445 + 0x7a1d0100, 0xe0724200, 446 + 0x7a1d0200, 0xe0724300, 447 + 0x7a1d0300, 0x807c847c, 448 + 0x807aff7a, 0x00000400, 449 + 0xbf0a717c, 0xbf85ffef, 450 + 0xbf9c0000, 0xbf8200d9, 451 + 0xbef4007e, 0x8675ff7f, 452 + 0x0000ffff, 0x8775ff75, 453 + 0x00040000, 0xbef60080, 454 + 0xbef700ff, 0x00807fac, 455 + 0x866eff7f, 0x08000000, 456 + 0x8f6e836e, 0x87776e77, 457 + 0x866eff7f, 0x70000000, 458 + 0x8f6e816e, 0x87776e77, 459 + 0x866eff7f, 0x04000000, 460 + 0xbf84001e, 0xbefe00c1, 461 + 0xbeff00c1, 0xb8ef4306, 462 + 0x866fc16f, 0xbf840019, 463 + 0x8e6f866f, 0x8e6f826f, 464 + 0xbef6006f, 0xb8f82a05, 465 + 0x80788178, 0x8e788a78, 466 + 0xb8ee1605, 0x806e816e, 467 + 0x8e6e866e, 0x80786e78, 468 + 0x8078ff78, 0x00000080, 469 + 0xbef600ff, 0x01000000, 470 + 0xbefc0080, 0xe0510000, 471 + 0x781d0000, 0xe0510100, 472 + 0x781d0000, 0x807cff7c, 473 + 0x00000200, 0x8078ff78, 474 + 0x00000200, 0xbf0a6f7c, 475 + 0xbf85fff6, 0xbef80080, 476 + 0xbefe00c1, 0xbeff00c1, 477 + 0xb8ef2a05, 0x806f816f, 478 + 0x8e6f826f, 0x8e76886f, 479 + 0xbef600ff, 0x01000000, 480 + 0xbeee0078, 0x8078ff78, 481 + 0x00000400, 0xbefc0084, 482 + 0xbf11087c, 0x806fff6f, 483 + 0x00008000, 0xe0524000, 484 + 0x781d0000, 0xe0524100, 485 + 0x781d0100, 0xe0524200, 486 + 0x781d0200, 0xe0524300, 487 + 0x781d0300, 0xbf8c0f70, 488 + 0x7e000300, 0x7e020301, 489 + 0x7e040302, 0x7e060303, 490 + 0x807c847c, 0x8078ff78, 491 + 0x00000400, 0xbf0a6f7c, 492 + 0xbf85ffee, 0xbf9c0000, 493 + 0xe0524000, 0x6e1d0000, 494 + 0xe0524100, 0x6e1d0100, 495 + 0xe0524200, 0x6e1d0200, 496 + 0xe0524300, 0x6e1d0300, 497 + 0xb8f82a05, 0x80788178, 498 + 0x8e788a78, 0xb8ee1605, 499 + 0x806e816e, 0x8e6e866e, 500 + 0x80786e78, 0x80f8c078, 501 + 0xb8ef1605, 0x806f816f, 502 + 0x8e6f846f, 0x8e76826f, 503 + 0xbef600ff, 0x01000000, 504 + 0xbefc006f, 0xc031003a, 505 + 0x00000078, 0x80f8c078, 506 + 0xbf8cc07f, 0x80fc907c, 507 + 0xbf800000, 0xbe802d00, 508 + 0xbe822d02, 0xbe842d04, 509 + 0xbe862d06, 0xbe882d08, 510 + 0xbe8a2d0a, 0xbe8c2d0c, 511 + 0xbe8e2d0e, 0xbf06807c, 512 + 0xbf84fff0, 0xb8f82a05, 513 + 0x80788178, 0x8e788a78, 514 + 0xb8ee1605, 0x806e816e, 515 + 0x8e6e866e, 0x80786e78, 516 + 0xbef60084, 0xbef600ff, 517 + 0x01000000, 0xc0211bfa, 518 + 0x00000078, 0x80788478, 519 + 0xc0211b3a, 0x00000078, 520 + 0x80788478, 0xc0211b7a, 521 + 0x00000078, 0x80788478, 522 + 0xc0211eba, 0x00000078, 523 + 0x80788478, 0xc0211efa, 524 + 0x00000078, 0x80788478, 525 + 0xc0211c3a, 0x00000078, 526 + 0x80788478, 0xc0211c7a, 527 + 0x00000078, 0x80788478, 528 + 0xc0211a3a, 0x00000078, 529 + 0x80788478, 0xc0211a7a, 530 + 0x00000078, 0x80788478, 531 + 0xc0211cfa, 0x00000078, 532 + 0x80788478, 0xbf8cc07f, 533 + 0xbefc006f, 0xbefe007a, 534 + 0xbeff007b, 0x866f71ff, 535 + 0x000003ff, 0xb96f4803, 536 + 0x866f71ff, 0xfffff800, 537 + 0x8f6f8b6f, 0xb96fa2c3, 538 + 0xb973f801, 0xb8ee2a05, 539 + 0x806e816e, 0x8e6e8a6e, 540 + 0xb8ef1605, 0x806f816f, 541 + 0x8e6f866f, 0x806e6f6e, 542 + 0x806e746e, 0x826f8075, 543 + 0x866fff6f, 0x0000ffff, 544 + 0xc0071cb7, 0x00000040, 545 + 0xc00b1d37, 0x00000048, 546 + 0xc0031e77, 0x00000058, 547 + 0xc0071eb7, 0x0000005c, 548 + 0xbf8cc07f, 0x866fff6d, 549 + 0xf0000000, 0x8f6f9c6f, 550 + 0x8e6f906f, 0xbeee0080, 551 + 0x876e6f6e, 0x866fff6d, 552 + 0x08000000, 0x8f6f9b6f, 553 + 0x8e6f8f6f, 0x876e6f6e, 554 + 0x866fff70, 0x00800000, 555 + 0x8f6f976f, 0xb96ef807, 556 + 0x866dff6d, 0x0000ffff, 557 + 0x86fe7e7e, 0x86ea6a6a, 558 + 0xb970f802, 0xbf8a0000, 559 + 0x95806f6c, 0xbf810000, 560 + };
+13 -261
drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
··· 20 20 * OTHER DEALINGS IN THE SOFTWARE. 21 21 */ 22 22 23 - #if 0 24 - HW (VI) source code for CWSR trap handler 25 - #Version 18 + multiple trap handler 23 + /* To compile this assembly code: 24 + * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex 25 + */ 26 + 27 + /* HW (VI) source code for CWSR trap handler */ 28 + /* Version 18 + multiple trap handler */ 26 29 27 30 // this performance-optimal version was originally from Seven Xu at SRDC 28 31 ··· 101 98 /**************************************************************************/ 102 99 var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 103 100 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 101 + var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 104 102 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 105 103 106 104 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 ··· 153 149 var s_save_spi_init_hi = exec_hi 154 150 155 151 //tba_lo and tba_hi need to be saved/restored 156 - var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} 152 + var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} 157 153 var s_save_pc_hi = ttmp1 158 154 var s_save_exec_lo = ttmp2 159 155 var s_save_exec_hi = ttmp3 ··· 322 318 else 323 319 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 324 320 end 321 + 322 + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. 323 + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) 324 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp 325 325 326 326 L_SLEEP: 327 327 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 ··· 1015 1007 1016 1008 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS 1017 1009 1018 - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1019 - 1020 1010 //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: 1021 1011 if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 1022 1012 s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) ··· 1050 1044 s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT 1051 1045 s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp 1052 1046 1047 + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1053 1048 s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1054 1049 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1055 1050 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu ··· 1134 1127 function get_hwreg_size_bytes 1135 1128 return 128 //HWREG size 128 bytes 1136 1129 end 1137 - 1138 - 1139 - #endif 1140 - 1141 - static const uint32_t cwsr_trap_gfx8_hex[] = { 1142 - 0xbf820001, 0xbf820123, 1143 - 0xb8f4f802, 0x89748674, 1144 - 0xb8f5f803, 0x8675ff75, 1145 - 0x00000400, 0xbf850011, 1146 - 0xc00a1e37, 0x00000000, 1147 - 0xbf8c007f, 0x87777978, 1148 - 0xbf840002, 0xb974f802, 1149 - 0xbe801d78, 0xb8f5f803, 1150 - 0x8675ff75, 0x000001ff, 1151 - 0xbf850002, 0x80708470, 1152 - 0x82718071, 0x8671ff71, 1153 - 0x0000ffff, 0xb974f802, 1154 - 0xbe801f70, 0xb8f5f803, 1155 - 0x8675ff75, 0x00000100, 1156 - 0xbf840006, 0xbefa0080, 1157 - 0xb97a0203, 0x8671ff71, 1158 - 0x0000ffff, 0x80f08870, 1159 - 0x82f18071, 0xbefa0080, 1160 - 0xb97a0283, 0xbef60068, 1161 - 0xbef70069, 0xb8fa1c07, 1162 - 0x8e7a9c7a, 0x87717a71, 1163 - 0xb8fa03c7, 0x8e7a9b7a, 1164 - 0x87717a71, 0xb8faf807, 1165 - 0x867aff7a, 0x00007fff, 1166 - 0xb97af807, 0xbef2007e, 1167 - 0xbef3007f, 0xbefe0180, 1168 - 0xbf900004, 0xbf8e0002, 1169 - 0xbf88fffe, 0xbef8007e, 1170 - 0x8679ff7f, 0x0000ffff, 1171 - 0x8779ff79, 0x00040000, 1172 - 0xbefa0080, 0xbefb00ff, 1173 - 0x00807fac, 0x867aff7f, 1174 - 0x08000000, 0x8f7a837a, 1175 - 0x877b7a7b, 0x867aff7f, 1176 - 0x70000000, 0x8f7a817a, 1177 - 0x877b7a7b, 0xbeef007c, 1178 - 0xbeee0080, 0xb8ee2a05, 1179 - 0x806e816e, 0x8e6e8a6e, 1180 - 0xb8fa1605, 0x807a817a, 1181 - 0x8e7a867a, 0x806e7a6e, 1182 - 0xbefa0084, 0xbefa00ff, 1183 - 0x01000000, 0xbefe007c, 1184 - 0xbefc006e, 0xc0611bfc, 1185 - 0x0000007c, 0x806e846e, 1186 - 0xbefc007e, 0xbefe007c, 1187 - 0xbefc006e, 0xc0611c3c, 1188 - 0x0000007c, 0x806e846e, 1189 - 0xbefc007e, 0xbefe007c, 1190 - 0xbefc006e, 0xc0611c7c, 1191 - 0x0000007c, 0x806e846e, 1192 - 0xbefc007e, 0xbefe007c, 1193 - 0xbefc006e, 0xc0611cbc, 1194 - 0x0000007c, 0x806e846e, 1195 - 0xbefc007e, 0xbefe007c, 1196 - 0xbefc006e, 0xc0611cfc, 1197 - 0x0000007c, 0x806e846e, 1198 - 0xbefc007e, 0xbefe007c, 1199 - 0xbefc006e, 0xc0611d3c, 1200 - 0x0000007c, 0x806e846e, 1201 - 0xbefc007e, 0xb8f5f803, 1202 - 0xbefe007c, 0xbefc006e, 1203 - 0xc0611d7c, 0x0000007c, 1204 - 0x806e846e, 0xbefc007e, 1205 - 0xbefe007c, 0xbefc006e, 1206 - 0xc0611dbc, 0x0000007c, 1207 - 0x806e846e, 0xbefc007e, 1208 - 0xbefe007c, 0xbefc006e, 1209 - 0xc0611dfc, 0x0000007c, 1210 - 0x806e846e, 0xbefc007e, 1211 - 0xb8eff801, 0xbefe007c, 1212 - 0xbefc006e, 0xc0611bfc, 1213 - 0x0000007c, 0x806e846e, 1214 - 0xbefc007e, 0xbefe007c, 1215 - 0xbefc006e, 0xc0611b3c, 1216 - 0x0000007c, 0x806e846e, 1217 - 0xbefc007e, 0xbefe007c, 1218 - 0xbefc006e, 0xc0611b7c, 1219 - 0x0000007c, 0x806e846e, 1220 - 0xbefc007e, 0x867aff7f, 1221 - 0x04000000, 0xbef30080, 1222 - 0x8773737a, 0xb8ee2a05, 1223 - 0x806e816e, 0x8e6e8a6e, 1224 - 0xb8f51605, 0x80758175, 1225 - 0x8e758475, 0x8e7a8275, 1226 - 0xbefa00ff, 0x01000000, 1227 - 0xbef60178, 0x80786e78, 1228 - 0x82798079, 0xbefc0080, 1229 - 0xbe802b00, 0xbe822b02, 1230 - 0xbe842b04, 0xbe862b06, 1231 - 0xbe882b08, 0xbe8a2b0a, 1232 - 0xbe8c2b0c, 0xbe8e2b0e, 1233 - 0xc06b003c, 0x00000000, 1234 - 0xc06b013c, 0x00000010, 1235 - 0xc06b023c, 0x00000020, 1236 - 0xc06b033c, 0x00000030, 1237 - 0x8078c078, 0x82798079, 1238 - 0x807c907c, 0xbf0a757c, 1239 - 0xbf85ffeb, 0xbef80176, 1240 - 0xbeee0080, 0xbefe00c1, 1241 - 0xbeff00c1, 0xbefa00ff, 1242 - 0x01000000, 0xe0724000, 1243 - 0x6e1e0000, 0xe0724100, 1244 - 0x6e1e0100, 0xe0724200, 1245 - 0x6e1e0200, 0xe0724300, 1246 - 0x6e1e0300, 0xbefe00c1, 1247 - 0xbeff00c1, 0xb8f54306, 1248 - 0x8675c175, 0xbf84002c, 1249 - 0xbf8a0000, 0x867aff73, 1250 - 0x04000000, 0xbf840028, 1251 - 0x8e758675, 0x8e758275, 1252 - 0xbefa0075, 0xb8ee2a05, 1253 - 0x806e816e, 0x8e6e8a6e, 1254 - 0xb8fa1605, 0x807a817a, 1255 - 0x8e7a867a, 0x806e7a6e, 1256 - 0x806eff6e, 0x00000080, 1257 - 0xbefa00ff, 0x01000000, 1258 - 0xbefc0080, 0xd28c0002, 1259 - 0x000100c1, 0xd28d0003, 1260 - 0x000204c1, 0xd1060002, 1261 - 0x00011103, 0x7e0602ff, 1262 - 0x00000200, 0xbefc00ff, 1263 - 0x00010000, 0xbe80007b, 1264 - 0x867bff7b, 0xff7fffff, 1265 - 0x877bff7b, 0x00058000, 1266 - 0xd8ec0000, 0x00000002, 1267 - 0xbf8c007f, 0xe0765000, 1268 - 0x6e1e0002, 0x32040702, 1269 - 0xd0c9006a, 0x0000eb02, 1270 - 0xbf87fff7, 0xbefb0000, 1271 - 0xbeee00ff, 0x00000400, 1272 - 0xbefe00c1, 0xbeff00c1, 1273 - 0xb8f52a05, 0x80758175, 1274 - 0x8e758275, 0x8e7a8875, 1275 - 0xbefa00ff, 0x01000000, 1276 - 0xbefc0084, 0xbf0a757c, 1277 - 0xbf840015, 0xbf11017c, 1278 - 0x8075ff75, 0x00001000, 1279 - 0x7e000300, 0x7e020301, 1280 - 0x7e040302, 0x7e060303, 1281 - 0xe0724000, 0x6e1e0000, 1282 - 0xe0724100, 0x6e1e0100, 1283 - 0xe0724200, 0x6e1e0200, 1284 - 0xe0724300, 0x6e1e0300, 1285 - 0x807c847c, 0x806eff6e, 1286 - 0x00000400, 0xbf0a757c, 1287 - 0xbf85ffef, 0xbf9c0000, 1288 - 0xbf8200ca, 0xbef8007e, 1289 - 0x8679ff7f, 0x0000ffff, 1290 - 0x8779ff79, 0x00040000, 1291 - 0xbefa0080, 0xbefb00ff, 1292 - 0x00807fac, 0x8676ff7f, 1293 - 0x08000000, 0x8f768376, 1294 - 0x877b767b, 0x8676ff7f, 1295 - 0x70000000, 0x8f768176, 1296 - 0x877b767b, 0x8676ff7f, 1297 - 0x04000000, 0xbf84001e, 1298 - 0xbefe00c1, 0xbeff00c1, 1299 - 0xb8f34306, 0x8673c173, 1300 - 0xbf840019, 0x8e738673, 1301 - 0x8e738273, 0xbefa0073, 1302 - 0xb8f22a05, 0x80728172, 1303 - 0x8e728a72, 0xb8f61605, 1304 - 0x80768176, 0x8e768676, 1305 - 0x80727672, 0x8072ff72, 1306 - 0x00000080, 0xbefa00ff, 1307 - 0x01000000, 0xbefc0080, 1308 - 0xe0510000, 0x721e0000, 1309 - 0xe0510100, 0x721e0000, 1310 - 0x807cff7c, 0x00000200, 1311 - 0x8072ff72, 0x00000200, 1312 - 0xbf0a737c, 0xbf85fff6, 1313 - 0xbef20080, 0xbefe00c1, 1314 - 0xbeff00c1, 0xb8f32a05, 1315 - 0x80738173, 0x8e738273, 1316 - 0x8e7a8873, 0xbefa00ff, 1317 - 0x01000000, 0xbef60072, 1318 - 0x8072ff72, 0x00000400, 1319 - 0xbefc0084, 0xbf11087c, 1320 - 0x8073ff73, 0x00008000, 1321 - 0xe0524000, 0x721e0000, 1322 - 0xe0524100, 0x721e0100, 1323 - 0xe0524200, 0x721e0200, 1324 - 0xe0524300, 0x721e0300, 1325 - 0xbf8c0f70, 0x7e000300, 1326 - 0x7e020301, 0x7e040302, 1327 - 0x7e060303, 0x807c847c, 1328 - 0x8072ff72, 0x00000400, 1329 - 0xbf0a737c, 0xbf85ffee, 1330 - 0xbf9c0000, 0xe0524000, 1331 - 0x761e0000, 0xe0524100, 1332 - 0x761e0100, 0xe0524200, 1333 - 0x761e0200, 0xe0524300, 1334 - 0x761e0300, 0xb8f22a05, 1335 - 0x80728172, 0x8e728a72, 1336 - 0xb8f61605, 0x80768176, 1337 - 0x8e768676, 0x80727672, 1338 - 0x80f2c072, 0xb8f31605, 1339 - 0x80738173, 0x8e738473, 1340 - 0x8e7a8273, 0xbefa00ff, 1341 - 0x01000000, 0xbefc0073, 1342 - 0xc031003c, 0x00000072, 1343 - 0x80f2c072, 0xbf8c007f, 1344 - 0x80fc907c, 0xbe802d00, 1345 - 0xbe822d02, 0xbe842d04, 1346 - 0xbe862d06, 0xbe882d08, 1347 - 0xbe8a2d0a, 0xbe8c2d0c, 1348 - 0xbe8e2d0e, 0xbf06807c, 1349 - 0xbf84fff1, 0xb8f22a05, 1350 - 0x80728172, 0x8e728a72, 1351 - 0xb8f61605, 0x80768176, 1352 - 0x8e768676, 0x80727672, 1353 - 0xbefa0084, 0xbefa00ff, 1354 - 0x01000000, 0xc0211cfc, 1355 - 0x00000072, 0x80728472, 1356 - 0xc0211c3c, 0x00000072, 1357 - 0x80728472, 0xc0211c7c, 1358 - 0x00000072, 0x80728472, 1359 - 0xc0211bbc, 0x00000072, 1360 - 0x80728472, 0xc0211bfc, 1361 - 0x00000072, 0x80728472, 1362 - 0xc0211d3c, 0x00000072, 1363 - 0x80728472, 0xc0211d7c, 1364 - 0x00000072, 0x80728472, 1365 - 0xc0211a3c, 0x00000072, 1366 - 0x80728472, 0xc0211a7c, 1367 - 0x00000072, 0x80728472, 1368 - 0xc0211dfc, 0x00000072, 1369 - 0x80728472, 0xc0211b3c, 1370 - 0x00000072, 0x80728472, 1371 - 0xc0211b7c, 0x00000072, 1372 - 0x80728472, 0xbf8c007f, 1373 - 0x8671ff71, 0x0000ffff, 1374 - 0xbefc0073, 0xbefe006e, 1375 - 0xbeff006f, 0x867375ff, 1376 - 0x000003ff, 0xb9734803, 1377 - 0x867375ff, 0xfffff800, 1378 - 0x8f738b73, 0xb973a2c3, 1379 - 0xb977f801, 0x8673ff71, 1380 - 0xf0000000, 0x8f739c73, 1381 - 0x8e739073, 0xbef60080, 1382 - 0x87767376, 0x8673ff71, 1383 - 0x08000000, 0x8f739b73, 1384 - 0x8e738f73, 0x87767376, 1385 - 0x8673ff74, 0x00800000, 1386 - 0x8f739773, 0xb976f807, 1387 - 0x86fe7e7e, 0x86ea6a6a, 1388 - 0xb974f802, 0xbf8a0000, 1389 - 0x95807370, 0xbf810000, 1390 - }; 1391 -
+1214
drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
··· 1 + /* 2 + * Copyright 2016 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + */ 22 + 23 + /* To compile this assembly code: 24 + * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex 25 + */ 26 + 27 + /* HW (GFX9) source code for CWSR trap handler */ 28 + /* Version 18 + multiple trap handler */ 29 + 30 + // this performance-optimal version was originally from Seven Xu at SRDC 31 + 32 + // Revison #18 --... 33 + /* Rev History 34 + ** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) 35 + ** #4. SR Memory Layout: 36 + ** 1. VGPR-SGPR-HWREG-{LDS} 37 + ** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. 38 + ** #5. Update: 1. Accurate g8sr_ts_save_d timestamp 39 + ** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) 40 + ** #7. Update: 1. don't barrier if noLDS 41 + ** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version 42 + ** 2. Fix SQ issue by s_sleep 2 43 + ** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last 44 + ** 2. optimize s_buffer save by burst 16sgprs... 45 + ** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. 46 + ** #11. Update 1. Add 2 more timestamp for debug version 47 + ** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance 48 + ** #13. Integ 1. Always use MUBUF for PV trap shader... 49 + ** #14. Update 1. s_buffer_store soft clause... 50 + ** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. 51 + ** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree 52 + ** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] 53 + ** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... 54 + ** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 55 + ** 2. FUNC - Handle non-CWSR traps 56 + */ 57 + 58 + var G8SR_WDMEM_HWREG_OFFSET = 0 59 + var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes 60 + 61 + // Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. 62 + 63 + var G8SR_DEBUG_TIMESTAMP = 0 64 + var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset 65 + var s_g8sr_ts_save_s = s[34:35] // save start 66 + var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi 67 + var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ 68 + var s_g8sr_ts_save_d = s[40:41] // save end 69 + var s_g8sr_ts_restore_s = s[42:43] // restore start 70 + var s_g8sr_ts_restore_d = s[44:45] // restore end 71 + 72 + var G8SR_VGPR_SR_IN_DWX4 = 0 73 + var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes 74 + var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 75 + 76 + 77 + /*************************************************************************/ 78 + /* control on how to run the shader */ 79 + /*************************************************************************/ 80 + //any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) 81 + var EMU_RUN_HACK = 0 82 + var EMU_RUN_HACK_RESTORE_NORMAL = 0 83 + var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 84 + var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 85 + var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK 86 + var SAVE_LDS = 1 87 + var WG_BASE_ADDR_LO = 0x9000a000 88 + var WG_BASE_ADDR_HI = 0x0 89 + var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem 90 + var CTX_SAVE_CONTROL = 0x0 91 + var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL 92 + var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) 93 + var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write 94 + var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes 95 + var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing 96 + var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency 97 + 98 + /**************************************************************************/ 99 + /* variables */ 100 + /**************************************************************************/ 101 + var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 102 + var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 103 + var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 104 + var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 105 + var SQ_WAVE_STATUS_HALT_MASK = 0x2000 106 + 107 + var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 108 + var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 109 + var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 110 + var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 111 + var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 112 + var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits 113 + 114 + var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 115 + var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask 116 + var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 117 + var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 118 + var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 119 + var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF 120 + var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 121 + var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 122 + var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 123 + var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 124 + var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 125 + var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 126 + 127 + var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME 128 + var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME 129 + var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 130 + var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME 131 + 132 + var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 133 + var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 134 + 135 + var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data 136 + var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000 137 + 138 + /* Save */ 139 + var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes 140 + var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE 141 + 142 + var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit 143 + var S_SAVE_SPI_INIT_ATC_SHIFT = 27 144 + var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype 145 + var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 146 + var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG 147 + var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 148 + 149 + var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used 150 + var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME 151 + var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME 152 + var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME 153 + 154 + var s_save_spi_init_lo = exec_lo 155 + var s_save_spi_init_hi = exec_hi 156 + 157 + var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} 158 + var s_save_pc_hi = ttmp1 159 + var s_save_exec_lo = ttmp2 160 + var s_save_exec_hi = ttmp3 161 + var s_save_tmp = ttmp4 162 + var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine 163 + var s_save_xnack_mask_lo = ttmp6 164 + var s_save_xnack_mask_hi = ttmp7 165 + var s_save_buf_rsrc0 = ttmp8 166 + var s_save_buf_rsrc1 = ttmp9 167 + var s_save_buf_rsrc2 = ttmp10 168 + var s_save_buf_rsrc3 = ttmp11 169 + var s_save_status = ttmp12 170 + var s_save_mem_offset = ttmp14 171 + var s_save_alloc_size = s_save_trapsts //conflict 172 + var s_save_m0 = ttmp15 173 + var s_save_ttmps_lo = s_save_tmp //no conflict 174 + var s_save_ttmps_hi = s_save_trapsts //no conflict 175 + 176 + /* Restore */ 177 + var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE 178 + var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC 179 + 180 + var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit 181 + var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 182 + var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype 183 + var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 184 + var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG 185 + var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 186 + 187 + var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT 188 + var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK 189 + var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 190 + var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK 191 + 192 + var s_restore_spi_init_lo = exec_lo 193 + var s_restore_spi_init_hi = exec_hi 194 + 195 + var s_restore_mem_offset = ttmp12 196 + var s_restore_alloc_size = ttmp3 197 + var s_restore_tmp = ttmp2 198 + var s_restore_mem_offset_save = s_restore_tmp //no conflict 199 + 200 + var s_restore_m0 = s_restore_alloc_size //no conflict 201 + 202 + var s_restore_mode = ttmp7 203 + 204 + var s_restore_pc_lo = ttmp0 205 + var s_restore_pc_hi = ttmp1 206 + var s_restore_exec_lo = ttmp14 207 + var s_restore_exec_hi = ttmp15 208 + var s_restore_status = ttmp4 209 + var s_restore_trapsts = ttmp5 210 + var s_restore_xnack_mask_lo = xnack_mask_lo 211 + var s_restore_xnack_mask_hi = xnack_mask_hi 212 + var s_restore_buf_rsrc0 = ttmp8 213 + var s_restore_buf_rsrc1 = ttmp9 214 + var s_restore_buf_rsrc2 = ttmp10 215 + var s_restore_buf_rsrc3 = ttmp11 216 + var s_restore_ttmps_lo = s_restore_tmp //no conflict 217 + var s_restore_ttmps_hi = s_restore_alloc_size //no conflict 218 + 219 + /**************************************************************************/ 220 + /* trap handler entry points */ 221 + /**************************************************************************/ 222 + /* Shader Main*/ 223 + 224 + shader main 225 + asic(GFX9) 226 + type(CS) 227 + 228 + 229 + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore 230 + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC 231 + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC 232 + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. 233 + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE 234 + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE 235 + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually 236 + else 237 + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save 238 + end 239 + 240 + L_JUMP_TO_RESTORE: 241 + s_branch L_RESTORE //restore 242 + 243 + L_SKIP_RESTORE: 244 + 245 + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC 246 + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save 247 + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 248 + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save 249 + s_cbranch_scc1 L_SAVE //this is the operation for save 250 + 251 + // ********* Handle non-CWSR traps ******************* 252 + if (!EMU_RUN_HACK) 253 + // Illegal instruction is a non-maskable exception which blocks context save. 254 + // Halt the wavefront and return from the trap. 255 + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK 256 + s_cbranch_scc1 L_HALT_WAVE 257 + 258 + // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. 259 + // Instead, halt the wavefront and return from the trap. 260 + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK 261 + s_cbranch_scc0 L_FETCH_2ND_TRAP 262 + 263 + L_HALT_WAVE: 264 + // If STATUS.HALT is set then this fault must come from SQC instruction fetch. 265 + // We cannot prevent further faults so just terminate the wavefront. 266 + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK 267 + s_cbranch_scc0 L_NOT_ALREADY_HALTED 268 + s_endpgm 269 + L_NOT_ALREADY_HALTED: 270 + s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK 271 + 272 + // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. 273 + // Rewind the PC to prevent this from occurring. The debugger compensates for this. 274 + s_sub_u32 ttmp0, ttmp0, 0x8 275 + s_subb_u32 ttmp1, ttmp1, 0x0 276 + 277 + L_FETCH_2ND_TRAP: 278 + // Preserve and clear scalar XNACK state before issuing scalar reads. 279 + // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26]. 280 + s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) 281 + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 282 + s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 283 + s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK 284 + s_or_b32 ttmp11, ttmp11, ttmp3 285 + 286 + s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 287 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 288 + 289 + // Read second-level TBA/TMA from first-level TMA and jump if available. 290 + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) 291 + // ttmp12 holds SQ_WAVE_STATUS 292 + s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) 293 + s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) 294 + s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 295 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA 296 + s_waitcnt lgkmcnt(0) 297 + s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA 298 + s_waitcnt lgkmcnt(0) 299 + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] 300 + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set 301 + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler 302 + 303 + L_NO_NEXT_TRAP: 304 + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 305 + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception 306 + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. 307 + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 308 + s_addc_u32 ttmp1, ttmp1, 0 309 + L_EXCP_CASE: 310 + s_and_b32 ttmp1, ttmp1, 0xFFFF 311 + 312 + // Restore SQ_WAVE_IB_STS. 313 + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) 314 + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK 315 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 316 + 317 + // Restore SQ_WAVE_STATUS. 318 + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 319 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 320 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status 321 + 322 + s_rfe_b64 [ttmp0, ttmp1] 323 + end 324 + // ********* End handling of non-CWSR traps ******************* 325 + 326 + /**************************************************************************/ 327 + /* save routine */ 328 + /**************************************************************************/ 329 + 330 + L_SAVE: 331 + 332 + if G8SR_DEBUG_TIMESTAMP 333 + s_memrealtime s_g8sr_ts_save_s 334 + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 335 + end 336 + 337 + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 338 + 339 + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit 340 + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit 341 + 342 + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT 343 + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT 344 + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 345 + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY 346 + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 347 + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp 348 + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS 349 + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG 350 + 351 + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp 352 + 353 + /* inform SPI the readiness and wait for SPI's go signal */ 354 + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI 355 + s_mov_b32 s_save_exec_hi, exec_hi 356 + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive 357 + 358 + if G8SR_DEBUG_TIMESTAMP 359 + s_memrealtime s_g8sr_ts_sq_save_msg 360 + s_waitcnt lgkmcnt(0) 361 + end 362 + 363 + if (EMU_RUN_HACK) 364 + 365 + else 366 + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC 367 + end 368 + 369 + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. 370 + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) 371 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp 372 + 373 + L_SLEEP: 374 + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 375 + 376 + if (EMU_RUN_HACK) 377 + 378 + else 379 + s_cbranch_execz L_SLEEP 380 + end 381 + 382 + if G8SR_DEBUG_TIMESTAMP 383 + s_memrealtime s_g8sr_ts_spi_wrexec 384 + s_waitcnt lgkmcnt(0) 385 + end 386 + 387 + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) 388 + //calculate wd_addr using absolute thread id 389 + v_readlane_b32 s_save_tmp, v9, 0 390 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 391 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE 392 + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO 393 + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI 394 + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL 395 + else 396 + end 397 + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) 398 + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO 399 + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI 400 + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL 401 + else 402 + end 403 + 404 + // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic 405 + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 406 + get_vgpr_size_bytes(s_save_ttmps_lo) 407 + get_sgpr_size_bytes(s_save_ttmps_hi) 408 + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi 409 + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo 410 + s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 411 + s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF 412 + s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 413 + ack_sqc_store_workaround() 414 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 415 + ack_sqc_store_workaround() 416 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 417 + ack_sqc_store_workaround() 418 + s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 419 + ack_sqc_store_workaround() 420 + 421 + /* setup Resource Contants */ 422 + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo 423 + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi 424 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE 425 + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited 426 + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC 427 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK 428 + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position 429 + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC 430 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK 431 + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position 432 + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE 433 + 434 + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) 435 + s_mov_b32 s_save_m0, m0 //save M0 436 + 437 + /* global mem offset */ 438 + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 439 + 440 + 441 + 442 + 443 + /* save HW registers */ 444 + ////////////////////////////// 445 + 446 + L_SAVE_HWREG: 447 + // HWREG SR memory offset : size(VGPR)+size(SGPR) 448 + get_vgpr_size_bytes(s_save_mem_offset) 449 + get_sgpr_size_bytes(s_save_tmp) 450 + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 451 + 452 + 453 + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes 454 + if (SWIZZLE_EN) 455 + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 456 + else 457 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 458 + end 459 + 460 + 461 + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 462 + 463 + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) 464 + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 465 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over 466 + end 467 + 468 + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC 469 + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) 470 + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC 471 + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) 472 + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS 473 + 474 + //s_save_trapsts conflicts with s_save_alloc_size 475 + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) 476 + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS 477 + 478 + write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO 479 + write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI 480 + 481 + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 482 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE 483 + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) 484 + 485 + 486 + 487 + /* the first wave in the threadgroup */ 488 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit 489 + s_mov_b32 s_save_exec_hi, 0x0 490 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] 491 + 492 + 493 + /* save SGPRs */ 494 + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... 495 + ////////////////////////////// 496 + 497 + // SGPR SR memory offset : size(VGPR) 498 + get_vgpr_size_bytes(s_save_mem_offset) 499 + // TODO, change RSRC word to rearrange memory layout for SGPRS 500 + 501 + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 502 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 503 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) 504 + 505 + if (SGPR_SAVE_USE_SQC) 506 + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes 507 + else 508 + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) 509 + end 510 + 511 + if (SWIZZLE_EN) 512 + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 513 + else 514 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 515 + end 516 + 517 + 518 + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 519 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 520 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 521 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset 522 + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 523 + 524 + s_mov_b32 m0, 0x0 //SGPR initial index value =0 525 + s_nop 0x0 //Manually inserted wait states 526 + L_SAVE_SGPR_LOOP: 527 + // SGPR is allocated in 16 SGPR granularity 528 + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] 529 + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] 530 + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] 531 + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] 532 + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] 533 + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] 534 + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] 535 + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] 536 + 537 + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 538 + s_add_u32 m0, m0, 16 //next sgpr index 539 + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 540 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? 541 + // restore s_save_buf_rsrc0,1 542 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo 543 + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo 544 + 545 + 546 + 547 + 548 + /* save first 4 VGPR, then LDS save could use */ 549 + // each wave will alloc 4 vgprs at least... 550 + ///////////////////////////////////////////////////////////////////////////////////// 551 + 552 + s_mov_b32 s_save_mem_offset, 0 553 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 554 + s_mov_b32 exec_hi, 0xFFFFFFFF 555 + s_mov_b32 xnack_mask_lo, 0x0 556 + s_mov_b32 xnack_mask_hi, 0x0 557 + 558 + if (SWIZZLE_EN) 559 + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 560 + else 561 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 562 + end 563 + 564 + 565 + // VGPR Allocated in 4-GPR granularity 566 + 567 + if G8SR_VGPR_SR_IN_DWX4 568 + // the const stride for DWx4 is 4*4 bytes 569 + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 570 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 571 + 572 + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 573 + 574 + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 575 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes 576 + else 577 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 578 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 579 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 580 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 581 + end 582 + 583 + 584 + 585 + /* save LDS */ 586 + ////////////////////////////// 587 + 588 + L_SAVE_LDS: 589 + 590 + // Change EXEC to all threads... 591 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 592 + s_mov_b32 exec_hi, 0xFFFFFFFF 593 + 594 + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size 595 + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? 596 + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE 597 + 598 + s_barrier //LDS is used? wait for other waves in the same TG 599 + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here 600 + s_cbranch_scc0 L_SAVE_LDS_DONE 601 + 602 + // first wave do LDS save; 603 + 604 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 605 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes 606 + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes 607 + 608 + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 609 + // 610 + get_vgpr_size_bytes(s_save_mem_offset) 611 + get_sgpr_size_bytes(s_save_tmp) 612 + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp 613 + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() 614 + 615 + 616 + if (SWIZZLE_EN) 617 + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 618 + else 619 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 620 + end 621 + 622 + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 623 + 624 + 625 + var LDS_DMA_ENABLE = 0 626 + var UNROLL = 0 627 + if UNROLL==0 && LDS_DMA_ENABLE==1 628 + s_mov_b32 s3, 256*2 629 + s_nop 0 630 + s_nop 0 631 + s_nop 0 632 + L_SAVE_LDS_LOOP: 633 + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? 634 + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity 635 + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW 636 + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW 637 + end 638 + 639 + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes 640 + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes 641 + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 642 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? 643 + 644 + elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss 645 + // store from higest LDS address to lowest 646 + s_mov_b32 s3, 256*2 647 + s_sub_u32 m0, s_save_alloc_size, s3 648 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 649 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... 650 + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest 651 + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction 652 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc 653 + s_nop 0 654 + s_nop 0 655 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes 656 + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved 657 + s_add_u32 s0, s0,s_save_alloc_size 658 + s_addc_u32 s1, s1, 0 659 + s_setpc_b64 s[0:1] 660 + 661 + 662 + for var i =0; i< 128; i++ 663 + // be careful to make here a 64Byte aligned address, which could improve performance... 664 + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW 665 + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW 666 + 667 + if i!=127 668 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline 669 + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 670 + end 671 + end 672 + 673 + else // BUFFER_STORE 674 + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 675 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid 676 + v_mul_i32_i24 v2, v3, 8 // tid*8 677 + v_mov_b32 v3, 256*2 678 + s_mov_b32 m0, 0x10000 679 + s_mov_b32 s0, s_save_buf_rsrc3 680 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid 681 + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT 682 + 683 + L_SAVE_LDS_LOOP_VECTOR: 684 + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address 685 + s_waitcnt lgkmcnt(0) 686 + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 687 + // s_waitcnt vmcnt(0) 688 + // v_add_u32 v2, vcc[0:1], v2, v3 689 + v_add_u32 v2, v2, v3 690 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size 691 + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR 692 + 693 + // restore rsrc3 694 + s_mov_b32 s_save_buf_rsrc3, s0 695 + 696 + end 697 + 698 + L_SAVE_LDS_DONE: 699 + 700 + 701 + /* save VGPRs - set the Rest VGPRs */ 702 + ////////////////////////////////////////////////////////////////////////////////////// 703 + L_SAVE_VGPR: 704 + // VGPR SR memory offset: 0 705 + // TODO rearrange the RSRC words to use swizzle for VGPR save... 706 + 707 + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs 708 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on 709 + s_mov_b32 exec_hi, 0xFFFFFFFF 710 + 711 + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 712 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 713 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible 714 + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) 715 + if (SWIZZLE_EN) 716 + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 717 + else 718 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 719 + end 720 + 721 + 722 + // VGPR Allocated in 4-GPR granularity 723 + 724 + if G8SR_VGPR_SR_IN_DWX4 725 + // the const stride for DWx4 is 4*4 bytes 726 + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 727 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 728 + 729 + s_mov_b32 m0, 4 // skip first 4 VGPRs 730 + s_cmp_lt_u32 m0, s_save_alloc_size 731 + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs 732 + 733 + s_set_gpr_idx_on m0, 0x1 // This will change M0 734 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 735 + L_SAVE_VGPR_LOOP: 736 + v_mov_b32 v0, v0 // v0 = v[0+m0] 737 + v_mov_b32 v1, v1 738 + v_mov_b32 v2, v2 739 + v_mov_b32 v3, v3 740 + 741 + 742 + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 743 + s_add_u32 m0, m0, 4 744 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 745 + s_cmp_lt_u32 m0, s_save_alloc_size 746 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? 747 + s_set_gpr_idx_off 748 + L_SAVE_VGPR_LOOP_END: 749 + 750 + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 751 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes 752 + else 753 + // VGPR store using dw burst 754 + s_mov_b32 m0, 0x4 //VGPR initial index value =0 755 + s_cmp_lt_u32 m0, s_save_alloc_size 756 + s_cbranch_scc0 L_SAVE_VGPR_END 757 + 758 + 759 + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 760 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later 761 + 762 + L_SAVE_VGPR_LOOP: 763 + v_mov_b32 v0, v0 //v0 = v[0+m0] 764 + v_mov_b32 v1, v1 //v0 = v[0+m0] 765 + v_mov_b32 v2, v2 //v0 = v[0+m0] 766 + v_mov_b32 v3, v3 //v0 = v[0+m0] 767 + 768 + if(USE_MTBUF_INSTEAD_OF_MUBUF) 769 + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 770 + else 771 + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 772 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 773 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 774 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 775 + end 776 + 777 + s_add_u32 m0, m0, 4 //next vgpr index 778 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes 779 + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 780 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? 781 + s_set_gpr_idx_off 782 + end 783 + 784 + L_SAVE_VGPR_END: 785 + 786 + 787 + 788 + 789 + 790 + 791 + /* S_PGM_END_SAVED */ //FIXME graphics ONLY 792 + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) 793 + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] 794 + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 795 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over 796 + s_rfe_b64 s_save_pc_lo //Return to the main shader program 797 + else 798 + end 799 + 800 + // Save Done timestamp 801 + if G8SR_DEBUG_TIMESTAMP 802 + s_memrealtime s_g8sr_ts_save_d 803 + // SGPR SR memory offset : size(VGPR) 804 + get_vgpr_size_bytes(s_save_mem_offset) 805 + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET 806 + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 807 + // Need reset rsrc2?? 808 + s_mov_b32 m0, s_save_mem_offset 809 + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 810 + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 811 + end 812 + 813 + 814 + s_branch L_END_PGM 815 + 816 + 817 + 818 + /**************************************************************************/ 819 + /* restore routine */ 820 + /**************************************************************************/ 821 + 822 + L_RESTORE: 823 + /* Setup Resource Contants */ 824 + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 825 + //calculate wd_addr using absolute thread id 826 + v_readlane_b32 s_restore_tmp, v9, 0 827 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 828 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE 829 + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO 830 + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI 831 + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL 832 + else 833 + end 834 + 835 + if G8SR_DEBUG_TIMESTAMP 836 + s_memrealtime s_g8sr_ts_restore_s 837 + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? 838 + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... 839 + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] 840 + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. 841 + end 842 + 843 + 844 + 845 + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo 846 + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi 847 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE 848 + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) 849 + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC 850 + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK 851 + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position 852 + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC 853 + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK 854 + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position 855 + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE 856 + 857 + /* global mem offset */ 858 + // s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 859 + 860 + /* the first wave in the threadgroup */ 861 + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK 862 + s_cbranch_scc0 L_RESTORE_VGPR 863 + 864 + /* restore LDS */ 865 + ////////////////////////////// 866 + L_RESTORE_LDS: 867 + 868 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead 869 + s_mov_b32 exec_hi, 0xFFFFFFFF 870 + 871 + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size 872 + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? 873 + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR 874 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw 875 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes 876 + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes 877 + 878 + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) 879 + // 880 + get_vgpr_size_bytes(s_restore_mem_offset) 881 + get_sgpr_size_bytes(s_restore_tmp) 882 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 883 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? 884 + 885 + 886 + if (SWIZZLE_EN) 887 + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 888 + else 889 + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 890 + end 891 + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 892 + 893 + L_RESTORE_LDS_LOOP: 894 + if (SAVE_LDS) 895 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 896 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW 897 + end 898 + s_add_u32 m0, m0, 256*2 // 128 DW 899 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW 900 + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 901 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? 902 + 903 + 904 + /* restore VGPRs */ 905 + ////////////////////////////// 906 + L_RESTORE_VGPR: 907 + // VGPR SR memory offset : 0 908 + s_mov_b32 s_restore_mem_offset, 0x0 909 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead 910 + s_mov_b32 exec_hi, 0xFFFFFFFF 911 + 912 + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 913 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 914 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) 915 + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) 916 + if (SWIZZLE_EN) 917 + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 918 + else 919 + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 920 + end 921 + 922 + if G8SR_VGPR_SR_IN_DWX4 923 + get_vgpr_size_bytes(s_restore_mem_offset) 924 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 925 + 926 + // the const stride for DWx4 is 4*4 bytes 927 + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 928 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes 929 + 930 + s_mov_b32 m0, s_restore_alloc_size 931 + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 932 + 933 + L_RESTORE_VGPR_LOOP: 934 + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 935 + s_waitcnt vmcnt(0) 936 + s_sub_u32 m0, m0, 4 937 + v_mov_b32 v0, v0 // v[0+m0] = v0 938 + v_mov_b32 v1, v1 939 + v_mov_b32 v2, v2 940 + v_mov_b32 v3, v3 941 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 942 + s_cmp_eq_u32 m0, 0x8000 943 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP 944 + s_set_gpr_idx_off 945 + 946 + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 947 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes 948 + 949 + else 950 + // VGPR load using dw burst 951 + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last 952 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 953 + s_mov_b32 m0, 4 //VGPR initial index value = 1 954 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 955 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later 956 + 957 + L_RESTORE_VGPR_LOOP: 958 + if(USE_MTBUF_INSTEAD_OF_MUBUF) 959 + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 960 + else 961 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 962 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 963 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 964 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 965 + end 966 + s_waitcnt vmcnt(0) //ensure data ready 967 + v_mov_b32 v0, v0 //v[0+m0] = v0 968 + v_mov_b32 v1, v1 969 + v_mov_b32 v2, v2 970 + v_mov_b32 v3, v3 971 + s_add_u32 m0, m0, 4 //next vgpr index 972 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes 973 + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 974 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? 975 + s_set_gpr_idx_off 976 + /* VGPR restore on v0 */ 977 + if(USE_MTBUF_INSTEAD_OF_MUBUF) 978 + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 979 + else 980 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 981 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 982 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 983 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 984 + end 985 + 986 + end 987 + 988 + /* restore SGPRs */ 989 + ////////////////////////////// 990 + 991 + // SGPR SR memory offset : size(VGPR) 992 + get_vgpr_size_bytes(s_restore_mem_offset) 993 + get_sgpr_size_bytes(s_restore_tmp) 994 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 995 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group 996 + // TODO, change RSRC word to rearrange memory layout for SGPRS 997 + 998 + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 999 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 1000 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) 1001 + 1002 + if (SGPR_SAVE_USE_SQC) 1003 + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes 1004 + else 1005 + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) 1006 + end 1007 + if (SWIZZLE_EN) 1008 + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 1009 + else 1010 + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1011 + end 1012 + 1013 + s_mov_b32 m0, s_restore_alloc_size 1014 + 1015 + L_RESTORE_SGPR_LOOP: 1016 + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made 1017 + s_waitcnt lgkmcnt(0) //ensure data ready 1018 + 1019 + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] 1020 + s_nop 0 // hazard SALU M0=> S_MOVREL 1021 + 1022 + s_movreld_b64 s0, s0 //s[0+m0] = s0 1023 + s_movreld_b64 s2, s2 1024 + s_movreld_b64 s4, s4 1025 + s_movreld_b64 s6, s6 1026 + s_movreld_b64 s8, s8 1027 + s_movreld_b64 s10, s10 1028 + s_movreld_b64 s12, s12 1029 + s_movreld_b64 s14, s14 1030 + 1031 + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 1032 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? 1033 + 1034 + /* restore HW registers */ 1035 + ////////////////////////////// 1036 + L_RESTORE_HWREG: 1037 + 1038 + 1039 + if G8SR_DEBUG_TIMESTAMP 1040 + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo 1041 + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi 1042 + end 1043 + 1044 + // HWREG SR memory offset : size(VGPR)+size(SGPR) 1045 + get_vgpr_size_bytes(s_restore_mem_offset) 1046 + get_sgpr_size_bytes(s_restore_tmp) 1047 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp 1048 + 1049 + 1050 + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes 1051 + if (SWIZZLE_EN) 1052 + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? 1053 + else 1054 + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes 1055 + end 1056 + 1057 + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 1058 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC 1059 + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1060 + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC 1061 + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) 1062 + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS 1063 + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS 1064 + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO 1065 + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI 1066 + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE 1067 + 1068 + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS 1069 + 1070 + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: 1071 + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) 1072 + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) 1073 + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over 1074 + end 1075 + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) 1076 + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal 1077 + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over 1078 + end 1079 + 1080 + s_mov_b32 m0, s_restore_m0 1081 + s_mov_b32 exec_lo, s_restore_exec_lo 1082 + s_mov_b32 exec_hi, s_restore_exec_hi 1083 + 1084 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts 1085 + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 1086 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts 1087 + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT 1088 + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 1089 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore 1090 + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode 1091 + 1092 + // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic 1093 + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 1094 + get_vgpr_size_bytes(s_restore_ttmps_lo) 1095 + get_sgpr_size_bytes(s_restore_ttmps_hi) 1096 + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi 1097 + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 1098 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 1099 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF 1100 + s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 1101 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 1102 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 1103 + s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 1104 + s_waitcnt lgkmcnt(0) 1105 + 1106 + //reuse s_restore_m0 as a temp register 1107 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK 1108 + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT 1109 + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT 1110 + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero 1111 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 1112 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK 1113 + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT 1114 + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT 1115 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 1116 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK 1117 + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT 1118 + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp 1119 + 1120 + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS 1121 + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 1122 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 1123 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu 1124 + 1125 + s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time 1126 + 1127 + if G8SR_DEBUG_TIMESTAMP 1128 + s_memrealtime s_g8sr_ts_restore_d 1129 + s_waitcnt lgkmcnt(0) 1130 + end 1131 + 1132 + // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution 1133 + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc 1134 + 1135 + 1136 + /**************************************************************************/ 1137 + /* the END */ 1138 + /**************************************************************************/ 1139 + L_END_PGM: 1140 + s_endpgm 1141 + 1142 + end 1143 + 1144 + 1145 + /**************************************************************************/ 1146 + /* the helper functions */ 1147 + /**************************************************************************/ 1148 + 1149 + //Only for save hwreg to mem 1150 + function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) 1151 + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on 1152 + s_mov_b32 m0, s_mem_offset 1153 + s_buffer_store_dword s, s_rsrc, m0 glc:1 1154 + ack_sqc_store_workaround() 1155 + s_add_u32 s_mem_offset, s_mem_offset, 4 1156 + s_mov_b32 m0, exec_lo 1157 + end 1158 + 1159 + 1160 + // HWREG are saved before SGPRs, so all HWREG could be use. 1161 + function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) 1162 + 1163 + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 1164 + ack_sqc_store_workaround() 1165 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 1166 + ack_sqc_store_workaround() 1167 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 1168 + ack_sqc_store_workaround() 1169 + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 1170 + ack_sqc_store_workaround() 1171 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 1172 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc 1173 + end 1174 + 1175 + 1176 + function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) 1177 + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 1178 + s_add_u32 s_mem_offset, s_mem_offset, 4 1179 + end 1180 + 1181 + function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) 1182 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 1183 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 1184 + end 1185 + 1186 + 1187 + 1188 + function get_lds_size_bytes(s_lds_size_byte) 1189 + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW 1190 + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size 1191 + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW 1192 + end 1193 + 1194 + function get_vgpr_size_bytes(s_vgpr_size_byte) 1195 + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size 1196 + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 1197 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible 1198 + end 1199 + 1200 + function get_sgpr_size_bytes(s_sgpr_size_byte) 1201 + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size 1202 + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 1203 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) 1204 + end 1205 + 1206 + function get_hwreg_size_bytes 1207 + return 128 //HWREG size 128 bytes 1208 + end 1209 + 1210 + function ack_sqc_store_workaround 1211 + if ACK_SQC_STORE 1212 + s_waitcnt lgkmcnt(0) 1213 + end 1214 + end
+35 -17
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
··· 233 233 pr_debug("Queue Size: 0x%llX, %u\n", 234 234 q_properties->queue_size, args->ring_size); 235 235 236 - pr_debug("Queue r/w Pointers: %p, %p\n", 236 + pr_debug("Queue r/w Pointers: %px, %px\n", 237 237 q_properties->read_ptr, 238 238 q_properties->write_ptr); 239 239 ··· 292 292 293 293 294 294 /* Return gpu_id as doorbell offset for mmap usage */ 295 - args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); 295 + args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; 296 + args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); 296 297 args->doorbell_offset <<= PAGE_SHIFT; 298 + if (KFD_IS_SOC15(dev->device_info->asic_family)) 299 + /* On SOC15 ASICs, doorbell allocation must be 300 + * per-device, and independent from the per-process 301 + * queue_id. Return the doorbell offset within the 302 + * doorbell aperture to user mode. 303 + */ 304 + args->doorbell_offset |= q_properties.doorbell_off; 297 305 298 306 mutex_unlock(&p->mutex); 299 307 ··· 1304 1296 return -EINVAL; 1305 1297 } 1306 1298 1307 - devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), 1308 - GFP_KERNEL); 1299 + devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), 1300 + GFP_KERNEL); 1309 1301 if (!devices_arr) 1310 1302 return -ENOMEM; 1311 1303 ··· 1413 1405 return -EINVAL; 1414 1406 } 1415 1407 1416 - devices_arr = kmalloc(args->n_devices * sizeof(*devices_arr), 1417 - GFP_KERNEL); 1408 + devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), 1409 + GFP_KERNEL); 1418 1410 if (!devices_arr) 1419 1411 return -ENOMEM; 1420 1412 ··· 1653 1645 static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) 1654 1646 { 1655 1647 struct kfd_process *process; 1648 + struct kfd_dev *dev = NULL; 1649 + unsigned long vm_pgoff; 1650 + unsigned int gpu_id; 1656 1651 1657 1652 process = kfd_get_process(current); 1658 1653 if (IS_ERR(process)) 1659 1654 return PTR_ERR(process); 1660 1655 1661 - if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == 1662 - KFD_MMAP_DOORBELL_MASK) { 1663 - vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; 1664 - return kfd_doorbell_mmap(process, vma); 1665 - } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == 1666 - KFD_MMAP_EVENTS_MASK) { 1667 - vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; 1656 + vm_pgoff = vma->vm_pgoff; 1657 + vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff); 1658 + gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff); 1659 + if (gpu_id) 1660 + dev = kfd_device_by_id(gpu_id); 1661 + 1662 + switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { 1663 + case KFD_MMAP_TYPE_DOORBELL: 1664 + if (!dev) 1665 + return -ENODEV; 1666 + return kfd_doorbell_mmap(dev, process, vma); 1667 + 1668 + case KFD_MMAP_TYPE_EVENTS: 1668 1669 return kfd_event_mmap(process, vma); 1669 - } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == 1670 - KFD_MMAP_RESERVED_MEM_MASK) { 1671 - vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; 1672 - return kfd_reserved_mem_mmap(process, vma); 1670 + 1671 + case KFD_MMAP_TYPE_RESERVED_MEM: 1672 + if (!dev) 1673 + return -ENODEV; 1674 + return kfd_reserved_mem_mmap(dev, process, vma); 1673 1675 } 1674 1676 1675 1677 return -EFAULT;
+11
drivers/gpu/drm/amd/amdkfd/kfd_crat.c
··· 132 132 #define fiji_cache_info carrizo_cache_info 133 133 #define polaris10_cache_info carrizo_cache_info 134 134 #define polaris11_cache_info carrizo_cache_info 135 + /* TODO - check & update Vega10 cache details */ 136 + #define vega10_cache_info carrizo_cache_info 137 + #define raven_cache_info carrizo_cache_info 135 138 136 139 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, 137 140 struct crat_subtype_computeunit *cu) ··· 605 602 case CHIP_POLARIS11: 606 603 pcache_info = polaris11_cache_info; 607 604 num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); 605 + break; 606 + case CHIP_VEGA10: 607 + pcache_info = vega10_cache_info; 608 + num_of_cache_types = ARRAY_SIZE(vega10_cache_info); 609 + break; 610 + case CHIP_RAVEN: 611 + pcache_info = raven_cache_info; 612 + num_of_cache_types = ARRAY_SIZE(raven_cache_info); 608 613 break; 609 614 default: 610 615 return -EINVAL;
+109 -22
drivers/gpu/drm/amd/amdkfd/kfd_device.c
··· 20 20 * OTHER DEALINGS IN THE SOFTWARE. 21 21 */ 22 22 23 - #if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) 24 - #include <linux/amd-iommu.h> 25 - #endif 26 23 #include <linux/bsearch.h> 27 24 #include <linux/pci.h> 28 25 #include <linux/slab.h> 29 26 #include "kfd_priv.h" 30 27 #include "kfd_device_queue_manager.h" 31 28 #include "kfd_pm4_headers_vi.h" 32 - #include "cwsr_trap_handler_gfx8.asm" 29 + #include "cwsr_trap_handler.h" 33 30 #include "kfd_iommu.h" 34 31 35 32 #define MQD_SIZE_ALIGNED 768 ··· 38 41 .max_pasid_bits = 16, 39 42 /* max num of queues for KV.TODO should be a dynamic value */ 40 43 .max_no_of_hqd = 24, 44 + .doorbell_size = 4, 41 45 .ih_ring_entry_size = 4 * sizeof(uint32_t), 42 46 .event_interrupt_class = &event_interrupt_class_cik, 43 47 .num_of_watch_points = 4, ··· 53 55 .max_pasid_bits = 16, 54 56 /* max num of queues for CZ.TODO should be a dynamic value */ 55 57 .max_no_of_hqd = 24, 58 + .doorbell_size = 4, 56 59 .ih_ring_entry_size = 4 * sizeof(uint32_t), 57 60 .event_interrupt_class = &event_interrupt_class_cik, 58 61 .num_of_watch_points = 4, ··· 69 70 .max_pasid_bits = 16, 70 71 /* max num of queues for KV.TODO should be a dynamic value */ 71 72 .max_no_of_hqd = 24, 73 + .doorbell_size = 4, 72 74 .ih_ring_entry_size = 4 * sizeof(uint32_t), 73 75 .event_interrupt_class = &event_interrupt_class_cik, 74 76 .num_of_watch_points = 4, ··· 83 83 .asic_family = CHIP_TONGA, 84 84 .max_pasid_bits = 16, 85 85 .max_no_of_hqd = 24, 86 + .doorbell_size = 4, 86 87 .ih_ring_entry_size = 4 * sizeof(uint32_t), 87 88 .event_interrupt_class = &event_interrupt_class_cik, 88 89 .num_of_watch_points = 4, ··· 97 96 .asic_family = CHIP_TONGA, 98 97 .max_pasid_bits = 16, 99 98 .max_no_of_hqd = 24, 99 + .doorbell_size = 4, 100 100 .ih_ring_entry_size = 4 * sizeof(uint32_t), 101 101 .event_interrupt_class = &event_interrupt_class_cik, 102 102 .num_of_watch_points = 4, ··· 111 109 .asic_family = CHIP_FIJI, 112 110 .max_pasid_bits = 16, 113 111 .max_no_of_hqd = 24, 112 + .doorbell_size = 4, 114 113 .ih_ring_entry_size = 4 * sizeof(uint32_t), 115 114 .event_interrupt_class = &event_interrupt_class_cik, 116 115 .num_of_watch_points = 4, ··· 125 122 .asic_family = CHIP_FIJI, 126 123 .max_pasid_bits = 16, 127 124 .max_no_of_hqd = 24, 125 + .doorbell_size = 4, 128 126 .ih_ring_entry_size = 4 * sizeof(uint32_t), 129 127 .event_interrupt_class = &event_interrupt_class_cik, 130 128 .num_of_watch_points = 4, ··· 140 136 .asic_family = CHIP_POLARIS10, 141 137 .max_pasid_bits = 16, 142 138 .max_no_of_hqd = 24, 139 + .doorbell_size = 4, 143 140 .ih_ring_entry_size = 4 * sizeof(uint32_t), 144 141 .event_interrupt_class = &event_interrupt_class_cik, 145 142 .num_of_watch_points = 4, ··· 154 149 .asic_family = CHIP_POLARIS10, 155 150 .max_pasid_bits = 16, 156 151 .max_no_of_hqd = 24, 152 + .doorbell_size = 4, 157 153 .ih_ring_entry_size = 4 * sizeof(uint32_t), 158 154 .event_interrupt_class = &event_interrupt_class_cik, 159 155 .num_of_watch_points = 4, ··· 168 162 .asic_family = CHIP_POLARIS11, 169 163 .max_pasid_bits = 16, 170 164 .max_no_of_hqd = 24, 165 + .doorbell_size = 4, 171 166 .ih_ring_entry_size = 4 * sizeof(uint32_t), 172 167 .event_interrupt_class = &event_interrupt_class_cik, 173 168 .num_of_watch_points = 4, ··· 176 169 .supports_cwsr = true, 177 170 .needs_iommu_device = false, 178 171 .needs_pci_atomics = true, 172 + }; 173 + 174 + static const struct kfd_device_info vega10_device_info = { 175 + .asic_family = CHIP_VEGA10, 176 + .max_pasid_bits = 16, 177 + .max_no_of_hqd = 24, 178 + .doorbell_size = 8, 179 + .ih_ring_entry_size = 8 * sizeof(uint32_t), 180 + .event_interrupt_class = &event_interrupt_class_v9, 181 + .num_of_watch_points = 4, 182 + .mqd_size_aligned = MQD_SIZE_ALIGNED, 183 + .supports_cwsr = true, 184 + .needs_iommu_device = false, 185 + .needs_pci_atomics = false, 186 + }; 187 + 188 + static const struct kfd_device_info vega10_vf_device_info = { 189 + .asic_family = CHIP_VEGA10, 190 + .max_pasid_bits = 16, 191 + .max_no_of_hqd = 24, 192 + .doorbell_size = 8, 193 + .ih_ring_entry_size = 8 * sizeof(uint32_t), 194 + .event_interrupt_class = &event_interrupt_class_v9, 195 + .num_of_watch_points = 4, 196 + .mqd_size_aligned = MQD_SIZE_ALIGNED, 197 + .supports_cwsr = true, 198 + .needs_iommu_device = false, 199 + .needs_pci_atomics = false, 179 200 }; 180 201 181 202 ··· 285 250 { 0x67EB, &polaris11_device_info }, /* Polaris11 */ 286 251 { 0x67EF, &polaris11_device_info }, /* Polaris11 */ 287 252 { 0x67FF, &polaris11_device_info }, /* Polaris11 */ 253 + { 0x6860, &vega10_device_info }, /* Vega10 */ 254 + { 0x6861, &vega10_device_info }, /* Vega10 */ 255 + { 0x6862, &vega10_device_info }, /* Vega10 */ 256 + { 0x6863, &vega10_device_info }, /* Vega10 */ 257 + { 0x6864, &vega10_device_info }, /* Vega10 */ 258 + { 0x6867, &vega10_device_info }, /* Vega10 */ 259 + { 0x6868, &vega10_device_info }, /* Vega10 */ 260 + { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ 261 + { 0x687F, &vega10_device_info }, /* Vega10 */ 288 262 }; 289 263 290 264 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, ··· 323 279 struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) 324 280 { 325 281 struct kfd_dev *kfd; 326 - 282 + int ret; 327 283 const struct kfd_device_info *device_info = 328 284 lookup_device_info(pdev->device); 329 285 ··· 332 288 return NULL; 333 289 } 334 290 335 - if (device_info->needs_pci_atomics) { 336 - /* Allow BIF to recode atomics to PCIe 3.0 337 - * AtomicOps. 32 and 64-bit requests are possible and 338 - * must be supported. 339 - */ 340 - if (pci_enable_atomic_ops_to_root(pdev, 341 - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 342 - PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) { 343 - dev_info(kfd_device, 344 - "skipped device %x:%x, PCI rejects atomics", 345 - pdev->vendor, pdev->device); 346 - return NULL; 347 - } 291 + /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. 292 + * 32 and 64-bit requests are possible and must be 293 + * supported. 294 + */ 295 + ret = pci_enable_atomic_ops_to_root(pdev, 296 + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 297 + PCI_EXP_DEVCAP2_ATOMIC_COMP64); 298 + if (device_info->needs_pci_atomics && ret < 0) { 299 + dev_info(kfd_device, 300 + "skipped device %x:%x, PCI rejects atomics\n", 301 + pdev->vendor, pdev->device); 302 + return NULL; 348 303 } 349 304 350 305 kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); ··· 366 323 static void kfd_cwsr_init(struct kfd_dev *kfd) 367 324 { 368 325 if (cwsr_enable && kfd->device_info->supports_cwsr) { 369 - BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); 326 + if (kfd->device_info->asic_family < CHIP_VEGA10) { 327 + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); 328 + kfd->cwsr_isa = cwsr_trap_gfx8_hex; 329 + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); 330 + } else { 331 + BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); 332 + kfd->cwsr_isa = cwsr_trap_gfx9_hex; 333 + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); 334 + } 370 335 371 - kfd->cwsr_isa = cwsr_trap_gfx8_hex; 372 - kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); 373 336 kfd->cwsr_enabled = true; 374 337 } 375 338 } ··· 590 541 spin_unlock(&kfd->interrupt_lock); 591 542 } 592 543 544 + int kgd2kfd_quiesce_mm(struct mm_struct *mm) 545 + { 546 + struct kfd_process *p; 547 + int r; 548 + 549 + /* Because we are called from arbitrary context (workqueue) as opposed 550 + * to process context, kfd_process could attempt to exit while we are 551 + * running so the lookup function increments the process ref count. 552 + */ 553 + p = kfd_lookup_process_by_mm(mm); 554 + if (!p) 555 + return -ESRCH; 556 + 557 + r = kfd_process_evict_queues(p); 558 + 559 + kfd_unref_process(p); 560 + return r; 561 + } 562 + 563 + int kgd2kfd_resume_mm(struct mm_struct *mm) 564 + { 565 + struct kfd_process *p; 566 + int r; 567 + 568 + /* Because we are called from arbitrary context (workqueue) as opposed 569 + * to process context, kfd_process could attempt to exit while we are 570 + * running so the lookup function increments the process ref count. 571 + */ 572 + p = kfd_lookup_process_by_mm(mm); 573 + if (!p) 574 + return -ESRCH; 575 + 576 + r = kfd_process_restore_queues(p); 577 + 578 + kfd_unref_process(p); 579 + return r; 580 + } 581 + 593 582 /** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will 594 583 * prepare for safe eviction of KFD BOs that belong to the specified 595 584 * process. ··· 739 652 if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) 740 653 return -ENOMEM; 741 654 742 - *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); 655 + *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); 743 656 if ((*mem_obj) == NULL) 744 657 return -ENOMEM; 745 658
+106 -8
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 110 110 qpd->sh_mem_bases); 111 111 } 112 112 113 + static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) 114 + { 115 + struct kfd_dev *dev = qpd->dqm->dev; 116 + 117 + if (!KFD_IS_SOC15(dev->device_info->asic_family)) { 118 + /* On pre-SOC15 chips we need to use the queue ID to 119 + * preserve the user mode ABI. 120 + */ 121 + q->doorbell_id = q->properties.queue_id; 122 + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { 123 + /* For SDMA queues on SOC15, use static doorbell 124 + * assignments based on the engine and queue. 125 + */ 126 + q->doorbell_id = dev->shared_resources.sdma_doorbell 127 + [q->properties.sdma_engine_id] 128 + [q->properties.sdma_queue_id]; 129 + } else { 130 + /* For CP queues on SOC15 reserve a free doorbell ID */ 131 + unsigned int found; 132 + 133 + found = find_first_zero_bit(qpd->doorbell_bitmap, 134 + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); 135 + if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { 136 + pr_debug("No doorbells available"); 137 + return -EBUSY; 138 + } 139 + set_bit(found, qpd->doorbell_bitmap); 140 + q->doorbell_id = found; 141 + } 142 + 143 + q->properties.doorbell_off = 144 + kfd_doorbell_id_to_offset(dev, q->process, 145 + q->doorbell_id); 146 + 147 + return 0; 148 + } 149 + 150 + static void deallocate_doorbell(struct qcm_process_device *qpd, 151 + struct queue *q) 152 + { 153 + unsigned int old; 154 + struct kfd_dev *dev = qpd->dqm->dev; 155 + 156 + if (!KFD_IS_SOC15(dev->device_info->asic_family) || 157 + q->properties.type == KFD_QUEUE_TYPE_SDMA) 158 + return; 159 + 160 + old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); 161 + WARN_ON(!old); 162 + } 163 + 113 164 static int allocate_vmid(struct device_queue_manager *dqm, 114 165 struct qcm_process_device *qpd, 115 166 struct queue *q) ··· 196 145 static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, 197 146 struct qcm_process_device *qpd) 198 147 { 199 - uint32_t len; 148 + const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf; 149 + int ret; 200 150 201 151 if (!qpd->ib_kaddr) 202 152 return -ENOMEM; 203 153 204 - len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); 154 + ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); 155 + if (ret) 156 + return ret; 205 157 206 158 return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, 207 - qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); 159 + qpd->ib_base, (uint32_t *)qpd->ib_kaddr, 160 + pmf->release_mem_size / sizeof(uint32_t)); 208 161 } 209 162 210 163 static void deallocate_vmid(struct device_queue_manager *dqm, ··· 356 301 if (retval) 357 302 return retval; 358 303 304 + retval = allocate_doorbell(qpd, q); 305 + if (retval) 306 + goto out_deallocate_hqd; 307 + 359 308 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 360 309 &q->gart_mqd_addr, &q->properties); 361 310 if (retval) 362 - goto out_deallocate_hqd; 311 + goto out_deallocate_doorbell; 363 312 364 313 pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", 365 314 q->pipe, q->queue); ··· 383 324 384 325 out_uninit_mqd: 385 326 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); 327 + out_deallocate_doorbell: 328 + deallocate_doorbell(qpd, q); 386 329 out_deallocate_hqd: 387 330 deallocate_hqd(dqm, q); 388 331 ··· 417 356 return -EINVAL; 418 357 } 419 358 dqm->total_queue_count--; 359 + 360 + deallocate_doorbell(qpd, q); 420 361 421 362 retval = mqd->destroy_mqd(mqd, q->mqd, 422 363 KFD_PREEMPT_TYPE_WAVEFRONT_RESET, ··· 924 861 q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; 925 862 q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; 926 863 864 + retval = allocate_doorbell(qpd, q); 865 + if (retval) 866 + goto out_deallocate_sdma_queue; 867 + 927 868 pr_debug("SDMA id is: %d\n", q->sdma_id); 928 869 pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); 929 870 pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); ··· 936 869 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 937 870 &q->gart_mqd_addr, &q->properties); 938 871 if (retval) 939 - goto out_deallocate_sdma_queue; 872 + goto out_deallocate_doorbell; 940 873 941 874 retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); 942 875 if (retval) ··· 946 879 947 880 out_uninit_mqd: 948 881 mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); 882 + out_deallocate_doorbell: 883 + deallocate_doorbell(qpd, q); 949 884 out_deallocate_sdma_queue: 950 885 deallocate_sdma_queue(dqm, q->sdma_id); 951 886 ··· 1139 1070 q->properties.sdma_engine_id = 1140 1071 q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; 1141 1072 } 1073 + 1074 + retval = allocate_doorbell(qpd, q); 1075 + if (retval) 1076 + goto out_deallocate_sdma_queue; 1077 + 1142 1078 mqd = dqm->ops.get_mqd_manager(dqm, 1143 1079 get_mqd_type_from_queue_type(q->properties.type)); 1144 1080 1145 1081 if (!mqd) { 1146 1082 retval = -ENOMEM; 1147 - goto out_deallocate_sdma_queue; 1083 + goto out_deallocate_doorbell; 1148 1084 } 1149 1085 /* 1150 1086 * Eviction state logic: we only mark active queues as evicted ··· 1167 1093 retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, 1168 1094 &q->gart_mqd_addr, &q->properties); 1169 1095 if (retval) 1170 - goto out_deallocate_sdma_queue; 1096 + goto out_deallocate_doorbell; 1171 1097 1172 1098 list_add(&q->list, &qpd->queues_list); 1173 1099 qpd->queue_count++; ··· 1191 1117 mutex_unlock(&dqm->lock); 1192 1118 return retval; 1193 1119 1120 + out_deallocate_doorbell: 1121 + deallocate_doorbell(qpd, q); 1194 1122 out_deallocate_sdma_queue: 1195 1123 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) 1196 1124 deallocate_sdma_queue(dqm, q->sdma_id); ··· 1333 1257 goto failed; 1334 1258 } 1335 1259 1260 + deallocate_doorbell(qpd, q); 1261 + 1336 1262 if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { 1337 1263 dqm->sdma_queue_count--; 1338 1264 deallocate_sdma_queue(dqm, q->sdma_id); ··· 1386 1308 void __user *alternate_aperture_base, 1387 1309 uint64_t alternate_aperture_size) 1388 1310 { 1389 - bool retval; 1311 + bool retval = true; 1312 + 1313 + if (!dqm->asic_ops.set_cache_memory_policy) 1314 + return retval; 1390 1315 1391 1316 mutex_lock(&dqm->lock); 1392 1317 ··· 1658 1577 case CHIP_POLARIS11: 1659 1578 device_queue_manager_init_vi_tonga(&dqm->asic_ops); 1660 1579 break; 1580 + 1581 + case CHIP_VEGA10: 1582 + case CHIP_RAVEN: 1583 + device_queue_manager_init_v9(&dqm->asic_ops); 1584 + break; 1661 1585 default: 1662 1586 WARN(1, "Unexpected ASIC family %u", 1663 1587 dev->device_info->asic_family); ··· 1712 1626 uint32_t (*dump)[2], n_regs; 1713 1627 int pipe, queue; 1714 1628 int r = 0; 1629 + 1630 + r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, 1631 + KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); 1632 + if (!r) { 1633 + seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", 1634 + KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, 1635 + KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), 1636 + KFD_CIK_HIQ_QUEUE); 1637 + seq_reg_dump(m, dump, n_regs); 1638 + 1639 + kfree(dump); 1640 + } 1715 1641 1716 1642 for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { 1717 1643 int pipe_offset = pipe * get_queues_per_pipe(dqm);
+2
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
··· 200 200 struct device_queue_manager_asic_ops *asic_ops); 201 201 void device_queue_manager_init_vi_tonga( 202 202 struct device_queue_manager_asic_ops *asic_ops); 203 + void device_queue_manager_init_v9( 204 + struct device_queue_manager_asic_ops *asic_ops); 203 205 void program_sh_mem_settings(struct device_queue_manager *dqm, 204 206 struct qcm_process_device *qpd); 205 207 unsigned int get_queues_num(struct device_queue_manager *dqm);
+84
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
··· 1 + /* 2 + * Copyright 2016-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + */ 23 + 24 + #include "kfd_device_queue_manager.h" 25 + #include "vega10_enum.h" 26 + #include "gc/gc_9_0_offset.h" 27 + #include "gc/gc_9_0_sh_mask.h" 28 + #include "sdma0/sdma0_4_0_sh_mask.h" 29 + 30 + static int update_qpd_v9(struct device_queue_manager *dqm, 31 + struct qcm_process_device *qpd); 32 + static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, 33 + struct qcm_process_device *qpd); 34 + 35 + void device_queue_manager_init_v9( 36 + struct device_queue_manager_asic_ops *asic_ops) 37 + { 38 + asic_ops->update_qpd = update_qpd_v9; 39 + asic_ops->init_sdma_vm = init_sdma_vm_v9; 40 + } 41 + 42 + static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) 43 + { 44 + uint32_t shared_base = pdd->lds_base >> 48; 45 + uint32_t private_base = pdd->scratch_base >> 48; 46 + 47 + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | 48 + private_base; 49 + } 50 + 51 + static int update_qpd_v9(struct device_queue_manager *dqm, 52 + struct qcm_process_device *qpd) 53 + { 54 + struct kfd_process_device *pdd; 55 + 56 + pdd = qpd_to_pdd(qpd); 57 + 58 + /* check if sh_mem_config register already configured */ 59 + if (qpd->sh_mem_config == 0) { 60 + qpd->sh_mem_config = 61 + SH_MEM_ALIGNMENT_MODE_UNALIGNED << 62 + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; 63 + if (vega10_noretry && 64 + !dqm->dev->device_info->needs_iommu_device) 65 + qpd->sh_mem_config |= 66 + 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; 67 + 68 + qpd->sh_mem_ape1_limit = 0; 69 + qpd->sh_mem_ape1_base = 0; 70 + } 71 + 72 + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); 73 + 74 + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); 75 + 76 + return 0; 77 + } 78 + 79 + static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, 80 + struct qcm_process_device *qpd) 81 + { 82 + /* Not needed on SDMAv4 any more */ 83 + q->properties.sdma_vm_addr = 0; 84 + }
+34 -31
drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
··· 33 33 34 34 static DEFINE_IDA(doorbell_ida); 35 35 static unsigned int max_doorbell_slices; 36 - #define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 37 36 38 37 /* 39 38 * Each device exposes a doorbell aperture, a PCI MMIO aperture that ··· 49 50 */ 50 51 51 52 /* # of doorbell bytes allocated for each process. */ 52 - static inline size_t doorbell_process_allocation(void) 53 + size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) 53 54 { 54 - return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * 55 + return roundup(kfd->device_info->doorbell_size * 55 56 KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, 56 57 PAGE_SIZE); 57 58 } ··· 71 72 72 73 doorbell_start_offset = 73 74 roundup(kfd->shared_resources.doorbell_start_offset, 74 - doorbell_process_allocation()); 75 + kfd_doorbell_process_slice(kfd)); 75 76 76 77 doorbell_aperture_size = 77 78 rounddown(kfd->shared_resources.doorbell_aperture_size, 78 - doorbell_process_allocation()); 79 + kfd_doorbell_process_slice(kfd)); 79 80 80 81 if (doorbell_aperture_size > doorbell_start_offset) 81 82 doorbell_process_limit = 82 83 (doorbell_aperture_size - doorbell_start_offset) / 83 - doorbell_process_allocation(); 84 + kfd_doorbell_process_slice(kfd); 84 85 else 85 86 return -ENOSPC; 86 87 ··· 94 95 kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); 95 96 96 97 kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, 97 - doorbell_process_allocation()); 98 + kfd_doorbell_process_slice(kfd)); 98 99 99 100 if (!kfd->doorbell_kernel_ptr) 100 101 return -ENOMEM; ··· 126 127 iounmap(kfd->doorbell_kernel_ptr); 127 128 } 128 129 129 - int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) 130 + int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, 131 + struct vm_area_struct *vma) 130 132 { 131 133 phys_addr_t address; 132 - struct kfd_dev *dev; 133 134 134 135 /* 135 136 * For simplicitly we only allow mapping of the entire doorbell 136 137 * allocation of a single device & process. 137 138 */ 138 - if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) 139 - return -EINVAL; 140 - 141 - /* Find kfd device according to gpu id */ 142 - dev = kfd_device_by_id(vma->vm_pgoff); 143 - if (!dev) 139 + if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) 144 140 return -EINVAL; 145 141 146 142 /* Calculate physical address of doorbell */ ··· 152 158 " vm_flags == 0x%04lX\n" 153 159 " size == 0x%04lX\n", 154 160 (unsigned long long) vma->vm_start, address, vma->vm_flags, 155 - doorbell_process_allocation()); 161 + kfd_doorbell_process_slice(dev)); 156 162 157 163 158 164 return io_remap_pfn_range(vma, 159 165 vma->vm_start, 160 166 address >> PAGE_SHIFT, 161 - doorbell_process_allocation(), 167 + kfd_doorbell_process_slice(dev), 162 168 vma->vm_page_prot); 163 169 } 164 170 165 171 166 172 /* get kernel iomem pointer for a doorbell */ 167 - u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 173 + void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 168 174 unsigned int *doorbell_off) 169 175 { 170 176 u32 inx; ··· 178 184 179 185 if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) 180 186 return NULL; 187 + 188 + inx *= kfd->device_info->doorbell_size / sizeof(u32); 181 189 182 190 /* 183 191 * Calculating the kernel doorbell offset using the first ··· 206 210 mutex_unlock(&kfd->doorbell_mutex); 207 211 } 208 212 209 - inline void write_kernel_doorbell(u32 __iomem *db, u32 value) 213 + void write_kernel_doorbell(void __iomem *db, u32 value) 210 214 { 211 215 if (db) { 212 216 writel(value, db); ··· 214 218 } 215 219 } 216 220 217 - /* 218 - * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 219 - * to doorbells with the process's doorbell page 220 - */ 221 - unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, 221 + void write_kernel_doorbell64(void __iomem *db, u64 value) 222 + { 223 + if (db) { 224 + WARN(((unsigned long)db & 7) != 0, 225 + "Unaligned 64-bit doorbell"); 226 + writeq(value, (u64 __iomem *)db); 227 + pr_debug("writing %llu to doorbell address %p\n", value, db); 228 + } 229 + } 230 + 231 + unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, 222 232 struct kfd_process *process, 223 - unsigned int queue_id) 233 + unsigned int doorbell_id) 224 234 { 225 235 /* 226 236 * doorbell_id_offset accounts for doorbells taken by KGD. 227 - * index * doorbell_process_allocation/sizeof(u32) adjusts to 228 - * the process's doorbells. 237 + * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to 238 + * the process's doorbells. The offset returned is in dword 239 + * units regardless of the ASIC-dependent doorbell size. 229 240 */ 230 241 return kfd->doorbell_id_offset + 231 242 process->doorbell_index 232 - * doorbell_process_allocation() / sizeof(u32) + 233 - queue_id; 243 + * kfd_doorbell_process_slice(kfd) / sizeof(u32) + 244 + doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); 234 245 } 235 246 236 247 uint64_t kfd_get_number_elems(struct kfd_dev *kfd) 237 248 { 238 249 uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - 239 250 kfd->shared_resources.doorbell_start_offset) / 240 - doorbell_process_allocation() + 1; 251 + kfd_doorbell_process_slice(kfd) + 1; 241 252 242 253 return num_of_elems; 243 254 ··· 254 251 struct kfd_process *process) 255 252 { 256 253 return dev->doorbell_base + 257 - process->doorbell_index * doorbell_process_allocation(); 254 + process->doorbell_index * kfd_doorbell_process_slice(dev); 258 255 } 259 256 260 257 int kfd_alloc_process_doorbells(struct kfd_process *process)
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_events.c
··· 345 345 case KFD_EVENT_TYPE_DEBUG: 346 346 ret = create_signal_event(devkfd, p, ev); 347 347 if (!ret) { 348 - *event_page_offset = KFD_MMAP_EVENTS_MASK; 348 + *event_page_offset = KFD_MMAP_TYPE_EVENTS; 349 349 *event_page_offset <<= PAGE_SHIFT; 350 350 *event_slot_index = ev->event_id; 351 351 } ··· 496 496 pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", 497 497 partial_id, valid_id_bits); 498 498 499 - if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { 499 + if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT / 64) { 500 500 /* With relatively few events, it's faster to 501 501 * iterate over the event IDR 502 502 */
+88 -31
drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
··· 275 275 * for FLAT_* / S_LOAD operations. 276 276 */ 277 277 278 - #define MAKE_GPUVM_APP_BASE(gpu_num) \ 278 + #define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ 279 279 (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) 280 280 281 281 #define MAKE_GPUVM_APP_LIMIT(base, size) \ 282 282 (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) 283 283 284 - #define MAKE_SCRATCH_APP_BASE() \ 284 + #define MAKE_SCRATCH_APP_BASE_VI() \ 285 285 (((uint64_t)(0x1UL) << 61) + 0x100000000L) 286 286 287 287 #define MAKE_SCRATCH_APP_LIMIT(base) \ 288 288 (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 289 289 290 - #define MAKE_LDS_APP_BASE() \ 290 + #define MAKE_LDS_APP_BASE_VI() \ 291 291 (((uint64_t)(0x1UL) << 61) + 0x0) 292 292 #define MAKE_LDS_APP_LIMIT(base) \ 293 293 (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) 294 + 295 + /* On GFXv9 the LDS and scratch apertures are programmed independently 296 + * using the high 16 bits of the 64-bit virtual address. They must be 297 + * in the hole, which will be the case as long as the high 16 bits are 298 + * not 0. 299 + * 300 + * The aperture sizes are still 4GB implicitly. 301 + * 302 + * A GPUVM aperture is not applicable on GFXv9. 303 + */ 304 + #define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) 305 + #define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) 294 306 295 307 /* User mode manages most of the SVM aperture address space. The low 296 308 * 16MB are reserved for kernel use (CWSR trap handler and kernel IB ··· 312 300 #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) 313 301 #define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) 314 302 303 + static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) 304 + { 305 + /* 306 + * node id couldn't be 0 - the three MSB bits of 307 + * aperture shoudn't be 0 308 + */ 309 + pdd->lds_base = MAKE_LDS_APP_BASE_VI(); 310 + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 311 + 312 + if (!pdd->dev->device_info->needs_iommu_device) { 313 + /* dGPUs: SVM aperture starting at 0 314 + * with small reserved space for kernel. 315 + * Set them to CANONICAL addresses. 316 + */ 317 + pdd->gpuvm_base = SVM_USER_BASE; 318 + pdd->gpuvm_limit = 319 + pdd->dev->shared_resources.gpuvm_size - 1; 320 + } else { 321 + /* set them to non CANONICAL addresses, and no SVM is 322 + * allocated. 323 + */ 324 + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); 325 + pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base, 326 + pdd->dev->shared_resources.gpuvm_size); 327 + } 328 + 329 + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); 330 + pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 331 + } 332 + 333 + static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) 334 + { 335 + pdd->lds_base = MAKE_LDS_APP_BASE_V9(); 336 + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 337 + 338 + /* Raven needs SVM to support graphic handle, etc. Leave the small 339 + * reserved space before SVM on Raven as well, even though we don't 340 + * have to. 341 + * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they 342 + * are used in Thunk to reserve SVM. 343 + */ 344 + pdd->gpuvm_base = SVM_USER_BASE; 345 + pdd->gpuvm_limit = 346 + pdd->dev->shared_resources.gpuvm_size - 1; 347 + 348 + pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); 349 + pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 350 + } 351 + 315 352 int kfd_init_apertures(struct kfd_process *process) 316 353 { 317 354 uint8_t id = 0; ··· 368 307 struct kfd_process_device *pdd; 369 308 370 309 /*Iterating over all devices*/ 371 - while (kfd_topology_enum_kfd_devices(id, &dev) == 0 && 372 - id < NUM_OF_SUPPORTED_GPUS) { 373 - 310 + while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { 374 311 if (!dev) { 375 312 id++; /* Skip non GPU devices */ 376 313 continue; ··· 377 318 pdd = kfd_create_process_device_data(dev, process); 378 319 if (!pdd) { 379 320 pr_err("Failed to create process device data\n"); 380 - return -1; 321 + return -ENOMEM; 381 322 } 382 323 /* 383 324 * For 64 bit process apertures will be statically reserved in ··· 389 330 pdd->gpuvm_base = pdd->gpuvm_limit = 0; 390 331 pdd->scratch_base = pdd->scratch_limit = 0; 391 332 } else { 392 - /* Same LDS and scratch apertures can be used 393 - * on all GPUs. This allows using more dGPUs 394 - * than placement options for apertures. 395 - */ 396 - pdd->lds_base = MAKE_LDS_APP_BASE(); 397 - pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); 333 + switch (dev->device_info->asic_family) { 334 + case CHIP_KAVERI: 335 + case CHIP_HAWAII: 336 + case CHIP_CARRIZO: 337 + case CHIP_TONGA: 338 + case CHIP_FIJI: 339 + case CHIP_POLARIS10: 340 + case CHIP_POLARIS11: 341 + kfd_init_apertures_vi(pdd, id); 342 + break; 343 + case CHIP_VEGA10: 344 + case CHIP_RAVEN: 345 + kfd_init_apertures_v9(pdd, id); 346 + break; 347 + default: 348 + WARN(1, "Unexpected ASIC family %u", 349 + dev->device_info->asic_family); 350 + return -EINVAL; 351 + } 398 352 399 - pdd->scratch_base = MAKE_SCRATCH_APP_BASE(); 400 - pdd->scratch_limit = 401 - MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); 402 - 403 - if (dev->device_info->needs_iommu_device) { 404 - /* APUs: GPUVM aperture in 405 - * non-canonical address space 353 + if (!dev->device_info->needs_iommu_device) { 354 + /* dGPUs: the reserved space for kernel 355 + * before SVM 406 356 */ 407 - pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); 408 - pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( 409 - pdd->gpuvm_base, 410 - dev->shared_resources.gpuvm_size); 411 - } else { 412 - /* dGPUs: SVM aperture starting at 0 413 - * with small reserved space for kernel 414 - */ 415 - pdd->gpuvm_base = SVM_USER_BASE; 416 - pdd->gpuvm_limit = 417 - dev->shared_resources.gpuvm_size - 1; 418 357 pdd->qpd.cwsr_base = SVM_CWSR_BASE; 419 358 pdd->qpd.ib_base = SVM_IB_BASE; 420 359 }
+92
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
··· 1 + /* 2 + * Copyright 2016-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + */ 22 + 23 + #include "kfd_priv.h" 24 + #include "kfd_events.h" 25 + #include "soc15_int.h" 26 + 27 + 28 + static bool event_interrupt_isr_v9(struct kfd_dev *dev, 29 + const uint32_t *ih_ring_entry) 30 + { 31 + uint16_t source_id, client_id, pasid, vmid; 32 + const uint32_t *data = ih_ring_entry; 33 + 34 + /* Only handle interrupts from KFD VMIDs */ 35 + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 36 + if (vmid < dev->vm_info.first_vmid_kfd || 37 + vmid > dev->vm_info.last_vmid_kfd) 38 + return 0; 39 + 40 + /* If there is no valid PASID, it's likely a firmware bug */ 41 + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 42 + if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) 43 + return 0; 44 + 45 + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 46 + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 47 + 48 + pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", 49 + client_id, source_id, pasid); 50 + pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", 51 + data[0], data[1], data[2], data[3], 52 + data[4], data[5], data[6], data[7]); 53 + 54 + /* Interrupt types we care about: various signals and faults. 55 + * They will be forwarded to a work queue (see below). 56 + */ 57 + return source_id == SOC15_INTSRC_CP_END_OF_PIPE || 58 + source_id == SOC15_INTSRC_SDMA_TRAP || 59 + source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || 60 + source_id == SOC15_INTSRC_CP_BAD_OPCODE; 61 + } 62 + 63 + static void event_interrupt_wq_v9(struct kfd_dev *dev, 64 + const uint32_t *ih_ring_entry) 65 + { 66 + uint16_t source_id, client_id, pasid, vmid; 67 + uint32_t context_id; 68 + 69 + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 70 + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 71 + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 72 + vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 73 + context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 74 + 75 + if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) 76 + kfd_signal_event_interrupt(pasid, context_id, 32); 77 + else if (source_id == SOC15_INTSRC_SDMA_TRAP) 78 + kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); 79 + else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) 80 + kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); 81 + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) 82 + kfd_signal_hw_exception_event(pasid); 83 + else if (client_id == SOC15_IH_CLIENTID_VMC || 84 + client_id == SOC15_IH_CLIENTID_UTCL2) { 85 + /* TODO */ 86 + } 87 + } 88 + 89 + const struct kfd_event_interrupt_class event_interrupt_class_v9 = { 90 + .interrupt_isr = event_interrupt_isr_v9, 91 + .interrupt_wq = event_interrupt_wq_v9, 92 + };
+5 -3
drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
··· 139 139 { 140 140 struct kfd_dev *dev = container_of(work, struct kfd_dev, 141 141 interrupt_work); 142 + uint32_t ih_ring_entry[KFD_MAX_RING_ENTRY_SIZE]; 142 143 143 - uint32_t ih_ring_entry[DIV_ROUND_UP( 144 - dev->device_info->ih_ring_entry_size, 145 - sizeof(uint32_t))]; 144 + if (dev->device_info->ih_ring_entry_size > sizeof(ih_ring_entry)) { 145 + dev_err_once(kfd_chardev(), "Ring entry too small\n"); 146 + return; 147 + } 146 148 147 149 while (dequeue_ih_ring_entry(dev, ih_ring_entry)) 148 150 dev->device_info->event_interrupt_class->interrupt_wq(dev,
+27 -12
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
··· 99 99 kq->rptr_kernel = kq->rptr_mem->cpu_ptr; 100 100 kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; 101 101 102 - retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), 102 + retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, 103 103 &kq->wptr_mem); 104 104 105 105 if (retval != 0) ··· 208 208 size_t available_size; 209 209 size_t queue_size_dwords; 210 210 uint32_t wptr, rptr; 211 + uint64_t wptr64; 211 212 unsigned int *queue_address; 212 213 213 214 /* When rptr == wptr, the buffer is empty. ··· 217 216 * the opposite. So we can only use up to queue_size_dwords - 1 dwords. 218 217 */ 219 218 rptr = *kq->rptr_kernel; 220 - wptr = *kq->wptr_kernel; 219 + wptr = kq->pending_wptr; 220 + wptr64 = kq->pending_wptr64; 221 221 queue_address = (unsigned int *)kq->pq_kernel_addr; 222 222 queue_size_dwords = kq->queue->properties.queue_size / 4; 223 223 ··· 234 232 * make sure calling functions know 235 233 * acquire_packet_buffer() failed 236 234 */ 237 - *buffer_ptr = NULL; 238 - return -ENOMEM; 235 + goto err_no_space; 239 236 } 240 237 241 238 if (wptr + packet_size_in_dwords >= queue_size_dwords) { 242 239 /* make sure after rolling back to position 0, there is 243 240 * still enough space. 244 241 */ 245 - if (packet_size_in_dwords >= rptr) { 246 - *buffer_ptr = NULL; 247 - return -ENOMEM; 248 - } 242 + if (packet_size_in_dwords >= rptr) 243 + goto err_no_space; 244 + 249 245 /* fill nops, roll back and start at position 0 */ 250 246 while (wptr > 0) { 251 247 queue_address[wptr] = kq->nop_packet; 252 248 wptr = (wptr + 1) % queue_size_dwords; 249 + wptr64++; 253 250 } 254 251 } 255 252 256 253 *buffer_ptr = &queue_address[wptr]; 257 254 kq->pending_wptr = wptr + packet_size_in_dwords; 255 + kq->pending_wptr64 = wptr64 + packet_size_in_dwords; 258 256 259 257 return 0; 258 + 259 + err_no_space: 260 + *buffer_ptr = NULL; 261 + return -ENOMEM; 260 262 } 261 263 262 264 static void submit_packet(struct kernel_queue *kq) ··· 276 270 pr_debug("\n"); 277 271 #endif 278 272 279 - *kq->wptr_kernel = kq->pending_wptr; 280 - write_kernel_doorbell(kq->queue->properties.doorbell_ptr, 281 - kq->pending_wptr); 273 + kq->ops_asic_specific.submit_packet(kq); 282 274 } 283 275 284 276 static void rollback_packet(struct kernel_queue *kq) 285 277 { 286 - kq->pending_wptr = *kq->queue->properties.write_ptr; 278 + if (kq->dev->device_info->doorbell_size == 8) { 279 + kq->pending_wptr64 = *kq->wptr64_kernel; 280 + kq->pending_wptr = *kq->wptr_kernel % 281 + (kq->queue->properties.queue_size / 4); 282 + } else { 283 + kq->pending_wptr = *kq->wptr_kernel; 284 + } 287 285 } 288 286 289 287 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, ··· 317 307 case CHIP_KAVERI: 318 308 case CHIP_HAWAII: 319 309 kernel_queue_init_cik(&kq->ops_asic_specific); 310 + break; 311 + 312 + case CHIP_VEGA10: 313 + case CHIP_RAVEN: 314 + kernel_queue_init_v9(&kq->ops_asic_specific); 320 315 break; 321 316 default: 322 317 WARN(1, "Unexpected ASIC family %u",
+6 -1
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
··· 72 72 struct kfd_dev *dev; 73 73 struct mqd_manager *mqd; 74 74 struct queue *queue; 75 + uint64_t pending_wptr64; 75 76 uint32_t pending_wptr; 76 77 unsigned int nop_packet; 77 78 ··· 80 79 uint32_t *rptr_kernel; 81 80 uint64_t rptr_gpu_addr; 82 81 struct kfd_mem_obj *wptr_mem; 83 - uint32_t *wptr_kernel; 82 + union { 83 + uint64_t *wptr64_kernel; 84 + uint32_t *wptr_kernel; 85 + }; 84 86 uint64_t wptr_gpu_addr; 85 87 struct kfd_mem_obj *pq; 86 88 uint64_t pq_gpu_addr; ··· 101 97 102 98 void kernel_queue_init_cik(struct kernel_queue_ops *ops); 103 99 void kernel_queue_init_vi(struct kernel_queue_ops *ops); 100 + void kernel_queue_init_v9(struct kernel_queue_ops *ops); 104 101 105 102 #endif /* KFD_KERNEL_QUEUE_H_ */
+9
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
··· 26 26 static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, 27 27 enum kfd_queue_type type, unsigned int queue_size); 28 28 static void uninitialize_cik(struct kernel_queue *kq); 29 + static void submit_packet_cik(struct kernel_queue *kq); 29 30 30 31 void kernel_queue_init_cik(struct kernel_queue_ops *ops) 31 32 { 32 33 ops->initialize = initialize_cik; 33 34 ops->uninitialize = uninitialize_cik; 35 + ops->submit_packet = submit_packet_cik; 34 36 } 35 37 36 38 static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, ··· 43 41 44 42 static void uninitialize_cik(struct kernel_queue *kq) 45 43 { 44 + } 45 + 46 + static void submit_packet_cik(struct kernel_queue *kq) 47 + { 48 + *kq->wptr_kernel = kq->pending_wptr; 49 + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, 50 + kq->pending_wptr); 46 51 }
+340
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
··· 1 + /* 2 + * Copyright 2016-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + */ 23 + 24 + #include "kfd_kernel_queue.h" 25 + #include "kfd_device_queue_manager.h" 26 + #include "kfd_pm4_headers_ai.h" 27 + #include "kfd_pm4_opcodes.h" 28 + 29 + static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, 30 + enum kfd_queue_type type, unsigned int queue_size); 31 + static void uninitialize_v9(struct kernel_queue *kq); 32 + static void submit_packet_v9(struct kernel_queue *kq); 33 + 34 + void kernel_queue_init_v9(struct kernel_queue_ops *ops) 35 + { 36 + ops->initialize = initialize_v9; 37 + ops->uninitialize = uninitialize_v9; 38 + ops->submit_packet = submit_packet_v9; 39 + } 40 + 41 + static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, 42 + enum kfd_queue_type type, unsigned int queue_size) 43 + { 44 + int retval; 45 + 46 + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); 47 + if (retval) 48 + return false; 49 + 50 + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; 51 + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; 52 + 53 + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); 54 + 55 + return true; 56 + } 57 + 58 + static void uninitialize_v9(struct kernel_queue *kq) 59 + { 60 + kfd_gtt_sa_free(kq->dev, kq->eop_mem); 61 + } 62 + 63 + static void submit_packet_v9(struct kernel_queue *kq) 64 + { 65 + *kq->wptr64_kernel = kq->pending_wptr64; 66 + write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, 67 + kq->pending_wptr64); 68 + } 69 + 70 + static int pm_map_process_v9(struct packet_manager *pm, 71 + uint32_t *buffer, struct qcm_process_device *qpd) 72 + { 73 + struct pm4_mes_map_process *packet; 74 + uint64_t vm_page_table_base_addr = 75 + (uint64_t)(qpd->page_table_base) << 12; 76 + 77 + packet = (struct pm4_mes_map_process *)buffer; 78 + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); 79 + 80 + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, 81 + sizeof(struct pm4_mes_map_process)); 82 + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; 83 + packet->bitfields2.process_quantum = 1; 84 + packet->bitfields2.pasid = qpd->pqm->process->pasid; 85 + packet->bitfields14.gds_size = qpd->gds_size; 86 + packet->bitfields14.num_gws = qpd->num_gws; 87 + packet->bitfields14.num_oac = qpd->num_oac; 88 + packet->bitfields14.sdma_enable = 1; 89 + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; 90 + 91 + packet->sh_mem_config = qpd->sh_mem_config; 92 + packet->sh_mem_bases = qpd->sh_mem_bases; 93 + packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); 94 + packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); 95 + packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); 96 + packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); 97 + 98 + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); 99 + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); 100 + 101 + packet->vm_context_page_table_base_addr_lo32 = 102 + lower_32_bits(vm_page_table_base_addr); 103 + packet->vm_context_page_table_base_addr_hi32 = 104 + upper_32_bits(vm_page_table_base_addr); 105 + 106 + return 0; 107 + } 108 + 109 + static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, 110 + uint64_t ib, size_t ib_size_in_dwords, bool chain) 111 + { 112 + struct pm4_mes_runlist *packet; 113 + 114 + int concurrent_proc_cnt = 0; 115 + struct kfd_dev *kfd = pm->dqm->dev; 116 + 117 + /* Determine the number of processes to map together to HW: 118 + * it can not exceed the number of VMIDs available to the 119 + * scheduler, and it is determined by the smaller of the number 120 + * of processes in the runlist and kfd module parameter 121 + * hws_max_conc_proc. 122 + * Note: the arbitration between the number of VMIDs and 123 + * hws_max_conc_proc has been done in 124 + * kgd2kfd_device_init(). 125 + */ 126 + concurrent_proc_cnt = min(pm->dqm->processes_count, 127 + kfd->max_proc_per_quantum); 128 + 129 + packet = (struct pm4_mes_runlist *)buffer; 130 + 131 + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); 132 + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, 133 + sizeof(struct pm4_mes_runlist)); 134 + 135 + packet->bitfields4.ib_size = ib_size_in_dwords; 136 + packet->bitfields4.chain = chain ? 1 : 0; 137 + packet->bitfields4.offload_polling = 0; 138 + packet->bitfields4.valid = 1; 139 + packet->bitfields4.process_cnt = concurrent_proc_cnt; 140 + packet->ordinal2 = lower_32_bits(ib); 141 + packet->ib_base_hi = upper_32_bits(ib); 142 + 143 + return 0; 144 + } 145 + 146 + static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, 147 + struct queue *q, bool is_static) 148 + { 149 + struct pm4_mes_map_queues *packet; 150 + bool use_static = is_static; 151 + 152 + packet = (struct pm4_mes_map_queues *)buffer; 153 + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); 154 + 155 + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, 156 + sizeof(struct pm4_mes_map_queues)); 157 + packet->bitfields2.alloc_format = 158 + alloc_format__mes_map_queues__one_per_pipe_vi; 159 + packet->bitfields2.num_queues = 1; 160 + packet->bitfields2.queue_sel = 161 + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; 162 + 163 + packet->bitfields2.engine_sel = 164 + engine_sel__mes_map_queues__compute_vi; 165 + packet->bitfields2.queue_type = 166 + queue_type__mes_map_queues__normal_compute_vi; 167 + 168 + switch (q->properties.type) { 169 + case KFD_QUEUE_TYPE_COMPUTE: 170 + if (use_static) 171 + packet->bitfields2.queue_type = 172 + queue_type__mes_map_queues__normal_latency_static_queue_vi; 173 + break; 174 + case KFD_QUEUE_TYPE_DIQ: 175 + packet->bitfields2.queue_type = 176 + queue_type__mes_map_queues__debug_interface_queue_vi; 177 + break; 178 + case KFD_QUEUE_TYPE_SDMA: 179 + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + 180 + engine_sel__mes_map_queues__sdma0_vi; 181 + use_static = false; /* no static queues under SDMA */ 182 + break; 183 + default: 184 + WARN(1, "queue type %d", q->properties.type); 185 + return -EINVAL; 186 + } 187 + packet->bitfields3.doorbell_offset = 188 + q->properties.doorbell_off; 189 + 190 + packet->mqd_addr_lo = 191 + lower_32_bits(q->gart_mqd_addr); 192 + 193 + packet->mqd_addr_hi = 194 + upper_32_bits(q->gart_mqd_addr); 195 + 196 + packet->wptr_addr_lo = 197 + lower_32_bits((uint64_t)q->properties.write_ptr); 198 + 199 + packet->wptr_addr_hi = 200 + upper_32_bits((uint64_t)q->properties.write_ptr); 201 + 202 + return 0; 203 + } 204 + 205 + static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, 206 + enum kfd_queue_type type, 207 + enum kfd_unmap_queues_filter filter, 208 + uint32_t filter_param, bool reset, 209 + unsigned int sdma_engine) 210 + { 211 + struct pm4_mes_unmap_queues *packet; 212 + 213 + packet = (struct pm4_mes_unmap_queues *)buffer; 214 + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); 215 + 216 + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, 217 + sizeof(struct pm4_mes_unmap_queues)); 218 + switch (type) { 219 + case KFD_QUEUE_TYPE_COMPUTE: 220 + case KFD_QUEUE_TYPE_DIQ: 221 + packet->bitfields2.engine_sel = 222 + engine_sel__mes_unmap_queues__compute; 223 + break; 224 + case KFD_QUEUE_TYPE_SDMA: 225 + packet->bitfields2.engine_sel = 226 + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; 227 + break; 228 + default: 229 + WARN(1, "queue type %d", type); 230 + return -EINVAL; 231 + } 232 + 233 + if (reset) 234 + packet->bitfields2.action = 235 + action__mes_unmap_queues__reset_queues; 236 + else 237 + packet->bitfields2.action = 238 + action__mes_unmap_queues__preempt_queues; 239 + 240 + switch (filter) { 241 + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: 242 + packet->bitfields2.queue_sel = 243 + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; 244 + packet->bitfields2.num_queues = 1; 245 + packet->bitfields3b.doorbell_offset0 = filter_param; 246 + break; 247 + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: 248 + packet->bitfields2.queue_sel = 249 + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; 250 + packet->bitfields3a.pasid = filter_param; 251 + break; 252 + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: 253 + packet->bitfields2.queue_sel = 254 + queue_sel__mes_unmap_queues__unmap_all_queues; 255 + break; 256 + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: 257 + /* in this case, we do not preempt static queues */ 258 + packet->bitfields2.queue_sel = 259 + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; 260 + break; 261 + default: 262 + WARN(1, "filter %d", filter); 263 + return -EINVAL; 264 + } 265 + 266 + return 0; 267 + 268 + } 269 + 270 + static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, 271 + uint64_t fence_address, uint32_t fence_value) 272 + { 273 + struct pm4_mes_query_status *packet; 274 + 275 + packet = (struct pm4_mes_query_status *)buffer; 276 + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); 277 + 278 + 279 + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, 280 + sizeof(struct pm4_mes_query_status)); 281 + 282 + packet->bitfields2.context_id = 0; 283 + packet->bitfields2.interrupt_sel = 284 + interrupt_sel__mes_query_status__completion_status; 285 + packet->bitfields2.command = 286 + command__mes_query_status__fence_only_after_write_ack; 287 + 288 + packet->addr_hi = upper_32_bits((uint64_t)fence_address); 289 + packet->addr_lo = lower_32_bits((uint64_t)fence_address); 290 + packet->data_hi = upper_32_bits((uint64_t)fence_value); 291 + packet->data_lo = lower_32_bits((uint64_t)fence_value); 292 + 293 + return 0; 294 + } 295 + 296 + 297 + static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) 298 + { 299 + struct pm4_mec_release_mem *packet; 300 + 301 + packet = (struct pm4_mec_release_mem *)buffer; 302 + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); 303 + 304 + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, 305 + sizeof(struct pm4_mec_release_mem)); 306 + 307 + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; 308 + packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; 309 + packet->bitfields2.tcl1_action_ena = 1; 310 + packet->bitfields2.tc_action_ena = 1; 311 + packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; 312 + 313 + packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; 314 + packet->bitfields3.int_sel = 315 + int_sel__mec_release_mem__send_interrupt_after_write_confirm; 316 + 317 + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; 318 + packet->address_hi = upper_32_bits(gpu_addr); 319 + 320 + packet->data_lo = 0; 321 + 322 + return 0; 323 + } 324 + 325 + const struct packet_manager_funcs kfd_v9_pm_funcs = { 326 + .map_process = pm_map_process_v9, 327 + .runlist = pm_runlist_v9, 328 + .set_resources = pm_set_resources_vi, 329 + .map_queues = pm_map_queues_v9, 330 + .unmap_queues = pm_unmap_queues_v9, 331 + .query_status = pm_query_status_v9, 332 + .release_mem = pm_release_mem_v9, 333 + .map_process_size = sizeof(struct pm4_mes_map_process), 334 + .runlist_size = sizeof(struct pm4_mes_runlist), 335 + .set_resources_size = sizeof(struct pm4_mes_set_resources), 336 + .map_queues_size = sizeof(struct pm4_mes_map_queues), 337 + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), 338 + .query_status_size = sizeof(struct pm4_mes_query_status), 339 + .release_mem_size = sizeof(struct pm4_mec_release_mem) 340 + };
+319
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
··· 22 22 */ 23 23 24 24 #include "kfd_kernel_queue.h" 25 + #include "kfd_device_queue_manager.h" 26 + #include "kfd_pm4_headers_vi.h" 27 + #include "kfd_pm4_opcodes.h" 25 28 26 29 static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, 27 30 enum kfd_queue_type type, unsigned int queue_size); 28 31 static void uninitialize_vi(struct kernel_queue *kq); 32 + static void submit_packet_vi(struct kernel_queue *kq); 29 33 30 34 void kernel_queue_init_vi(struct kernel_queue_ops *ops) 31 35 { 32 36 ops->initialize = initialize_vi; 33 37 ops->uninitialize = uninitialize_vi; 38 + ops->submit_packet = submit_packet_vi; 34 39 } 35 40 36 41 static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, ··· 59 54 { 60 55 kfd_gtt_sa_free(kq->dev, kq->eop_mem); 61 56 } 57 + 58 + static void submit_packet_vi(struct kernel_queue *kq) 59 + { 60 + *kq->wptr_kernel = kq->pending_wptr; 61 + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, 62 + kq->pending_wptr); 63 + } 64 + 65 + unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) 66 + { 67 + union PM4_MES_TYPE_3_HEADER header; 68 + 69 + header.u32All = 0; 70 + header.opcode = opcode; 71 + header.count = packet_size / 4 - 2; 72 + header.type = PM4_TYPE_3; 73 + 74 + return header.u32All; 75 + } 76 + 77 + static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, 78 + struct qcm_process_device *qpd) 79 + { 80 + struct pm4_mes_map_process *packet; 81 + 82 + packet = (struct pm4_mes_map_process *)buffer; 83 + 84 + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); 85 + 86 + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, 87 + sizeof(struct pm4_mes_map_process)); 88 + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; 89 + packet->bitfields2.process_quantum = 1; 90 + packet->bitfields2.pasid = qpd->pqm->process->pasid; 91 + packet->bitfields3.page_table_base = qpd->page_table_base; 92 + packet->bitfields10.gds_size = qpd->gds_size; 93 + packet->bitfields10.num_gws = qpd->num_gws; 94 + packet->bitfields10.num_oac = qpd->num_oac; 95 + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; 96 + 97 + packet->sh_mem_config = qpd->sh_mem_config; 98 + packet->sh_mem_bases = qpd->sh_mem_bases; 99 + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; 100 + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; 101 + 102 + packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; 103 + 104 + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); 105 + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); 106 + 107 + return 0; 108 + } 109 + 110 + static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, 111 + uint64_t ib, size_t ib_size_in_dwords, bool chain) 112 + { 113 + struct pm4_mes_runlist *packet; 114 + int concurrent_proc_cnt = 0; 115 + struct kfd_dev *kfd = pm->dqm->dev; 116 + 117 + if (WARN_ON(!ib)) 118 + return -EFAULT; 119 + 120 + /* Determine the number of processes to map together to HW: 121 + * it can not exceed the number of VMIDs available to the 122 + * scheduler, and it is determined by the smaller of the number 123 + * of processes in the runlist and kfd module parameter 124 + * hws_max_conc_proc. 125 + * Note: the arbitration between the number of VMIDs and 126 + * hws_max_conc_proc has been done in 127 + * kgd2kfd_device_init(). 128 + */ 129 + concurrent_proc_cnt = min(pm->dqm->processes_count, 130 + kfd->max_proc_per_quantum); 131 + 132 + packet = (struct pm4_mes_runlist *)buffer; 133 + 134 + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); 135 + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, 136 + sizeof(struct pm4_mes_runlist)); 137 + 138 + packet->bitfields4.ib_size = ib_size_in_dwords; 139 + packet->bitfields4.chain = chain ? 1 : 0; 140 + packet->bitfields4.offload_polling = 0; 141 + packet->bitfields4.valid = 1; 142 + packet->bitfields4.process_cnt = concurrent_proc_cnt; 143 + packet->ordinal2 = lower_32_bits(ib); 144 + packet->bitfields3.ib_base_hi = upper_32_bits(ib); 145 + 146 + return 0; 147 + } 148 + 149 + int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, 150 + struct scheduling_resources *res) 151 + { 152 + struct pm4_mes_set_resources *packet; 153 + 154 + packet = (struct pm4_mes_set_resources *)buffer; 155 + memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); 156 + 157 + packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, 158 + sizeof(struct pm4_mes_set_resources)); 159 + 160 + packet->bitfields2.queue_type = 161 + queue_type__mes_set_resources__hsa_interface_queue_hiq; 162 + packet->bitfields2.vmid_mask = res->vmid_mask; 163 + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; 164 + packet->bitfields7.oac_mask = res->oac_mask; 165 + packet->bitfields8.gds_heap_base = res->gds_heap_base; 166 + packet->bitfields8.gds_heap_size = res->gds_heap_size; 167 + 168 + packet->gws_mask_lo = lower_32_bits(res->gws_mask); 169 + packet->gws_mask_hi = upper_32_bits(res->gws_mask); 170 + 171 + packet->queue_mask_lo = lower_32_bits(res->queue_mask); 172 + packet->queue_mask_hi = upper_32_bits(res->queue_mask); 173 + 174 + return 0; 175 + } 176 + 177 + static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, 178 + struct queue *q, bool is_static) 179 + { 180 + struct pm4_mes_map_queues *packet; 181 + bool use_static = is_static; 182 + 183 + packet = (struct pm4_mes_map_queues *)buffer; 184 + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); 185 + 186 + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, 187 + sizeof(struct pm4_mes_map_queues)); 188 + packet->bitfields2.alloc_format = 189 + alloc_format__mes_map_queues__one_per_pipe_vi; 190 + packet->bitfields2.num_queues = 1; 191 + packet->bitfields2.queue_sel = 192 + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; 193 + 194 + packet->bitfields2.engine_sel = 195 + engine_sel__mes_map_queues__compute_vi; 196 + packet->bitfields2.queue_type = 197 + queue_type__mes_map_queues__normal_compute_vi; 198 + 199 + switch (q->properties.type) { 200 + case KFD_QUEUE_TYPE_COMPUTE: 201 + if (use_static) 202 + packet->bitfields2.queue_type = 203 + queue_type__mes_map_queues__normal_latency_static_queue_vi; 204 + break; 205 + case KFD_QUEUE_TYPE_DIQ: 206 + packet->bitfields2.queue_type = 207 + queue_type__mes_map_queues__debug_interface_queue_vi; 208 + break; 209 + case KFD_QUEUE_TYPE_SDMA: 210 + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + 211 + engine_sel__mes_map_queues__sdma0_vi; 212 + use_static = false; /* no static queues under SDMA */ 213 + break; 214 + default: 215 + WARN(1, "queue type %d", q->properties.type); 216 + return -EINVAL; 217 + } 218 + packet->bitfields3.doorbell_offset = 219 + q->properties.doorbell_off; 220 + 221 + packet->mqd_addr_lo = 222 + lower_32_bits(q->gart_mqd_addr); 223 + 224 + packet->mqd_addr_hi = 225 + upper_32_bits(q->gart_mqd_addr); 226 + 227 + packet->wptr_addr_lo = 228 + lower_32_bits((uint64_t)q->properties.write_ptr); 229 + 230 + packet->wptr_addr_hi = 231 + upper_32_bits((uint64_t)q->properties.write_ptr); 232 + 233 + return 0; 234 + } 235 + 236 + static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, 237 + enum kfd_queue_type type, 238 + enum kfd_unmap_queues_filter filter, 239 + uint32_t filter_param, bool reset, 240 + unsigned int sdma_engine) 241 + { 242 + struct pm4_mes_unmap_queues *packet; 243 + 244 + packet = (struct pm4_mes_unmap_queues *)buffer; 245 + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); 246 + 247 + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, 248 + sizeof(struct pm4_mes_unmap_queues)); 249 + switch (type) { 250 + case KFD_QUEUE_TYPE_COMPUTE: 251 + case KFD_QUEUE_TYPE_DIQ: 252 + packet->bitfields2.engine_sel = 253 + engine_sel__mes_unmap_queues__compute; 254 + break; 255 + case KFD_QUEUE_TYPE_SDMA: 256 + packet->bitfields2.engine_sel = 257 + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; 258 + break; 259 + default: 260 + WARN(1, "queue type %d", type); 261 + return -EINVAL; 262 + } 263 + 264 + if (reset) 265 + packet->bitfields2.action = 266 + action__mes_unmap_queues__reset_queues; 267 + else 268 + packet->bitfields2.action = 269 + action__mes_unmap_queues__preempt_queues; 270 + 271 + switch (filter) { 272 + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: 273 + packet->bitfields2.queue_sel = 274 + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; 275 + packet->bitfields2.num_queues = 1; 276 + packet->bitfields3b.doorbell_offset0 = filter_param; 277 + break; 278 + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: 279 + packet->bitfields2.queue_sel = 280 + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; 281 + packet->bitfields3a.pasid = filter_param; 282 + break; 283 + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: 284 + packet->bitfields2.queue_sel = 285 + queue_sel__mes_unmap_queues__unmap_all_queues; 286 + break; 287 + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: 288 + /* in this case, we do not preempt static queues */ 289 + packet->bitfields2.queue_sel = 290 + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; 291 + break; 292 + default: 293 + WARN(1, "filter %d", filter); 294 + return -EINVAL; 295 + } 296 + 297 + return 0; 298 + 299 + } 300 + 301 + static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, 302 + uint64_t fence_address, uint32_t fence_value) 303 + { 304 + struct pm4_mes_query_status *packet; 305 + 306 + packet = (struct pm4_mes_query_status *)buffer; 307 + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); 308 + 309 + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, 310 + sizeof(struct pm4_mes_query_status)); 311 + 312 + packet->bitfields2.context_id = 0; 313 + packet->bitfields2.interrupt_sel = 314 + interrupt_sel__mes_query_status__completion_status; 315 + packet->bitfields2.command = 316 + command__mes_query_status__fence_only_after_write_ack; 317 + 318 + packet->addr_hi = upper_32_bits((uint64_t)fence_address); 319 + packet->addr_lo = lower_32_bits((uint64_t)fence_address); 320 + packet->data_hi = upper_32_bits((uint64_t)fence_value); 321 + packet->data_lo = lower_32_bits((uint64_t)fence_value); 322 + 323 + return 0; 324 + } 325 + 326 + static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) 327 + { 328 + struct pm4_mec_release_mem *packet; 329 + 330 + packet = (struct pm4_mec_release_mem *)buffer; 331 + memset(buffer, 0, sizeof(*packet)); 332 + 333 + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, 334 + sizeof(*packet)); 335 + 336 + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; 337 + packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; 338 + packet->bitfields2.tcl1_action_ena = 1; 339 + packet->bitfields2.tc_action_ena = 1; 340 + packet->bitfields2.cache_policy = cache_policy___release_mem__lru; 341 + packet->bitfields2.atc = 0; 342 + 343 + packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; 344 + packet->bitfields3.int_sel = 345 + int_sel___release_mem__send_interrupt_after_write_confirm; 346 + 347 + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; 348 + packet->address_hi = upper_32_bits(gpu_addr); 349 + 350 + packet->data_lo = 0; 351 + 352 + return 0; 353 + } 354 + 355 + const struct packet_manager_funcs kfd_vi_pm_funcs = { 356 + .map_process = pm_map_process_vi, 357 + .runlist = pm_runlist_vi, 358 + .set_resources = pm_set_resources_vi, 359 + .map_queues = pm_map_queues_vi, 360 + .unmap_queues = pm_unmap_queues_vi, 361 + .query_status = pm_query_status_vi, 362 + .release_mem = pm_release_mem_vi, 363 + .map_process_size = sizeof(struct pm4_mes_map_process), 364 + .runlist_size = sizeof(struct pm4_mes_runlist), 365 + .set_resources_size = sizeof(struct pm4_mes_set_resources), 366 + .map_queues_size = sizeof(struct pm4_mes_map_queues), 367 + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), 368 + .query_status_size = sizeof(struct pm4_mes_query_status), 369 + .release_mem_size = sizeof(struct pm4_mec_release_mem) 370 + };
+7
drivers/gpu/drm/amd/amdkfd/kfd_module.c
··· 43 43 .interrupt = kgd2kfd_interrupt, 44 44 .suspend = kgd2kfd_suspend, 45 45 .resume = kgd2kfd_resume, 46 + .quiesce_mm = kgd2kfd_quiesce_mm, 47 + .resume_mm = kgd2kfd_resume_mm, 46 48 .schedule_evict_and_restore_process = 47 49 kgd2kfd_schedule_evict_and_restore_process, 48 50 }; ··· 82 80 module_param(ignore_crat, int, 0444); 83 81 MODULE_PARM_DESC(ignore_crat, 84 82 "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); 83 + 84 + int vega10_noretry; 85 + module_param_named(noretry, vega10_noretry, int, 0644); 86 + MODULE_PARM_DESC(noretry, 87 + "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); 85 88 86 89 static int amdkfd_init_completed; 87 90
+3
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
··· 38 38 case CHIP_POLARIS10: 39 39 case CHIP_POLARIS11: 40 40 return mqd_manager_init_vi_tonga(type, dev); 41 + case CHIP_VEGA10: 42 + case CHIP_RAVEN: 43 + return mqd_manager_init_v9(type, dev); 41 44 default: 42 45 WARN(1, "Unexpected ASIC family %u", 43 46 dev->device_info->asic_family);
+1 -5
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
··· 79 79 m->cp_mqd_base_addr_lo = lower_32_bits(addr); 80 80 m->cp_mqd_base_addr_hi = upper_32_bits(addr); 81 81 82 - m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; 83 - /* Although WinKFD writes this, I suspect it should not be necessary */ 84 - m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; 85 - 86 82 m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | 87 83 QUANTUM_DURATION(10); 88 84 ··· 408 412 if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 409 413 return NULL; 410 414 411 - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); 415 + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); 412 416 if (!mqd) 413 417 return NULL; 414 418
+443
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
··· 1 + /* 2 + * Copyright 2016-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + */ 23 + 24 + #include <linux/printk.h> 25 + #include <linux/slab.h> 26 + #include <linux/uaccess.h> 27 + #include "kfd_priv.h" 28 + #include "kfd_mqd_manager.h" 29 + #include "v9_structs.h" 30 + #include "gc/gc_9_0_offset.h" 31 + #include "gc/gc_9_0_sh_mask.h" 32 + #include "sdma0/sdma0_4_0_sh_mask.h" 33 + 34 + static inline struct v9_mqd *get_mqd(void *mqd) 35 + { 36 + return (struct v9_mqd *)mqd; 37 + } 38 + 39 + static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) 40 + { 41 + return (struct v9_sdma_mqd *)mqd; 42 + } 43 + 44 + static int init_mqd(struct mqd_manager *mm, void **mqd, 45 + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, 46 + struct queue_properties *q) 47 + { 48 + int retval; 49 + uint64_t addr; 50 + struct v9_mqd *m; 51 + struct kfd_dev *kfd = mm->dev; 52 + 53 + /* From V9, for CWSR, the control stack is located on the next page 54 + * boundary after the mqd, we will use the gtt allocation function 55 + * instead of sub-allocation function. 56 + */ 57 + if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { 58 + *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); 59 + if (!*mqd_mem_obj) 60 + return -ENOMEM; 61 + retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, 62 + ALIGN(q->ctl_stack_size, PAGE_SIZE) + 63 + ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), 64 + &((*mqd_mem_obj)->gtt_mem), 65 + &((*mqd_mem_obj)->gpu_addr), 66 + (void *)&((*mqd_mem_obj)->cpu_ptr)); 67 + } else 68 + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), 69 + mqd_mem_obj); 70 + if (retval != 0) 71 + return -ENOMEM; 72 + 73 + m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; 74 + addr = (*mqd_mem_obj)->gpu_addr; 75 + 76 + memset(m, 0, sizeof(struct v9_mqd)); 77 + 78 + m->header = 0xC0310800; 79 + m->compute_pipelinestat_enable = 1; 80 + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; 81 + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; 82 + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; 83 + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; 84 + 85 + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 86 + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; 87 + 88 + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; 89 + 90 + m->cp_mqd_base_addr_lo = lower_32_bits(addr); 91 + m->cp_mqd_base_addr_hi = upper_32_bits(addr); 92 + 93 + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | 94 + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 95 + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; 96 + 97 + m->cp_hqd_pipe_priority = 1; 98 + m->cp_hqd_queue_priority = 15; 99 + 100 + if (q->format == KFD_QUEUE_FORMAT_AQL) { 101 + m->cp_hqd_aql_control = 102 + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; 103 + } 104 + 105 + if (q->tba_addr) { 106 + m->compute_pgm_rsrc2 |= 107 + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); 108 + } 109 + 110 + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { 111 + m->cp_hqd_persistent_state |= 112 + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); 113 + m->cp_hqd_ctx_save_base_addr_lo = 114 + lower_32_bits(q->ctx_save_restore_area_address); 115 + m->cp_hqd_ctx_save_base_addr_hi = 116 + upper_32_bits(q->ctx_save_restore_area_address); 117 + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; 118 + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; 119 + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; 120 + m->cp_hqd_wg_state_offset = q->ctl_stack_size; 121 + } 122 + 123 + *mqd = m; 124 + if (gart_addr) 125 + *gart_addr = addr; 126 + retval = mm->update_mqd(mm, m, q); 127 + 128 + return retval; 129 + } 130 + 131 + static int load_mqd(struct mqd_manager *mm, void *mqd, 132 + uint32_t pipe_id, uint32_t queue_id, 133 + struct queue_properties *p, struct mm_struct *mms) 134 + { 135 + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ 136 + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); 137 + 138 + return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, 139 + (uint32_t __user *)p->write_ptr, 140 + wptr_shift, 0, mms); 141 + } 142 + 143 + static int update_mqd(struct mqd_manager *mm, void *mqd, 144 + struct queue_properties *q) 145 + { 146 + struct v9_mqd *m; 147 + 148 + m = get_mqd(mqd); 149 + 150 + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; 151 + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; 152 + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); 153 + 154 + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); 155 + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); 156 + 157 + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); 158 + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); 159 + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); 160 + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); 161 + 162 + m->cp_hqd_pq_doorbell_control = 163 + q->doorbell_off << 164 + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; 165 + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", 166 + m->cp_hqd_pq_doorbell_control); 167 + 168 + m->cp_hqd_ib_control = 169 + 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | 170 + 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; 171 + 172 + /* 173 + * HW does not clamp this field correctly. Maximum EOP queue size 174 + * is constrained by per-SE EOP done signal count, which is 8-bit. 175 + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit 176 + * more than (EOP entry count - 1) so a queue size of 0x800 dwords 177 + * is safe, giving a maximum field value of 0xA. 178 + */ 179 + m->cp_hqd_eop_control = min(0xA, 180 + order_base_2(q->eop_ring_buffer_size / 4) - 1); 181 + m->cp_hqd_eop_base_addr_lo = 182 + lower_32_bits(q->eop_ring_buffer_address >> 8); 183 + m->cp_hqd_eop_base_addr_hi = 184 + upper_32_bits(q->eop_ring_buffer_address >> 8); 185 + 186 + m->cp_hqd_iq_timer = 0; 187 + 188 + m->cp_hqd_vmid = q->vmid; 189 + 190 + if (q->format == KFD_QUEUE_FORMAT_AQL) { 191 + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | 192 + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | 193 + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | 194 + 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; 195 + m->cp_hqd_pq_doorbell_control |= 1 << 196 + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; 197 + } 198 + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) 199 + m->cp_hqd_ctx_save_control = 0; 200 + 201 + q->is_active = (q->queue_size > 0 && 202 + q->queue_address != 0 && 203 + q->queue_percent > 0 && 204 + !q->is_evicted); 205 + 206 + return 0; 207 + } 208 + 209 + 210 + static int destroy_mqd(struct mqd_manager *mm, void *mqd, 211 + enum kfd_preempt_type type, 212 + unsigned int timeout, uint32_t pipe_id, 213 + uint32_t queue_id) 214 + { 215 + return mm->dev->kfd2kgd->hqd_destroy 216 + (mm->dev->kgd, mqd, type, timeout, 217 + pipe_id, queue_id); 218 + } 219 + 220 + static void uninit_mqd(struct mqd_manager *mm, void *mqd, 221 + struct kfd_mem_obj *mqd_mem_obj) 222 + { 223 + struct kfd_dev *kfd = mm->dev; 224 + 225 + if (mqd_mem_obj->gtt_mem) { 226 + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); 227 + kfree(mqd_mem_obj); 228 + } else { 229 + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); 230 + } 231 + } 232 + 233 + static bool is_occupied(struct mqd_manager *mm, void *mqd, 234 + uint64_t queue_address, uint32_t pipe_id, 235 + uint32_t queue_id) 236 + { 237 + return mm->dev->kfd2kgd->hqd_is_occupied( 238 + mm->dev->kgd, queue_address, 239 + pipe_id, queue_id); 240 + } 241 + 242 + static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, 243 + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, 244 + struct queue_properties *q) 245 + { 246 + struct v9_mqd *m; 247 + int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); 248 + 249 + if (retval != 0) 250 + return retval; 251 + 252 + m = get_mqd(*mqd); 253 + 254 + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 255 + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; 256 + 257 + return retval; 258 + } 259 + 260 + static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, 261 + struct queue_properties *q) 262 + { 263 + struct v9_mqd *m; 264 + int retval = update_mqd(mm, mqd, q); 265 + 266 + if (retval != 0) 267 + return retval; 268 + 269 + /* TODO: what's the point? update_mqd already does this. */ 270 + m = get_mqd(mqd); 271 + m->cp_hqd_vmid = q->vmid; 272 + return retval; 273 + } 274 + 275 + static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, 276 + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, 277 + struct queue_properties *q) 278 + { 279 + int retval; 280 + struct v9_sdma_mqd *m; 281 + 282 + 283 + retval = kfd_gtt_sa_allocate(mm->dev, 284 + sizeof(struct v9_sdma_mqd), 285 + mqd_mem_obj); 286 + 287 + if (retval != 0) 288 + return -ENOMEM; 289 + 290 + m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; 291 + 292 + memset(m, 0, sizeof(struct v9_sdma_mqd)); 293 + 294 + *mqd = m; 295 + if (gart_addr) 296 + *gart_addr = (*mqd_mem_obj)->gpu_addr; 297 + 298 + retval = mm->update_mqd(mm, m, q); 299 + 300 + return retval; 301 + } 302 + 303 + static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, 304 + struct kfd_mem_obj *mqd_mem_obj) 305 + { 306 + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); 307 + } 308 + 309 + static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, 310 + uint32_t pipe_id, uint32_t queue_id, 311 + struct queue_properties *p, struct mm_struct *mms) 312 + { 313 + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, 314 + (uint32_t __user *)p->write_ptr, 315 + mms); 316 + } 317 + 318 + #define SDMA_RLC_DUMMY_DEFAULT 0xf 319 + 320 + static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, 321 + struct queue_properties *q) 322 + { 323 + struct v9_sdma_mqd *m; 324 + 325 + m = get_sdma_mqd(mqd); 326 + m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) 327 + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | 328 + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 329 + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 330 + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; 331 + 332 + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); 333 + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); 334 + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); 335 + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); 336 + m->sdmax_rlcx_doorbell_offset = 337 + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; 338 + 339 + m->sdma_engine_id = q->sdma_engine_id; 340 + m->sdma_queue_id = q->sdma_queue_id; 341 + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; 342 + 343 + q->is_active = (q->queue_size > 0 && 344 + q->queue_address != 0 && 345 + q->queue_percent > 0 && 346 + !q->is_evicted); 347 + 348 + return 0; 349 + } 350 + 351 + /* 352 + * * preempt type here is ignored because there is only one way 353 + * * to preempt sdma queue 354 + */ 355 + static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, 356 + enum kfd_preempt_type type, 357 + unsigned int timeout, uint32_t pipe_id, 358 + uint32_t queue_id) 359 + { 360 + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); 361 + } 362 + 363 + static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, 364 + uint64_t queue_address, uint32_t pipe_id, 365 + uint32_t queue_id) 366 + { 367 + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); 368 + } 369 + 370 + #if defined(CONFIG_DEBUG_FS) 371 + 372 + static int debugfs_show_mqd(struct seq_file *m, void *data) 373 + { 374 + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, 375 + data, sizeof(struct v9_mqd), false); 376 + return 0; 377 + } 378 + 379 + static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) 380 + { 381 + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, 382 + data, sizeof(struct v9_sdma_mqd), false); 383 + return 0; 384 + } 385 + 386 + #endif 387 + 388 + struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, 389 + struct kfd_dev *dev) 390 + { 391 + struct mqd_manager *mqd; 392 + 393 + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 394 + return NULL; 395 + 396 + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); 397 + if (!mqd) 398 + return NULL; 399 + 400 + mqd->dev = dev; 401 + 402 + switch (type) { 403 + case KFD_MQD_TYPE_CP: 404 + case KFD_MQD_TYPE_COMPUTE: 405 + mqd->init_mqd = init_mqd; 406 + mqd->uninit_mqd = uninit_mqd; 407 + mqd->load_mqd = load_mqd; 408 + mqd->update_mqd = update_mqd; 409 + mqd->destroy_mqd = destroy_mqd; 410 + mqd->is_occupied = is_occupied; 411 + #if defined(CONFIG_DEBUG_FS) 412 + mqd->debugfs_show_mqd = debugfs_show_mqd; 413 + #endif 414 + break; 415 + case KFD_MQD_TYPE_HIQ: 416 + mqd->init_mqd = init_mqd_hiq; 417 + mqd->uninit_mqd = uninit_mqd; 418 + mqd->load_mqd = load_mqd; 419 + mqd->update_mqd = update_mqd_hiq; 420 + mqd->destroy_mqd = destroy_mqd; 421 + mqd->is_occupied = is_occupied; 422 + #if defined(CONFIG_DEBUG_FS) 423 + mqd->debugfs_show_mqd = debugfs_show_mqd; 424 + #endif 425 + break; 426 + case KFD_MQD_TYPE_SDMA: 427 + mqd->init_mqd = init_mqd_sdma; 428 + mqd->uninit_mqd = uninit_mqd_sdma; 429 + mqd->load_mqd = load_mqd_sdma; 430 + mqd->update_mqd = update_mqd_sdma; 431 + mqd->destroy_mqd = destroy_mqd_sdma; 432 + mqd->is_occupied = is_occupied_sdma; 433 + #if defined(CONFIG_DEBUG_FS) 434 + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; 435 + #endif 436 + break; 437 + default: 438 + kfree(mqd); 439 + return NULL; 440 + } 441 + 442 + return mqd; 443 + }
+1 -1
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
··· 394 394 if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) 395 395 return NULL; 396 396 397 - mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); 397 + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); 398 398 if (!mqd) 399 399 return NULL; 400 400
+80 -312
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
··· 26 26 #include "kfd_device_queue_manager.h" 27 27 #include "kfd_kernel_queue.h" 28 28 #include "kfd_priv.h" 29 - #include "kfd_pm4_headers_vi.h" 30 - #include "kfd_pm4_opcodes.h" 31 29 32 30 static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, 33 31 unsigned int buffer_size_bytes) ··· 35 37 WARN((temp * sizeof(uint32_t)) > buffer_size_bytes, 36 38 "Runlist IB overflow"); 37 39 *wptr = temp; 38 - } 39 - 40 - static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) 41 - { 42 - union PM4_MES_TYPE_3_HEADER header; 43 - 44 - header.u32All = 0; 45 - header.opcode = opcode; 46 - header.count = packet_size / 4 - 2; 47 - header.type = PM4_TYPE_3; 48 - 49 - return header.u32All; 50 40 } 51 41 52 42 static void pm_calc_rlib_size(struct packet_manager *pm, ··· 66 80 pr_debug("Over subscribed runlist\n"); 67 81 } 68 82 69 - map_queue_size = sizeof(struct pm4_mes_map_queues); 83 + map_queue_size = pm->pmf->map_queues_size; 70 84 /* calculate run list ib allocation size */ 71 - *rlib_size = process_count * sizeof(struct pm4_mes_map_process) + 85 + *rlib_size = process_count * pm->pmf->map_process_size + 72 86 queue_count * map_queue_size; 73 87 74 88 /* ··· 76 90 * when over subscription 77 91 */ 78 92 if (*over_subscription) 79 - *rlib_size += sizeof(struct pm4_mes_runlist); 93 + *rlib_size += pm->pmf->runlist_size; 80 94 81 95 pr_debug("runlist ib size %d\n", *rlib_size); 82 96 } ··· 94 108 95 109 pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); 96 110 111 + mutex_lock(&pm->lock); 112 + 97 113 retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, 98 114 &pm->ib_buffer_obj); 99 115 100 116 if (retval) { 101 117 pr_err("Failed to allocate runlist IB\n"); 102 - return retval; 118 + goto out; 103 119 } 104 120 105 121 *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; ··· 109 121 110 122 memset(*rl_buffer, 0, *rl_buffer_size); 111 123 pm->allocated = true; 124 + 125 + out: 126 + mutex_unlock(&pm->lock); 112 127 return retval; 113 - } 114 - 115 - static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, 116 - uint64_t ib, size_t ib_size_in_dwords, bool chain) 117 - { 118 - struct pm4_mes_runlist *packet; 119 - int concurrent_proc_cnt = 0; 120 - struct kfd_dev *kfd = pm->dqm->dev; 121 - 122 - if (WARN_ON(!ib)) 123 - return -EFAULT; 124 - 125 - /* Determine the number of processes to map together to HW: 126 - * it can not exceed the number of VMIDs available to the 127 - * scheduler, and it is determined by the smaller of the number 128 - * of processes in the runlist and kfd module parameter 129 - * hws_max_conc_proc. 130 - * Note: the arbitration between the number of VMIDs and 131 - * hws_max_conc_proc has been done in 132 - * kgd2kfd_device_init(). 133 - */ 134 - concurrent_proc_cnt = min(pm->dqm->processes_count, 135 - kfd->max_proc_per_quantum); 136 - 137 - packet = (struct pm4_mes_runlist *)buffer; 138 - 139 - memset(buffer, 0, sizeof(struct pm4_mes_runlist)); 140 - packet->header.u32All = build_pm4_header(IT_RUN_LIST, 141 - sizeof(struct pm4_mes_runlist)); 142 - 143 - packet->bitfields4.ib_size = ib_size_in_dwords; 144 - packet->bitfields4.chain = chain ? 1 : 0; 145 - packet->bitfields4.offload_polling = 0; 146 - packet->bitfields4.valid = 1; 147 - packet->bitfields4.process_cnt = concurrent_proc_cnt; 148 - packet->ordinal2 = lower_32_bits(ib); 149 - packet->bitfields3.ib_base_hi = upper_32_bits(ib); 150 - 151 - return 0; 152 - } 153 - 154 - static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, 155 - struct qcm_process_device *qpd) 156 - { 157 - struct pm4_mes_map_process *packet; 158 - 159 - packet = (struct pm4_mes_map_process *)buffer; 160 - 161 - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); 162 - 163 - packet->header.u32All = build_pm4_header(IT_MAP_PROCESS, 164 - sizeof(struct pm4_mes_map_process)); 165 - packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; 166 - packet->bitfields2.process_quantum = 1; 167 - packet->bitfields2.pasid = qpd->pqm->process->pasid; 168 - packet->bitfields3.page_table_base = qpd->page_table_base; 169 - packet->bitfields10.gds_size = qpd->gds_size; 170 - packet->bitfields10.num_gws = qpd->num_gws; 171 - packet->bitfields10.num_oac = qpd->num_oac; 172 - packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; 173 - 174 - packet->sh_mem_config = qpd->sh_mem_config; 175 - packet->sh_mem_bases = qpd->sh_mem_bases; 176 - packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; 177 - packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; 178 - 179 - packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; 180 - 181 - packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); 182 - packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); 183 - 184 - return 0; 185 - } 186 - 187 - static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, 188 - struct queue *q, bool is_static) 189 - { 190 - struct pm4_mes_map_queues *packet; 191 - bool use_static = is_static; 192 - 193 - packet = (struct pm4_mes_map_queues *)buffer; 194 - memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); 195 - 196 - packet->header.u32All = build_pm4_header(IT_MAP_QUEUES, 197 - sizeof(struct pm4_mes_map_queues)); 198 - packet->bitfields2.alloc_format = 199 - alloc_format__mes_map_queues__one_per_pipe_vi; 200 - packet->bitfields2.num_queues = 1; 201 - packet->bitfields2.queue_sel = 202 - queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; 203 - 204 - packet->bitfields2.engine_sel = 205 - engine_sel__mes_map_queues__compute_vi; 206 - packet->bitfields2.queue_type = 207 - queue_type__mes_map_queues__normal_compute_vi; 208 - 209 - switch (q->properties.type) { 210 - case KFD_QUEUE_TYPE_COMPUTE: 211 - if (use_static) 212 - packet->bitfields2.queue_type = 213 - queue_type__mes_map_queues__normal_latency_static_queue_vi; 214 - break; 215 - case KFD_QUEUE_TYPE_DIQ: 216 - packet->bitfields2.queue_type = 217 - queue_type__mes_map_queues__debug_interface_queue_vi; 218 - break; 219 - case KFD_QUEUE_TYPE_SDMA: 220 - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + 221 - engine_sel__mes_map_queues__sdma0_vi; 222 - use_static = false; /* no static queues under SDMA */ 223 - break; 224 - default: 225 - WARN(1, "queue type %d", q->properties.type); 226 - return -EINVAL; 227 - } 228 - packet->bitfields3.doorbell_offset = 229 - q->properties.doorbell_off; 230 - 231 - packet->mqd_addr_lo = 232 - lower_32_bits(q->gart_mqd_addr); 233 - 234 - packet->mqd_addr_hi = 235 - upper_32_bits(q->gart_mqd_addr); 236 - 237 - packet->wptr_addr_lo = 238 - lower_32_bits((uint64_t)q->properties.write_ptr); 239 - 240 - packet->wptr_addr_hi = 241 - upper_32_bits((uint64_t)q->properties.write_ptr); 242 - 243 - return 0; 244 128 } 245 129 246 130 static int pm_create_runlist_ib(struct packet_manager *pm, ··· 152 292 return -ENOMEM; 153 293 } 154 294 155 - retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); 295 + retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); 156 296 if (retval) 157 297 return retval; 158 298 159 299 proccesses_mapped++; 160 - inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process), 300 + inc_wptr(&rl_wptr, pm->pmf->map_process_size, 161 301 alloc_size_bytes); 162 302 163 303 list_for_each_entry(kq, &qpd->priv_queue_list, list) { ··· 167 307 pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", 168 308 kq->queue->queue, qpd->is_debug); 169 309 170 - retval = pm_create_map_queue(pm, 310 + retval = pm->pmf->map_queues(pm, 171 311 &rl_buffer[rl_wptr], 172 312 kq->queue, 173 313 qpd->is_debug); ··· 175 315 return retval; 176 316 177 317 inc_wptr(&rl_wptr, 178 - sizeof(struct pm4_mes_map_queues), 318 + pm->pmf->map_queues_size, 179 319 alloc_size_bytes); 180 320 } 181 321 ··· 186 326 pr_debug("static_queue, mapping user queue %d, is debug status %d\n", 187 327 q->queue, qpd->is_debug); 188 328 189 - retval = pm_create_map_queue(pm, 329 + retval = pm->pmf->map_queues(pm, 190 330 &rl_buffer[rl_wptr], 191 331 q, 192 332 qpd->is_debug); ··· 195 335 return retval; 196 336 197 337 inc_wptr(&rl_wptr, 198 - sizeof(struct pm4_mes_map_queues), 338 + pm->pmf->map_queues_size, 199 339 alloc_size_bytes); 200 340 } 201 341 } ··· 203 343 pr_debug("Finished map process and queues to runlist\n"); 204 344 205 345 if (is_over_subscription) 206 - retval = pm_create_runlist(pm, &rl_buffer[rl_wptr], 346 + retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], 207 347 *rl_gpu_addr, 208 348 alloc_size_bytes / sizeof(uint32_t), 209 349 true); ··· 215 355 return retval; 216 356 } 217 357 218 - /* pm_create_release_mem - Create a RELEASE_MEM packet and return the size 219 - * of this packet 220 - * @gpu_addr - GPU address of the packet. It's a virtual address. 221 - * @buffer - buffer to fill up with the packet. It's a CPU kernel pointer 222 - * Return - length of the packet 223 - */ 224 - uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer) 225 - { 226 - struct pm4_mec_release_mem *packet; 227 - 228 - WARN_ON(!buffer); 229 - 230 - packet = (struct pm4_mec_release_mem *)buffer; 231 - memset(buffer, 0, sizeof(*packet)); 232 - 233 - packet->header.u32All = build_pm4_header(IT_RELEASE_MEM, 234 - sizeof(*packet)); 235 - 236 - packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; 237 - packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; 238 - packet->bitfields2.tcl1_action_ena = 1; 239 - packet->bitfields2.tc_action_ena = 1; 240 - packet->bitfields2.cache_policy = cache_policy___release_mem__lru; 241 - packet->bitfields2.atc = 0; 242 - 243 - packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; 244 - packet->bitfields3.int_sel = 245 - int_sel___release_mem__send_interrupt_after_write_confirm; 246 - 247 - packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; 248 - packet->address_hi = upper_32_bits(gpu_addr); 249 - 250 - packet->data_lo = 0; 251 - 252 - return sizeof(*packet) / sizeof(unsigned int); 253 - } 254 - 255 358 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) 256 359 { 360 + switch (dqm->dev->device_info->asic_family) { 361 + case CHIP_KAVERI: 362 + case CHIP_HAWAII: 363 + /* PM4 packet structures on CIK are the same as on VI */ 364 + case CHIP_CARRIZO: 365 + case CHIP_TONGA: 366 + case CHIP_FIJI: 367 + case CHIP_POLARIS10: 368 + case CHIP_POLARIS11: 369 + pm->pmf = &kfd_vi_pm_funcs; 370 + break; 371 + case CHIP_VEGA10: 372 + case CHIP_RAVEN: 373 + pm->pmf = &kfd_v9_pm_funcs; 374 + break; 375 + default: 376 + WARN(1, "Unexpected ASIC family %u", 377 + dqm->dev->device_info->asic_family); 378 + return -EINVAL; 379 + } 380 + 257 381 pm->dqm = dqm; 258 382 mutex_init(&pm->lock); 259 383 pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); ··· 259 415 int pm_send_set_resources(struct packet_manager *pm, 260 416 struct scheduling_resources *res) 261 417 { 262 - struct pm4_mes_set_resources *packet; 418 + uint32_t *buffer, size; 263 419 int retval = 0; 264 420 421 + size = pm->pmf->set_resources_size; 265 422 mutex_lock(&pm->lock); 266 423 pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, 267 - sizeof(*packet) / sizeof(uint32_t), 268 - (unsigned int **)&packet); 269 - if (!packet) { 424 + size / sizeof(uint32_t), 425 + (unsigned int **)&buffer); 426 + if (!buffer) { 270 427 pr_err("Failed to allocate buffer on kernel queue\n"); 271 428 retval = -ENOMEM; 272 429 goto out; 273 430 } 274 431 275 - memset(packet, 0, sizeof(struct pm4_mes_set_resources)); 276 - packet->header.u32All = build_pm4_header(IT_SET_RESOURCES, 277 - sizeof(struct pm4_mes_set_resources)); 278 - 279 - packet->bitfields2.queue_type = 280 - queue_type__mes_set_resources__hsa_interface_queue_hiq; 281 - packet->bitfields2.vmid_mask = res->vmid_mask; 282 - packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; 283 - packet->bitfields7.oac_mask = res->oac_mask; 284 - packet->bitfields8.gds_heap_base = res->gds_heap_base; 285 - packet->bitfields8.gds_heap_size = res->gds_heap_size; 286 - 287 - packet->gws_mask_lo = lower_32_bits(res->gws_mask); 288 - packet->gws_mask_hi = upper_32_bits(res->gws_mask); 289 - 290 - packet->queue_mask_lo = lower_32_bits(res->queue_mask); 291 - packet->queue_mask_hi = upper_32_bits(res->queue_mask); 292 - 293 - pm->priv_queue->ops.submit_packet(pm->priv_queue); 432 + retval = pm->pmf->set_resources(pm, buffer, res); 433 + if (!retval) 434 + pm->priv_queue->ops.submit_packet(pm->priv_queue); 435 + else 436 + pm->priv_queue->ops.rollback_packet(pm->priv_queue); 294 437 295 438 out: 296 439 mutex_unlock(&pm->lock); ··· 299 468 300 469 pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); 301 470 302 - packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t); 471 + packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t); 303 472 mutex_lock(&pm->lock); 304 473 305 474 retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, ··· 307 476 if (retval) 308 477 goto fail_acquire_packet_buffer; 309 478 310 - retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, 479 + retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, 311 480 rl_ib_size / sizeof(uint32_t), false); 312 481 if (retval) 313 482 goto fail_create_runlist; ··· 330 499 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, 331 500 uint32_t fence_value) 332 501 { 333 - int retval; 334 - struct pm4_mes_query_status *packet; 502 + uint32_t *buffer, size; 503 + int retval = 0; 335 504 336 505 if (WARN_ON(!fence_address)) 337 506 return -EFAULT; 338 507 508 + size = pm->pmf->query_status_size; 339 509 mutex_lock(&pm->lock); 340 - retval = pm->priv_queue->ops.acquire_packet_buffer( 341 - pm->priv_queue, 342 - sizeof(struct pm4_mes_query_status) / sizeof(uint32_t), 343 - (unsigned int **)&packet); 344 - if (retval) 345 - goto fail_acquire_packet_buffer; 510 + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, 511 + size / sizeof(uint32_t), (unsigned int **)&buffer); 512 + if (!buffer) { 513 + pr_err("Failed to allocate buffer on kernel queue\n"); 514 + retval = -ENOMEM; 515 + goto out; 516 + } 346 517 347 - packet->header.u32All = build_pm4_header(IT_QUERY_STATUS, 348 - sizeof(struct pm4_mes_query_status)); 518 + retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); 519 + if (!retval) 520 + pm->priv_queue->ops.submit_packet(pm->priv_queue); 521 + else 522 + pm->priv_queue->ops.rollback_packet(pm->priv_queue); 349 523 350 - packet->bitfields2.context_id = 0; 351 - packet->bitfields2.interrupt_sel = 352 - interrupt_sel__mes_query_status__completion_status; 353 - packet->bitfields2.command = 354 - command__mes_query_status__fence_only_after_write_ack; 355 - 356 - packet->addr_hi = upper_32_bits((uint64_t)fence_address); 357 - packet->addr_lo = lower_32_bits((uint64_t)fence_address); 358 - packet->data_hi = upper_32_bits((uint64_t)fence_value); 359 - packet->data_lo = lower_32_bits((uint64_t)fence_value); 360 - 361 - pm->priv_queue->ops.submit_packet(pm->priv_queue); 362 - 363 - fail_acquire_packet_buffer: 524 + out: 364 525 mutex_unlock(&pm->lock); 365 526 return retval; 366 527 } ··· 362 539 uint32_t filter_param, bool reset, 363 540 unsigned int sdma_engine) 364 541 { 365 - int retval; 366 - uint32_t *buffer; 367 - struct pm4_mes_unmap_queues *packet; 542 + uint32_t *buffer, size; 543 + int retval = 0; 368 544 545 + size = pm->pmf->unmap_queues_size; 369 546 mutex_lock(&pm->lock); 370 - retval = pm->priv_queue->ops.acquire_packet_buffer( 371 - pm->priv_queue, 372 - sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t), 373 - &buffer); 374 - if (retval) 375 - goto err_acquire_packet_buffer; 376 - 377 - packet = (struct pm4_mes_unmap_queues *)buffer; 378 - memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); 379 - pr_debug("static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", 380 - filter, reset, type); 381 - packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES, 382 - sizeof(struct pm4_mes_unmap_queues)); 383 - switch (type) { 384 - case KFD_QUEUE_TYPE_COMPUTE: 385 - case KFD_QUEUE_TYPE_DIQ: 386 - packet->bitfields2.engine_sel = 387 - engine_sel__mes_unmap_queues__compute; 388 - break; 389 - case KFD_QUEUE_TYPE_SDMA: 390 - packet->bitfields2.engine_sel = 391 - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; 392 - break; 393 - default: 394 - WARN(1, "queue type %d", type); 395 - retval = -EINVAL; 396 - goto err_invalid; 547 + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, 548 + size / sizeof(uint32_t), (unsigned int **)&buffer); 549 + if (!buffer) { 550 + pr_err("Failed to allocate buffer on kernel queue\n"); 551 + retval = -ENOMEM; 552 + goto out; 397 553 } 398 554 399 - if (reset) 400 - packet->bitfields2.action = 401 - action__mes_unmap_queues__reset_queues; 555 + retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, 556 + reset, sdma_engine); 557 + if (!retval) 558 + pm->priv_queue->ops.submit_packet(pm->priv_queue); 402 559 else 403 - packet->bitfields2.action = 404 - action__mes_unmap_queues__preempt_queues; 560 + pm->priv_queue->ops.rollback_packet(pm->priv_queue); 405 561 406 - switch (filter) { 407 - case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: 408 - packet->bitfields2.queue_sel = 409 - queue_sel__mes_unmap_queues__perform_request_on_specified_queues; 410 - packet->bitfields2.num_queues = 1; 411 - packet->bitfields3b.doorbell_offset0 = filter_param; 412 - break; 413 - case KFD_UNMAP_QUEUES_FILTER_BY_PASID: 414 - packet->bitfields2.queue_sel = 415 - queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; 416 - packet->bitfields3a.pasid = filter_param; 417 - break; 418 - case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: 419 - packet->bitfields2.queue_sel = 420 - queue_sel__mes_unmap_queues__unmap_all_queues; 421 - break; 422 - case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: 423 - /* in this case, we do not preempt static queues */ 424 - packet->bitfields2.queue_sel = 425 - queue_sel__mes_unmap_queues__unmap_all_non_static_queues; 426 - break; 427 - default: 428 - WARN(1, "filter %d", filter); 429 - retval = -EINVAL; 430 - goto err_invalid; 431 - } 432 - 433 - pm->priv_queue->ops.submit_packet(pm->priv_queue); 434 - 435 - mutex_unlock(&pm->lock); 436 - return 0; 437 - 438 - err_invalid: 439 - pm->priv_queue->ops.rollback_packet(pm->priv_queue); 440 - err_acquire_packet_buffer: 562 + out: 441 563 mutex_unlock(&pm->lock); 442 564 return retval; 443 565 }
+583
drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
··· 1 + /* 2 + * Copyright 2016 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + */ 23 + 24 + #ifndef F32_MES_PM4_PACKETS_H 25 + #define F32_MES_PM4_PACKETS_H 26 + 27 + #ifndef PM4_MES_HEADER_DEFINED 28 + #define PM4_MES_HEADER_DEFINED 29 + union PM4_MES_TYPE_3_HEADER { 30 + struct { 31 + uint32_t reserved1 : 8; /* < reserved */ 32 + uint32_t opcode : 8; /* < IT opcode */ 33 + uint32_t count : 14;/* < number of DWORDs - 1 in the 34 + * information body. 35 + */ 36 + uint32_t type : 2; /* < packet identifier. 37 + * It should be 3 for type 3 packets 38 + */ 39 + }; 40 + uint32_t u32All; 41 + }; 42 + #endif /* PM4_MES_HEADER_DEFINED */ 43 + 44 + /*--------------------MES_SET_RESOURCES--------------------*/ 45 + 46 + #ifndef PM4_MES_SET_RESOURCES_DEFINED 47 + #define PM4_MES_SET_RESOURCES_DEFINED 48 + enum mes_set_resources_queue_type_enum { 49 + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, 50 + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, 51 + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 52 + }; 53 + 54 + 55 + struct pm4_mes_set_resources { 56 + union { 57 + union PM4_MES_TYPE_3_HEADER header; /* header */ 58 + uint32_t ordinal1; 59 + }; 60 + 61 + union { 62 + struct { 63 + uint32_t vmid_mask:16; 64 + uint32_t unmap_latency:8; 65 + uint32_t reserved1:5; 66 + enum mes_set_resources_queue_type_enum queue_type:3; 67 + } bitfields2; 68 + uint32_t ordinal2; 69 + }; 70 + 71 + uint32_t queue_mask_lo; 72 + uint32_t queue_mask_hi; 73 + uint32_t gws_mask_lo; 74 + uint32_t gws_mask_hi; 75 + 76 + union { 77 + struct { 78 + uint32_t oac_mask:16; 79 + uint32_t reserved2:16; 80 + } bitfields7; 81 + uint32_t ordinal7; 82 + }; 83 + 84 + union { 85 + struct { 86 + uint32_t gds_heap_base:6; 87 + uint32_t reserved3:5; 88 + uint32_t gds_heap_size:6; 89 + uint32_t reserved4:15; 90 + } bitfields8; 91 + uint32_t ordinal8; 92 + }; 93 + 94 + }; 95 + #endif 96 + 97 + /*--------------------MES_RUN_LIST--------------------*/ 98 + 99 + #ifndef PM4_MES_RUN_LIST_DEFINED 100 + #define PM4_MES_RUN_LIST_DEFINED 101 + 102 + struct pm4_mes_runlist { 103 + union { 104 + union PM4_MES_TYPE_3_HEADER header; /* header */ 105 + uint32_t ordinal1; 106 + }; 107 + 108 + union { 109 + struct { 110 + uint32_t reserved1:2; 111 + uint32_t ib_base_lo:30; 112 + } bitfields2; 113 + uint32_t ordinal2; 114 + }; 115 + 116 + uint32_t ib_base_hi; 117 + 118 + union { 119 + struct { 120 + uint32_t ib_size:20; 121 + uint32_t chain:1; 122 + uint32_t offload_polling:1; 123 + uint32_t reserved2:1; 124 + uint32_t valid:1; 125 + uint32_t process_cnt:4; 126 + uint32_t reserved3:4; 127 + } bitfields4; 128 + uint32_t ordinal4; 129 + }; 130 + 131 + }; 132 + #endif 133 + 134 + /*--------------------MES_MAP_PROCESS--------------------*/ 135 + 136 + #ifndef PM4_MES_MAP_PROCESS_DEFINED 137 + #define PM4_MES_MAP_PROCESS_DEFINED 138 + 139 + struct pm4_mes_map_process { 140 + union { 141 + union PM4_MES_TYPE_3_HEADER header; /* header */ 142 + uint32_t ordinal1; 143 + }; 144 + 145 + union { 146 + struct { 147 + uint32_t pasid:16; 148 + uint32_t reserved1:8; 149 + uint32_t diq_enable:1; 150 + uint32_t process_quantum:7; 151 + } bitfields2; 152 + uint32_t ordinal2; 153 + }; 154 + 155 + uint32_t vm_context_page_table_base_addr_lo32; 156 + 157 + uint32_t vm_context_page_table_base_addr_hi32; 158 + 159 + uint32_t sh_mem_bases; 160 + 161 + uint32_t sh_mem_config; 162 + 163 + uint32_t sq_shader_tba_lo; 164 + 165 + uint32_t sq_shader_tba_hi; 166 + 167 + uint32_t sq_shader_tma_lo; 168 + 169 + uint32_t sq_shader_tma_hi; 170 + 171 + uint32_t reserved6; 172 + 173 + uint32_t gds_addr_lo; 174 + 175 + uint32_t gds_addr_hi; 176 + 177 + union { 178 + struct { 179 + uint32_t num_gws:6; 180 + uint32_t reserved7:1; 181 + uint32_t sdma_enable:1; 182 + uint32_t num_oac:4; 183 + uint32_t reserved8:4; 184 + uint32_t gds_size:6; 185 + uint32_t num_queues:10; 186 + } bitfields14; 187 + uint32_t ordinal14; 188 + }; 189 + 190 + uint32_t completion_signal_lo; 191 + 192 + uint32_t completion_signal_hi; 193 + 194 + }; 195 + 196 + #endif 197 + 198 + /*--------------------MES_MAP_PROCESS_VM--------------------*/ 199 + 200 + #ifndef PM4_MES_MAP_PROCESS_VM_DEFINED 201 + #define PM4_MES_MAP_PROCESS_VM_DEFINED 202 + 203 + struct PM4_MES_MAP_PROCESS_VM { 204 + union { 205 + union PM4_MES_TYPE_3_HEADER header; /* header */ 206 + uint32_t ordinal1; 207 + }; 208 + 209 + uint32_t reserved1; 210 + 211 + uint32_t vm_context_cntl; 212 + 213 + uint32_t reserved2; 214 + 215 + uint32_t vm_context_page_table_end_addr_lo32; 216 + 217 + uint32_t vm_context_page_table_end_addr_hi32; 218 + 219 + uint32_t vm_context_page_table_start_addr_lo32; 220 + 221 + uint32_t vm_context_page_table_start_addr_hi32; 222 + 223 + uint32_t reserved3; 224 + 225 + uint32_t reserved4; 226 + 227 + uint32_t reserved5; 228 + 229 + uint32_t reserved6; 230 + 231 + uint32_t reserved7; 232 + 233 + uint32_t reserved8; 234 + 235 + uint32_t completion_signal_lo32; 236 + 237 + uint32_t completion_signal_hi32; 238 + 239 + }; 240 + #endif 241 + 242 + /*--------------------MES_MAP_QUEUES--------------------*/ 243 + 244 + #ifndef PM4_MES_MAP_QUEUES_VI_DEFINED 245 + #define PM4_MES_MAP_QUEUES_VI_DEFINED 246 + enum mes_map_queues_queue_sel_enum { 247 + queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, 248 + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 249 + }; 250 + 251 + enum mes_map_queues_queue_type_enum { 252 + queue_type__mes_map_queues__normal_compute_vi = 0, 253 + queue_type__mes_map_queues__debug_interface_queue_vi = 1, 254 + queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, 255 + queue_type__mes_map_queues__low_latency_static_queue_vi = 3 256 + }; 257 + 258 + enum mes_map_queues_alloc_format_enum { 259 + alloc_format__mes_map_queues__one_per_pipe_vi = 0, 260 + alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 261 + }; 262 + 263 + enum mes_map_queues_engine_sel_enum { 264 + engine_sel__mes_map_queues__compute_vi = 0, 265 + engine_sel__mes_map_queues__sdma0_vi = 2, 266 + engine_sel__mes_map_queues__sdma1_vi = 3 267 + }; 268 + 269 + 270 + struct pm4_mes_map_queues { 271 + union { 272 + union PM4_MES_TYPE_3_HEADER header; /* header */ 273 + uint32_t ordinal1; 274 + }; 275 + 276 + union { 277 + struct { 278 + uint32_t reserved1:4; 279 + enum mes_map_queues_queue_sel_enum queue_sel:2; 280 + uint32_t reserved2:15; 281 + enum mes_map_queues_queue_type_enum queue_type:3; 282 + enum mes_map_queues_alloc_format_enum alloc_format:2; 283 + enum mes_map_queues_engine_sel_enum engine_sel:3; 284 + uint32_t num_queues:3; 285 + } bitfields2; 286 + uint32_t ordinal2; 287 + }; 288 + 289 + union { 290 + struct { 291 + uint32_t reserved3:1; 292 + uint32_t check_disable:1; 293 + uint32_t doorbell_offset:26; 294 + uint32_t reserved4:4; 295 + } bitfields3; 296 + uint32_t ordinal3; 297 + }; 298 + 299 + uint32_t mqd_addr_lo; 300 + uint32_t mqd_addr_hi; 301 + uint32_t wptr_addr_lo; 302 + uint32_t wptr_addr_hi; 303 + }; 304 + #endif 305 + 306 + /*--------------------MES_QUERY_STATUS--------------------*/ 307 + 308 + #ifndef PM4_MES_QUERY_STATUS_DEFINED 309 + #define PM4_MES_QUERY_STATUS_DEFINED 310 + enum mes_query_status_interrupt_sel_enum { 311 + interrupt_sel__mes_query_status__completion_status = 0, 312 + interrupt_sel__mes_query_status__process_status = 1, 313 + interrupt_sel__mes_query_status__queue_status = 2 314 + }; 315 + 316 + enum mes_query_status_command_enum { 317 + command__mes_query_status__interrupt_only = 0, 318 + command__mes_query_status__fence_only_immediate = 1, 319 + command__mes_query_status__fence_only_after_write_ack = 2, 320 + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 321 + }; 322 + 323 + enum mes_query_status_engine_sel_enum { 324 + engine_sel__mes_query_status__compute = 0, 325 + engine_sel__mes_query_status__sdma0_queue = 2, 326 + engine_sel__mes_query_status__sdma1_queue = 3 327 + }; 328 + 329 + struct pm4_mes_query_status { 330 + union { 331 + union PM4_MES_TYPE_3_HEADER header; /* header */ 332 + uint32_t ordinal1; 333 + }; 334 + 335 + union { 336 + struct { 337 + uint32_t context_id:28; 338 + enum mes_query_status_interrupt_sel_enum interrupt_sel:2; 339 + enum mes_query_status_command_enum command:2; 340 + } bitfields2; 341 + uint32_t ordinal2; 342 + }; 343 + 344 + union { 345 + struct { 346 + uint32_t pasid:16; 347 + uint32_t reserved1:16; 348 + } bitfields3a; 349 + struct { 350 + uint32_t reserved2:2; 351 + uint32_t doorbell_offset:26; 352 + enum mes_query_status_engine_sel_enum engine_sel:3; 353 + uint32_t reserved3:1; 354 + } bitfields3b; 355 + uint32_t ordinal3; 356 + }; 357 + 358 + uint32_t addr_lo; 359 + uint32_t addr_hi; 360 + uint32_t data_lo; 361 + uint32_t data_hi; 362 + }; 363 + #endif 364 + 365 + /*--------------------MES_UNMAP_QUEUES--------------------*/ 366 + 367 + #ifndef PM4_MES_UNMAP_QUEUES_DEFINED 368 + #define PM4_MES_UNMAP_QUEUES_DEFINED 369 + enum mes_unmap_queues_action_enum { 370 + action__mes_unmap_queues__preempt_queues = 0, 371 + action__mes_unmap_queues__reset_queues = 1, 372 + action__mes_unmap_queues__disable_process_queues = 2, 373 + action__mes_unmap_queues__reserved = 3 374 + }; 375 + 376 + enum mes_unmap_queues_queue_sel_enum { 377 + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, 378 + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, 379 + queue_sel__mes_unmap_queues__unmap_all_queues = 2, 380 + queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 381 + }; 382 + 383 + enum mes_unmap_queues_engine_sel_enum { 384 + engine_sel__mes_unmap_queues__compute = 0, 385 + engine_sel__mes_unmap_queues__sdma0 = 2, 386 + engine_sel__mes_unmap_queues__sdmal = 3 387 + }; 388 + 389 + struct pm4_mes_unmap_queues { 390 + union { 391 + union PM4_MES_TYPE_3_HEADER header; /* header */ 392 + uint32_t ordinal1; 393 + }; 394 + 395 + union { 396 + struct { 397 + enum mes_unmap_queues_action_enum action:2; 398 + uint32_t reserved1:2; 399 + enum mes_unmap_queues_queue_sel_enum queue_sel:2; 400 + uint32_t reserved2:20; 401 + enum mes_unmap_queues_engine_sel_enum engine_sel:3; 402 + uint32_t num_queues:3; 403 + } bitfields2; 404 + uint32_t ordinal2; 405 + }; 406 + 407 + union { 408 + struct { 409 + uint32_t pasid:16; 410 + uint32_t reserved3:16; 411 + } bitfields3a; 412 + struct { 413 + uint32_t reserved4:2; 414 + uint32_t doorbell_offset0:26; 415 + int32_t reserved5:4; 416 + } bitfields3b; 417 + uint32_t ordinal3; 418 + }; 419 + 420 + union { 421 + struct { 422 + uint32_t reserved6:2; 423 + uint32_t doorbell_offset1:26; 424 + uint32_t reserved7:4; 425 + } bitfields4; 426 + uint32_t ordinal4; 427 + }; 428 + 429 + union { 430 + struct { 431 + uint32_t reserved8:2; 432 + uint32_t doorbell_offset2:26; 433 + uint32_t reserved9:4; 434 + } bitfields5; 435 + uint32_t ordinal5; 436 + }; 437 + 438 + union { 439 + struct { 440 + uint32_t reserved10:2; 441 + uint32_t doorbell_offset3:26; 442 + uint32_t reserved11:4; 443 + } bitfields6; 444 + uint32_t ordinal6; 445 + }; 446 + }; 447 + #endif 448 + 449 + #ifndef PM4_MEC_RELEASE_MEM_DEFINED 450 + #define PM4_MEC_RELEASE_MEM_DEFINED 451 + 452 + enum mec_release_mem_event_index_enum { 453 + event_index__mec_release_mem__end_of_pipe = 5, 454 + event_index__mec_release_mem__shader_done = 6 455 + }; 456 + 457 + enum mec_release_mem_cache_policy_enum { 458 + cache_policy__mec_release_mem__lru = 0, 459 + cache_policy__mec_release_mem__stream = 1 460 + }; 461 + 462 + enum mec_release_mem_pq_exe_status_enum { 463 + pq_exe_status__mec_release_mem__default = 0, 464 + pq_exe_status__mec_release_mem__phase_update = 1 465 + }; 466 + 467 + enum mec_release_mem_dst_sel_enum { 468 + dst_sel__mec_release_mem__memory_controller = 0, 469 + dst_sel__mec_release_mem__tc_l2 = 1, 470 + dst_sel__mec_release_mem__queue_write_pointer_register = 2, 471 + dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 472 + }; 473 + 474 + enum mec_release_mem_int_sel_enum { 475 + int_sel__mec_release_mem__none = 0, 476 + int_sel__mec_release_mem__send_interrupt_only = 1, 477 + int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, 478 + int_sel__mec_release_mem__send_data_after_write_confirm = 3, 479 + int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, 480 + int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, 481 + int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 482 + }; 483 + 484 + enum mec_release_mem_data_sel_enum { 485 + data_sel__mec_release_mem__none = 0, 486 + data_sel__mec_release_mem__send_32_bit_low = 1, 487 + data_sel__mec_release_mem__send_64_bit_data = 2, 488 + data_sel__mec_release_mem__send_gpu_clock_counter = 3, 489 + data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, 490 + data_sel__mec_release_mem__store_gds_data_to_memory = 5 491 + }; 492 + 493 + struct pm4_mec_release_mem { 494 + union { 495 + union PM4_MES_TYPE_3_HEADER header; /*header */ 496 + unsigned int ordinal1; 497 + }; 498 + 499 + union { 500 + struct { 501 + unsigned int event_type:6; 502 + unsigned int reserved1:2; 503 + enum mec_release_mem_event_index_enum event_index:4; 504 + unsigned int tcl1_vol_action_ena:1; 505 + unsigned int tc_vol_action_ena:1; 506 + unsigned int reserved2:1; 507 + unsigned int tc_wb_action_ena:1; 508 + unsigned int tcl1_action_ena:1; 509 + unsigned int tc_action_ena:1; 510 + uint32_t reserved3:1; 511 + uint32_t tc_nc_action_ena:1; 512 + uint32_t tc_wc_action_ena:1; 513 + uint32_t tc_md_action_ena:1; 514 + uint32_t reserved4:3; 515 + enum mec_release_mem_cache_policy_enum cache_policy:2; 516 + uint32_t reserved5:2; 517 + enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; 518 + uint32_t reserved6:2; 519 + } bitfields2; 520 + unsigned int ordinal2; 521 + }; 522 + 523 + union { 524 + struct { 525 + uint32_t reserved7:16; 526 + enum mec_release_mem_dst_sel_enum dst_sel:2; 527 + uint32_t reserved8:6; 528 + enum mec_release_mem_int_sel_enum int_sel:3; 529 + uint32_t reserved9:2; 530 + enum mec_release_mem_data_sel_enum data_sel:3; 531 + } bitfields3; 532 + unsigned int ordinal3; 533 + }; 534 + 535 + union { 536 + struct { 537 + uint32_t reserved10:2; 538 + unsigned int address_lo_32b:30; 539 + } bitfields4; 540 + struct { 541 + uint32_t reserved11:3; 542 + uint32_t address_lo_64b:29; 543 + } bitfields4b; 544 + uint32_t reserved12; 545 + unsigned int ordinal4; 546 + }; 547 + 548 + union { 549 + uint32_t address_hi; 550 + uint32_t reserved13; 551 + uint32_t ordinal5; 552 + }; 553 + 554 + union { 555 + uint32_t data_lo; 556 + uint32_t cmp_data_lo; 557 + struct { 558 + uint32_t dw_offset:16; 559 + uint32_t num_dwords:16; 560 + } bitfields6c; 561 + uint32_t reserved14; 562 + uint32_t ordinal6; 563 + }; 564 + 565 + union { 566 + uint32_t data_hi; 567 + uint32_t cmp_data_hi; 568 + uint32_t reserved15; 569 + uint32_t reserved16; 570 + uint32_t ordinal7; 571 + }; 572 + 573 + uint32_t int_ctxid; 574 + 575 + }; 576 + 577 + #endif 578 + 579 + enum { 580 + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 581 + }; 582 + #endif 583 +
+98 -14
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
··· 39 39 40 40 #include "amd_shared.h" 41 41 42 + #define KFD_MAX_RING_ENTRY_SIZE 8 43 + 42 44 #define KFD_SYSFS_FILE_MODE 0444 43 45 44 - #define KFD_MMAP_DOORBELL_MASK 0x8000000000000ull 45 - #define KFD_MMAP_EVENTS_MASK 0x4000000000000ull 46 - #define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000ull 46 + /* GPU ID hash width in bits */ 47 + #define KFD_GPU_ID_HASH_WIDTH 16 48 + 49 + /* Use upper bits of mmap offset to store KFD driver specific information. 50 + * BITS[63:62] - Encode MMAP type 51 + * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to 52 + * BITS[45:0] - MMAP offset value 53 + * 54 + * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these 55 + * defines are w.r.t to PAGE_SIZE 56 + */ 57 + #define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) 58 + #define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) 59 + #define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) 60 + #define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) 61 + #define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) 62 + 63 + #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) 64 + #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ 65 + << KFD_MMAP_GPU_ID_SHIFT) 66 + #define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ 67 + & KFD_MMAP_GPU_ID_MASK) 68 + #define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ 69 + >> KFD_MMAP_GPU_ID_SHIFT) 70 + 71 + #define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT) 72 + #define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) 47 73 48 74 /* 49 75 * When working with cp scheduler we should assign the HIQ manually or via ··· 80 54 */ 81 55 #define KFD_CIK_HIQ_PIPE 4 82 56 #define KFD_CIK_HIQ_QUEUE 0 83 - 84 - /* GPU ID hash width in bits */ 85 - #define KFD_GPU_ID_HASH_WIDTH 16 86 57 87 58 /* Macro for allocating structures */ 88 59 #define kfd_alloc_struct(ptr_to_struct) \ ··· 139 116 */ 140 117 extern int ignore_crat; 141 118 119 + /* 120 + * Set sh_mem_config.retry_disable on Vega10 121 + */ 122 + extern int vega10_noretry; 123 + 142 124 /** 143 125 * enum kfd_sched_policy 144 126 * ··· 176 148 cache_policy_noncoherent 177 149 }; 178 150 151 + #define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) 152 + 179 153 struct kfd_event_interrupt_class { 180 154 bool (*interrupt_isr)(struct kfd_dev *dev, 181 155 const uint32_t *ih_ring_entry); ··· 190 160 const struct kfd_event_interrupt_class *event_interrupt_class; 191 161 unsigned int max_pasid_bits; 192 162 unsigned int max_no_of_hqd; 163 + unsigned int doorbell_size; 193 164 size_t ih_ring_entry_size; 194 165 uint8_t num_of_watch_points; 195 166 uint16_t mqd_size_aligned; ··· 204 173 uint32_t range_end; 205 174 uint64_t gpu_addr; 206 175 uint32_t *cpu_ptr; 176 + void *gtt_mem; 207 177 }; 208 178 209 179 struct kfd_vmid_info { ··· 396 364 uint32_t queue_percent; 397 365 uint32_t *read_ptr; 398 366 uint32_t *write_ptr; 399 - uint32_t __iomem *doorbell_ptr; 367 + void __iomem *doorbell_ptr; 400 368 uint32_t doorbell_off; 401 369 bool is_interop; 402 370 bool is_evicted; ··· 459 427 uint32_t queue; 460 428 461 429 unsigned int sdma_id; 430 + unsigned int doorbell_id; 462 431 463 432 struct kfd_process *process; 464 433 struct kfd_dev *device; ··· 534 501 /* IB memory */ 535 502 uint64_t ib_base; 536 503 void *ib_kaddr; 504 + 505 + /* doorbell resources per process per device */ 506 + unsigned long *doorbell_bitmap; 537 507 }; 538 508 539 509 /* KFD Memory Eviction */ ··· 548 512 /* Approx. time before evicting the process again */ 549 513 #define PROCESS_ACTIVE_TIME_MS 10 550 514 515 + int kgd2kfd_quiesce_mm(struct mm_struct *mm); 516 + int kgd2kfd_resume_mm(struct mm_struct *mm); 551 517 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, 552 518 struct dma_fence *fence); 553 519 ··· 719 681 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); 720 682 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); 721 683 void kfd_unref_process(struct kfd_process *p); 684 + int kfd_process_evict_queues(struct kfd_process *p); 685 + int kfd_process_restore_queues(struct kfd_process *p); 722 686 void kfd_suspend_all_processes(void); 723 687 int kfd_resume_all_processes(void); 724 688 ··· 733 693 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, 734 694 struct kfd_process *p); 735 695 736 - int kfd_reserved_mem_mmap(struct kfd_process *process, 696 + int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, 737 697 struct vm_area_struct *vma); 738 698 739 699 /* KFD process API for creating and translating handles */ ··· 761 721 void kfd_pasid_free(unsigned int pasid); 762 722 763 723 /* Doorbells */ 724 + size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); 764 725 int kfd_doorbell_init(struct kfd_dev *kfd); 765 726 void kfd_doorbell_fini(struct kfd_dev *kfd); 766 - int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); 767 - u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 727 + int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, 728 + struct vm_area_struct *vma); 729 + void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, 768 730 unsigned int *doorbell_off); 769 731 void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); 770 732 u32 read_kernel_doorbell(u32 __iomem *db); 771 - void write_kernel_doorbell(u32 __iomem *db, u32 value); 772 - unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, 733 + void write_kernel_doorbell(void __iomem *db, u32 value); 734 + void write_kernel_doorbell64(void __iomem *db, u64 value); 735 + unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, 773 736 struct kfd_process *process, 774 - unsigned int queue_id); 737 + unsigned int doorbell_id); 775 738 phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, 776 739 struct kfd_process *process); 777 740 int kfd_alloc_process_doorbells(struct kfd_process *process); ··· 831 788 struct kfd_dev *dev); 832 789 struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, 833 790 struct kfd_dev *dev); 791 + struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, 792 + struct kfd_dev *dev); 834 793 struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); 835 794 void device_queue_manager_uninit(struct device_queue_manager *dqm); 836 795 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, ··· 877 832 bool allocated; 878 833 struct kfd_mem_obj *ib_buffer_obj; 879 834 unsigned int ib_size_bytes; 835 + 836 + const struct packet_manager_funcs *pmf; 880 837 }; 838 + 839 + struct packet_manager_funcs { 840 + /* Support ASIC-specific packet formats for PM4 packets */ 841 + int (*map_process)(struct packet_manager *pm, uint32_t *buffer, 842 + struct qcm_process_device *qpd); 843 + int (*runlist)(struct packet_manager *pm, uint32_t *buffer, 844 + uint64_t ib, size_t ib_size_in_dwords, bool chain); 845 + int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, 846 + struct scheduling_resources *res); 847 + int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, 848 + struct queue *q, bool is_static); 849 + int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, 850 + enum kfd_queue_type type, 851 + enum kfd_unmap_queues_filter mode, 852 + uint32_t filter_param, bool reset, 853 + unsigned int sdma_engine); 854 + int (*query_status)(struct packet_manager *pm, uint32_t *buffer, 855 + uint64_t fence_address, uint32_t fence_value); 856 + int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); 857 + 858 + /* Packet sizes */ 859 + int map_process_size; 860 + int runlist_size; 861 + int set_resources_size; 862 + int map_queues_size; 863 + int unmap_queues_size; 864 + int query_status_size; 865 + int release_mem_size; 866 + }; 867 + 868 + extern const struct packet_manager_funcs kfd_vi_pm_funcs; 869 + extern const struct packet_manager_funcs kfd_v9_pm_funcs; 881 870 882 871 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); 883 872 void pm_uninit(struct packet_manager *pm); ··· 928 849 929 850 void pm_release_ib(struct packet_manager *pm); 930 851 931 - uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer); 852 + /* Following PM funcs can be shared among VI and AI */ 853 + unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); 854 + int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, 855 + struct scheduling_resources *res); 932 856 933 857 uint64_t kfd_get_number_elems(struct kfd_dev *kfd); 934 858 935 859 /* Events */ 936 860 extern const struct kfd_event_interrupt_class event_interrupt_class_cik; 861 + extern const struct kfd_event_interrupt_class event_interrupt_class_v9; 862 + 937 863 extern const struct kfd_device_global_init_class device_global_init_class_cik; 938 864 939 865 void kfd_event_init_process(struct kfd_process *p);
+40 -10
drivers/gpu/drm/amd/amdkfd/kfd_process.c
··· 332 332 free_pages((unsigned long)pdd->qpd.cwsr_kaddr, 333 333 get_order(KFD_CWSR_TBA_TMA_SIZE)); 334 334 335 + kfree(pdd->qpd.doorbell_bitmap); 335 336 idr_destroy(&pdd->alloc_idr); 336 337 337 338 kfree(pdd); ··· 452 451 if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) 453 452 continue; 454 453 455 - offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT; 454 + offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id)) 455 + << PAGE_SHIFT; 456 456 qpd->tba_addr = (int64_t)vm_mmap(filep, 0, 457 457 KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, 458 458 MAP_SHARED, offset); ··· 587 585 return ERR_PTR(err); 588 586 } 589 587 588 + static int init_doorbell_bitmap(struct qcm_process_device *qpd, 589 + struct kfd_dev *dev) 590 + { 591 + unsigned int i; 592 + 593 + if (!KFD_IS_SOC15(dev->device_info->asic_family)) 594 + return 0; 595 + 596 + qpd->doorbell_bitmap = 597 + kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, 598 + BITS_PER_BYTE), GFP_KERNEL); 599 + if (!qpd->doorbell_bitmap) 600 + return -ENOMEM; 601 + 602 + /* Mask out any reserved doorbells */ 603 + for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) 604 + if ((dev->shared_resources.reserved_doorbell_mask & i) == 605 + dev->shared_resources.reserved_doorbell_val) { 606 + set_bit(i, qpd->doorbell_bitmap); 607 + pr_debug("reserved doorbell 0x%03x\n", i); 608 + } 609 + 610 + return 0; 611 + } 612 + 590 613 struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, 591 614 struct kfd_process *p) 592 615 { ··· 632 605 pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); 633 606 if (!pdd) 634 607 return NULL; 608 + 609 + if (init_doorbell_bitmap(&pdd->qpd, dev)) { 610 + pr_err("Failed to init doorbell for process\n"); 611 + kfree(pdd); 612 + return NULL; 613 + } 635 614 636 615 pdd->dev = dev; 637 616 INIT_LIST_HEAD(&pdd->qpd.queues_list); ··· 841 808 * Eviction is reference-counted per process-device. This means multiple 842 809 * evictions from different sources can be nested safely. 843 810 */ 844 - static int process_evict_queues(struct kfd_process *p) 811 + int kfd_process_evict_queues(struct kfd_process *p) 845 812 { 846 813 struct kfd_process_device *pdd; 847 814 int r = 0; ··· 877 844 } 878 845 879 846 /* process_restore_queues - Restore all user queues of a process */ 880 - static int process_restore_queues(struct kfd_process *p) 847 + int kfd_process_restore_queues(struct kfd_process *p) 881 848 { 882 849 struct kfd_process_device *pdd; 883 850 int r, ret = 0; ··· 919 886 flush_delayed_work(&p->restore_work); 920 887 921 888 pr_debug("Started evicting pasid %d\n", p->pasid); 922 - ret = process_evict_queues(p); 889 + ret = kfd_process_evict_queues(p); 923 890 if (!ret) { 924 891 dma_fence_signal(p->ef); 925 892 dma_fence_put(p->ef); ··· 979 946 return; 980 947 } 981 948 982 - ret = process_restore_queues(p); 949 + ret = kfd_process_restore_queues(p); 983 950 if (!ret) 984 951 pr_debug("Finished restoring pasid %d\n", p->pasid); 985 952 else ··· 996 963 cancel_delayed_work_sync(&p->eviction_work); 997 964 cancel_delayed_work_sync(&p->restore_work); 998 965 999 - if (process_evict_queues(p)) 966 + if (kfd_process_evict_queues(p)) 1000 967 pr_err("Failed to suspend process %d\n", p->pasid); 1001 968 dma_fence_signal(p->ef); 1002 969 dma_fence_put(p->ef); ··· 1022 989 return ret; 1023 990 } 1024 991 1025 - int kfd_reserved_mem_mmap(struct kfd_process *process, 992 + int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, 1026 993 struct vm_area_struct *vma) 1027 994 { 1028 - struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); 1029 995 struct kfd_process_device *pdd; 1030 996 struct qcm_process_device *qpd; 1031 997 1032 - if (!dev) 1033 - return -EINVAL; 1034 998 if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { 1035 999 pr_err("Incorrect CWSR mapping size.\n"); 1036 1000 return -EINVAL;
+16 -6
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
··· 119 119 /* Doorbell initialized in user space*/ 120 120 q_properties->doorbell_ptr = NULL; 121 121 122 - q_properties->doorbell_off = 123 - kfd_queue_id_to_doorbell(dev, pqm->process, qid); 124 - 125 122 /* let DQM handle it*/ 126 123 q_properties->vmid = 0; 127 124 q_properties->queue_id = qid; ··· 241 244 } 242 245 243 246 if (retval != 0) { 244 - pr_err("DQM create queue failed\n"); 247 + pr_err("Pasid %d DQM create queue %d failed. ret %d\n", 248 + pqm->process->pasid, type, retval); 245 249 goto err_create_queue; 246 250 } 251 + 252 + if (q) 253 + /* Return the doorbell offset within the doorbell page 254 + * to the caller so it can be passed up to user mode 255 + * (in bytes). 256 + */ 257 + properties->doorbell_off = 258 + (q->properties.doorbell_off * sizeof(uint32_t)) & 259 + (kfd_doorbell_process_slice(dev) - 1); 247 260 248 261 pr_debug("PQM After DQM create queue\n"); 249 262 ··· 320 313 dqm = pqn->q->device->dqm; 321 314 retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); 322 315 if (retval) { 323 - pr_debug("Destroy queue failed, returned %d\n", retval); 324 - goto err_destroy_queue; 316 + pr_err("Pasid %d destroy queue %d failed, ret %d\n", 317 + pqm->process->pasid, 318 + pqn->q->properties.queue_id, retval); 319 + if (retval != -ETIME) 320 + goto err_destroy_queue; 325 321 } 326 322 uninit_queue(pqn->q); 327 323 }
+4 -4
drivers/gpu/drm/amd/amdkfd/kfd_queue.c
··· 36 36 pr_debug("Queue Address: 0x%llX\n", q->queue_address); 37 37 pr_debug("Queue Id: %u\n", q->queue_id); 38 38 pr_debug("Queue Process Vmid: %u\n", q->vmid); 39 - pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); 40 - pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); 39 + pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr); 40 + pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr); 41 41 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); 42 42 pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); 43 43 } ··· 53 53 pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); 54 54 pr_debug("Queue Id: %u\n", q->properties.queue_id); 55 55 pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); 56 - pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); 57 - pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); 56 + pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr); 57 + pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr); 58 58 pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); 59 59 pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); 60 60 pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
+6
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
··· 1239 1239 HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & 1240 1240 HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); 1241 1241 break; 1242 + case CHIP_VEGA10: 1243 + case CHIP_RAVEN: 1244 + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << 1245 + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & 1246 + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); 1247 + break; 1242 1248 default: 1243 1249 WARN(1, "Unexpected ASIC family %u", 1244 1250 dev->gpu->device_info->asic_family);
+1
drivers/gpu/drm/amd/amdkfd/kfd_topology.h
··· 45 45 46 46 #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 47 47 #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 48 + #define HSA_CAP_DOORBELL_TYPE_2_0 0x2 48 49 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 49 50 50 51 struct kfd_node_properties {
+47
drivers/gpu/drm/amd/amdkfd/soc15_int.h
··· 1 + /* 2 + * Copyright 2016-2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + */ 22 + 23 + #ifndef HSA_SOC15_INT_H_INCLUDED 24 + #define HSA_SOC15_INT_H_INCLUDED 25 + 26 + #include "soc15_ih_clientid.h" 27 + 28 + #define SOC15_INTSRC_CP_END_OF_PIPE 181 29 + #define SOC15_INTSRC_CP_BAD_OPCODE 183 30 + #define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 31 + #define SOC15_INTSRC_VMC_FAULT 0 32 + #define SOC15_INTSRC_SDMA_TRAP 224 33 + 34 + 35 + #define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) 36 + #define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) 37 + #define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) 38 + #define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) 39 + #define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) 40 + #define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) 41 + #define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) 42 + #define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) 43 + #define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) 44 + #define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) 45 + 46 + #endif 47 +
+21 -5
drivers/gpu/drm/amd/include/kgd_kfd_interface.h
··· 100 100 /* Bit n == 1 means Queue n is available for KFD */ 101 101 DECLARE_BITMAP(queue_bitmap, KGD_MAX_QUEUES); 102 102 103 + /* Doorbell assignments (SOC15 and later chips only). Only 104 + * specific doorbells are routed to each SDMA engine. Others 105 + * are routed to IH and VCN. They are not usable by the CP. 106 + * 107 + * Any doorbell number D that satisfies the following condition 108 + * is reserved: (D & reserved_doorbell_mask) == reserved_doorbell_val 109 + * 110 + * KFD currently uses 1024 (= 0x3ff) doorbells per process. If 111 + * doorbells 0x0f0-0x0f7 and 0x2f-0x2f7 are reserved, that means 112 + * mask would be set to 0x1f8 and val set to 0x0f0. 113 + */ 114 + unsigned int sdma_doorbell[2][2]; 115 + unsigned int reserved_doorbell_mask; 116 + unsigned int reserved_doorbell_val; 117 + 103 118 /* Base address of doorbell aperture. */ 104 119 phys_addr_t doorbell_physical_address; 105 120 ··· 187 172 * 188 173 * @set_pasid_vmid_mapping: Exposes pasid/vmid pair to the H/W for no cp 189 174 * scheduling mode. Only used for no cp scheduling mode. 190 - * 191 - * @init_pipeline: Initialized the compute pipelines. 192 175 * 193 176 * @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp 194 177 * sceduling mode. ··· 286 273 287 274 int (*set_pasid_vmid_mapping)(struct kgd_dev *kgd, unsigned int pasid, 288 275 unsigned int vmid); 289 - 290 - int (*init_pipeline)(struct kgd_dev *kgd, uint32_t pipe_id, 291 - uint32_t hpd_size, uint64_t hpd_gpu_addr); 292 276 293 277 int (*init_interrupts)(struct kgd_dev *kgd, uint32_t pipe_id); 294 278 ··· 392 382 * 393 383 * @resume: Notifies amdkfd about a resume action done to a kgd device 394 384 * 385 + * @quiesce_mm: Quiesce all user queue access to specified MM address space 386 + * 387 + * @resume_mm: Resume user queue access to specified MM address space 388 + * 395 389 * @schedule_evict_and_restore_process: Schedules work queue that will prepare 396 390 * for safe eviction of KFD BOs that belong to the specified process. 397 391 * ··· 413 399 void (*interrupt)(struct kfd_dev *kfd, const void *ih_ring_entry); 414 400 void (*suspend)(struct kfd_dev *kfd); 415 401 int (*resume)(struct kfd_dev *kfd); 402 + int (*quiesce_mm)(struct mm_struct *mm); 403 + int (*resume_mm)(struct mm_struct *mm); 416 404 int (*schedule_evict_and_restore_process)(struct mm_struct *mm, 417 405 struct dma_fence *fence); 418 406 };
+24 -24
drivers/gpu/drm/amd/include/v9_structs.h
··· 29 29 uint32_t sdmax_rlcx_rb_base; 30 30 uint32_t sdmax_rlcx_rb_base_hi; 31 31 uint32_t sdmax_rlcx_rb_rptr; 32 + uint32_t sdmax_rlcx_rb_rptr_hi; 32 33 uint32_t sdmax_rlcx_rb_wptr; 34 + uint32_t sdmax_rlcx_rb_wptr_hi; 33 35 uint32_t sdmax_rlcx_rb_wptr_poll_cntl; 34 - uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; 35 - uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; 36 36 uint32_t sdmax_rlcx_rb_rptr_addr_hi; 37 37 uint32_t sdmax_rlcx_rb_rptr_addr_lo; 38 38 uint32_t sdmax_rlcx_ib_cntl; ··· 44 44 uint32_t sdmax_rlcx_skip_cntl; 45 45 uint32_t sdmax_rlcx_context_status; 46 46 uint32_t sdmax_rlcx_doorbell; 47 - uint32_t sdmax_rlcx_virtual_addr; 48 - uint32_t sdmax_rlcx_ape1_cntl; 47 + uint32_t sdmax_rlcx_status; 49 48 uint32_t sdmax_rlcx_doorbell_log; 50 - uint32_t reserved_22; 51 - uint32_t reserved_23; 52 - uint32_t reserved_24; 53 - uint32_t reserved_25; 54 - uint32_t reserved_26; 55 - uint32_t reserved_27; 56 - uint32_t reserved_28; 57 - uint32_t reserved_29; 58 - uint32_t reserved_30; 59 - uint32_t reserved_31; 60 - uint32_t reserved_32; 61 - uint32_t reserved_33; 62 - uint32_t reserved_34; 63 - uint32_t reserved_35; 64 - uint32_t reserved_36; 65 - uint32_t reserved_37; 66 - uint32_t reserved_38; 67 - uint32_t reserved_39; 68 - uint32_t reserved_40; 69 - uint32_t reserved_41; 49 + uint32_t sdmax_rlcx_watermark; 50 + uint32_t sdmax_rlcx_doorbell_offset; 51 + uint32_t sdmax_rlcx_csa_addr_lo; 52 + uint32_t sdmax_rlcx_csa_addr_hi; 53 + uint32_t sdmax_rlcx_ib_sub_remain; 54 + uint32_t sdmax_rlcx_preempt; 55 + uint32_t sdmax_rlcx_dummy_reg; 56 + uint32_t sdmax_rlcx_rb_wptr_poll_addr_hi; 57 + uint32_t sdmax_rlcx_rb_wptr_poll_addr_lo; 58 + uint32_t sdmax_rlcx_rb_aql_cntl; 59 + uint32_t sdmax_rlcx_minor_ptr_update; 60 + uint32_t sdmax_rlcx_midcmd_data0; 61 + uint32_t sdmax_rlcx_midcmd_data1; 62 + uint32_t sdmax_rlcx_midcmd_data2; 63 + uint32_t sdmax_rlcx_midcmd_data3; 64 + uint32_t sdmax_rlcx_midcmd_data4; 65 + uint32_t sdmax_rlcx_midcmd_data5; 66 + uint32_t sdmax_rlcx_midcmd_data6; 67 + uint32_t sdmax_rlcx_midcmd_data7; 68 + uint32_t sdmax_rlcx_midcmd_data8; 69 + uint32_t sdmax_rlcx_midcmd_cntl; 70 70 uint32_t reserved_42; 71 71 uint32_t reserved_43; 72 72 uint32_t reserved_44;