Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Add wave control operation to debugger

The wave control operation supports several command types executed upon
existing wave fronts that belong to the currently debugged process.

The available commands are:

HALT - Freeze wave front(s) execution
RESUME - Resume freezed wave front(s) execution
KILL - Kill existing wave front(s)

Signed-off-by: Yair Shachar <yair.shachar@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

authored by

Yair Shachar and committed by
Oded Gabbay
788bf83d fbeb661b

+430 -2
+405
drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
··· 47 47 dev->kfd2kgd->address_watch_disable(dev->kgd); 48 48 } 49 49 50 + static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, 51 + unsigned int pasid, uint64_t vmid0_address, 52 + uint32_t *packet_buff, size_t size_in_bytes) 53 + { 54 + struct pm4__release_mem *rm_packet; 55 + struct pm4__indirect_buffer_pasid *ib_packet; 56 + struct kfd_mem_obj *mem_obj; 57 + size_t pq_packets_size_in_bytes; 58 + union ULARGE_INTEGER *largep; 59 + union ULARGE_INTEGER addr; 60 + struct kernel_queue *kq; 61 + uint64_t *rm_state; 62 + unsigned int *ib_packet_buff; 63 + int status; 64 + 65 + BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes); 66 + 67 + kq = dbgdev->kq; 68 + 69 + pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + 70 + sizeof(struct pm4__indirect_buffer_pasid); 71 + 72 + /* 73 + * We acquire a buffer from DIQ 74 + * The receive packet buff will be sitting on the Indirect Buffer 75 + * and in the PQ we put the IB packet + sync packet(s). 76 + */ 77 + status = kq->ops.acquire_packet_buffer(kq, 78 + pq_packets_size_in_bytes / sizeof(uint32_t), 79 + &ib_packet_buff); 80 + if (status != 0) { 81 + pr_err("amdkfd: acquire_packet_buffer failed\n"); 82 + return status; 83 + } 84 + 85 + memset(ib_packet_buff, 0, pq_packets_size_in_bytes); 86 + 87 + ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff); 88 + 89 + ib_packet->header.count = 3; 90 + ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID; 91 + ib_packet->header.type = PM4_TYPE_3; 92 + 93 + largep = (union ULARGE_INTEGER *) &vmid0_address; 94 + 95 + ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2; 96 + ib_packet->bitfields3.ib_base_hi = largep->u.high_part; 97 + 98 + ib_packet->control = (1 << 23) | (1 << 31) | 99 + ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); 100 + 101 + ib_packet->bitfields5.pasid = pasid; 102 + 103 + /* 104 + * for now we use release mem for GPU-CPU synchronization 105 + * Consider WaitRegMem + WriteData as a better alternative 106 + * we get a GART allocations ( gpu/cpu mapping), 107 + * for the sync variable, and wait until: 108 + * (a) Sync with HW 109 + * (b) Sync var is written by CP to mem. 110 + */ 111 + rm_packet = (struct pm4__release_mem *) (ib_packet_buff + 112 + (sizeof(struct pm4__indirect_buffer_pasid) / 113 + sizeof(unsigned int))); 114 + 115 + status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t), 116 + &mem_obj); 117 + 118 + if (status != 0) { 119 + pr_err("amdkfd: Failed to allocate GART memory\n"); 120 + kq->ops.rollback_packet(kq); 121 + return status; 122 + } 123 + 124 + rm_state = (uint64_t *) mem_obj->cpu_ptr; 125 + 126 + *rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING; 127 + 128 + rm_packet->header.opcode = IT_RELEASE_MEM; 129 + rm_packet->header.type = PM4_TYPE_3; 130 + rm_packet->header.count = sizeof(struct pm4__release_mem) / 131 + sizeof(unsigned int) - 2; 132 + 133 + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; 134 + rm_packet->bitfields2.event_index = 135 + event_index___release_mem__end_of_pipe; 136 + 137 + rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru; 138 + rm_packet->bitfields2.atc = 0; 139 + rm_packet->bitfields2.tc_wb_action_ena = 1; 140 + 141 + addr.quad_part = mem_obj->gpu_addr; 142 + 143 + rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2; 144 + rm_packet->address_hi = addr.u.high_part; 145 + 146 + rm_packet->bitfields3.data_sel = 147 + data_sel___release_mem__send_64_bit_data; 148 + 149 + rm_packet->bitfields3.int_sel = 150 + int_sel___release_mem__send_data_after_write_confirm; 151 + 152 + rm_packet->bitfields3.dst_sel = 153 + dst_sel___release_mem__memory_controller; 154 + 155 + rm_packet->data_lo = QUEUESTATE__ACTIVE; 156 + 157 + kq->ops.submit_packet(kq); 158 + 159 + /* Wait till CP writes sync code: */ 160 + status = amdkfd_fence_wait_timeout( 161 + (unsigned int *) rm_state, 162 + QUEUESTATE__ACTIVE, 1500); 163 + 164 + kfd_gtt_sa_free(dbgdev->dev, mem_obj); 165 + 166 + return status; 167 + } 168 + 50 169 static int dbgdev_register_nodiq(struct kfd_dbgdev *dbgdev) 51 170 { 52 171 BUG_ON(!dbgdev); ··· 236 117 return status; 237 118 } 238 119 120 + static int dbgdev_wave_control_set_registers( 121 + struct dbg_wave_control_info *wac_info, 122 + union SQ_CMD_BITS *in_reg_sq_cmd, 123 + union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) 124 + { 125 + int status; 126 + union SQ_CMD_BITS reg_sq_cmd; 127 + union GRBM_GFX_INDEX_BITS reg_gfx_index; 128 + struct HsaDbgWaveMsgAMDGen2 *pMsg; 129 + 130 + BUG_ON(!wac_info || !in_reg_sq_cmd || !in_reg_gfx_index); 131 + 132 + reg_sq_cmd.u32All = 0; 133 + reg_gfx_index.u32All = 0; 134 + pMsg = &wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2; 135 + 136 + switch (wac_info->mode) { 137 + /* Send command to single wave */ 138 + case HSA_DBG_WAVEMODE_SINGLE: 139 + /* 140 + * Limit access to the process waves only, 141 + * by setting vmid check 142 + */ 143 + reg_sq_cmd.bits.check_vmid = 1; 144 + reg_sq_cmd.bits.simd_id = pMsg->ui32.SIMD; 145 + reg_sq_cmd.bits.wave_id = pMsg->ui32.WaveId; 146 + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_SINGLE; 147 + 148 + reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; 149 + reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; 150 + reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; 151 + 152 + break; 153 + 154 + /* Send command to all waves with matching VMID */ 155 + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS: 156 + 157 + reg_gfx_index.bits.sh_broadcast_writes = 1; 158 + reg_gfx_index.bits.se_broadcast_writes = 1; 159 + reg_gfx_index.bits.instance_broadcast_writes = 1; 160 + 161 + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; 162 + 163 + break; 164 + 165 + /* Send command to all CU waves with matching VMID */ 166 + case HSA_DBG_WAVEMODE_BROADCAST_PROCESS_CU: 167 + 168 + reg_sq_cmd.bits.check_vmid = 1; 169 + reg_sq_cmd.bits.mode = SQ_IND_CMD_MODE_BROADCAST; 170 + 171 + reg_gfx_index.bits.sh_index = pMsg->ui32.ShaderArray; 172 + reg_gfx_index.bits.se_index = pMsg->ui32.ShaderEngine; 173 + reg_gfx_index.bits.instance_index = pMsg->ui32.HSACU; 174 + 175 + break; 176 + 177 + default: 178 + return -EINVAL; 179 + } 180 + 181 + switch (wac_info->operand) { 182 + case HSA_DBG_WAVEOP_HALT: 183 + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; 184 + break; 185 + 186 + case HSA_DBG_WAVEOP_RESUME: 187 + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; 188 + break; 189 + 190 + case HSA_DBG_WAVEOP_KILL: 191 + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_KILL; 192 + break; 193 + 194 + case HSA_DBG_WAVEOP_DEBUG: 195 + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_DEBUG; 196 + break; 197 + 198 + case HSA_DBG_WAVEOP_TRAP: 199 + if (wac_info->trapId < MAX_TRAPID) { 200 + reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_TRAP; 201 + reg_sq_cmd.bits.trap_id = wac_info->trapId; 202 + } else { 203 + status = -EINVAL; 204 + } 205 + break; 206 + 207 + default: 208 + status = -EINVAL; 209 + break; 210 + } 211 + 212 + if (status == 0) { 213 + *in_reg_sq_cmd = reg_sq_cmd; 214 + *in_reg_gfx_index = reg_gfx_index; 215 + } 216 + 217 + return status; 218 + } 219 + 220 + static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, 221 + struct dbg_wave_control_info *wac_info) 222 + { 223 + 224 + int status; 225 + union SQ_CMD_BITS reg_sq_cmd; 226 + union GRBM_GFX_INDEX_BITS reg_gfx_index; 227 + struct kfd_mem_obj *mem_obj; 228 + uint32_t *packet_buff_uint; 229 + struct pm4__set_config_reg *packets_vec; 230 + size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; 231 + 232 + BUG_ON(!dbgdev || !wac_info); 233 + 234 + reg_sq_cmd.u32All = 0; 235 + 236 + status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd, 237 + &reg_gfx_index); 238 + if (status) { 239 + pr_err("amdkfd: Failed to set wave control registers\n"); 240 + return status; 241 + } 242 + 243 + /* we do not control the VMID in DIQ,so reset it to a known value */ 244 + reg_sq_cmd.bits.vm_id = 0; 245 + 246 + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); 247 + 248 + pr_debug("\t\t mode is: %u\n", wac_info->mode); 249 + pr_debug("\t\t operand is: %u\n", wac_info->operand); 250 + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); 251 + pr_debug("\t\t msg value is: %u\n", 252 + wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); 253 + pr_debug("\t\t vmid is: N/A\n"); 254 + 255 + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); 256 + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); 257 + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); 258 + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); 259 + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); 260 + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); 261 + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); 262 + 263 + pr_debug("\t\t ibw is : %u\n", 264 + reg_gfx_index.bitfields.instance_broadcast_writes); 265 + pr_debug("\t\t ii is : %u\n", 266 + reg_gfx_index.bitfields.instance_index); 267 + pr_debug("\t\t sebw is : %u\n", 268 + reg_gfx_index.bitfields.se_broadcast_writes); 269 + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); 270 + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); 271 + pr_debug("\t\t sbw is : %u\n", 272 + reg_gfx_index.bitfields.sh_broadcast_writes); 273 + 274 + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); 275 + 276 + status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); 277 + 278 + if (status != 0) { 279 + pr_err("amdkfd: Failed to allocate GART memory\n"); 280 + return status; 281 + } 282 + 283 + packet_buff_uint = mem_obj->cpu_ptr; 284 + 285 + memset(packet_buff_uint, 0, ib_size); 286 + 287 + packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; 288 + packets_vec[0].header.count = 1; 289 + packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; 290 + packets_vec[0].header.type = PM4_TYPE_3; 291 + packets_vec[0].bitfields2.reg_offset = 292 + GRBM_GFX_INDEX / (sizeof(uint32_t)) - 293 + USERCONFIG_REG_BASE; 294 + 295 + packets_vec[0].bitfields2.insert_vmid = 0; 296 + packets_vec[0].reg_data[0] = reg_gfx_index.u32All; 297 + 298 + packets_vec[1].header.count = 1; 299 + packets_vec[1].header.opcode = IT_SET_CONFIG_REG; 300 + packets_vec[1].header.type = PM4_TYPE_3; 301 + packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - 302 + CONFIG_REG_BASE; 303 + 304 + packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; 305 + packets_vec[1].bitfields2.insert_vmid = 1; 306 + packets_vec[1].reg_data[0] = reg_sq_cmd.u32All; 307 + 308 + /* Restore the GRBM_GFX_INDEX register */ 309 + 310 + reg_gfx_index.u32All = 0; 311 + reg_gfx_index.bits.sh_broadcast_writes = 1; 312 + reg_gfx_index.bits.instance_broadcast_writes = 1; 313 + reg_gfx_index.bits.se_broadcast_writes = 1; 314 + 315 + 316 + packets_vec[2].ordinal1 = packets_vec[0].ordinal1; 317 + packets_vec[2].bitfields2.reg_offset = 318 + GRBM_GFX_INDEX / (sizeof(uint32_t)) - 319 + USERCONFIG_REG_BASE; 320 + 321 + packets_vec[2].bitfields2.insert_vmid = 0; 322 + packets_vec[2].reg_data[0] = reg_gfx_index.u32All; 323 + 324 + status = dbgdev_diq_submit_ib( 325 + dbgdev, 326 + wac_info->process->pasid, 327 + mem_obj->gpu_addr, 328 + packet_buff_uint, 329 + ib_size); 330 + 331 + if (status != 0) 332 + pr_err("amdkfd: Failed to submit IB to DIQ\n"); 333 + 334 + kfd_gtt_sa_free(dbgdev->dev, mem_obj); 335 + 336 + return status; 337 + } 338 + 339 + static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, 340 + struct dbg_wave_control_info *wac_info) 341 + { 342 + int status; 343 + union SQ_CMD_BITS reg_sq_cmd; 344 + union GRBM_GFX_INDEX_BITS reg_gfx_index; 345 + struct kfd_process_device *pdd; 346 + 347 + BUG_ON(!dbgdev || !dbgdev->dev || !wac_info); 348 + 349 + reg_sq_cmd.u32All = 0; 350 + 351 + /* taking the VMID for that process on the safe way using PDD */ 352 + pdd = kfd_get_process_device_data(dbgdev->dev, wac_info->process); 353 + 354 + if (!pdd) { 355 + pr_err("amdkfd: Failed to get pdd for wave control no DIQ\n"); 356 + return -EFAULT; 357 + } 358 + status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd, 359 + &reg_gfx_index); 360 + if (status) { 361 + pr_err("amdkfd: Failed to set wave control registers\n"); 362 + return status; 363 + } 364 + 365 + /* for non DIQ we need to patch the VMID: */ 366 + 367 + reg_sq_cmd.bits.vm_id = pdd->qpd.vmid; 368 + 369 + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); 370 + 371 + pr_debug("\t\t mode is: %u\n", wac_info->mode); 372 + pr_debug("\t\t operand is: %u\n", wac_info->operand); 373 + pr_debug("\t\t trap id is: %u\n", wac_info->trapId); 374 + pr_debug("\t\t msg value is: %u\n", 375 + wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); 376 + pr_debug("\t\t vmid is: %u\n", pdd->qpd.vmid); 377 + 378 + pr_debug("\t\t chk_vmid is : %u\n", reg_sq_cmd.bitfields.check_vmid); 379 + pr_debug("\t\t command is : %u\n", reg_sq_cmd.bitfields.cmd); 380 + pr_debug("\t\t queue id is : %u\n", reg_sq_cmd.bitfields.queue_id); 381 + pr_debug("\t\t simd id is : %u\n", reg_sq_cmd.bitfields.simd_id); 382 + pr_debug("\t\t mode is : %u\n", reg_sq_cmd.bitfields.mode); 383 + pr_debug("\t\t vm_id is : %u\n", reg_sq_cmd.bitfields.vm_id); 384 + pr_debug("\t\t wave_id is : %u\n", reg_sq_cmd.bitfields.wave_id); 385 + 386 + pr_debug("\t\t ibw is : %u\n", 387 + reg_gfx_index.bitfields.instance_broadcast_writes); 388 + pr_debug("\t\t ii is : %u\n", 389 + reg_gfx_index.bitfields.instance_index); 390 + pr_debug("\t\t sebw is : %u\n", 391 + reg_gfx_index.bitfields.se_broadcast_writes); 392 + pr_debug("\t\t se_ind is : %u\n", reg_gfx_index.bitfields.se_index); 393 + pr_debug("\t\t sh_ind is : %u\n", reg_gfx_index.bitfields.sh_index); 394 + pr_debug("\t\t sbw is : %u\n", 395 + reg_gfx_index.bitfields.sh_broadcast_writes); 396 + 397 + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); 398 + 399 + return dbgdev->dev->kfd2kgd->wave_control_execute(dbgdev->dev->kgd, 400 + reg_gfx_index.u32All, 401 + reg_sq_cmd.u32All); 402 + } 403 + 239 404 void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev, 240 405 enum DBGDEV_TYPE type) 241 406 { ··· 534 131 case DBGDEV_TYPE_NODIQ: 535 132 pdbgdev->dbgdev_register = dbgdev_register_nodiq; 536 133 pdbgdev->dbgdev_unregister = dbgdev_unregister_nodiq; 134 + pdbgdev->dbgdev_wave_control = dbgdev_wave_control_nodiq; 537 135 break; 538 136 case DBGDEV_TYPE_DIQ: 539 137 default: 540 138 pdbgdev->dbgdev_register = dbgdev_register_diq; 541 139 pdbgdev->dbgdev_unregister = dbgdev_unregister_diq; 140 + pdbgdev->dbgdev_wave_control = dbgdev_wave_control_diq; 542 141 break; 543 142 } 544 143
+16
drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
··· 133 133 134 134 return 0; 135 135 } 136 + 137 + long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, 138 + struct dbg_wave_control_info *wac_info) 139 + { 140 + BUG_ON(!pmgr || !pmgr->dbgdev || !wac_info); 141 + 142 + /* Is the requests coming from the already registered process? */ 143 + if (pmgr->pasid != wac_info->process->pasid) { 144 + pr_debug("H/W debugger support was not registered for requester pasid %d\n", 145 + wac_info->process->pasid); 146 + return -EINVAL; 147 + } 148 + 149 + return (long) pmgr->dbgdev->dbgdev_wave_control(pmgr->dbgdev, wac_info); 150 + } 151 +
+4 -1
drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.h
··· 268 268 /* virtualized function pointers to device dbg */ 269 269 int (*dbgdev_register)(struct kfd_dbgdev *dbgdev); 270 270 int (*dbgdev_unregister)(struct kfd_dbgdev *dbgdev); 271 + int (*dbgdev_wave_control)(struct kfd_dbgdev *dbgdev, 272 + struct dbg_wave_control_info *wac_info); 271 273 272 274 }; 273 275 ··· 285 283 bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev); 286 284 long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p); 287 285 long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p); 288 - 286 + long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, 287 + struct dbg_wave_control_info *wac_info); 289 288 #endif /* KFD_DBGMGR_H_ */
+1 -1
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 915 915 return retval; 916 916 } 917 917 918 - static int amdkfd_fence_wait_timeout(unsigned int *fence_addr, 918 + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, 919 919 unsigned int fence_value, 920 920 unsigned long timeout) 921 921 {
+4
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
··· 656 656 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, 657 657 unsigned int qid); 658 658 659 + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, 660 + unsigned int fence_value, 661 + unsigned long timeout); 662 + 659 663 /* Packet Manager */ 660 664 661 665 #define KFD_HIQ_TIMEOUT (500)