Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: allow users to target recommended SDMA engines

Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Jonathan Kim and committed by
Alex Deucher
e06b71b2 60c30ba7

+116 -3
+16
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
··· 255 255 args->ctx_save_restore_address; 256 256 q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; 257 257 q_properties->ctl_stack_size = args->ctl_stack_size; 258 + q_properties->sdma_engine_id = args->sdma_engine_id; 258 259 if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || 259 260 args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) 260 261 q_properties->type = KFD_QUEUE_TYPE_COMPUTE; ··· 263 262 q_properties->type = KFD_QUEUE_TYPE_SDMA; 264 263 else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) 265 264 q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; 265 + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID) 266 + q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID; 266 267 else 267 268 return -ENOTSUPP; 268 269 ··· 336 333 goto err_bind_process; 337 334 } 338 335 336 + if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { 337 + int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) + 338 + kfd_get_num_xgmi_sdma_engines(dev) - 1; 339 + 340 + if (q_properties.sdma_engine_id > max_sdma_eng_id) { 341 + err = -EINVAL; 342 + pr_err("sdma_engine_id %i exceeds maximum id of %i\n", 343 + q_properties.sdma_engine_id, max_sdma_eng_id); 344 + goto err_sdma_engine_id; 345 + } 346 + } 347 + 339 348 if (!pdd->qpd.proc_doorbells) { 340 349 err = kfd_alloc_process_doorbells(dev->kfd, pdd); 341 350 if (err) { ··· 402 387 err_create_queue: 403 388 kfd_queue_release_buffers(pdd, &q_properties); 404 389 err_acquire_queue_buf: 390 + err_sdma_engine_id: 405 391 err_bind_process: 406 392 err_pdd: 407 393 mutex_unlock(&p->mutex);
+37 -1
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1532 1532 q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev); 1533 1533 q->properties.sdma_queue_id = q->sdma_id / 1534 1534 kfd_get_num_xgmi_sdma_engines(dqm->dev); 1535 + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { 1536 + int i, num_queues, num_engines, eng_offset = 0, start_engine; 1537 + bool free_bit_found = false, is_xgmi = false; 1538 + 1539 + if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) { 1540 + num_queues = get_num_sdma_queues(dqm); 1541 + num_engines = kfd_get_num_sdma_engines(dqm->dev); 1542 + q->properties.type = KFD_QUEUE_TYPE_SDMA; 1543 + } else { 1544 + num_queues = get_num_xgmi_sdma_queues(dqm); 1545 + num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev); 1546 + eng_offset = kfd_get_num_sdma_engines(dqm->dev); 1547 + q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI; 1548 + is_xgmi = true; 1549 + } 1550 + 1551 + /* Scan available bit based on target engine ID. */ 1552 + start_engine = q->properties.sdma_engine_id - eng_offset; 1553 + for (i = start_engine; i < num_queues; i += num_engines) { 1554 + 1555 + if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap)) 1556 + continue; 1557 + 1558 + clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap); 1559 + q->sdma_id = i; 1560 + q->properties.sdma_queue_id = q->sdma_id / num_engines; 1561 + free_bit_found = true; 1562 + break; 1563 + } 1564 + 1565 + if (!free_bit_found) { 1566 + dev_err(dev, "No more SDMA queue to allocate for target ID %i\n", 1567 + q->properties.sdma_engine_id); 1568 + return -ENOMEM; 1569 + } 1535 1570 } 1536 1571 1537 1572 pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); ··· 1819 1784 } 1820 1785 1821 1786 if (q->properties.type == KFD_QUEUE_TYPE_SDMA || 1822 - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { 1787 + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI || 1788 + q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { 1823 1789 dqm_lock(dqm); 1824 1790 retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL); 1825 1791 dqm_unlock(dqm);
+4 -1
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
··· 414 414 * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. 415 415 * 416 416 * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface. 417 + * 418 + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: SDMA user mode queue with target SDMA engine ID. 417 419 */ 418 420 enum kfd_queue_type { 419 421 KFD_QUEUE_TYPE_COMPUTE, 420 422 KFD_QUEUE_TYPE_SDMA, 421 423 KFD_QUEUE_TYPE_HIQ, 422 424 KFD_QUEUE_TYPE_DIQ, 423 - KFD_QUEUE_TYPE_SDMA_XGMI 425 + KFD_QUEUE_TYPE_SDMA_XGMI, 426 + KFD_QUEUE_TYPE_SDMA_BY_ENG_ID 424 427 }; 425 428 426 429 enum kfd_queue_format {
+1
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
··· 366 366 switch (type) { 367 367 case KFD_QUEUE_TYPE_SDMA: 368 368 case KFD_QUEUE_TYPE_SDMA_XGMI: 369 + case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID: 369 370 /* SDMA queues are always allocated statically no matter 370 371 * which scheduler mode is used. We also do not need to 371 372 * check whether a SDMA queue can be allocated here, because
+52
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
··· 292 292 iolink->max_bandwidth); 293 293 sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", 294 294 iolink->rec_transfer_size); 295 + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask", 296 + iolink->rec_sdma_eng_id_mask); 295 297 sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); 296 298 297 299 return offs; ··· 1267 1265 } 1268 1266 } 1269 1267 1268 + #define REC_SDMA_NUM_GPU 8 1269 + static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = { 1270 + { -1, 14, 12, 2, 4, 8, 10, 6 }, 1271 + { 14, -1, 2, 10, 8, 4, 6, 12 }, 1272 + { 10, 2, -1, 12, 14, 6, 4, 8 }, 1273 + { 2, 12, 10, -1, 6, 14, 8, 4 }, 1274 + { 4, 8, 14, 6, -1, 10, 12, 2 }, 1275 + { 8, 4, 6, 14, 12, -1, 2, 10 }, 1276 + { 10, 6, 4, 8, 12, 2, -1, 14 }, 1277 + { 6, 12, 8, 4, 2, 10, 14, -1 }}; 1278 + 1279 + static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, 1280 + struct kfd_iolink_properties *outbound_link, 1281 + struct kfd_iolink_properties *inbound_link) 1282 + { 1283 + struct kfd_node *gpu = outbound_link->gpu; 1284 + struct amdgpu_device *adev = gpu->adev; 1285 + int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; 1286 + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && 1287 + adev->aid_mask && num_xgmi_nodes && 1288 + (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) == 1289 + AMDGPU_SPX_PARTITION_MODE) && 1290 + (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8); 1291 + 1292 + if (support_rec_eng) { 1293 + int src_socket_id = adev->gmc.xgmi.physical_node_id; 1294 + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; 1295 + 1296 + outbound_link->rec_sdma_eng_id_mask = 1297 + 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id]; 1298 + inbound_link->rec_sdma_eng_id_mask = 1299 + 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id]; 1300 + } else { 1301 + int num_sdma_eng = kfd_get_num_sdma_engines(gpu); 1302 + int i, eng_offset = 0; 1303 + 1304 + if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && 1305 + kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) { 1306 + eng_offset = num_sdma_eng; 1307 + num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu); 1308 + } 1309 + 1310 + for (i = 0; i < num_sdma_eng; i++) { 1311 + outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); 1312 + inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); 1313 + } 1314 + } 1315 + } 1316 + 1270 1317 static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) 1271 1318 { 1272 1319 struct kfd_iolink_properties *link, *inbound_link; ··· 1354 1303 inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED; 1355 1304 kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link); 1356 1305 kfd_set_iolink_non_coherent(peer_dev, link, inbound_link); 1306 + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link); 1357 1307 } 1358 1308 } 1359 1309
+1
drivers/gpu/drm/amd/amdkfd/kfd_topology.h
··· 125 125 uint32_t min_bandwidth; 126 126 uint32_t max_bandwidth; 127 127 uint32_t rec_transfer_size; 128 + uint32_t rec_sdma_eng_id_mask; 128 129 uint32_t flags; 129 130 struct kfd_node *gpu; 130 131 struct kobject *kobj;
+5 -1
include/uapi/linux/kfd_ioctl.h
··· 42 42 * - 1.14 - Update kfd_event_data 43 43 * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl 44 44 * - 1.16 - Add contiguous VRAM allocation flag 45 + * - 1.17 - Add SDMA queue creation with target SDMA engine ID 45 46 */ 46 47 #define KFD_IOCTL_MAJOR_VERSION 1 47 - #define KFD_IOCTL_MINOR_VERSION 16 48 + #define KFD_IOCTL_MINOR_VERSION 17 48 49 49 50 struct kfd_ioctl_get_version_args { 50 51 __u32 major_version; /* from KFD */ ··· 57 56 #define KFD_IOC_QUEUE_TYPE_SDMA 0x1 58 57 #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2 59 58 #define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3 59 + #define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID 0x4 60 60 61 61 #define KFD_MAX_QUEUE_PERCENTAGE 100 62 62 #define KFD_MAX_QUEUE_PRIORITY 15 ··· 80 78 __u64 ctx_save_restore_address; /* to KFD */ 81 79 __u32 ctx_save_restore_size; /* to KFD */ 82 80 __u32 ctl_stack_size; /* to KFD */ 81 + __u32 sdma_engine_id; /* to KFD */ 82 + __u32 pad; 83 83 }; 84 84 85 85 struct kfd_ioctl_destroy_queue_args {