Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Check preemption status on all XCDs

This patch adds the following functionality:
- Check the queue preemption status on all XCDs in a partition
for GFX 9.4.3.
- Update the queue preemption debug message to print the queue
doorbell id for which preemption failed.
- Change the signature of check preemption failed function to
return a bool instead of uint32_t and pass the MQD manager
as an argument.

Suggested-by: Jay Cornwall <jay.cornwall@amd.com>
Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Mukul Joshi and committed by
Alex Deucher
0991a4c1 26d97182

+52 -14
+1 -2
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1997 1997 * check those fields 1998 1998 */ 1999 1999 mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; 2000 - if (mqd_mgr->check_preemption_failed(dqm->packet_mgr.priv_queue->queue->mqd)) { 2001 - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); 2000 + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { 2002 2001 while (halt_if_hws_hang) 2003 2002 schedule(); 2004 2003 return -ETIME;
+18
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
··· 290 290 { 291 291 return mm->mqd_size; 292 292 } 293 + 294 + bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, 295 + uint32_t inst) 296 + { 297 + if (doorbell_id) { 298 + struct device *dev = node->adev->dev; 299 + 300 + if (node->adev->xcp_mgr && node->adev->xcp_mgr->num_xcps > 0) 301 + dev_err(dev, "XCC %d: Queue preemption failed for queue with doorbell_id: %x\n", 302 + inst, doorbell_id); 303 + else 304 + dev_err(dev, "Queue preemption failed for queue with doorbell_id: %x\n", 305 + doorbell_id); 306 + return true; 307 + } 308 + 309 + return false; 310 + }
+3 -1
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
··· 119 119 #if defined(CONFIG_DEBUG_FS) 120 120 int (*debugfs_show_mqd)(struct seq_file *m, void *data); 121 121 #endif 122 - uint32_t (*check_preemption_failed)(void *mqd); 122 + bool (*check_preemption_failed)(struct mqd_manager *mm, void *mqd); 123 123 uint64_t (*mqd_stride)(struct mqd_manager *mm, 124 124 struct queue_properties *p); 125 125 ··· 198 198 uint64_t kfd_hiq_mqd_stride(struct kfd_node *dev); 199 199 uint64_t kfd_mqd_stride(struct mqd_manager *mm, 200 200 struct queue_properties *q); 201 + bool kfd_check_hiq_mqd_doorbell_id(struct kfd_node *node, uint32_t doorbell_id, 202 + uint32_t inst); 201 203 #endif /* KFD_MQD_MANAGER_H_ */
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
··· 206 206 q->is_active = QUEUE_IS_ACTIVE(*q); 207 207 } 208 208 209 - static uint32_t check_preemption_failed(void *mqd) 209 + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 210 210 { 211 211 struct cik_mqd *m = (struct cik_mqd *)mqd; 212 212 213 - return m->queue_doorbell_id0; 213 + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); 214 214 } 215 215 216 216 static void update_mqd(struct mqd_manager *mm, void *mqd,
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
··· 224 224 q->is_active = QUEUE_IS_ACTIVE(*q); 225 225 } 226 226 227 - static uint32_t check_preemption_failed(void *mqd) 227 + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 228 228 { 229 229 struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd; 230 230 231 - return m->queue_doorbell_id0; 231 + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); 232 232 } 233 233 234 234 static int get_wave_state(struct mqd_manager *mm, void *mqd,
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
··· 278 278 q->is_active = QUEUE_IS_ACTIVE(*q); 279 279 } 280 280 281 - static uint32_t check_preemption_failed(void *mqd) 281 + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 282 282 { 283 283 struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd; 284 284 285 - return m->queue_doorbell_id0; 285 + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); 286 286 } 287 287 288 288 static int get_wave_state(struct mqd_manager *mm, void *mqd,
+22 -3
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
··· 316 316 } 317 317 318 318 319 - static uint32_t check_preemption_failed(void *mqd) 319 + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 320 320 { 321 321 struct v9_mqd *m = (struct v9_mqd *)mqd; 322 322 323 - return m->queue_doorbell_id0; 323 + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); 324 324 } 325 325 326 326 static int get_wave_state(struct mqd_manager *mm, void *mqd, ··· 607 607 return err; 608 608 } 609 609 610 + static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) 611 + { 612 + uint64_t hiq_mqd_size = kfd_hiq_mqd_stride(mm->dev); 613 + uint32_t xcc_mask = mm->dev->xcc_mask; 614 + int inst = 0, xcc_id; 615 + struct v9_mqd *m; 616 + bool ret = false; 617 + 618 + for_each_inst(xcc_id, xcc_mask) { 619 + m = get_mqd(mqd + hiq_mqd_size * inst); 620 + ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, 621 + m->queue_doorbell_id0, inst); 622 + ++inst; 623 + } 624 + 625 + return ret; 626 + } 627 + 610 628 static void get_xcc_mqd(struct kfd_mem_obj *mqd_mem_obj, 611 629 struct kfd_mem_obj *xcc_mqd_mem_obj, 612 630 uint64_t offset) ··· 899 881 #if defined(CONFIG_DEBUG_FS) 900 882 mqd->debugfs_show_mqd = debugfs_show_mqd; 901 883 #endif 902 - mqd->check_preemption_failed = check_preemption_failed; 903 884 if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 3)) { 904 885 mqd->init_mqd = init_mqd_hiq_v9_4_3; 905 886 mqd->load_mqd = hiq_load_mqd_kiq_v9_4_3; 906 887 mqd->destroy_mqd = destroy_hiq_mqd_v9_4_3; 888 + mqd->check_preemption_failed = check_preemption_failed_v9_4_3; 907 889 } else { 908 890 mqd->init_mqd = init_mqd_hiq; 909 891 mqd->load_mqd = kfd_hiq_load_mqd_kiq; 910 892 mqd->destroy_mqd = destroy_hiq_mqd; 893 + mqd->check_preemption_failed = check_preemption_failed; 911 894 } 912 895 break; 913 896 case KFD_MQD_TYPE_DIQ:
+2 -2
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
··· 237 237 q->is_active = QUEUE_IS_ACTIVE(*q); 238 238 } 239 239 240 - static uint32_t check_preemption_failed(void *mqd) 240 + static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) 241 241 { 242 242 struct vi_mqd *m = (struct vi_mqd *)mqd; 243 243 244 - return m->queue_doorbell_id0; 244 + return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); 245 245 } 246 246 247 247 static void update_mqd(struct mqd_manager *mm, void *mqd,