Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Check HIQ's MQD for queue preemption status

MEC firmware can silently fail the queue preemption request
without time out. In this case, HIQ's MQD's queue_doorbell_id
will be set. Check this field to see whether last queue preemption
was successful or not.

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Suggested-by: Jay Cornwall <Jay.Cornwall@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Oak Zeng and committed by
Alex Deucher
51a0f459 6d909c5d

+66 -16
+17
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 1393 1393 uint32_t filter_param) 1394 1394 { 1395 1395 int retval = 0; 1396 + struct mqd_manager *mqd_mgr; 1396 1397 1397 1398 if (!dqm->sched_running) 1398 1399 return 0; ··· 1423 1422 if (!dqm->is_resetting) 1424 1423 schedule_work(&dqm->hw_exception_work); 1425 1424 return retval; 1425 + } 1426 + 1427 + /* In the current MEC firmware implementation, if compute queue 1428 + * doesn't response to the preemption request in time, HIQ will 1429 + * abandon the unmap request without returning any timeout error 1430 + * to driver. Instead, MEC firmware will log the doorbell of the 1431 + * unresponding compute queue to HIQ.MQD.queue_doorbell_id fields. 1432 + * To make sure the queue unmap was successful, driver need to 1433 + * check those fields 1434 + */ 1435 + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; 1436 + if (mqd_mgr->read_doorbell_id(dqm->packets.priv_queue->queue->mqd)) { 1437 + pr_err("HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); 1438 + while (halt_if_hws_hang) 1439 + schedule(); 1440 + return -ETIME; 1426 1441 } 1427 1442 1428 1443 pm_release_ib(&dqm->packets);
+1
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
··· 101 101 #if defined(CONFIG_DEBUG_FS) 102 102 int (*debugfs_show_mqd)(struct seq_file *m, void *data); 103 103 #endif 104 + uint32_t (*read_doorbell_id)(void *mqd); 104 105 105 106 struct mutex mqd_mutex; 106 107 struct kfd_dev *dev;
+8
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
··· 226 226 __update_mqd(mm, mqd, q, 1); 227 227 } 228 228 229 + static uint32_t read_doorbell_id(void *mqd) 230 + { 231 + struct cik_mqd *m = (struct cik_mqd *)mqd; 232 + 233 + return m->queue_doorbell_id0; 234 + } 235 + 229 236 static void update_mqd_hawaii(struct mqd_manager *mm, void *mqd, 230 237 struct queue_properties *q) 231 238 { ··· 405 398 #if defined(CONFIG_DEBUG_FS) 406 399 mqd->debugfs_show_mqd = debugfs_show_mqd; 407 400 #endif 401 + mqd->read_doorbell_id = read_doorbell_id; 408 402 break; 409 403 case KFD_MQD_TYPE_DIQ: 410 404 mqd->allocate_mqd = allocate_mqd;
+8
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
··· 224 224 q->is_active = QUEUE_IS_ACTIVE(*q); 225 225 } 226 226 227 + static uint32_t read_doorbell_id(void *mqd) 228 + { 229 + struct v10_compute_mqd *m = (struct v10_compute_mqd *)mqd; 230 + 231 + return m->queue_doorbell_id0; 232 + } 233 + 227 234 static int destroy_mqd(struct mqd_manager *mm, void *mqd, 228 235 enum kfd_preempt_type type, 229 236 unsigned int timeout, uint32_t pipe_id, ··· 432 425 #if defined(CONFIG_DEBUG_FS) 433 426 mqd->debugfs_show_mqd = debugfs_show_mqd; 434 427 #endif 428 + mqd->read_doorbell_id = read_doorbell_id; 435 429 pr_debug("%s@%i\n", __func__, __LINE__); 436 430 break; 437 431 case KFD_MQD_TYPE_DIQ:
+8
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
··· 276 276 } 277 277 278 278 279 + static uint32_t read_doorbell_id(void *mqd) 280 + { 281 + struct v9_mqd *m = (struct v9_mqd *)mqd; 282 + 283 + return m->queue_doorbell_id0; 284 + } 285 + 279 286 static int destroy_mqd(struct mqd_manager *mm, void *mqd, 280 287 enum kfd_preempt_type type, 281 288 unsigned int timeout, uint32_t pipe_id, ··· 484 477 #if defined(CONFIG_DEBUG_FS) 485 478 mqd->debugfs_show_mqd = debugfs_show_mqd; 486 479 #endif 480 + mqd->read_doorbell_id = read_doorbell_id; 487 481 break; 488 482 case KFD_MQD_TYPE_DIQ: 489 483 mqd->allocate_mqd = allocate_mqd;
+8
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
··· 243 243 __update_mqd(mm, mqd, q, MTYPE_CC, 1); 244 244 } 245 245 246 + static uint32_t read_doorbell_id(void *mqd) 247 + { 248 + struct vi_mqd *m = (struct vi_mqd *)mqd; 249 + 250 + return m->queue_doorbell_id0; 251 + } 252 + 246 253 static void update_mqd_tonga(struct mqd_manager *mm, void *mqd, 247 254 struct queue_properties *q) 248 255 { ··· 453 446 #if defined(CONFIG_DEBUG_FS) 454 447 mqd->debugfs_show_mqd = debugfs_show_mqd; 455 448 #endif 449 + mqd->read_doorbell_id = read_doorbell_id; 456 450 break; 457 451 case KFD_MQD_TYPE_DIQ: 458 452 mqd->allocate_mqd = allocate_mqd;
+16 -16
drivers/gpu/drm/amd/include/vi_structs.h
··· 397 397 uint32_t reserved60; 398 398 uint32_t reserved61; 399 399 uint32_t reserved62; 400 - uint32_t reserved63; 401 - uint32_t reserved64; 402 - uint32_t reserved65; 403 - uint32_t reserved66; 404 - uint32_t reserved67; 405 - uint32_t reserved68; 406 - uint32_t reserved69; 407 - uint32_t reserved70; 408 - uint32_t reserved71; 409 - uint32_t reserved72; 410 - uint32_t reserved73; 411 - uint32_t reserved74; 412 - uint32_t reserved75; 413 - uint32_t reserved76; 414 - uint32_t reserved77; 415 - uint32_t reserved78; 400 + uint32_t queue_doorbell_id0; 401 + uint32_t queue_doorbell_id1; 402 + uint32_t queue_doorbell_id2; 403 + uint32_t queue_doorbell_id3; 404 + uint32_t queue_doorbell_id4; 405 + uint32_t queue_doorbell_id5; 406 + uint32_t queue_doorbell_id6; 407 + uint32_t queue_doorbell_id7; 408 + uint32_t queue_doorbell_id8; 409 + uint32_t queue_doorbell_id9; 410 + uint32_t queue_doorbell_id10; 411 + uint32_t queue_doorbell_id11; 412 + uint32_t queue_doorbell_id12; 413 + uint32_t queue_doorbell_id13; 414 + uint32_t queue_doorbell_id14; 415 + uint32_t queue_doorbell_id15; 416 416 uint32_t reserved_t[256]; 417 417 }; 418 418