Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: Update BadOpcode Interrupt handling with MES

Based on the recommendation of MEC FW, update BadOpcode interrupt
handling by unmapping all queues, removing the queue that got the
interrupt from scheduling and remapping rest of the queues back when
using MES scheduler. This is done to prevent the case where unmapping
of the bad queue can fail thereby causing a GPU reset.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Acked-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Mukul Joshi and committed by
Alex Deucher
eb067d65 9a16042f

+58 -3
+51
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 2931 2931 kfree(dqm); 2932 2932 } 2933 2933 2934 + int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) 2935 + { 2936 + struct kfd_process_device *pdd; 2937 + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); 2938 + struct device_queue_manager *dqm = knode->dqm; 2939 + struct device *dev = dqm->dev->adev->dev; 2940 + struct qcm_process_device *qpd; 2941 + struct queue *q = NULL; 2942 + int ret = 0; 2943 + 2944 + if (!p) 2945 + return -EINVAL; 2946 + 2947 + dqm_lock(dqm); 2948 + 2949 + pdd = kfd_get_process_device_data(dqm->dev, p); 2950 + if (pdd) { 2951 + qpd = &pdd->qpd; 2952 + 2953 + list_for_each_entry(q, &qpd->queues_list, list) { 2954 + if (q->doorbell_id == doorbell_id && q->properties.is_active) { 2955 + ret = suspend_all_queues_mes(dqm); 2956 + if (ret) { 2957 + dev_err(dev, "Suspending all queues failed"); 2958 + goto out; 2959 + } 2960 + 2961 + q->properties.is_evicted = true; 2962 + q->properties.is_active = false; 2963 + decrement_queue_count(dqm, qpd, q); 2964 + 2965 + ret = remove_queue_mes(dqm, q, qpd); 2966 + if (ret) { 2967 + dev_err(dev, "Removing bad queue failed"); 2968 + goto out; 2969 + } 2970 + 2971 + ret = resume_all_queues_mes(dqm); 2972 + if (ret) 2973 + dev_err(dev, "Resuming all queues failed"); 2974 + 2975 + break; 2976 + } 2977 + } 2978 + } 2979 + 2980 + out: 2981 + dqm_unlock(dqm); 2982 + return ret; 2983 + } 2984 + 2934 2985 static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, 2935 2986 struct qcm_process_device *qpd) 2936 2987 {
+6 -3
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
··· 330 330 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) 331 331 kfd_signal_event_interrupt(pasid, context_id0, 32); 332 332 else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && 333 - KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) 334 - kfd_set_dbg_ev_from_interrupt(dev, pasid, 335 - KFD_CTXID0_DOORBELL_ID(context_id0), 333 + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { 334 + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); 335 + 336 + kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id, 336 337 KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), 337 338 NULL, 0); 339 + kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id); 340 + } 338 341 339 342 /* SDMA */ 340 343 else if (source_id == SOC21_INTSRC_SDMA_TRAP)
+1
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
··· 1324 1324 enum kfd_queue_type type); 1325 1325 void kernel_queue_uninit(struct kernel_queue *kq); 1326 1326 int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); 1327 + int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id); 1327 1328 1328 1329 /* Process Queue Manager */ 1329 1330 struct process_queue_node {