Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdkfd: add edc error interrupt handle for poison propogate mode

In poison progogate mode, when driver receive the edc error interrupt
from SQ, driver should kill the process by pasid which is using the
poison data, and then trigger GPU reset.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Dennis Li and committed by
Alex Deucher
20161e51 611ed9a5

+122 -7
+122 -7
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
··· 25 25 #include "soc15_int.h" 26 26 #include "kfd_device_queue_manager.h" 27 27 #include "kfd_smi_events.h" 28 + #include "amdgpu.h" 29 + 30 + enum SQ_INTERRUPT_WORD_ENCODING { 31 + SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, 32 + SQ_INTERRUPT_WORD_ENCODING_INST, 33 + SQ_INTERRUPT_WORD_ENCODING_ERROR, 34 + }; 35 + 36 + enum SQ_INTERRUPT_ERROR_TYPE { 37 + SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0, 38 + SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST, 39 + SQ_INTERRUPT_ERROR_TYPE_MEMVIOL, 40 + SQ_INTERRUPT_ERROR_TYPE_EDC_FED, 41 + }; 42 + 43 + /* SQ_INTERRUPT_WORD_AUTO_CTXID */ 44 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE__SHIFT 0 45 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT__SHIFT 1 46 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL__SHIFT 2 47 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP__SHIFT 3 48 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP__SHIFT 4 49 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW__SHIFT 5 50 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW__SHIFT 6 51 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW__SHIFT 7 52 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR__SHIFT 8 53 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID__SHIFT 24 54 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING__SHIFT 26 55 + 56 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_MASK 0x00000001 57 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__WLT_MASK 0x00000002 58 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_BUF_FULL_MASK 0x00000004 59 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__REG_TIMESTAMP_MASK 0x00000008 60 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__CMD_TIMESTAMP_MASK 0x00000010 61 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_CMD_OVERFLOW_MASK 0x00000020 62 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__HOST_REG_OVERFLOW_MASK 0x00000040 63 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__IMMED_OVERFLOW_MASK 0x00000080 64 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__THREAD_TRACE_UTC_ERROR_MASK 0x00000100 65 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__SE_ID_MASK 0x03000000 66 + #define SQ_INTERRUPT_WORD_AUTO_CTXID__ENCODING_MASK 0x0c000000 67 + 68 + /* SQ_INTERRUPT_WORD_WAVE_CTXID */ 69 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA__SHIFT 0 70 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID__SHIFT 12 71 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV__SHIFT 13 72 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID__SHIFT 14 73 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID__SHIFT 18 74 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID__SHIFT 20 75 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID__SHIFT 24 76 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING__SHIFT 26 77 + 78 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__DATA_MASK 0x00000fff 79 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SH_ID_MASK 0x00001000 80 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK 0x00002000 81 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__WAVE_ID_MASK 0x0003c000 82 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SIMD_ID_MASK 0x000c0000 83 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__CU_ID_MASK 0x00f00000 84 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000 85 + #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000 86 + 87 + #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) \ 88 + ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000)) 89 + 90 + #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 91 + #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 28 92 29 93 static bool event_interrupt_isr_v9(struct kfd_dev *dev, 30 94 const uint32_t *ih_ring_entry, ··· 172 108 const uint32_t *ih_ring_entry) 173 109 { 174 110 uint16_t source_id, client_id, pasid, vmid; 175 - uint32_t context_id; 111 + uint32_t context_id0, context_id1; 112 + uint32_t sq_intr_err, sq_int_data, encoding; 176 113 177 114 source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); 178 115 client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); 179 116 pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); 180 117 vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); 181 - context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 118 + context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); 119 + context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry); 182 120 183 121 if (client_id == SOC15_IH_CLIENTID_GRBM_CP || 184 122 client_id == SOC15_IH_CLIENTID_SE0SH || ··· 188 122 client_id == SOC15_IH_CLIENTID_SE2SH || 189 123 client_id == SOC15_IH_CLIENTID_SE3SH) { 190 124 if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) 191 - kfd_signal_event_interrupt(pasid, context_id, 32); 192 - else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) 193 - kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); 194 - else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) 125 + kfd_signal_event_interrupt(pasid, context_id0, 32); 126 + else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) { 127 + sq_int_data = KFD_CONTEXT_ID_GET_SQ_INT_DATA(context_id0, context_id1); 128 + encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING); 129 + switch (encoding) { 130 + case SQ_INTERRUPT_WORD_ENCODING_AUTO: 131 + pr_debug( 132 + "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", 133 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID), 134 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE), 135 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, WLT), 136 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_BUF_FULL), 137 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, REG_TIMESTAMP), 138 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, CMD_TIMESTAMP), 139 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_CMD_OVERFLOW), 140 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, HOST_REG_OVERFLOW), 141 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, IMMED_OVERFLOW), 142 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR)); 143 + break; 144 + case SQ_INTERRUPT_WORD_ENCODING_INST: 145 + pr_debug("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", 146 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), 147 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), 148 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), 149 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), 150 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), 151 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), 152 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), 153 + sq_int_data); 154 + break; 155 + case SQ_INTERRUPT_WORD_ENCODING_ERROR: 156 + sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); 157 + pr_warn("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", 158 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), 159 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), 160 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), 161 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, PRIV), 162 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, WAVE_ID), 163 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), 164 + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), 165 + sq_intr_err); 166 + if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && 167 + sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { 168 + kfd_signal_hw_exception_event(pasid); 169 + amdgpu_amdkfd_gpu_reset(dev->kgd); 170 + return; 171 + } 172 + break; 173 + default: 174 + break; 175 + } 176 + kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24); 177 + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) 195 178 kfd_signal_hw_exception_event(pasid); 196 179 } else if (client_id == SOC15_IH_CLIENTID_SDMA0 || 197 180 client_id == SOC15_IH_CLIENTID_SDMA1 || ··· 251 136 client_id == SOC15_IH_CLIENTID_SDMA6 || 252 137 client_id == SOC15_IH_CLIENTID_SDMA7) { 253 138 if (source_id == SOC15_INTSRC_SDMA_TRAP) 254 - kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); 139 + kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); 255 140 } else if (client_id == SOC15_IH_CLIENTID_VMC || 256 141 client_id == SOC15_IH_CLIENTID_VMC1 || 257 142 client_id == SOC15_IH_CLIENTID_UTCL2) {