Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
fork
Configure Feed
Select the types of activity you want to include in your feed.
1/*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24#include <linux/list.h>
25#include "amdgpu.h"
26#include "amdgpu_xgmi.h"
27#include "amdgpu_ras.h"
28#include "soc15.h"
29#include "df/df_3_6_offset.h"
30#include "xgmi/xgmi_4_0_0_smn.h"
31#include "xgmi/xgmi_4_0_0_sh_mask.h"
32#include "xgmi/xgmi_6_1_0_sh_mask.h"
33#include "wafl/wafl2_4_0_0_smn.h"
34#include "wafl/wafl2_4_0_0_sh_mask.h"
35
36#include "amdgpu_reset.h"
37
38#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
39#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218
40#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
41#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218
42
43#define XGMI_STATE_DISABLE 0xD1
44#define XGMI_STATE_LS0 0x81
45
46static DEFINE_MUTEX(xgmi_mutex);
47
48#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
49
50static LIST_HEAD(xgmi_hive_list);
51
52static const int xgmi_pcs_err_status_reg_vg20[] = {
53 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
54 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
55};
56
57static const int wafl_pcs_err_status_reg_vg20[] = {
58 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
59 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
60};
61
62static const int xgmi_pcs_err_status_reg_arct[] = {
63 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
64 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
65 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
66 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
67 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
68 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
69};
70
71/* same as vg20*/
72static const int wafl_pcs_err_status_reg_arct[] = {
73 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
74 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
75};
76
77static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
78 smnPCS_XGMI3X16_PCS_ERROR_STATUS,
79 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
80 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
81 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
82 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
83 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
84 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
85 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
86};
87
88static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
89 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
90 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000,
91 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000,
92 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000,
93 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000,
94 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000,
95 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000,
96 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000
97};
98
99static const int walf_pcs_err_status_reg_aldebaran[] = {
100 smnPCS_GOPX1_PCS_ERROR_STATUS,
101 smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
102};
103
104static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
105 smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK,
106 smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
107};
108
109static const int xgmi3x16_pcs_err_status_reg_v6_4[] = {
110 smnPCS_XGMI3X16_PCS_ERROR_STATUS,
111 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000
112};
113
114static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
115 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
116 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
117};
118
119static const u64 xgmi_v6_4_0_mca_base_array[] = {
120 0x11a09200,
121 0x11b09200,
122};
123
124static const char *xgmi_v6_4_0_ras_error_code_ext[32] = {
125 [0x00] = "XGMI PCS DataLossErr",
126 [0x01] = "XGMI PCS TrainingErr",
127 [0x02] = "XGMI PCS FlowCtrlAckErr",
128 [0x03] = "XGMI PCS RxFifoUnderflowErr",
129 [0x04] = "XGMI PCS RxFifoOverflowErr",
130 [0x05] = "XGMI PCS CRCErr",
131 [0x06] = "XGMI PCS BERExceededErr",
132 [0x07] = "XGMI PCS TxMetaDataErr",
133 [0x08] = "XGMI PCS ReplayBufParityErr",
134 [0x09] = "XGMI PCS DataParityErr",
135 [0x0a] = "XGMI PCS ReplayFifoOverflowErr",
136 [0x0b] = "XGMI PCS ReplayFifoUnderflowErr",
137 [0x0c] = "XGMI PCS ElasticFifoOverflowErr",
138 [0x0d] = "XGMI PCS DeskewErr",
139 [0x0e] = "XGMI PCS FlowCtrlCRCErr",
140 [0x0f] = "XGMI PCS DataStartupLimitErr",
141 [0x10] = "XGMI PCS FCInitTimeoutErr",
142 [0x11] = "XGMI PCS RecoveryTimeoutErr",
143 [0x12] = "XGMI PCS ReadySerialTimeoutErr",
144 [0x13] = "XGMI PCS ReadySerialAttemptErr",
145 [0x14] = "XGMI PCS RecoveryAttemptErr",
146 [0x15] = "XGMI PCS RecoveryRelockAttemptErr",
147 [0x16] = "XGMI PCS ReplayAttemptErr",
148 [0x17] = "XGMI PCS SyncHdrErr",
149 [0x18] = "XGMI PCS TxReplayTimeoutErr",
150 [0x19] = "XGMI PCS RxReplayTimeoutErr",
151 [0x1a] = "XGMI PCS LinkSubTxTimeoutErr",
152 [0x1b] = "XGMI PCS LinkSubRxTimeoutErr",
153 [0x1c] = "XGMI PCS RxCMDPktErr",
154};
155
156static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
157 {"XGMI PCS DataLossErr",
158 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
159 {"XGMI PCS TrainingErr",
160 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
161 {"XGMI PCS CRCErr",
162 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
163 {"XGMI PCS BERExceededErr",
164 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
165 {"XGMI PCS TxMetaDataErr",
166 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
167 {"XGMI PCS ReplayBufParityErr",
168 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
169 {"XGMI PCS DataParityErr",
170 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
171 {"XGMI PCS ReplayFifoOverflowErr",
172 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
173 {"XGMI PCS ReplayFifoUnderflowErr",
174 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
175 {"XGMI PCS ElasticFifoOverflowErr",
176 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
177 {"XGMI PCS DeskewErr",
178 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
179 {"XGMI PCS DataStartupLimitErr",
180 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
181 {"XGMI PCS FCInitTimeoutErr",
182 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
183 {"XGMI PCS RecoveryTimeoutErr",
184 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
185 {"XGMI PCS ReadySerialTimeoutErr",
186 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
187 {"XGMI PCS ReadySerialAttemptErr",
188 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
189 {"XGMI PCS RecoveryAttemptErr",
190 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
191 {"XGMI PCS RecoveryRelockAttemptErr",
192 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
193};
194
195static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
196 {"WAFL PCS DataLossErr",
197 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
198 {"WAFL PCS TrainingErr",
199 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
200 {"WAFL PCS CRCErr",
201 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
202 {"WAFL PCS BERExceededErr",
203 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
204 {"WAFL PCS TxMetaDataErr",
205 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
206 {"WAFL PCS ReplayBufParityErr",
207 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
208 {"WAFL PCS DataParityErr",
209 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
210 {"WAFL PCS ReplayFifoOverflowErr",
211 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
212 {"WAFL PCS ReplayFifoUnderflowErr",
213 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
214 {"WAFL PCS ElasticFifoOverflowErr",
215 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
216 {"WAFL PCS DeskewErr",
217 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
218 {"WAFL PCS DataStartupLimitErr",
219 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
220 {"WAFL PCS FCInitTimeoutErr",
221 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
222 {"WAFL PCS RecoveryTimeoutErr",
223 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
224 {"WAFL PCS ReadySerialTimeoutErr",
225 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
226 {"WAFL PCS ReadySerialAttemptErr",
227 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
228 {"WAFL PCS RecoveryAttemptErr",
229 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
230 {"WAFL PCS RecoveryRelockAttemptErr",
231 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
232};
233
234static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = {
235 {"XGMI3X16 PCS DataLossErr",
236 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)},
237 {"XGMI3X16 PCS TrainingErr",
238 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)},
239 {"XGMI3X16 PCS FlowCtrlAckErr",
240 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)},
241 {"XGMI3X16 PCS RxFifoUnderflowErr",
242 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)},
243 {"XGMI3X16 PCS RxFifoOverflowErr",
244 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)},
245 {"XGMI3X16 PCS CRCErr",
246 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)},
247 {"XGMI3X16 PCS BERExceededErr",
248 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)},
249 {"XGMI3X16 PCS TxVcidDataErr",
250 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)},
251 {"XGMI3X16 PCS ReplayBufParityErr",
252 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)},
253 {"XGMI3X16 PCS DataParityErr",
254 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)},
255 {"XGMI3X16 PCS ReplayFifoOverflowErr",
256 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
257 {"XGMI3X16 PCS ReplayFifoUnderflowErr",
258 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
259 {"XGMI3X16 PCS ElasticFifoOverflowErr",
260 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
261 {"XGMI3X16 PCS DeskewErr",
262 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)},
263 {"XGMI3X16 PCS FlowCtrlCRCErr",
264 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)},
265 {"XGMI3X16 PCS DataStartupLimitErr",
266 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)},
267 {"XGMI3X16 PCS FCInitTimeoutErr",
268 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
269 {"XGMI3X16 PCS RecoveryTimeoutErr",
270 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
271 {"XGMI3X16 PCS ReadySerialTimeoutErr",
272 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
273 {"XGMI3X16 PCS ReadySerialAttemptErr",
274 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
275 {"XGMI3X16 PCS RecoveryAttemptErr",
276 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
277 {"XGMI3X16 PCS RecoveryRelockAttemptErr",
278 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
279 {"XGMI3X16 PCS ReplayAttemptErr",
280 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)},
281 {"XGMI3X16 PCS SyncHdrErr",
282 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)},
283 {"XGMI3X16 PCS TxReplayTimeoutErr",
284 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)},
285 {"XGMI3X16 PCS RxReplayTimeoutErr",
286 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)},
287 {"XGMI3X16 PCS LinkSubTxTimeoutErr",
288 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)},
289 {"XGMI3X16 PCS LinkSubRxTimeoutErr",
290 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)},
291 {"XGMI3X16 PCS RxCMDPktErr",
292 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)},
293};
294
295int amdgpu_xgmi_get_ext_link(struct amdgpu_device *adev, int link_num)
296{
297 int link_map_6_4_x[8] = { 0, 3, 1, 2, 7, 6, 4, 5 };
298
299 if (adev->gmc.xgmi.num_physical_nodes <= 1)
300 return -EINVAL;
301
302 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
303 case IP_VERSION(6, 4, 0):
304 case IP_VERSION(6, 4, 1):
305 if (link_num < ARRAY_SIZE(link_map_6_4_x))
306 return link_map_6_4_x[link_num];
307 break;
308 default:
309 return -EINVAL;
310 }
311
312 return -EINVAL;
313}
314
315static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num)
316{
317 const u32 smn_xgmi_6_4_pcs_state_hist1[2] = { 0x11a00070, 0x11b00070 };
318 const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x12100070,
319 0x11b00070 };
320 u32 i, n;
321 u64 addr;
322
323 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
324 case IP_VERSION(6, 4, 0):
325 n = ARRAY_SIZE(smn_xgmi_6_4_pcs_state_hist1);
326 addr = smn_xgmi_6_4_pcs_state_hist1[global_link_num % n];
327 break;
328 case IP_VERSION(6, 4, 1):
329 n = ARRAY_SIZE(smn_xgmi_6_4_1_pcs_state_hist1);
330 addr = smn_xgmi_6_4_1_pcs_state_hist1[global_link_num % n];
331 break;
332 default:
333 return U32_MAX;
334 }
335
336 i = global_link_num / n;
337
338 if (!(adev->aid_mask & BIT(i)))
339 return U32_MAX;
340
341 addr += amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, i);
342
343 return RREG32_PCIE_EXT(addr);
344}
345
346int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev, int global_link_num)
347{
348 u32 xgmi_state_reg_val;
349
350 if (amdgpu_sriov_vf(adev))
351 return AMDGPU_XGMI_LINK_NA;
352
353 if (adev->gmc.xgmi.num_physical_nodes <= 1)
354 return -EINVAL;
355
356 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
357 case IP_VERSION(6, 4, 0):
358 case IP_VERSION(6, 4, 1):
359 xgmi_state_reg_val = xgmi_v6_4_get_link_status(adev, global_link_num);
360 break;
361 default:
362 return -EOPNOTSUPP;
363 }
364
365 if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE)
366 return -ENOLINK;
367
368 if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0)
369 return AMDGPU_XGMI_LINK_ACTIVE;
370
371 return AMDGPU_XGMI_LINK_INACTIVE;
372}
373
374/**
375 * DOC: AMDGPU XGMI Support
376 *
377 * XGMI is a high speed interconnect that joins multiple GPU cards
378 * into a homogeneous memory space that is organized by a collective
379 * hive ID and individual node IDs, both of which are 64-bit numbers.
380 *
381 * The file xgmi_device_id contains the unique per GPU device ID and
382 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
383 *
384 * Inside the device directory a sub-directory 'xgmi_hive_info' is
385 * created which contains the hive ID and the list of nodes.
386 *
387 * The hive ID is stored in:
388 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
389 *
390 * The node information is stored in numbered directories:
391 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
392 *
393 * Each device has their own xgmi_hive_info direction with a mirror
394 * set of node sub-directories.
395 *
396 * The XGMI memory space is built by contiguously adding the power of
397 * two padded VRAM space from each node to each other.
398 *
399 */
400
401static struct attribute amdgpu_xgmi_hive_id = {
402 .name = "xgmi_hive_id",
403 .mode = S_IRUGO
404};
405
406static struct attribute *amdgpu_xgmi_hive_attrs[] = {
407 &amdgpu_xgmi_hive_id,
408 NULL
409};
410ATTRIBUTE_GROUPS(amdgpu_xgmi_hive);
411
412static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
413 struct attribute *attr, char *buf)
414{
415 struct amdgpu_hive_info *hive = container_of(
416 kobj, struct amdgpu_hive_info, kobj);
417
418 if (attr == &amdgpu_xgmi_hive_id)
419 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
420
421 return 0;
422}
423
424static void amdgpu_xgmi_hive_release(struct kobject *kobj)
425{
426 struct amdgpu_hive_info *hive = container_of(
427 kobj, struct amdgpu_hive_info, kobj);
428
429 amdgpu_reset_put_reset_domain(hive->reset_domain);
430 hive->reset_domain = NULL;
431
432 mutex_destroy(&hive->hive_lock);
433 kfree(hive);
434}
435
436static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
437 .show = amdgpu_xgmi_show_attrs,
438};
439
440static const struct kobj_type amdgpu_xgmi_hive_type = {
441 .release = amdgpu_xgmi_hive_release,
442 .sysfs_ops = &amdgpu_xgmi_hive_ops,
443 .default_groups = amdgpu_xgmi_hive_groups,
444};
445
446static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
447 struct device_attribute *attr,
448 char *buf)
449{
450 struct drm_device *ddev = dev_get_drvdata(dev);
451 struct amdgpu_device *adev = drm_to_adev(ddev);
452
453 return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
454
455}
456
457static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev,
458 struct device_attribute *attr,
459 char *buf)
460{
461 struct drm_device *ddev = dev_get_drvdata(dev);
462 struct amdgpu_device *adev = drm_to_adev(ddev);
463
464 return sysfs_emit(buf, "%u\n", adev->gmc.xgmi.physical_node_id);
465
466}
467
468static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev,
469 struct device_attribute *attr,
470 char *buf)
471{
472 struct drm_device *ddev = dev_get_drvdata(dev);
473 struct amdgpu_device *adev = drm_to_adev(ddev);
474 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
475 int i, offset = 0;
476
477 for (i = 0; i < top->num_nodes; i++)
478 offset += sysfs_emit_at(buf, offset, "%02x ", top->nodes[i].num_hops);
479
480 return offset + sysfs_emit_at(buf, offset, "\n");
481}
482
483static ssize_t amdgpu_xgmi_show_num_links(struct device *dev,
484 struct device_attribute *attr,
485 char *buf)
486{
487 struct drm_device *ddev = dev_get_drvdata(dev);
488 struct amdgpu_device *adev = drm_to_adev(ddev);
489 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
490 int i, offset = 0;
491
492 for (i = 0; i < top->num_nodes; i++)
493 offset += sysfs_emit_at(buf, offset, "%02x ", top->nodes[i].num_links);
494
495 return offset + sysfs_emit_at(buf, offset, "\n");
496}
497
498static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev,
499 struct device_attribute *attr,
500 char *buf)
501{
502 struct drm_device *ddev = dev_get_drvdata(dev);
503 struct amdgpu_device *adev = drm_to_adev(ddev);
504 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
505 int i, j, size = 0;
506 int current_node;
507 /*
508 * get the node id in the sysfs for the current socket and show
509 * it in the port num info output in the sysfs for easy reading.
510 * it is NOT the one retrieved from xgmi ta.
511 */
512 for (i = 0; i < top->num_nodes; i++) {
513 if (top->nodes[i].node_id == adev->gmc.xgmi.node_id) {
514 current_node = i;
515 break;
516 }
517 }
518
519 if (i == top->num_nodes)
520 return -EINVAL;
521
522 for (i = 0; i < top->num_nodes; i++) {
523 for (j = 0; j < top->nodes[i].num_links; j++)
524 /* node id in sysfs starts from 1 rather than 0 so +1 here */
525 size += sysfs_emit_at(buf, size, "%02x:%02x -> %02x:%02x\n", current_node + 1,
526 top->nodes[i].port_num[j].src_xgmi_port_num, i + 1,
527 top->nodes[i].port_num[j].dst_xgmi_port_num);
528 }
529
530 return size;
531}
532
533#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
534static ssize_t amdgpu_xgmi_show_error(struct device *dev,
535 struct device_attribute *attr,
536 char *buf)
537{
538 struct drm_device *ddev = dev_get_drvdata(dev);
539 struct amdgpu_device *adev = drm_to_adev(ddev);
540 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
541 uint64_t fica_out;
542 unsigned int error_count = 0;
543
544 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
545 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
546
547 if ((!adev->df.funcs) ||
548 (!adev->df.funcs->get_fica) ||
549 (!adev->df.funcs->set_fica))
550 return -EINVAL;
551
552 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
553 if (fica_out != 0x1f)
554 pr_err("xGMI error counters not enabled!\n");
555
556 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
557
558 if ((fica_out & 0xffff) == 2)
559 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
560
561 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
562
563 return sysfs_emit(buf, "%u\n", error_count);
564}
565
566
567static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
568static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL);
569static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
570static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL);
571static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL);
572static DEVICE_ATTR(xgmi_port_num, S_IRUGO, amdgpu_xgmi_show_connected_port_num, NULL);
573
574static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
575 struct amdgpu_hive_info *hive)
576{
577 int ret = 0;
578 char node[10] = { 0 };
579
580 /* Create xgmi device id file */
581 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
582 if (ret) {
583 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
584 return ret;
585 }
586
587 ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id);
588 if (ret) {
589 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n");
590 return ret;
591 }
592
593 /* Create xgmi error file */
594 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
595 if (ret)
596 pr_err("failed to create xgmi_error\n");
597
598 /* Create xgmi num hops file */
599 ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops);
600 if (ret)
601 pr_err("failed to create xgmi_num_hops\n");
602
603 /* Create xgmi num links file */
604 ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links);
605 if (ret)
606 pr_err("failed to create xgmi_num_links\n");
607
608 /* Create xgmi port num file if supported */
609 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) {
610 ret = device_create_file(adev->dev, &dev_attr_xgmi_port_num);
611 if (ret)
612 dev_err(adev->dev, "failed to create xgmi_port_num\n");
613 }
614
615 /* Create sysfs link to hive info folder on the first device */
616 if (hive->kobj.parent != (&adev->dev->kobj)) {
617 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
618 "xgmi_hive_info");
619 if (ret) {
620 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
621 goto remove_file;
622 }
623 }
624
625 sprintf(node, "node%d", atomic_read(&hive->number_devices));
626 /* Create sysfs link form the hive folder to yourself */
627 ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
628 if (ret) {
629 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
630 goto remove_link;
631 }
632
633 goto success;
634
635
636remove_link:
637 sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
638
639remove_file:
640 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
641 device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
642 device_remove_file(adev->dev, &dev_attr_xgmi_error);
643 device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
644 device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
645 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG)
646 device_remove_file(adev->dev, &dev_attr_xgmi_port_num);
647
648success:
649 return ret;
650}
651
652static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
653 struct amdgpu_hive_info *hive)
654{
655 char node[10];
656 memset(node, 0, sizeof(node));
657
658 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
659 device_remove_file(adev->dev, &dev_attr_xgmi_physical_id);
660 device_remove_file(adev->dev, &dev_attr_xgmi_error);
661 device_remove_file(adev->dev, &dev_attr_xgmi_num_hops);
662 device_remove_file(adev->dev, &dev_attr_xgmi_num_links);
663 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG)
664 device_remove_file(adev->dev, &dev_attr_xgmi_port_num);
665
666 if (hive->kobj.parent != (&adev->dev->kobj))
667 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
668
669 sprintf(node, "node%d", atomic_read(&hive->number_devices));
670 sysfs_remove_link(&hive->kobj, node);
671
672}
673
674
675
676struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
677{
678 struct amdgpu_hive_info *hive = NULL;
679 int ret;
680
681 if (!adev->gmc.xgmi.hive_id)
682 return NULL;
683
684 if (adev->hive) {
685 kobject_get(&adev->hive->kobj);
686 return adev->hive;
687 }
688
689 mutex_lock(&xgmi_mutex);
690
691 list_for_each_entry(hive, &xgmi_hive_list, node) {
692 if (hive->hive_id == adev->gmc.xgmi.hive_id)
693 goto pro_end;
694 }
695
696 hive = kzalloc_obj(*hive);
697 if (!hive) {
698 dev_err(adev->dev, "XGMI: allocation failed\n");
699 ret = -ENOMEM;
700 hive = NULL;
701 goto pro_end;
702 }
703
704 /* initialize new hive if not exist */
705 ret = kobject_init_and_add(&hive->kobj,
706 &amdgpu_xgmi_hive_type,
707 &adev->dev->kobj,
708 "%s", "xgmi_hive_info");
709 if (ret) {
710 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
711 kobject_put(&hive->kobj);
712 hive = NULL;
713 goto pro_end;
714 }
715
716 /**
717 * Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
718 * Host driver decide how to reset the GPU either through FLR or chain reset.
719 * Guest side will get individual notifications from the host for the FLR
720 * if necessary.
721 */
722 if (!amdgpu_sriov_vf(adev)) {
723 /**
724 * Avoid recreating reset domain when hive is reconstructed for the case
725 * of reset the devices in the XGMI hive during probe for passthrough GPU
726 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
727 */
728 if (adev->reset_domain->type != XGMI_HIVE) {
729 hive->reset_domain =
730 amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
731 if (!hive->reset_domain) {
732 dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
733 ret = -ENOMEM;
734 kobject_put(&hive->kobj);
735 hive = NULL;
736 goto pro_end;
737 }
738 } else {
739 amdgpu_reset_get_reset_domain(adev->reset_domain);
740 hive->reset_domain = adev->reset_domain;
741 }
742 }
743
744 hive->hive_id = adev->gmc.xgmi.hive_id;
745 INIT_LIST_HEAD(&hive->device_list);
746 INIT_LIST_HEAD(&hive->node);
747 mutex_init(&hive->hive_lock);
748 atomic_set(&hive->number_devices, 0);
749 task_barrier_init(&hive->tb);
750 hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
751 hive->hi_req_gpu = NULL;
752 atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
753
754 /*
755 * hive pstate on boot is high in vega20 so we have to go to low
756 * pstate on after boot.
757 */
758 hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
759 list_add_tail(&hive->node, &xgmi_hive_list);
760
761pro_end:
762 if (hive)
763 kobject_get(&hive->kobj);
764 mutex_unlock(&xgmi_mutex);
765 return hive;
766}
767
768void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
769{
770 if (hive)
771 kobject_put(&hive->kobj);
772}
773
774int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
775{
776 int ret = 0;
777 struct amdgpu_hive_info *hive;
778 struct amdgpu_device *request_adev;
779 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
780 bool init_low;
781
782 hive = amdgpu_get_xgmi_hive(adev);
783 if (!hive)
784 return 0;
785
786 request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
787 init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
788 amdgpu_put_xgmi_hive(hive);
789 /* fw bug so temporarily disable pstate switching */
790 return 0;
791
792 if (!hive || adev->asic_type != CHIP_VEGA20)
793 return 0;
794
795 mutex_lock(&hive->hive_lock);
796
797 if (is_hi_req)
798 hive->hi_req_count++;
799 else
800 hive->hi_req_count--;
801
802 /*
803 * Vega20 only needs single peer to request pstate high for the hive to
804 * go high but all peers must request pstate low for the hive to go low
805 */
806 if (hive->pstate == pstate ||
807 (!is_hi_req && hive->hi_req_count && !init_low))
808 goto out;
809
810 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
811
812 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
813 if (ret) {
814 dev_err(request_adev->dev,
815 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
816 request_adev->gmc.xgmi.node_id,
817 request_adev->gmc.xgmi.hive_id, ret);
818 goto out;
819 }
820
821 if (init_low)
822 hive->pstate = hive->hi_req_count ?
823 hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
824 else {
825 hive->pstate = pstate;
826 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
827 adev : NULL;
828 }
829out:
830 mutex_unlock(&hive->hive_lock);
831 return ret;
832}
833
834int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
835{
836 int ret;
837
838 if (amdgpu_sriov_vf(adev))
839 return 0;
840
841 /* Each psp need to set the latest topology */
842 ret = psp_xgmi_set_topology_info(&adev->psp,
843 atomic_read(&hive->number_devices),
844 &adev->psp.xgmi_context.top_info);
845 if (ret)
846 dev_err(adev->dev,
847 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
848 adev->gmc.xgmi.node_id,
849 adev->gmc.xgmi.hive_id, ret);
850
851 return ret;
852}
853
854
855/*
856 * NOTE psp_xgmi_node_info.num_hops layout is as follows:
857 * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
858 * num_hops[5:3] = reserved
859 * num_hops[2:0] = number of hops
860 */
861int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
862 struct amdgpu_device *peer_adev)
863{
864 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
865 uint8_t num_hops_mask = 0x7;
866 int i;
867
868 if (!adev->gmc.xgmi.supported)
869 return 0;
870
871 for (i = 0 ; i < top->num_nodes; ++i)
872 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
873 return top->nodes[i].num_hops & num_hops_mask;
874
875 dev_err(adev->dev, "Failed to get xgmi hops count for peer %d.\n",
876 peer_adev->gmc.xgmi.physical_node_id);
877
878 return 0;
879}
880
881int amdgpu_xgmi_get_bandwidth(struct amdgpu_device *adev, struct amdgpu_device *peer_adev,
882 enum amdgpu_xgmi_bw_mode bw_mode, enum amdgpu_xgmi_bw_unit bw_unit,
883 uint32_t *min_bw, uint32_t *max_bw)
884{
885 bool peer_mode = bw_mode == AMDGPU_XGMI_BW_MODE_PER_PEER;
886 int unit_scale = bw_unit == AMDGPU_XGMI_BW_UNIT_MBYTES ? 1000 : 1;
887 int num_lanes = adev->gmc.xgmi.max_width;
888 int speed = adev->gmc.xgmi.max_speed;
889 int num_links = !peer_mode ? 1 : -1;
890
891 if (!(min_bw && max_bw))
892 return -EINVAL;
893
894 *min_bw = 0;
895 *max_bw = 0;
896
897 if (!adev->gmc.xgmi.supported)
898 return -ENODATA;
899
900 if (peer_mode && !peer_adev)
901 return -EINVAL;
902
903 if (peer_mode) {
904 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
905 int i;
906
907 for (i = 0 ; i < top->num_nodes; ++i) {
908 if (top->nodes[i].node_id != peer_adev->gmc.xgmi.node_id)
909 continue;
910
911 num_links = top->nodes[i].num_links;
912 break;
913 }
914 }
915
916 if (num_links == -1) {
917 dev_err(adev->dev, "Failed to get number of xgmi links for peer %d.\n",
918 peer_adev->gmc.xgmi.physical_node_id);
919 } else if (num_links) {
920 int per_link_bw = (speed * num_lanes * unit_scale)/BITS_PER_BYTE;
921
922 *min_bw = per_link_bw;
923 *max_bw = num_links * per_link_bw;
924 }
925
926 return 0;
927}
928
929bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
930 struct amdgpu_device *peer_adev)
931{
932 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
933 int i;
934
935 /* Sharing should always be enabled for non-SRIOV. */
936 if (!amdgpu_sriov_vf(adev))
937 return true;
938
939 for (i = 0 ; i < top->num_nodes; ++i)
940 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
941 return !!top->nodes[i].is_sharing_enabled;
942
943 return false;
944}
945
946/*
947 * Devices that support extended data require the entire hive to initialize with
948 * the shared memory buffer flag set.
949 *
950 * Hive locks and conditions apply - see amdgpu_xgmi_add_device
951 */
952static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
953 bool set_extended_data)
954{
955 struct amdgpu_device *tmp_adev;
956 int ret;
957
958 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
959 ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
960 if (ret) {
961 dev_err(tmp_adev->dev,
962 "XGMI: Failed to initialize xgmi session for data partition %i\n",
963 set_extended_data);
964 return ret;
965 }
966
967 }
968
969 return 0;
970}
971
972int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
973{
974 struct psp_xgmi_topology_info *top_info;
975 struct amdgpu_hive_info *hive;
976 struct amdgpu_xgmi *entry;
977 struct amdgpu_device *tmp_adev = NULL;
978
979 int count = 0, ret = 0;
980
981 if (!adev->gmc.xgmi.supported)
982 return 0;
983
984 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
985 ret = psp_xgmi_initialize(&adev->psp, false, true);
986 if (ret) {
987 dev_err(adev->dev,
988 "XGMI: Failed to initialize xgmi session\n");
989 return ret;
990 }
991
992 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
993 if (ret) {
994 dev_err(adev->dev,
995 "XGMI: Failed to get hive id\n");
996 return ret;
997 }
998
999 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
1000 if (ret) {
1001 dev_err(adev->dev,
1002 "XGMI: Failed to get node id\n");
1003 return ret;
1004 }
1005 } else {
1006 adev->gmc.xgmi.hive_id = 16;
1007 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
1008 }
1009
1010 hive = amdgpu_get_xgmi_hive(adev);
1011 if (!hive) {
1012 ret = -EINVAL;
1013 dev_err(adev->dev,
1014 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
1015 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
1016 goto exit;
1017 }
1018 mutex_lock(&hive->hive_lock);
1019
1020 top_info = &adev->psp.xgmi_context.top_info;
1021
1022 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
1023 list_for_each_entry(entry, &hive->device_list, head)
1024 top_info->nodes[count++].node_id = entry->node_id;
1025 top_info->num_nodes = count;
1026 atomic_set(&hive->number_devices, count);
1027
1028 task_barrier_add_task(&hive->tb);
1029
1030 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
1031 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
1032 /* update node list for other device in the hive */
1033 if (tmp_adev != adev) {
1034 top_info = &tmp_adev->psp.xgmi_context.top_info;
1035 top_info->nodes[count - 1].node_id =
1036 adev->gmc.xgmi.node_id;
1037 top_info->num_nodes = count;
1038 }
1039 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
1040 if (ret)
1041 goto exit_unlock;
1042 }
1043
1044 if (amdgpu_sriov_vf(adev) &&
1045 adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) {
1046 /* only get topology for VF being init if it can support full duplex */
1047 ret = psp_xgmi_get_topology_info(&adev->psp, count,
1048 &adev->psp.xgmi_context.top_info, false);
1049 if (ret) {
1050 dev_err(adev->dev,
1051 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
1052 adev->gmc.xgmi.node_id,
1053 adev->gmc.xgmi.hive_id, ret);
1054 /* To do: continue with some node failed or disable the whole hive*/
1055 goto exit_unlock;
1056 }
1057 } else {
1058 /* get latest topology info for each device from psp */
1059 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
1060 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
1061 &tmp_adev->psp.xgmi_context.top_info, false);
1062 if (ret) {
1063 dev_err(tmp_adev->dev,
1064 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
1065 tmp_adev->gmc.xgmi.node_id,
1066 tmp_adev->gmc.xgmi.hive_id, ret);
1067 /* To do : continue with some node failed or disable the whole hive */
1068 goto exit_unlock;
1069 }
1070 }
1071 }
1072
1073 /* get topology again for hives that support extended data */
1074 if (adev->psp.xgmi_context.supports_extended_data) {
1075
1076 /* initialize the hive to get extended data. */
1077 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
1078 if (ret)
1079 goto exit_unlock;
1080
1081 /* get the extended data. */
1082 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
1083 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
1084 &tmp_adev->psp.xgmi_context.top_info, true);
1085 if (ret) {
1086 dev_err(tmp_adev->dev,
1087 "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
1088 tmp_adev->gmc.xgmi.node_id,
1089 tmp_adev->gmc.xgmi.hive_id, ret);
1090 goto exit_unlock;
1091 }
1092 }
1093
1094 /* initialize the hive to get non-extended data for the next round. */
1095 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
1096 if (ret)
1097 goto exit_unlock;
1098
1099 }
1100 }
1101
1102 if (!ret)
1103 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
1104
1105exit_unlock:
1106 mutex_unlock(&hive->hive_lock);
1107exit:
1108 if (!ret) {
1109 adev->hive = hive;
1110 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
1111 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
1112 } else {
1113 amdgpu_put_xgmi_hive(hive);
1114 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
1115 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
1116 ret);
1117 }
1118
1119 return ret;
1120}
1121
1122int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
1123{
1124 struct amdgpu_hive_info *hive = adev->hive;
1125
1126 if (!adev->gmc.xgmi.supported)
1127 return -EINVAL;
1128
1129 if (!hive)
1130 return -EINVAL;
1131
1132 mutex_lock(&hive->hive_lock);
1133 task_barrier_rem_task(&hive->tb);
1134 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
1135 if (hive->hi_req_gpu == adev)
1136 hive->hi_req_gpu = NULL;
1137 list_del(&adev->gmc.xgmi.head);
1138 mutex_unlock(&hive->hive_lock);
1139
1140 amdgpu_put_xgmi_hive(hive);
1141 adev->hive = NULL;
1142
1143 if (atomic_dec_return(&hive->number_devices) == 0) {
1144 /* Remove the hive from global hive list */
1145 mutex_lock(&xgmi_mutex);
1146 list_del(&hive->node);
1147 mutex_unlock(&xgmi_mutex);
1148
1149 amdgpu_put_xgmi_hive(hive);
1150 }
1151
1152 return 0;
1153}
1154
1155static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
1156 enum aca_smu_type type, void *data)
1157{
1158 struct amdgpu_device *adev = handle->adev;
1159 struct aca_bank_info info;
1160 const char *error_str;
1161 u64 status, count;
1162 int ret, ext_error_code;
1163
1164 ret = aca_bank_info_decode(bank, &info);
1165 if (ret)
1166 return ret;
1167
1168 status = bank->regs[ACA_REG_IDX_STATUS];
1169 ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
1170
1171 error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
1172 xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
1173 if (error_str)
1174 dev_info(adev->dev, "%s detected\n", error_str);
1175
1176 count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
1177
1178 switch (type) {
1179 case ACA_SMU_TYPE_UE:
1180 if (ext_error_code != 0 && ext_error_code != 1 && ext_error_code != 9)
1181 count = 0ULL;
1182
1183 bank->aca_err_type = ACA_ERROR_TYPE_UE;
1184 ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
1185 break;
1186 case ACA_SMU_TYPE_CE:
1187 count = ext_error_code == 6 ? count : 0ULL;
1188 bank->aca_err_type = ACA_ERROR_TYPE_CE;
1189 ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count);
1190 break;
1191 default:
1192 return -EINVAL;
1193 }
1194
1195 return ret;
1196}
1197
1198static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
1199 .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
1200};
1201
1202static const struct aca_info xgmi_v6_4_0_aca_info = {
1203 .hwip = ACA_HWIP_TYPE_PCS_XGMI,
1204 .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK,
1205 .bank_ops = &xgmi_v6_4_0_aca_bank_ops,
1206};
1207
1208static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
1209{
1210 int r;
1211
1212 if (!adev->gmc.xgmi.supported ||
1213 adev->gmc.xgmi.num_physical_nodes == 0)
1214 return 0;
1215
1216 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
1217
1218 r = amdgpu_ras_block_late_init(adev, ras_block);
1219 if (r)
1220 return r;
1221
1222 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1223 case IP_VERSION(6, 4, 0):
1224 case IP_VERSION(6, 4, 1):
1225 r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
1226 &xgmi_v6_4_0_aca_info, NULL);
1227 if (r)
1228 goto late_fini;
1229 break;
1230 default:
1231 break;
1232 }
1233
1234 return 0;
1235
1236late_fini:
1237 amdgpu_ras_block_late_fini(adev, ras_block);
1238
1239 return r;
1240}
1241
1242uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
1243 uint64_t addr)
1244{
1245 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
1246 return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
1247}
1248
1249static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
1250{
1251 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
1252 WREG32_PCIE(pcs_status_reg, 0);
1253}
1254
1255static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
1256{
1257 uint32_t i;
1258
1259 switch (adev->asic_type) {
1260 case CHIP_ARCTURUS:
1261 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
1262 pcs_clear_status(adev,
1263 xgmi_pcs_err_status_reg_arct[i]);
1264 break;
1265 case CHIP_VEGA20:
1266 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
1267 pcs_clear_status(adev,
1268 xgmi_pcs_err_status_reg_vg20[i]);
1269 break;
1270 case CHIP_ALDEBARAN:
1271 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++)
1272 pcs_clear_status(adev,
1273 xgmi3x16_pcs_err_status_reg_aldebaran[i]);
1274 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
1275 pcs_clear_status(adev,
1276 walf_pcs_err_status_reg_aldebaran[i]);
1277 break;
1278 default:
1279 break;
1280 }
1281
1282 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1283 case IP_VERSION(6, 4, 0):
1284 case IP_VERSION(6, 4, 1):
1285 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
1286 pcs_clear_status(adev,
1287 xgmi3x16_pcs_err_status_reg_v6_4[i]);
1288 break;
1289 default:
1290 break;
1291 }
1292}
1293
1294static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base)
1295{
1296 uint64_t smn_base =
1297 amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst);
1298
1299 WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL);
1300}
1301
1302static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst)
1303{
1304 int i;
1305
1306 for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
1307 __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]);
1308}
1309
1310static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev)
1311{
1312 int i;
1313
1314 for_each_inst(i, adev->aid_mask)
1315 xgmi_v6_4_0_reset_error_count(adev, i);
1316}
1317
1318static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
1319{
1320 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1321 case IP_VERSION(6, 4, 0):
1322 case IP_VERSION(6, 4, 1):
1323 xgmi_v6_4_0_reset_ras_error_count(adev);
1324 break;
1325 default:
1326 amdgpu_xgmi_legacy_reset_ras_error_count(adev);
1327 break;
1328 }
1329}
1330
1331static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
1332 uint32_t value,
1333 uint32_t mask_value,
1334 uint32_t *ue_count,
1335 uint32_t *ce_count,
1336 bool is_xgmi_pcs,
1337 bool check_mask)
1338{
1339 int i;
1340 int ue_cnt = 0;
1341 const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL;
1342 uint32_t field_array_size = 0;
1343
1344 if (is_xgmi_pcs) {
1345 if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
1346 IP_VERSION(6, 1, 0) ||
1347 amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
1348 IP_VERSION(6, 4, 0) ||
1349 amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
1350 IP_VERSION(6, 4, 1)) {
1351 pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
1352 field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
1353 } else {
1354 pcs_ras_fields = &xgmi_pcs_ras_fields[0];
1355 field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields);
1356 }
1357 } else {
1358 pcs_ras_fields = &wafl_pcs_ras_fields[0];
1359 field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields);
1360 }
1361
1362 if (check_mask)
1363 value = value & ~mask_value;
1364
1365 /* query xgmi/walf pcs error status,
1366 * only ue is supported */
1367 for (i = 0; value && i < field_array_size; i++) {
1368 ue_cnt = (value &
1369 pcs_ras_fields[i].pcs_err_mask) >>
1370 pcs_ras_fields[i].pcs_err_shift;
1371 if (ue_cnt) {
1372 dev_info(adev->dev, "%s detected\n",
1373 pcs_ras_fields[i].err_name);
1374 *ue_count += ue_cnt;
1375 }
1376
1377 /* reset bit value if the bit is checked */
1378 value &= ~(pcs_ras_fields[i].pcs_err_mask);
1379 }
1380
1381 return 0;
1382}
1383
1384static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
1385 void *ras_error_status)
1386{
1387 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1388 int i, supported = 1;
1389 uint32_t data, mask_data = 0;
1390 uint32_t ue_cnt = 0, ce_cnt = 0;
1391
1392 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
1393 return ;
1394
1395 err_data->ue_count = 0;
1396 err_data->ce_count = 0;
1397
1398 switch (adev->asic_type) {
1399 case CHIP_ARCTURUS:
1400 /* check xgmi pcs error */
1401 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
1402 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
1403 if (data)
1404 amdgpu_xgmi_query_pcs_error_status(adev, data,
1405 mask_data, &ue_cnt, &ce_cnt, true, false);
1406 }
1407 /* check wafl pcs error */
1408 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
1409 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
1410 if (data)
1411 amdgpu_xgmi_query_pcs_error_status(adev, data,
1412 mask_data, &ue_cnt, &ce_cnt, false, false);
1413 }
1414 break;
1415 case CHIP_VEGA20:
1416 /* check xgmi pcs error */
1417 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
1418 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
1419 if (data)
1420 amdgpu_xgmi_query_pcs_error_status(adev, data,
1421 mask_data, &ue_cnt, &ce_cnt, true, false);
1422 }
1423 /* check wafl pcs error */
1424 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
1425 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
1426 if (data)
1427 amdgpu_xgmi_query_pcs_error_status(adev, data,
1428 mask_data, &ue_cnt, &ce_cnt, false, false);
1429 }
1430 break;
1431 case CHIP_ALDEBARAN:
1432 /* check xgmi3x16 pcs error */
1433 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
1434 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
1435 mask_data =
1436 RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
1437 if (data)
1438 amdgpu_xgmi_query_pcs_error_status(adev, data,
1439 mask_data, &ue_cnt, &ce_cnt, true, true);
1440 }
1441 /* check wafl pcs error */
1442 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
1443 data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
1444 mask_data =
1445 RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
1446 if (data)
1447 amdgpu_xgmi_query_pcs_error_status(adev, data,
1448 mask_data, &ue_cnt, &ce_cnt, false, true);
1449 }
1450 break;
1451 default:
1452 supported = 0;
1453 break;
1454 }
1455
1456 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1457 case IP_VERSION(6, 4, 0):
1458 case IP_VERSION(6, 4, 1):
1459 /* check xgmi3x16 pcs error */
1460 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) {
1461 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
1462 mask_data =
1463 RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]);
1464 if (data)
1465 amdgpu_xgmi_query_pcs_error_status(adev, data,
1466 mask_data, &ue_cnt, &ce_cnt, true, true);
1467 }
1468 break;
1469 default:
1470 if (!supported)
1471 dev_warn(adev->dev, "XGMI RAS error query not supported");
1472 break;
1473 }
1474
1475 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL);
1476
1477 err_data->ue_count += ue_cnt;
1478 err_data->ce_count += ce_cnt;
1479}
1480
1481static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status)
1482{
1483 const char *error_str;
1484 int ext_error_code;
1485
1486 ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status);
1487
1488 error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
1489 xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
1490 if (error_str)
1491 dev_info(adev->dev, "%s detected\n", error_str);
1492
1493 switch (ext_error_code) {
1494 case 0:
1495 return ACA_ERROR_TYPE_UE;
1496 case 6:
1497 return ACA_ERROR_TYPE_CE;
1498 default:
1499 return -EINVAL;
1500 }
1501
1502 return -EINVAL;
1503}
1504
1505static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info,
1506 u64 mca_base, struct ras_err_data *err_data)
1507{
1508 int xgmi_inst = mcm_info->die_id;
1509 uint64_t smn_base;
1510 u64 status = 0;
1511
1512 status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS);
1513 if (!ACA_REG__STATUS__VAL(status))
1514 return;
1515
1516 switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
1517 case ACA_ERROR_TYPE_UE:
1518 amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
1519 break;
1520 case ACA_ERROR_TYPE_CE:
1521 amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
1522 break;
1523 default:
1524 break;
1525 }
1526 smn_base = amdgpu_reg_get_smn_base64(adev, XGMI_HWIP, xgmi_inst);
1527 WREG64_MCA(smn_base, mca_base, ACA_REG_IDX_STATUS, 0ULL);
1528}
1529
1530static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data)
1531{
1532 struct amdgpu_smuio_mcm_config_info mcm_info = {
1533 .socket_id = adev->smuio.funcs->get_socket_id(adev),
1534 .die_id = xgmi_inst,
1535 };
1536 int i;
1537
1538 for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
1539 __xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data);
1540}
1541
1542static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
1543{
1544 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1545 int i;
1546
1547 for_each_inst(i, adev->aid_mask)
1548 xgmi_v6_4_0_query_error_count(adev, i, err_data);
1549}
1550
1551static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
1552 void *ras_error_status)
1553{
1554 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1555 case IP_VERSION(6, 4, 0):
1556 case IP_VERSION(6, 4, 1):
1557 xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status);
1558 break;
1559 default:
1560 amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status);
1561 break;
1562 }
1563}
1564
1565/* Trigger XGMI/WAFL error */
1566static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
1567 void *inject_if, uint32_t instance_mask)
1568{
1569 int ret1, ret2;
1570 struct ta_ras_trigger_error_input *block_info =
1571 (struct ta_ras_trigger_error_input *)inject_if;
1572
1573 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
1574 dev_warn(adev->dev, "Failed to disallow df cstate");
1575
1576 ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW);
1577 if (ret1 && ret1 != -EOPNOTSUPP)
1578 dev_warn(adev->dev, "Failed to disallow XGMI power down");
1579
1580 ret2 = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
1581
1582 if (amdgpu_ras_intr_triggered())
1583 return ret2;
1584
1585 ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT);
1586 if (ret1 && ret1 != -EOPNOTSUPP)
1587 dev_warn(adev->dev, "Failed to allow XGMI power down");
1588
1589 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
1590 dev_warn(adev->dev, "Failed to allow df cstate");
1591
1592 return ret2;
1593}
1594
1595struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = {
1596 .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
1597 .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
1598 .ras_error_inject = amdgpu_ras_error_inject_xgmi,
1599};
1600
1601struct amdgpu_xgmi_ras xgmi_ras = {
1602 .ras_block = {
1603 .hw_ops = &xgmi_ras_hw_ops,
1604 .ras_late_init = amdgpu_xgmi_ras_late_init,
1605 },
1606};
1607
1608int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
1609{
1610 int err;
1611 struct amdgpu_xgmi_ras *ras;
1612
1613 if (!adev->gmc.xgmi.ras)
1614 return 0;
1615
1616 ras = adev->gmc.xgmi.ras;
1617 err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
1618 if (err) {
1619 dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n");
1620 return err;
1621 }
1622
1623 strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl");
1624 ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
1625 ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
1626 adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm;
1627
1628 return 0;
1629}
1630
1631static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
1632{
1633 struct amdgpu_hive_info *hive =
1634 container_of(work, struct amdgpu_hive_info, reset_on_init_work);
1635 struct amdgpu_reset_context reset_context;
1636 struct amdgpu_device *tmp_adev;
1637 struct list_head device_list;
1638 int r;
1639
1640 mutex_lock(&hive->hive_lock);
1641
1642 INIT_LIST_HEAD(&device_list);
1643 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
1644 list_add_tail(&tmp_adev->reset_list, &device_list);
1645
1646 tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
1647 reset_list);
1648 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
1649
1650 reset_context.method = AMD_RESET_METHOD_ON_INIT;
1651 reset_context.reset_req_dev = tmp_adev;
1652 reset_context.hive = hive;
1653 reset_context.reset_device_list = &device_list;
1654 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
1655 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
1656
1657 amdgpu_reset_do_xgmi_reset_on_init(&reset_context);
1658 mutex_unlock(&hive->hive_lock);
1659 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
1660
1661 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
1662 r = amdgpu_ras_init_badpage_info(tmp_adev);
1663 if (r && r != -EHWPOISON)
1664 dev_err(tmp_adev->dev,
1665 "error during bad page data initialization");
1666 }
1667}
1668
1669static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive)
1670{
1671 INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work);
1672 amdgpu_reset_domain_schedule(hive->reset_domain,
1673 &hive->reset_on_init_work);
1674}
1675
1676int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev)
1677{
1678 struct amdgpu_hive_info *hive;
1679 bool reset_scheduled;
1680 int num_devs;
1681
1682 hive = amdgpu_get_xgmi_hive(adev);
1683 if (!hive)
1684 return -EINVAL;
1685
1686 mutex_lock(&hive->hive_lock);
1687 num_devs = atomic_read(&hive->number_devices);
1688 reset_scheduled = false;
1689 if (num_devs == adev->gmc.xgmi.num_physical_nodes) {
1690 amdgpu_xgmi_schedule_reset_on_init(hive);
1691 reset_scheduled = true;
1692 }
1693
1694 mutex_unlock(&hive->hive_lock);
1695 amdgpu_put_xgmi_hive(hive);
1696
1697 if (reset_scheduled)
1698 flush_work(&hive->reset_on_init_work);
1699
1700 return 0;
1701}
1702
1703int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev,
1704 struct amdgpu_hive_info *hive,
1705 int req_nps_mode)
1706{
1707 struct amdgpu_device *tmp_adev;
1708 int cur_nps_mode, r;
1709
1710 /* This is expected to be called only during unload of driver. The
1711 * request needs to be placed only once for all devices in the hive. If
1712 * one of them fail, revert the request for previous successful devices.
1713 * After placing the request, make hive mode as UNKNOWN so that other
1714 * devices don't request anymore.
1715 */
1716 mutex_lock(&hive->hive_lock);
1717 if (atomic_read(&hive->requested_nps_mode) ==
1718 UNKNOWN_MEMORY_PARTITION_MODE) {
1719 dev_dbg(adev->dev, "Unexpected entry for hive NPS change");
1720 mutex_unlock(&hive->hive_lock);
1721 return 0;
1722 }
1723 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
1724 r = adev->gmc.gmc_funcs->request_mem_partition_mode(
1725 tmp_adev, req_nps_mode);
1726 if (r)
1727 break;
1728 }
1729 if (r) {
1730 /* Request back current mode if one of the requests failed */
1731 cur_nps_mode =
1732 adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev);
1733 list_for_each_entry_continue_reverse(
1734 tmp_adev, &hive->device_list, gmc.xgmi.head)
1735 adev->gmc.gmc_funcs->request_mem_partition_mode(
1736 tmp_adev, cur_nps_mode);
1737 }
1738 /* Set to UNKNOWN so that other devices don't request anymore */
1739 atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
1740 mutex_unlock(&hive->hive_lock);
1741
1742 return r;
1743}
1744
1745bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
1746 struct amdgpu_device *bo_adev)
1747{
1748 return (amdgpu_use_xgmi_p2p && adev != bo_adev &&
1749 adev->gmc.xgmi.hive_id &&
1750 adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
1751}
1752
1753void amdgpu_xgmi_early_init(struct amdgpu_device *adev)
1754{
1755 if (!adev->gmc.xgmi.supported)
1756 return;
1757
1758 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
1759 case IP_VERSION(9, 4, 0):
1760 case IP_VERSION(9, 4, 1):
1761 case IP_VERSION(9, 4, 2):
1762 /* 25 GT/s */
1763 adev->gmc.xgmi.max_speed = 25;
1764 adev->gmc.xgmi.max_width = 16;
1765 break;
1766 case IP_VERSION(9, 4, 3):
1767 case IP_VERSION(9, 4, 4):
1768 case IP_VERSION(9, 5, 0):
1769 /* 32 GT/s */
1770 adev->gmc.xgmi.max_speed = 32;
1771 adev->gmc.xgmi.max_width = 16;
1772 break;
1773 default:
1774 break;
1775 }
1776}
1777
1778void amgpu_xgmi_set_max_speed_width(struct amdgpu_device *adev,
1779 uint16_t max_speed, uint8_t max_width)
1780{
1781 adev->gmc.xgmi.max_speed = max_speed;
1782 adev->gmc.xgmi.max_width = max_width;
1783}