Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
24#include "amdgpu_ras_eeprom.h"
25#include "amdgpu.h"
26#include "amdgpu_ras.h"
27#include <linux/bits.h>
28#include "atom.h"
29#include "amdgpu_eeprom.h"
30#include "amdgpu_atomfirmware.h"
31#include <linux/debugfs.h>
32#include <linux/uaccess.h>
33
34#include "amdgpu_reset.h"
35#include "amdgpu_ras_mgr.h"
36
37/* These are memory addresses as would be seen by one or more EEPROM
38 * chips strung on the I2C bus, usually by manipulating pins 1-3 of a
39 * set of EEPROM devices. They form a continuous memory space.
40 *
41 * The I2C device address includes the device type identifier, 1010b,
42 * which is a reserved value and indicates that this is an I2C EEPROM
43 * device. It also includes the top 3 bits of the 19 bit EEPROM memory
44 * address, namely bits 18, 17, and 16. This makes up the 7 bit
45 * address sent on the I2C bus with bit 0 being the direction bit,
46 * which is not represented here, and sent by the hardware directly.
47 *
48 * For instance,
49 * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
50 * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
51 * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
52 * Depending on the size of the I2C EEPROM device(s), bits 18:16 may
53 * address memory in a device or a device on the I2C bus, depending on
54 * the status of pins 1-3. See top of amdgpu_eeprom.c.
55 *
56 * The RAS table lives either at address 0 or address 40000h of EEPROM.
57 */
58#define EEPROM_I2C_MADDR_0 0x0
59#define EEPROM_I2C_MADDR_4 0x40000
60
61/*
62 * The 2 macros below represent the actual size in bytes that
63 * those entities occupy in the EEPROM memory.
64 * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
65 * uses uint64 to store 6b fields such as retired_page.
66 */
67#define RAS_TABLE_HEADER_SIZE 20
68#define RAS_TABLE_RECORD_SIZE 24
69
70/* Table hdr is 'AMDR' */
71#define RAS_TABLE_HDR_VAL 0x414d4452
72
73/* Bad GPU tag ‘BADG’ */
74#define RAS_TABLE_HDR_BAD 0x42414447
75
76/*
77 * EEPROM Table structure v1
78 * ---------------------------------
79 * | |
80 * | EEPROM TABLE HEADER |
81 * | ( size 20 Bytes ) |
82 * | |
83 * ---------------------------------
84 * | |
85 * | BAD PAGE RECORD AREA |
86 * | |
87 * ---------------------------------
88 */
89
90/* Assume 2-Mbit size EEPROM and take up the whole space. */
91#define RAS_TBL_SIZE_BYTES (256 * 1024)
92#define RAS_TABLE_START 0
93#define RAS_HDR_START RAS_TABLE_START
94#define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
95#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
96 / RAS_TABLE_RECORD_SIZE)
97
98/*
99 * EEPROM Table structrue v2.1
100 * ---------------------------------
101 * | |
102 * | EEPROM TABLE HEADER |
103 * | ( size 20 Bytes ) |
104 * | |
105 * ---------------------------------
106 * | |
107 * | EEPROM TABLE RAS INFO |
108 * | (available info size 4 Bytes) |
109 * | ( reserved size 252 Bytes ) |
110 * | |
111 * ---------------------------------
112 * | |
113 * | BAD PAGE RECORD AREA |
114 * | |
115 * ---------------------------------
116 */
117
118/* EEPROM Table V2_1 */
119#define RAS_TABLE_V2_1_INFO_SIZE 256
120#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
121#define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \
122 RAS_TABLE_V2_1_INFO_SIZE)
123#define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \
124 RAS_TABLE_V2_1_INFO_SIZE) \
125 / RAS_TABLE_RECORD_SIZE)
126
127#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
128
129/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
130 * offset off of RAS_TABLE_START. That is, this is something you can
131 * add to control->i2c_address, and then tell I2C layer to read
132 * from/write to there. _N is the so called absolute index,
133 * because it starts right after the table header.
134 */
135#define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
136 (_N) * RAS_TABLE_RECORD_SIZE)
137
138#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
139 (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
140
141/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
142 * of "fri", return the absolute record index off of the end of
143 * the table header.
144 */
145#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
146 (_C)->ras_max_record_count)
147
148#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
149 RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
150
151#define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
152 RAS_TABLE_HEADER_SIZE - \
153 RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)
154
155#define to_amdgpu_device(x) ((container_of(x, struct amdgpu_ras, eeprom_control))->adev)
156
157static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
158{
159 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
160 case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */
161 case IP_VERSION(11, 0, 7): /* Sienna cichlid */
162 case IP_VERSION(13, 0, 0):
163 case IP_VERSION(13, 0, 2): /* Aldebaran */
164 case IP_VERSION(13, 0, 10):
165 return true;
166 case IP_VERSION(13, 0, 6):
167 case IP_VERSION(13, 0, 12):
168 case IP_VERSION(13, 0, 14):
169 return (adev->gmc.is_app_apu) ? false : true;
170 default:
171 return false;
172 }
173}
174
175static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
176 struct amdgpu_ras_eeprom_control *control)
177{
178 struct atom_context *atom_ctx = adev->mode_info.atom_context;
179 u8 i2c_addr;
180
181 if (!control)
182 return false;
183
184 if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
185 /* The address given by VBIOS is an 8-bit, wire-format
186 * address, i.e. the most significant byte.
187 *
188 * Normalize it to a 19-bit EEPROM address. Remove the
189 * device type identifier and make it a 7-bit address;
190 * then make it a 19-bit EEPROM address. See top of
191 * amdgpu_eeprom.c.
192 */
193 i2c_addr = (i2c_addr & 0x0F) >> 1;
194 control->i2c_address = ((u32) i2c_addr) << 16;
195
196 return true;
197 }
198
199 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
200 case IP_VERSION(11, 0, 2):
201 /* VEGA20 and ARCTURUS */
202 if (adev->asic_type == CHIP_VEGA20)
203 control->i2c_address = EEPROM_I2C_MADDR_0;
204 else if (strnstr(atom_ctx->vbios_pn,
205 "D342",
206 sizeof(atom_ctx->vbios_pn)))
207 control->i2c_address = EEPROM_I2C_MADDR_0;
208 else
209 control->i2c_address = EEPROM_I2C_MADDR_4;
210 return true;
211 case IP_VERSION(11, 0, 7):
212 control->i2c_address = EEPROM_I2C_MADDR_0;
213 return true;
214 case IP_VERSION(13, 0, 2):
215 if (strnstr(atom_ctx->vbios_pn, "D673",
216 sizeof(atom_ctx->vbios_pn)))
217 control->i2c_address = EEPROM_I2C_MADDR_4;
218 else
219 control->i2c_address = EEPROM_I2C_MADDR_0;
220 return true;
221 case IP_VERSION(13, 0, 0):
222 if (strnstr(atom_ctx->vbios_pn, "D707",
223 sizeof(atom_ctx->vbios_pn)))
224 control->i2c_address = EEPROM_I2C_MADDR_0;
225 else
226 control->i2c_address = EEPROM_I2C_MADDR_4;
227 return true;
228 case IP_VERSION(13, 0, 6):
229 case IP_VERSION(13, 0, 10):
230 case IP_VERSION(13, 0, 12):
231 case IP_VERSION(13, 0, 14):
232 control->i2c_address = EEPROM_I2C_MADDR_4;
233 return true;
234 default:
235 return false;
236 }
237}
238
239static void
240__encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr,
241 unsigned char *buf)
242{
243 u32 *pp = (uint32_t *)buf;
244
245 pp[0] = cpu_to_le32(hdr->header);
246 pp[1] = cpu_to_le32(hdr->version);
247 pp[2] = cpu_to_le32(hdr->first_rec_offset);
248 pp[3] = cpu_to_le32(hdr->tbl_size);
249 pp[4] = cpu_to_le32(hdr->checksum);
250}
251
252static void
253__decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr,
254 unsigned char *buf)
255{
256 u32 *pp = (uint32_t *)buf;
257
258 hdr->header = le32_to_cpu(pp[0]);
259 hdr->version = le32_to_cpu(pp[1]);
260 hdr->first_rec_offset = le32_to_cpu(pp[2]);
261 hdr->tbl_size = le32_to_cpu(pp[3]);
262 hdr->checksum = le32_to_cpu(pp[4]);
263}
264
265static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
266{
267 u8 buf[RAS_TABLE_HEADER_SIZE];
268 struct amdgpu_device *adev = to_amdgpu_device(control);
269 int res;
270
271 memset(buf, 0, sizeof(buf));
272 __encode_table_header_to_buf(&control->tbl_hdr, buf);
273
274 /* i2c may be unstable in gpu reset */
275 down_read(&adev->reset_domain->sem);
276 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
277 control->i2c_address +
278 control->ras_header_offset,
279 buf, RAS_TABLE_HEADER_SIZE);
280 up_read(&adev->reset_domain->sem);
281
282 if (res < 0) {
283 dev_err(adev->dev, "Failed to write EEPROM table header:%d",
284 res);
285 } else if (res < RAS_TABLE_HEADER_SIZE) {
286 dev_err(adev->dev, "Short write:%d out of %d\n", res,
287 RAS_TABLE_HEADER_SIZE);
288 res = -EIO;
289 } else {
290 res = 0;
291 }
292
293 return res;
294}
295
296static void
297__encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
298 unsigned char *buf)
299{
300 u32 *pp = (uint32_t *)buf;
301 u32 tmp;
302
303 tmp = ((uint32_t)(rai->rma_status) & 0xFF) |
304 (((uint32_t)(rai->health_percent) << 8) & 0xFF00) |
305 (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000);
306 pp[0] = cpu_to_le32(tmp);
307}
308
309static void
310__decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
311 unsigned char *buf)
312{
313 u32 *pp = (uint32_t *)buf;
314 u32 tmp;
315
316 tmp = le32_to_cpu(pp[0]);
317 rai->rma_status = tmp & 0xFF;
318 rai->health_percent = (tmp >> 8) & 0xFF;
319 rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF;
320}
321
322static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control)
323{
324 struct amdgpu_device *adev = to_amdgpu_device(control);
325 u8 *buf;
326 int res;
327
328 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
329 if (!buf) {
330 dev_err(adev->dev,
331 "Failed to alloc buf to write table ras info\n");
332 return -ENOMEM;
333 }
334
335 __encode_table_ras_info_to_buf(&control->tbl_rai, buf);
336
337 /* i2c may be unstable in gpu reset */
338 down_read(&adev->reset_domain->sem);
339 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
340 control->i2c_address +
341 control->ras_info_offset,
342 buf, RAS_TABLE_V2_1_INFO_SIZE);
343 up_read(&adev->reset_domain->sem);
344
345 if (res < 0) {
346 dev_err(adev->dev, "Failed to write EEPROM table ras info:%d",
347 res);
348 } else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
349 dev_err(adev->dev, "Short write:%d out of %d\n", res,
350 RAS_TABLE_V2_1_INFO_SIZE);
351 res = -EIO;
352 } else {
353 res = 0;
354 }
355
356 kfree(buf);
357
358 return res;
359}
360
361static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
362{
363 int ii;
364 u8 *pp, csum;
365 size_t sz;
366
367 /* Header checksum, skip checksum field in the calculation */
368 sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
369 pp = (u8 *) &control->tbl_hdr;
370 csum = 0;
371 for (ii = 0; ii < sz; ii++, pp++)
372 csum += *pp;
373
374 return csum;
375}
376
377static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control)
378{
379 int ii;
380 u8 *pp, csum;
381 size_t sz;
382
383 sz = sizeof(control->tbl_rai);
384 pp = (u8 *) &control->tbl_rai;
385 csum = 0;
386 for (ii = 0; ii < sz; ii++, pp++)
387 csum += *pp;
388
389 return csum;
390}
391
392static int amdgpu_ras_eeprom_correct_header_tag(
393 struct amdgpu_ras_eeprom_control *control,
394 uint32_t header)
395{
396 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
397 u8 *hh;
398 int res;
399 u8 csum;
400
401 csum = -hdr->checksum;
402
403 hh = (void *) &hdr->header;
404 csum -= (hh[0] + hh[1] + hh[2] + hh[3]);
405 hh = (void *) &header;
406 csum += hh[0] + hh[1] + hh[2] + hh[3];
407 csum = -csum;
408 mutex_lock(&control->ras_tbl_mutex);
409 hdr->header = header;
410 hdr->checksum = csum;
411 res = __write_table_header(control);
412 mutex_unlock(&control->ras_tbl_mutex);
413
414 return res;
415}
416
417static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
418{
419 struct amdgpu_device *adev = to_amdgpu_device(control);
420 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
421
422 switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
423 case IP_VERSION(8, 10, 0):
424 hdr->version = RAS_TABLE_VER_V2_1;
425 return;
426 case IP_VERSION(12, 0, 0):
427 case IP_VERSION(12, 5, 0):
428 hdr->version = RAS_TABLE_VER_V3;
429 return;
430 default:
431 hdr->version = RAS_TABLE_VER_V1;
432 return;
433 }
434}
435
436/**
437 * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
438 * @control: pointer to control structure
439 *
440 * Reset the contents of the header of the RAS EEPROM table.
441 * Return 0 on success, -errno on error.
442 */
443int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
444{
445 struct amdgpu_device *adev = to_amdgpu_device(control);
446 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
447 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
448 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
449 u32 erase_res = 0;
450 u8 csum;
451 int res;
452
453 mutex_lock(&control->ras_tbl_mutex);
454
455 if (!amdgpu_ras_smu_eeprom_supported(adev)) {
456 hdr->header = RAS_TABLE_HDR_VAL;
457 amdgpu_ras_set_eeprom_table_version(control);
458
459 if (hdr->version >= RAS_TABLE_VER_V2_1) {
460 hdr->first_rec_offset = RAS_RECORD_START_V2_1;
461 hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
462 RAS_TABLE_V2_1_INFO_SIZE;
463 rai->rma_status = GPU_HEALTH_USABLE;
464
465 control->ras_record_offset = RAS_RECORD_START_V2_1;
466 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
467 /**
468 * GPU health represented as a percentage.
469 * 0 means worst health, 100 means fully health.
470 */
471 rai->health_percent = 100;
472 /* ecc_page_threshold = 0 means disable bad page retirement */
473 rai->ecc_page_threshold = con->bad_page_cnt_threshold;
474 } else {
475 hdr->first_rec_offset = RAS_RECORD_START;
476 hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
477
478 control->ras_record_offset = RAS_RECORD_START;
479 control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
480 }
481
482 csum = __calc_hdr_byte_sum(control);
483 if (hdr->version >= RAS_TABLE_VER_V2_1)
484 csum += __calc_ras_info_byte_sum(control);
485 csum = -csum;
486 hdr->checksum = csum;
487 res = __write_table_header(control);
488 if (!res && hdr->version > RAS_TABLE_VER_V1)
489 res = __write_table_ras_info(control);
490 } else {
491 res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res);
492 if (res || erase_res) {
493 dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d",
494 res, erase_res);
495 if (!res)
496 res = -EIO;
497 }
498 }
499
500 control->ras_num_recs = 0;
501 control->ras_num_bad_pages = 0;
502 control->ras_num_mca_recs = 0;
503 control->ras_num_pa_recs = 0;
504 control->ras_fri = 0;
505
506 amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
507
508 control->bad_channel_bitmap = 0;
509 amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
510 con->update_channel_flag = false;
511
512 amdgpu_ras_debugfs_set_ret_size(control);
513
514 mutex_unlock(&control->ras_tbl_mutex);
515
516 return res;
517}
518
519static void
520__encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control,
521 struct eeprom_table_record *record,
522 unsigned char *buf)
523{
524 __le64 tmp = 0;
525 int i = 0;
526
527 /* Next are all record fields according to EEPROM page spec in LE foramt */
528 buf[i++] = record->err_type;
529
530 buf[i++] = record->bank;
531
532 tmp = cpu_to_le64(record->ts);
533 memcpy(buf + i, &tmp, 8);
534 i += 8;
535
536 tmp = cpu_to_le64((record->offset & 0xffffffffffff));
537 memcpy(buf + i, &tmp, 6);
538 i += 6;
539
540 buf[i++] = record->mem_channel;
541 buf[i++] = record->mcumc_id;
542
543 tmp = cpu_to_le64((record->retired_page & 0xffffffffffff));
544 memcpy(buf + i, &tmp, 6);
545}
546
547static void
548__decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control,
549 struct eeprom_table_record *record,
550 unsigned char *buf)
551{
552 __le64 tmp = 0;
553 int i = 0;
554
555 /* Next are all record fields according to EEPROM page spec in LE foramt */
556 record->err_type = buf[i++];
557
558 record->bank = buf[i++];
559
560 memcpy(&tmp, buf + i, 8);
561 record->ts = le64_to_cpu(tmp);
562 i += 8;
563
564 memcpy(&tmp, buf + i, 6);
565 record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
566 i += 6;
567
568 record->mem_channel = buf[i++];
569 record->mcumc_id = buf[i++];
570
571 memcpy(&tmp, buf + i, 6);
572 record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff);
573}
574
575bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
576{
577 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
578
579 if (amdgpu_uniras_enabled(adev))
580 return amdgpu_ras_mgr_check_eeprom_safety_watermark(adev);
581
582 if (!__is_ras_eeprom_supported(adev) ||
583 !amdgpu_bad_page_threshold)
584 return false;
585
586 /* skip check eeprom table for VEGA20 Gaming */
587 if (!con)
588 return false;
589 else
590 if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
591 return false;
592
593 if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
594 if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
595 dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
596 con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
597 if ((amdgpu_bad_page_threshold == -1) ||
598 (amdgpu_bad_page_threshold == -2)) {
599 dev_warn(adev->dev,
600 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
601 return false;
602 } else {
603 dev_warn(adev->dev,
604 "Please consider adjusting the customized threshold.\n");
605 return true;
606 }
607 }
608
609 return false;
610}
611
612/**
613 * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM
614 * @control: pointer to control structure
615 * @buf: pointer to buffer containing data to write
616 * @fri: start writing at this index
617 * @num: number of records to write
618 *
619 * The caller must hold the table mutex in @control.
620 * Return 0 on success, -errno otherwise.
621 */
622static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
623 u8 *buf, const u32 fri, const u32 num)
624{
625 struct amdgpu_device *adev = to_amdgpu_device(control);
626 u32 buf_size;
627 int res;
628
629 /* i2c may be unstable in gpu reset */
630 down_read(&adev->reset_domain->sem);
631 buf_size = num * RAS_TABLE_RECORD_SIZE;
632 res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
633 control->i2c_address +
634 RAS_INDEX_TO_OFFSET(control, fri),
635 buf, buf_size);
636 up_read(&adev->reset_domain->sem);
637 if (res < 0) {
638 dev_err(adev->dev, "Writing %d EEPROM table records error:%d",
639 num, res);
640 } else if (res < buf_size) {
641 /* Short write, return error.
642 */
643 dev_err(adev->dev, "Wrote %d records out of %d",
644 res / RAS_TABLE_RECORD_SIZE, num);
645 res = -EIO;
646 } else {
647 res = 0;
648 }
649
650 return res;
651}
652
653static int
654amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
655 struct eeprom_table_record *record,
656 const u32 num)
657{
658 struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
659 struct amdgpu_device *adev = to_amdgpu_device(control);
660 u32 a, b, i;
661 u8 *buf, *pp;
662 int res;
663
664 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
665 if (!buf)
666 return -ENOMEM;
667
668 /* Encode all of them in one go.
669 */
670 pp = buf;
671 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
672 __encode_table_record_to_buf(control, &record[i], pp);
673
674 /* update bad channel bitmap */
675 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
676 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
677 control->bad_channel_bitmap |= 1 << record[i].mem_channel;
678 con->update_channel_flag = true;
679 }
680 }
681
682 /* a, first record index to write into.
683 * b, last record index to write into.
684 * a = first index to read (fri) + number of records in the table,
685 * b = a + @num - 1.
686 * Let N = control->ras_max_num_record_count, then we have,
687 * case 0: 0 <= a <= b < N,
688 * just append @num records starting at a;
689 * case 1: 0 <= a < N <= b,
690 * append (N - a) records starting at a, and
691 * append the remainder, b % N + 1, starting at 0.
692 * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases,
693 * case 2a: 0 <= a <= b < N
694 * append num records starting at a; and fix fri if b overwrote it,
695 * and since a <= b, if b overwrote it then a must've also,
696 * and if b didn't overwrite it, then a didn't also.
697 * case 2b: 0 <= b < a < N
698 * write num records starting at a, which wraps around 0=N
699 * and overwrite fri unconditionally. Now from case 2a,
700 * this means that b eclipsed fri to overwrite it and wrap
701 * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally
702 * set fri = b + 1 (mod N).
703 * Now, since fri is updated in every case, except the trivial case 0,
704 * the number of records present in the table after writing, is,
705 * num_recs - 1 = b - fri (mod N), and we take the positive value,
706 * by adding an arbitrary multiple of N before taking the modulo N
707 * as shown below.
708 */
709 a = control->ras_fri + control->ras_num_recs;
710 b = a + num - 1;
711 if (b < control->ras_max_record_count) {
712 res = __amdgpu_ras_eeprom_write(control, buf, a, num);
713 } else if (a < control->ras_max_record_count) {
714 u32 g0, g1;
715
716 g0 = control->ras_max_record_count - a;
717 g1 = b % control->ras_max_record_count + 1;
718 res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
719 if (res)
720 goto Out;
721 res = __amdgpu_ras_eeprom_write(control,
722 buf + g0 * RAS_TABLE_RECORD_SIZE,
723 0, g1);
724 if (res)
725 goto Out;
726 if (g1 > control->ras_fri)
727 control->ras_fri = g1 % control->ras_max_record_count;
728 } else {
729 a %= control->ras_max_record_count;
730 b %= control->ras_max_record_count;
731
732 if (a <= b) {
733 /* Note that, b - a + 1 = num. */
734 res = __amdgpu_ras_eeprom_write(control, buf, a, num);
735 if (res)
736 goto Out;
737 if (b >= control->ras_fri)
738 control->ras_fri = (b + 1) % control->ras_max_record_count;
739 } else {
740 u32 g0, g1;
741
742 /* b < a, which means, we write from
743 * a to the end of the table, and from
744 * the start of the table to b.
745 */
746 g0 = control->ras_max_record_count - a;
747 g1 = b + 1;
748 res = __amdgpu_ras_eeprom_write(control, buf, a, g0);
749 if (res)
750 goto Out;
751 res = __amdgpu_ras_eeprom_write(control,
752 buf + g0 * RAS_TABLE_RECORD_SIZE,
753 0, g1);
754 if (res)
755 goto Out;
756 control->ras_fri = g1 % control->ras_max_record_count;
757 }
758 }
759 control->ras_num_recs = 1 + (control->ras_max_record_count + b
760 - control->ras_fri)
761 % control->ras_max_record_count;
762
763 /*old asics only save pa to eeprom like before*/
764 if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12)
765 control->ras_num_pa_recs += num;
766 else
767 control->ras_num_mca_recs += num;
768
769 control->ras_num_bad_pages = con->bad_page_num;
770Out:
771 kfree(buf);
772 return res;
773}
774
775static int
776amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
777{
778 struct amdgpu_device *adev = to_amdgpu_device(control);
779 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
780 u8 *buf, *pp, csum;
781 u32 buf_size;
782 int res;
783
784 /* Modify the header if it exceeds.
785 */
786 if (amdgpu_bad_page_threshold != 0 &&
787 control->ras_num_bad_pages > ras->bad_page_cnt_threshold) {
788 dev_warn(adev->dev,
789 "Saved bad pages %d reaches threshold value %d\n",
790 control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
791
792 if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) &&
793 amdgpu_cper_generate_bp_threshold_record(adev))
794 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
795
796 if ((amdgpu_bad_page_threshold != -1) &&
797 (amdgpu_bad_page_threshold != -2)) {
798 control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
799 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
800 control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
801 control->tbl_rai.health_percent = 0;
802 }
803 ras->is_rma = true;
804 }
805
806 /* ignore the -ENOTSUPP return value */
807 amdgpu_dpm_send_rma_reason(adev);
808 }
809
810 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
811 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
812 RAS_TABLE_V2_1_INFO_SIZE +
813 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
814 else
815 control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
816 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
817 control->tbl_hdr.checksum = 0;
818
819 buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
820 buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
821 if (!buf) {
822 dev_err(adev->dev,
823 "allocating memory for table of size %d bytes failed\n",
824 control->tbl_hdr.tbl_size);
825 res = -ENOMEM;
826 goto Out;
827 }
828
829 down_read(&adev->reset_domain->sem);
830 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
831 control->i2c_address +
832 control->ras_record_offset,
833 buf, buf_size);
834 up_read(&adev->reset_domain->sem);
835 if (res < 0) {
836 dev_err(adev->dev, "EEPROM failed reading records:%d\n", res);
837 goto Out;
838 } else if (res < buf_size) {
839 dev_err(adev->dev, "EEPROM read %d out of %d bytes\n", res,
840 buf_size);
841 res = -EIO;
842 goto Out;
843 }
844
845 /**
846 * bad page records have been stored in eeprom,
847 * now calculate gpu health percent
848 */
849 if (amdgpu_bad_page_threshold != 0 &&
850 control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
851 control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
852 control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
853 control->ras_num_bad_pages) * 100) /
854 ras->bad_page_cnt_threshold;
855
856 /* Recalc the checksum.
857 */
858 csum = 0;
859 for (pp = buf; pp < buf + buf_size; pp++)
860 csum += *pp;
861
862 csum += __calc_hdr_byte_sum(control);
863 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
864 csum += __calc_ras_info_byte_sum(control);
865 /* avoid sign extension when assigning to "checksum" */
866 csum = -csum;
867 control->tbl_hdr.checksum = csum;
868 res = __write_table_header(control);
869 if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
870 res = __write_table_ras_info(control);
871Out:
872 kfree(buf);
873 return res;
874}
875
876int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
877{
878 struct amdgpu_device *adev = to_amdgpu_device(control);
879 int ret, retry = 20;
880
881 if (!amdgpu_ras_smu_eeprom_supported(adev))
882 return 0;
883
884 control->ras_num_recs_old = control->ras_num_recs;
885
886 do {
887 /* 1000ms timeout is long enough, smu_get_badpage_count won't
888 * return -EBUSY before timeout.
889 */
890 ret = amdgpu_ras_smu_get_badpage_count(adev,
891 &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS);
892 if (!ret &&
893 (control->ras_num_recs_old == control->ras_num_recs)) {
894 /* record number update in PMFW needs some time,
895 * smu_get_badpage_count may return immediately without
896 * count update, sleep for a while and retry again.
897 */
898 msleep(50);
899 retry--;
900 } else {
901 break;
902 }
903 } while (retry);
904
905 /* no update of record number is not a real failure,
906 * don't print warning here
907 */
908 if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
909 ret = -EINVAL;
910
911 return ret;
912}
913
914static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control)
915{
916 struct amdgpu_device *adev = to_amdgpu_device(control);
917 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
918
919 if (!amdgpu_ras_smu_eeprom_supported(adev) || !con)
920 return 0;
921
922 control->ras_num_bad_pages = con->bad_page_num;
923
924 if (amdgpu_bad_page_threshold != 0 &&
925 control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
926 dev_warn(adev->dev,
927 "Saved bad pages %d reaches threshold value %d\n",
928 control->ras_num_bad_pages, con->bad_page_cnt_threshold);
929
930 if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev))
931 dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
932
933 if ((amdgpu_bad_page_threshold != -1) &&
934 (amdgpu_bad_page_threshold != -2))
935 con->is_rma = true;
936 }
937
938 return 0;
939}
940
941/**
942 * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
943 * @control: pointer to control structure
944 * @record: array of records to append
945 * @num: number of records in @record array
946 *
947 * Append @num records to the table, calculate the checksum and write
948 * the table back to EEPROM. The maximum number of records that
949 * can be appended is between 1 and control->ras_max_record_count,
950 * regardless of how many records are already stored in the table.
951 *
952 * Return 0 on success or if EEPROM is not supported, -errno on error.
953 */
954int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
955 struct eeprom_table_record *record,
956 const u32 num)
957{
958 struct amdgpu_device *adev = to_amdgpu_device(control);
959 int res, i;
960 uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
961
962 if (!__is_ras_eeprom_supported(adev))
963 return 0;
964
965 if (amdgpu_ras_smu_eeprom_supported(adev))
966 return amdgpu_ras_smu_eeprom_append(control);
967
968 if (num == 0) {
969 dev_err(adev->dev, "will not append 0 records\n");
970 return -EINVAL;
971 } else if (num > control->ras_max_record_count) {
972 dev_err(adev->dev,
973 "cannot append %d records than the size of table %d\n",
974 num, control->ras_max_record_count);
975 return -EINVAL;
976 }
977
978 if (adev->gmc.gmc_funcs->query_mem_partition_mode)
979 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
980
981 /* set the new channel index flag */
982 for (i = 0; i < num; i++)
983 record[i].retired_page |= (nps << UMC_NPS_SHIFT);
984
985 mutex_lock(&control->ras_tbl_mutex);
986
987 res = amdgpu_ras_eeprom_append_table(control, record, num);
988 if (!res)
989 res = amdgpu_ras_eeprom_update_header(control);
990 if (!res)
991 amdgpu_ras_debugfs_set_ret_size(control);
992
993 mutex_unlock(&control->ras_tbl_mutex);
994
995 /* clear channel index flag, the flag is only saved on eeprom */
996 for (i = 0; i < num; i++)
997 record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
998
999 return res;
1000}
1001
1002/**
1003 * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer
1004 * @control: pointer to control structure
1005 * @buf: pointer to buffer to read into
1006 * @fri: first record index, start reading at this index, absolute index
1007 * @num: number of records to read
1008 *
1009 * The caller must hold the table mutex in @control.
1010 * Return 0 on success, -errno otherwise.
1011 */
1012static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
1013 u8 *buf, const u32 fri, const u32 num)
1014{
1015 struct amdgpu_device *adev = to_amdgpu_device(control);
1016 u32 buf_size;
1017 int res;
1018
1019 /* i2c may be unstable in gpu reset */
1020 down_read(&adev->reset_domain->sem);
1021 buf_size = num * RAS_TABLE_RECORD_SIZE;
1022 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1023 control->i2c_address +
1024 RAS_INDEX_TO_OFFSET(control, fri),
1025 buf, buf_size);
1026 up_read(&adev->reset_domain->sem);
1027 if (res < 0) {
1028 dev_err(adev->dev, "Reading %d EEPROM table records error:%d",
1029 num, res);
1030 } else if (res < buf_size) {
1031 /* Short read, return error.
1032 */
1033 dev_err(adev->dev, "Read %d records out of %d",
1034 res / RAS_TABLE_RECORD_SIZE, num);
1035 res = -EIO;
1036 } else {
1037 res = 0;
1038 }
1039
1040 return res;
1041}
1042
1043int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
1044 struct eeprom_table_record *record, u32 rec_idx,
1045 const u32 num)
1046{
1047 struct amdgpu_device *adev = to_amdgpu_device(control);
1048 uint64_t ts, end_idx;
1049 int i, ret;
1050 u64 mca, ipid;
1051
1052 if (!amdgpu_ras_smu_eeprom_supported(adev))
1053 return 0;
1054
1055 if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse)
1056 return -EOPNOTSUPP;
1057
1058 end_idx = rec_idx + num;
1059 for (i = rec_idx; i < end_idx; i++) {
1060 ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca);
1061 if (ret)
1062 return ret;
1063
1064 ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid);
1065 if (ret)
1066 return ret;
1067
1068 ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts);
1069 if (ret)
1070 return ret;
1071
1072 record[i - rec_idx].address = mca;
1073 /* retired_page (pa) is unused now */
1074 record[i - rec_idx].retired_page = 0x1ULL;
1075 record[i - rec_idx].ts = ts;
1076 record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
1077
1078 adev->umc.ras->mca_ipid_parse(adev, ipid,
1079 (uint32_t *)&(record[i - rec_idx].cu),
1080 (uint32_t *)&(record[i - rec_idx].mem_channel),
1081 (uint32_t *)&(record[i - rec_idx].mcumc_id), NULL);
1082 }
1083
1084 return 0;
1085}
1086
1087/**
1088 * amdgpu_ras_eeprom_read -- read EEPROM
1089 * @control: pointer to control structure
1090 * @record: array of records to read into
1091 * @num: number of records in @record
1092 *
1093 * Reads num records from the RAS table in EEPROM and
1094 * writes the data into @record array.
1095 *
1096 * Returns 0 on success, -errno on error.
1097 */
1098int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
1099 struct eeprom_table_record *record,
1100 const u32 num)
1101{
1102 struct amdgpu_device *adev = to_amdgpu_device(control);
1103 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1104 int i, res;
1105 u8 *buf, *pp;
1106 u32 g0, g1;
1107
1108 if (amdgpu_ras_smu_eeprom_supported(adev))
1109 return amdgpu_ras_eeprom_read_idx(control, record, 0, num);
1110
1111 if (!__is_ras_eeprom_supported(adev))
1112 return 0;
1113
1114 if (num == 0) {
1115 dev_err(adev->dev, "will not read 0 records\n");
1116 return -EINVAL;
1117 } else if (num > control->ras_num_recs) {
1118 dev_err(adev->dev, "too many records to read:%d available:%d\n",
1119 num, control->ras_num_recs);
1120 return -EINVAL;
1121 }
1122
1123 buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
1124 if (!buf)
1125 return -ENOMEM;
1126
1127 /* Determine how many records to read, from the first record
1128 * index, fri, to the end of the table, and from the beginning
1129 * of the table, such that the total number of records is
1130 * @num, and we handle wrap around when fri > 0 and
1131 * fri + num > RAS_MAX_RECORD_COUNT.
1132 *
1133 * First we compute the index of the last element
1134 * which would be fetched from each region,
1135 * g0 is in [fri, fri + num - 1], and
1136 * g1 is in [0, RAS_MAX_RECORD_COUNT - 1].
1137 * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of
1138 * the last element to fetch, we set g0 to _the number_
1139 * of elements to fetch, @num, since we know that the last
1140 * indexed to be fetched does not exceed the table.
1141 *
1142 * If, however, g0 >= RAS_MAX_RECORD_COUNT, then
1143 * we set g0 to the number of elements to read
1144 * until the end of the table, and g1 to the number of
1145 * elements to read from the beginning of the table.
1146 */
1147 g0 = control->ras_fri + num - 1;
1148 g1 = g0 % control->ras_max_record_count;
1149 if (g0 < control->ras_max_record_count) {
1150 g0 = num;
1151 g1 = 0;
1152 } else {
1153 g0 = control->ras_max_record_count - control->ras_fri;
1154 g1 += 1;
1155 }
1156
1157 mutex_lock(&control->ras_tbl_mutex);
1158 res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0);
1159 if (res)
1160 goto Out;
1161 if (g1) {
1162 res = __amdgpu_ras_eeprom_read(control,
1163 buf + g0 * RAS_TABLE_RECORD_SIZE,
1164 0, g1);
1165 if (res)
1166 goto Out;
1167 }
1168
1169 res = 0;
1170
1171 /* Read up everything? Then transform.
1172 */
1173 pp = buf;
1174 for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
1175 __decode_table_record_from_buf(control, &record[i], pp);
1176
1177 /* update bad channel bitmap */
1178 if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
1179 !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
1180 control->bad_channel_bitmap |= 1 << record[i].mem_channel;
1181 con->update_channel_flag = true;
1182 }
1183 }
1184Out:
1185 kfree(buf);
1186 mutex_unlock(&control->ras_tbl_mutex);
1187
1188 return res;
1189}
1190
1191uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
1192{
1193 /* get available eeprom table version first before eeprom table init */
1194 amdgpu_ras_set_eeprom_table_version(control);
1195
1196 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
1197 return RAS_MAX_RECORD_COUNT_V2_1;
1198 else
1199 return RAS_MAX_RECORD_COUNT;
1200}
1201
1202static ssize_t
1203amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
1204 size_t size, loff_t *pos)
1205{
1206 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1207 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1208 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1209 u8 data[50];
1210 int res;
1211
1212 if (!size)
1213 return size;
1214
1215 if (!ras || !control) {
1216 res = snprintf(data, sizeof(data), "Not supported\n");
1217 } else {
1218 res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
1219 RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
1220 }
1221
1222 if (*pos >= res)
1223 return 0;
1224
1225 res -= *pos;
1226 res = min_t(size_t, res, size);
1227
1228 if (copy_to_user(buf, &data[*pos], res))
1229 return -EFAULT;
1230
1231 *pos += res;
1232
1233 return res;
1234}
1235
1236const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
1237 .owner = THIS_MODULE,
1238 .read = amdgpu_ras_debugfs_eeprom_size_read,
1239 .write = NULL,
1240 .llseek = default_llseek,
1241};
1242
1243static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n";
1244static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
1245#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
1246static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n";
1247static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n";
1248#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
1249
1250static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
1251 "ignore",
1252 "re",
1253 "ue",
1254};
1255
1256static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
1257{
1258 return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
1259 strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
1260}
1261
1262void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
1263{
1264 struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
1265 eeprom_control);
1266 struct dentry *de = ras->de_ras_eeprom_table;
1267
1268 if (de)
1269 d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
1270}
1271
1272static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
1273 size_t size, loff_t *pos)
1274{
1275 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1276 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1277 struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
1278 const size_t orig_size = size;
1279 int res = -EFAULT;
1280 size_t data_len;
1281
1282 /* pmfw manages eeprom data by itself */
1283 if (amdgpu_ras_smu_eeprom_supported(adev))
1284 return 0;
1285
1286 mutex_lock(&control->ras_tbl_mutex);
1287
1288 /* We want *pos - data_len > 0, which means there's
1289 * bytes to be printed from data.
1290 */
1291 data_len = strlen(tbl_hdr_str);
1292 if (*pos < data_len) {
1293 data_len -= *pos;
1294 data_len = min_t(size_t, data_len, size);
1295 if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
1296 goto Out;
1297 buf += data_len;
1298 size -= data_len;
1299 *pos += data_len;
1300 }
1301
1302 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
1303 if (*pos < data_len && size > 0) {
1304 u8 data[tbl_hdr_fmt_size + 1];
1305 loff_t lpos;
1306
1307 snprintf(data, sizeof(data), tbl_hdr_fmt,
1308 control->tbl_hdr.header,
1309 control->tbl_hdr.version,
1310 control->tbl_hdr.first_rec_offset,
1311 control->tbl_hdr.tbl_size,
1312 control->tbl_hdr.checksum);
1313
1314 data_len -= *pos;
1315 data_len = min_t(size_t, data_len, size);
1316 lpos = *pos - strlen(tbl_hdr_str);
1317 if (copy_to_user(buf, &data[lpos], data_len))
1318 goto Out;
1319 buf += data_len;
1320 size -= data_len;
1321 *pos += data_len;
1322 }
1323
1324 data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
1325 if (*pos < data_len && size > 0) {
1326 loff_t lpos;
1327
1328 data_len -= *pos;
1329 data_len = min_t(size_t, data_len, size);
1330 lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
1331 if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
1332 goto Out;
1333 buf += data_len;
1334 size -= data_len;
1335 *pos += data_len;
1336 }
1337
1338 data_len = amdgpu_ras_debugfs_table_size(control);
1339 if (*pos < data_len && size > 0) {
1340 u8 dare[RAS_TABLE_RECORD_SIZE];
1341 u8 data[rec_hdr_fmt_size + 1];
1342 struct eeprom_table_record record;
1343 int s, r;
1344
1345 /* Find the starting record index
1346 */
1347 s = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1348 strlen(rec_hdr_str);
1349 s = s / rec_hdr_fmt_size;
1350 r = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
1351 strlen(rec_hdr_str);
1352 r = r % rec_hdr_fmt_size;
1353
1354 for ( ; size > 0 && s < control->ras_num_recs; s++) {
1355 u32 ai = RAS_RI_TO_AI(control, s);
1356 /* Read a single record
1357 */
1358 res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
1359 if (res)
1360 goto Out;
1361 __decode_table_record_from_buf(control, &record, dare);
1362 snprintf(data, sizeof(data), rec_hdr_fmt,
1363 s,
1364 RAS_INDEX_TO_OFFSET(control, ai),
1365 record_err_type_str[record.err_type],
1366 record.bank,
1367 record.ts,
1368 record.offset,
1369 record.mem_channel,
1370 record.mcumc_id,
1371 record.retired_page);
1372
1373 data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
1374 if (copy_to_user(buf, &data[r], data_len)) {
1375 res = -EFAULT;
1376 goto Out;
1377 }
1378 buf += data_len;
1379 size -= data_len;
1380 *pos += data_len;
1381 r = 0;
1382 }
1383 }
1384 res = 0;
1385Out:
1386 mutex_unlock(&control->ras_tbl_mutex);
1387 return res < 0 ? res : orig_size - size;
1388}
1389
1390static ssize_t
1391amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
1392 size_t size, loff_t *pos)
1393{
1394 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
1395 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1396 struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1397 u8 data[81];
1398 int res;
1399
1400 if (!size)
1401 return size;
1402
1403 if (!ras || !control) {
1404 res = snprintf(data, sizeof(data), "Not supported\n");
1405 if (*pos >= res)
1406 return 0;
1407
1408 res -= *pos;
1409 res = min_t(size_t, res, size);
1410
1411 if (copy_to_user(buf, &data[*pos], res))
1412 return -EFAULT;
1413
1414 *pos += res;
1415
1416 return res;
1417 } else {
1418 return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
1419 }
1420}
1421
1422const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
1423 .owner = THIS_MODULE,
1424 .read = amdgpu_ras_debugfs_eeprom_table_read,
1425 .write = NULL,
1426 .llseek = default_llseek,
1427};
1428
1429/**
1430 * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
1431 * @control: pointer to control structure
1432 *
1433 * Check the checksum of the stored in EEPROM RAS table.
1434 *
1435 * Return 0 if the checksum is correct,
1436 * positive if it is not correct, and
1437 * -errno on I/O error.
1438 */
1439static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control)
1440{
1441 struct amdgpu_device *adev = to_amdgpu_device(control);
1442 int buf_size, res;
1443 u8 csum, *buf, *pp;
1444
1445 if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
1446 buf_size = RAS_TABLE_HEADER_SIZE +
1447 RAS_TABLE_V2_1_INFO_SIZE +
1448 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1449 else
1450 buf_size = RAS_TABLE_HEADER_SIZE +
1451 control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
1452
1453 buf = kzalloc(buf_size, GFP_KERNEL);
1454 if (!buf) {
1455 dev_err(adev->dev,
1456 "Out of memory checking RAS table checksum.\n");
1457 return -ENOMEM;
1458 }
1459
1460 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1461 control->i2c_address +
1462 control->ras_header_offset,
1463 buf, buf_size);
1464 if (res < buf_size) {
1465 dev_err(adev->dev, "Partial read for checksum, res:%d\n", res);
1466 /* On partial reads, return -EIO.
1467 */
1468 if (res >= 0)
1469 res = -EIO;
1470 goto Out;
1471 }
1472
1473 csum = 0;
1474 for (pp = buf; pp < buf + buf_size; pp++)
1475 csum += *pp;
1476Out:
1477 kfree(buf);
1478 return res < 0 ? res : csum;
1479}
1480
1481static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
1482{
1483 struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
1484 struct amdgpu_device *adev = to_amdgpu_device(control);
1485 unsigned char *buf;
1486 int res;
1487
1488 buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
1489 if (!buf) {
1490 dev_err(adev->dev,
1491 "Failed to alloc buf to read EEPROM table ras info\n");
1492 return -ENOMEM;
1493 }
1494
1495 /**
1496 * EEPROM table V2_1 supports ras info,
1497 * read EEPROM table ras info
1498 */
1499 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1500 control->i2c_address + control->ras_info_offset,
1501 buf, RAS_TABLE_V2_1_INFO_SIZE);
1502 if (res < RAS_TABLE_V2_1_INFO_SIZE) {
1503 dev_err(adev->dev,
1504 "Failed to read EEPROM table ras info, res:%d", res);
1505 res = res >= 0 ? -EIO : res;
1506 goto Out;
1507 }
1508
1509 __decode_table_ras_info_from_buf(rai, buf);
1510
1511Out:
1512 kfree(buf);
1513 return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
1514}
1515
1516static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1517{
1518 struct amdgpu_device *adev = to_amdgpu_device(control);
1519 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1520 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1521 uint64_t local_time;
1522 int res;
1523
1524 ras->is_rma = false;
1525
1526 if (!__is_ras_eeprom_supported(adev))
1527 return 0;
1528 mutex_init(&control->ras_tbl_mutex);
1529
1530 res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version));
1531 if (res)
1532 return res;
1533
1534 res = amdgpu_ras_smu_get_badpage_count(adev,
1535 &(control->ras_num_recs), 100);
1536 if (res)
1537 return res;
1538
1539 local_time = (uint64_t)ktime_get_real_seconds();
1540 res = amdgpu_ras_smu_set_timestamp(adev, local_time);
1541 if (res)
1542 return res;
1543
1544 control->ras_max_record_count = 4000;
1545
1546 control->ras_num_mca_recs = 0;
1547 control->ras_num_pa_recs = 0;
1548
1549 return 0;
1550}
1551
1552int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1553{
1554 struct amdgpu_device *adev = to_amdgpu_device(control);
1555 unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
1556 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1557 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1558 int res;
1559
1560 if (amdgpu_ras_smu_eeprom_supported(adev))
1561 return amdgpu_ras_smu_eeprom_init(control);
1562
1563 ras->is_rma = false;
1564
1565 if (!__is_ras_eeprom_supported(adev))
1566 return 0;
1567
1568 /* Verify i2c adapter is initialized */
1569 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
1570 return -ENOENT;
1571
1572 if (!__get_eeprom_i2c_addr(adev, control))
1573 return -EINVAL;
1574
1575 control->ras_header_offset = RAS_HDR_START;
1576 control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
1577 mutex_init(&control->ras_tbl_mutex);
1578
1579 /* Read the table header from EEPROM address */
1580 res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
1581 control->i2c_address + control->ras_header_offset,
1582 buf, RAS_TABLE_HEADER_SIZE);
1583 if (res < RAS_TABLE_HEADER_SIZE) {
1584 dev_err(adev->dev, "Failed to read EEPROM table header, res:%d",
1585 res);
1586 return res >= 0 ? -EIO : res;
1587 }
1588
1589 __decode_table_header_from_buf(hdr, buf);
1590
1591 if (hdr->header != RAS_TABLE_HDR_VAL &&
1592 hdr->header != RAS_TABLE_HDR_BAD) {
1593 dev_info(adev->dev, "Creating a new EEPROM table");
1594 return amdgpu_ras_eeprom_reset_table(control);
1595 }
1596
1597 switch (hdr->version) {
1598 case RAS_TABLE_VER_V2_1:
1599 case RAS_TABLE_VER_V3:
1600 control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
1601 control->ras_record_offset = RAS_RECORD_START_V2_1;
1602 control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
1603 break;
1604 case RAS_TABLE_VER_V1:
1605 control->ras_num_recs = RAS_NUM_RECS(hdr);
1606 control->ras_record_offset = RAS_RECORD_START;
1607 control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
1608 break;
1609 default:
1610 dev_err(adev->dev,
1611 "RAS header invalid, unsupported version: %u",
1612 hdr->version);
1613 return -EINVAL;
1614 }
1615
1616 if (control->ras_num_recs > control->ras_max_record_count) {
1617 dev_err(adev->dev,
1618 "RAS header invalid, records in header: %u max allowed :%u",
1619 control->ras_num_recs, control->ras_max_record_count);
1620 return -EINVAL;
1621 }
1622
1623 control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
1624 control->ras_num_mca_recs = 0;
1625 control->ras_num_pa_recs = 0;
1626 return 0;
1627}
1628
1629static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control)
1630{
1631 struct amdgpu_device *adev = to_amdgpu_device(control);
1632 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1633
1634 if (!__is_ras_eeprom_supported(adev))
1635 return 0;
1636
1637 control->ras_num_bad_pages = ras->bad_page_num;
1638
1639 if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) &&
1640 amdgpu_bad_page_threshold != 0) {
1641 dev_warn(adev->dev,
1642 "RAS records:%d exceed threshold:%d\n",
1643 control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
1644 if ((amdgpu_bad_page_threshold == -1) ||
1645 (amdgpu_bad_page_threshold == -2)) {
1646 dev_warn(adev->dev,
1647 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
1648 } else {
1649 ras->is_rma = true;
1650 dev_warn(adev->dev,
1651 "User defined threshold is set, runtime service will be halt when threshold is reached\n");
1652 }
1653
1654 return 0;
1655 }
1656
1657 dev_dbg(adev->dev,
1658 "Found existing EEPROM table with %d records",
1659 control->ras_num_bad_pages);
1660
1661 /* Warn if we are at 90% of the threshold or above
1662 */
1663 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
1664 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
1665 control->ras_num_bad_pages,
1666 ras->bad_page_cnt_threshold);
1667 return 0;
1668}
1669
1670int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
1671{
1672 struct amdgpu_device *adev = to_amdgpu_device(control);
1673 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
1674 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1675 int res = 0;
1676
1677 if (amdgpu_ras_smu_eeprom_supported(adev))
1678 return amdgpu_ras_smu_eeprom_check(control);
1679
1680 if (!__is_ras_eeprom_supported(adev))
1681 return 0;
1682
1683 /* Verify i2c adapter is initialized */
1684 if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
1685 return -ENOENT;
1686
1687 if (!__get_eeprom_i2c_addr(adev, control))
1688 return -EINVAL;
1689
1690 control->ras_num_bad_pages = ras->bad_page_num;
1691
1692 if (hdr->header == RAS_TABLE_HDR_VAL) {
1693 dev_dbg(adev->dev,
1694 "Found existing EEPROM table with %d records",
1695 control->ras_num_bad_pages);
1696
1697 if (hdr->version >= RAS_TABLE_VER_V2_1) {
1698 res = __read_table_ras_info(control);
1699 if (res)
1700 return res;
1701 }
1702
1703 res = __verify_ras_table_checksum(control);
1704 if (res)
1705 dev_err(adev->dev,
1706 "RAS table incorrect checksum or error:%d\n",
1707 res);
1708
1709 /* Warn if we are at 90% of the threshold or above
1710 */
1711 if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
1712 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
1713 control->ras_num_bad_pages,
1714 ras->bad_page_cnt_threshold);
1715 } else if (hdr->header == RAS_TABLE_HDR_BAD &&
1716 amdgpu_bad_page_threshold != 0) {
1717 if (hdr->version >= RAS_TABLE_VER_V2_1) {
1718 res = __read_table_ras_info(control);
1719 if (res)
1720 return res;
1721 }
1722
1723 res = __verify_ras_table_checksum(control);
1724 if (res) {
1725 dev_err(adev->dev,
1726 "RAS Table incorrect checksum or error:%d\n",
1727 res);
1728 return -EINVAL;
1729 }
1730 if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) {
1731 /* This means that, the threshold was increased since
1732 * the last time the system was booted, and now,
1733 * ras->bad_page_cnt_threshold - control->num_recs > 0,
1734 * so that at least one more record can be saved,
1735 * before the page count threshold is reached.
1736 */
1737 dev_info(adev->dev,
1738 "records:%d threshold:%d, resetting "
1739 "RAS table header signature",
1740 control->ras_num_bad_pages,
1741 ras->bad_page_cnt_threshold);
1742 res = amdgpu_ras_eeprom_correct_header_tag(control,
1743 RAS_TABLE_HDR_VAL);
1744 } else {
1745 dev_warn(adev->dev,
1746 "RAS records:%d exceed threshold:%d\n",
1747 control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
1748 if ((amdgpu_bad_page_threshold == -1) ||
1749 (amdgpu_bad_page_threshold == -2)) {
1750 res = 0;
1751 dev_warn(adev->dev,
1752 "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
1753 } else {
1754 ras->is_rma = true;
1755 dev_warn(adev->dev,
1756 "User defined threshold is set, runtime service will be halt when threshold is reached\n");
1757 }
1758 }
1759 }
1760
1761 return res < 0 ? res : 0;
1762}
1763
1764void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
1765{
1766 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1767 struct amdgpu_ras_eeprom_control *control;
1768 int res;
1769
1770 if (!__is_ras_eeprom_supported(adev) || !ras ||
1771 amdgpu_ras_smu_eeprom_supported(adev))
1772 return;
1773 control = &ras->eeprom_control;
1774 if (!control->is_eeprom_valid)
1775 return;
1776 res = __verify_ras_table_checksum(control);
1777 if (res) {
1778 dev_warn(adev->dev,
1779 "RAS table incorrect checksum or error:%d, try to recover\n",
1780 res);
1781 if (!amdgpu_ras_eeprom_reset_table(control))
1782 if (!amdgpu_ras_save_bad_pages(adev, NULL))
1783 if (!__verify_ras_table_checksum(control)) {
1784 dev_info(adev->dev, "RAS table recovery succeed\n");
1785 return;
1786 }
1787 dev_err(adev->dev, "RAS table recovery failed\n");
1788 control->is_eeprom_valid = false;
1789 }
1790 return;
1791}
1792
1793static const struct ras_smu_drv *amdgpu_ras_get_smu_ras_drv(struct amdgpu_device *adev)
1794{
1795 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1796
1797 if (!ras)
1798 return NULL;
1799
1800 return ras->ras_smu_drv;
1801}
1802
1803static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev)
1804{
1805 const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev);
1806 uint64_t flags = 0ULL;
1807
1808 if (!ras_smu_drv)
1809 goto out;
1810
1811 if (ras_smu_drv->ras_smu_feature_flags)
1812 ras_smu_drv->ras_smu_feature_flags(adev, &flags);
1813
1814out:
1815 return flags;
1816}
1817
1818bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev)
1819{
1820 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1821 uint64_t flags = 0ULL;
1822
1823 if (!__is_ras_eeprom_supported(adev) || !smu_ras_drv)
1824 return false;
1825
1826 if (!smu_ras_drv->smu_eeprom_funcs)
1827 return false;
1828
1829 flags = amdgpu_ras_smu_get_feature_flags(adev);
1830
1831 return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM);
1832}
1833
1834int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev,
1835 uint32_t *table_version)
1836{
1837 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1838
1839 if (!amdgpu_ras_smu_eeprom_supported(adev))
1840 return -EOPNOTSUPP;
1841
1842 if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version)
1843 return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev,
1844 table_version);
1845 return -EOPNOTSUPP;
1846}
1847
1848int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev,
1849 uint32_t *count, uint32_t timeout)
1850{
1851 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1852
1853 if (!amdgpu_ras_smu_eeprom_supported(adev))
1854 return -EOPNOTSUPP;
1855
1856 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count)
1857 return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev,
1858 count, timeout);
1859 return -EOPNOTSUPP;
1860}
1861
1862int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev,
1863 uint16_t index, uint64_t *mca_addr)
1864{
1865 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1866
1867 if (!amdgpu_ras_smu_eeprom_supported(adev))
1868 return -EOPNOTSUPP;
1869
1870 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr)
1871 return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev,
1872 index, mca_addr);
1873 return -EOPNOTSUPP;
1874}
1875
1876int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev,
1877 uint64_t timestamp)
1878{
1879 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1880
1881 if (!amdgpu_ras_smu_eeprom_supported(adev))
1882 return -EOPNOTSUPP;
1883
1884 if (smu_ras_drv->smu_eeprom_funcs->set_timestamp)
1885 return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev,
1886 timestamp);
1887 return -EOPNOTSUPP;
1888}
1889
1890int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev,
1891 uint16_t index, uint64_t *timestamp)
1892{
1893 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1894
1895 if (!amdgpu_ras_smu_eeprom_supported(adev))
1896 return -EOPNOTSUPP;
1897
1898 if (smu_ras_drv->smu_eeprom_funcs->get_timestamp)
1899 return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev,
1900 index, timestamp);
1901 return -EOPNOTSUPP;
1902}
1903
1904int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev,
1905 uint16_t index, uint64_t *ipid)
1906{
1907 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1908
1909 if (!amdgpu_ras_smu_eeprom_supported(adev))
1910 return -EOPNOTSUPP;
1911
1912 if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid)
1913 return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev,
1914 index, ipid);
1915 return -EOPNOTSUPP;
1916}
1917
1918int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
1919 uint32_t *result)
1920{
1921 const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
1922
1923 if (!amdgpu_ras_smu_eeprom_supported(adev))
1924 return -EOPNOTSUPP;
1925
1926 if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table)
1927 return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev,
1928 result);
1929 return -EOPNOTSUPP;
1930}