drm/amdgpu: correct the calculation of RAS bad page

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

After the introduction of NPS RAS, one bad page record on eeprom may be
related to 1 or 16 bad pages, so the bad page record and bad page are
two different concepts, define a new variable to store bad page number.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by

Alex Deucher 1 year ago ae756cd8 1f06e7f3

+36 -22

4 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu_ras.c

amdgpu_ras_eeprom.c

amdgpu_ras_eeprom.h

amdgpu_umc.c

+2 -8

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

··· 2943 2943 mutex_lock(&con->recovery_lock); 2944 2944 control = &con->eeprom_control; 2945 2945 data = con->eh_data; 2946 - bad_page_num = control->ras_num_recs; 2947 - /* one record on eeprom stands for all pages in one memory row 2948 - * in this mode 2949 - */ 2950 - if (control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) 2951 - bad_page_num = control->ras_num_recs * adev->umc.retire_unit; 2952 - 2946 + bad_page_num = control->ras_num_bad_pages; 2953 2947 save_count = data->count - bad_page_num; 2954 2948 mutex_unlock(&con->recovery_lock); 2955 2949 ··· 3427 3433 return ret; 3428 3434 3429 3435 amdgpu_dpm_send_hbm_bad_pages_num( 3430 - adev, control->ras_num_recs); 3436 + adev, control->ras_num_bad_pages); 3431 3437 3432 3438 if (con->update_channel_flag == true) { 3433 3439 amdgpu_dpm_send_hbm_bad_channel_flag(

+27 -13

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

··· 470 470 res = __write_table_ras_info(control); 471 471 472 472 control->ras_num_recs = 0; 473 + control->ras_num_bad_pages = 0; 473 474 control->ras_fri = 0; 474 475 475 - amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); 476 + amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages); 476 477 477 478 control->bad_channel_bitmap = 0; 478 479 amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); ··· 560 559 if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { 561 560 if (amdgpu_bad_page_threshold == -1) { 562 561 dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", 563 - con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold); 562 + con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); 564 563 dev_warn(adev->dev, 565 564 "But GPU can be operated due to bad_page_threshold = -1.\n"); 566 565 return false; ··· 622 621 const u32 num) 623 622 { 624 623 struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); 624 + struct amdgpu_device *adev = to_amdgpu_device(control); 625 625 u32 a, b, i; 626 626 u8 *buf, *pp; 627 627 int res; ··· 725 723 control->ras_num_recs = 1 + (control->ras_max_record_count + b 726 724 - control->ras_fri) 727 725 % control->ras_max_record_count; 726 + 727 + if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) 728 + control->ras_num_bad_pages = control->ras_num_recs; 729 + else 730 + control->ras_num_bad_pages = 731 + control->ras_num_recs * adev->umc.retire_unit; 728 732 Out: 729 733 kfree(buf); 730 734 return res; ··· 748 740 /* Modify the header if it exceeds. 749 741 */ 750 742 if (amdgpu_bad_page_threshold != 0 && 751 - control->ras_num_recs >= ras->bad_page_cnt_threshold) { 743 + control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) { 752 744 dev_warn(adev->dev, 753 745 "Saved bad pages %d reaches threshold value %d\n", 754 - control->ras_num_recs, ras->bad_page_cnt_threshold); 746 + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 755 747 control->tbl_hdr.header = RAS_TABLE_HDR_BAD; 756 748 if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { 757 749 control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; ··· 806 798 */ 807 799 if (amdgpu_bad_page_threshold != 0 && 808 800 control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && 809 - control->ras_num_recs < ras->bad_page_cnt_threshold) 801 + control->ras_num_bad_pages < ras->bad_page_cnt_threshold) 810 802 control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - 811 - control->ras_num_recs) * 100) / 803 + control->ras_num_bad_pages) * 100) / 812 804 ras->bad_page_cnt_threshold; 813 805 814 806 /* Recalc the checksum. ··· 1410 1402 if (!__get_eeprom_i2c_addr(adev, control)) 1411 1403 return -EINVAL; 1412 1404 1405 + if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) 1406 + control->ras_num_bad_pages = control->ras_num_recs; 1407 + else 1408 + control->ras_num_bad_pages = 1409 + control->ras_num_recs * adev->umc.retire_unit; 1410 + 1413 1411 if (hdr->header == RAS_TABLE_HDR_VAL) { 1414 1412 DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", 1415 - control->ras_num_recs); 1413 + control->ras_num_bad_pages); 1416 1414 1417 1415 if (hdr->version == RAS_TABLE_VER_V2_1) { 1418 1416 res = __read_table_ras_info(control); ··· 1433 1419 1434 1420 /* Warn if we are at 90% of the threshold or above 1435 1421 */ 1436 - if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold) 1422 + if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) 1437 1423 dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", 1438 - control->ras_num_recs, 1424 + control->ras_num_bad_pages, 1439 1425 ras->bad_page_cnt_threshold); 1440 1426 } else if (hdr->header == RAS_TABLE_HDR_BAD && 1441 1427 amdgpu_bad_page_threshold != 0) { ··· 1451 1437 res); 1452 1438 return -EINVAL; 1453 1439 } 1454 - if (ras->bad_page_cnt_threshold > control->ras_num_recs) { 1440 + if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) { 1455 1441 /* This means that, the threshold was increased since 1456 1442 * the last time the system was booted, and now, 1457 1443 * ras->bad_page_cnt_threshold - control->num_recs > 0, ··· 1461 1447 dev_info(adev->dev, 1462 1448 "records:%d threshold:%d, resetting " 1463 1449 "RAS table header signature", 1464 - control->ras_num_recs, 1450 + control->ras_num_bad_pages, 1465 1451 ras->bad_page_cnt_threshold); 1466 1452 res = amdgpu_ras_eeprom_correct_header_tag(control, 1467 1453 RAS_TABLE_HDR_VAL); 1468 1454 } else { 1469 1455 dev_err(adev->dev, "RAS records:%d exceed threshold:%d", 1470 - control->ras_num_recs, ras->bad_page_cnt_threshold); 1456 + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1471 1457 if (amdgpu_bad_page_threshold == -1) { 1472 1458 dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); 1473 1459 res = 0; ··· 1476 1462 dev_err(adev->dev, 1477 1463 "RAS records:%d exceed threshold:%d, " 1478 1464 "GPU will not be initialized. Replace this GPU or increase the threshold", 1479 - control->ras_num_recs, ras->bad_page_cnt_threshold); 1465 + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); 1480 1466 } 1481 1467 } 1482 1468 } else {

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

··· 95 95 */ 96 96 u32 ras_num_recs; 97 97 98 + /* the bad page number is ras_num_recs or 99 + * ras_num_recs * umc.retire_unit 100 + */ 101 + u32 ras_num_bad_pages; 102 + 98 103 /* First record index to read, 0-based. 99 104 * Range is [0, num_recs-1]. This is 100 105 * an absolute index, starting right after

+2 -1

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

··· 169 169 err_data->err_addr_cnt, false); 170 170 amdgpu_ras_save_bad_pages(adev, &err_count); 171 171 172 - amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 172 + amdgpu_dpm_send_hbm_bad_pages_num(adev, 173 + con->eeprom_control.ras_num_bad_pages); 173 174 174 175 if (con->update_channel_flag == true) { 175 176 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);