Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: message smu to update bad channel info

It should notice SMU to update bad channel info when detected
uncorrectable error in UMC block

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Stanley.Yang and committed by
Alex Deucher
69691c82 d510eccf

+42 -2
+7
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 2068 2068 mutex_init(&con->recovery_lock); 2069 2069 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 2070 2070 atomic_set(&con->in_recovery, 0); 2071 + con->eeprom_control.bad_channel_bitmap = 0; 2071 2072 2072 2073 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); 2073 2074 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); ··· 2093 2092 goto free; 2094 2093 2095 2094 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 2095 + 2096 + if (con->update_channel_flag == true) { 2097 + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); 2098 + con->update_channel_flag = false; 2099 + } 2096 2100 } 2097 2101 2098 2102 #ifdef CONFIG_X86_MCE_AMD ··· 2291 2285 goto release_con; 2292 2286 } 2293 2287 2288 + con->update_channel_flag = false; 2294 2289 con->features = 0; 2295 2290 INIT_LIST_HEAD(&con->head); 2296 2291 /* Might need get this flag from vbios. */
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 374 374 375 375 /* record umc error info queried from smu */ 376 376 struct umc_ecc_info umc_ecc; 377 + 378 + /* Indicates smu whether need update bad channel info */ 379 + bool update_channel_flag; 377 380 }; 378 381 379 382 struct ras_fs_data {
+23 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
··· 267 267 { 268 268 struct amdgpu_device *adev = to_amdgpu_device(control); 269 269 struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; 270 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 270 271 u8 csum; 271 272 int res; 272 273 ··· 287 286 control->ras_fri = 0; 288 287 289 288 amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); 289 + 290 + control->bad_channel_bitmap = 0; 291 + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); 292 + con->update_channel_flag = false; 290 293 291 294 amdgpu_ras_debugfs_set_ret_size(control); 292 295 ··· 425 420 struct eeprom_table_record *record, 426 421 const u32 num) 427 422 { 423 + struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); 428 424 u32 a, b, i; 429 425 u8 *buf, *pp; 430 426 int res; ··· 437 431 /* Encode all of them in one go. 438 432 */ 439 433 pp = buf; 440 - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) 434 + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 441 435 __encode_table_record_to_buf(control, &record[i], pp); 436 + 437 + /* update bad channel bitmap */ 438 + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 439 + control->bad_channel_bitmap |= 1 << record[i].mem_channel; 440 + con->update_channel_flag = true; 441 + } 442 + } 442 443 443 444 /* a, first record index to write into. 444 445 * b, last record index to write into. ··· 699 686 const u32 num) 700 687 { 701 688 struct amdgpu_device *adev = to_amdgpu_device(control); 689 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 702 690 int i, res; 703 691 u8 *buf, *pp; 704 692 u32 g0, g1; ··· 767 753 /* Read up everything? Then transform. 768 754 */ 769 755 pp = buf; 770 - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) 756 + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { 771 757 __decode_table_record_from_buf(control, &record[i], pp); 758 + 759 + /* update bad channel bitmap */ 760 + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { 761 + control->bad_channel_bitmap |= 1 << record[i].mem_channel; 762 + con->update_channel_flag = true; 763 + } 764 + } 772 765 Out: 773 766 kfree(buf); 774 767 mutex_unlock(&control->ras_tbl_mutex);
+4
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
··· 80 80 /* Protect table access via this mutex. 81 81 */ 82 82 struct mutex ras_tbl_mutex; 83 + 84 + /* Record channel info which occurred bad pages 85 + */ 86 + u32 bad_channel_bitmap; 83 87 }; 84 88 85 89 /*
+5
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
··· 97 97 amdgpu_ras_save_bad_pages(adev); 98 98 99 99 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 100 + 101 + if (con->update_channel_flag == true) { 102 + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); 103 + con->update_channel_flag = false; 104 + } 100 105 } 101 106 102 107 if (reset)