Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: only harvest gcea/mmea error status in arcturus

SDP RdRspStatus/WrRspStatus or first parity error on
RdRsp data can cause system fatal error in arcturus.
GPU will be freezed in such case.

Driver needs to harvest these error information before
reset the GPU. Check error type to avoid harvest normal
gcea/mmea information.

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Hawking Zhang and committed by
Alex Deucher
53ee6609 9406d39b

+34 -6
+11 -5
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
··· 994 994 return ret; 995 995 } 996 996 997 - static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs = 997 + static const struct soc15_reg_entry gfx_v9_4_ea_err_status_regs = 998 998 { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 }; 999 999 1000 1000 static void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev) ··· 1007 1007 1008 1008 mutex_lock(&adev->grbm_idx_mutex); 1009 1009 1010 - for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) { 1011 - for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance; 1010 + for (i = 0; i < gfx_v9_4_ea_err_status_regs.se_num; i++) { 1011 + for (j = 0; j < gfx_v9_4_ea_err_status_regs.instance; 1012 1012 j++) { 1013 1013 gfx_v9_4_select_se_sh(adev, i, 0, j); 1014 1014 reg_value = RREG32(SOC15_REG_ENTRY_OFFSET( 1015 - gfx_v9_4_rdrsp_status_regs)); 1016 - if (reg_value) 1015 + gfx_v9_4_ea_err_status_regs)); 1016 + if (REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_STATUS) || 1017 + REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_WRRSP_STATUS) || 1018 + REG_GET_FIELD(reg_value, GCEA_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) { 1019 + /* SDP read/write error/parity error in FUE_IS_FATAL mode 1020 + * can cause system fatal error in arcturas. Harvest the error 1021 + * status before GPU reset */ 1017 1022 dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n", 1018 1023 j, reg_value); 1024 + } 1019 1025 } 1020 1026 } 1021 1027
+7 -1
drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
··· 1645 1645 for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) { 1646 1646 reg_value = 1647 1647 RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i])); 1648 - if (reg_value) 1648 + if (REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_STATUS) || 1649 + REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_WRRSP_STATUS) || 1650 + REG_GET_FIELD(reg_value, MMEA0_ERR_STATUS, SDP_RDRSP_DATAPARITY_ERROR)) { 1651 + /* SDP read/write error/parity error in FUE_IS_FATAL mode 1652 + * can cause system fatal error in arcturas. Harvest the error 1653 + * status before GPU reset */ 1649 1654 dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n", 1650 1655 i, reg_value); 1656 + } 1651 1657 } 1652 1658 } 1653 1659
+16
drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_sh_mask.h
··· 617 617 #define GCEA_EDC_CNT3__MAM_A3MEM_SEC_COUNT_MASK 0x30000000L 618 618 #define GCEA_EDC_CNT3__MAM_A3MEM_DED_COUNT_MASK 0xC0000000L 619 619 620 + //GCEA_ERR_STATUS 621 + #define GCEA_ERR_STATUS__SDP_RDRSP_STATUS__SHIFT 0x0 622 + #define GCEA_ERR_STATUS__SDP_WRRSP_STATUS__SHIFT 0x4 623 + #define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS__SHIFT 0x8 624 + #define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR__SHIFT 0xa 625 + #define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS__SHIFT 0xb 626 + #define GCEA_ERR_STATUS__BUSY_ON_ERROR__SHIFT 0xc 627 + #define GCEA_ERR_STATUS__FUE_FLAG__SHIFT 0xd 628 + #define GCEA_ERR_STATUS__SDP_RDRSP_STATUS_MASK 0x0000000FL 629 + #define GCEA_ERR_STATUS__SDP_WRRSP_STATUS_MASK 0x000000F0L 630 + #define GCEA_ERR_STATUS__SDP_RDRSP_DATASTATUS_MASK 0x00000300L 631 + #define GCEA_ERR_STATUS__SDP_RDRSP_DATAPARITY_ERROR_MASK 0x00000400L 632 + #define GCEA_ERR_STATUS__CLEAR_ERROR_STATUS_MASK 0x00000800L 633 + #define GCEA_ERR_STATUS__BUSY_ON_ERROR_MASK 0x00001000L 634 + #define GCEA_ERR_STATUS__FUE_FLAG_MASK 0x00002000L 635 + 620 636 // addressBlock: gc_gfxudec 621 637 //GRBM_GFX_INDEX 622 638 #define GRBM_GFX_INDEX__INSTANCE_INDEX__SHIFT 0x0