Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Add support for RAS XGMI err query

Update XGMI RAS to support error query on aldebaran

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

John Clements and committed by
Alex Deucher
3c4ff2dc 1ec06c2d

+65
+65
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
··· 32 32 #include "wafl/wafl2_4_0_0_smn.h" 33 33 #include "wafl/wafl2_4_0_0_sh_mask.h" 34 34 35 + #define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 36 + #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c 37 + #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 38 + 35 39 static DEFINE_MUTEX(xgmi_mutex); 36 40 37 41 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 ··· 65 61 static const int wafl_pcs_err_status_reg_arct[] = { 66 62 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, 67 63 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, 64 + }; 65 + 66 + static const int xgmi23_pcs_err_status_reg_aldebaran[] = { 67 + smnPCS_XGMI23_PCS_ERROR_STATUS, 68 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000, 69 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000, 70 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000, 71 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000, 72 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000, 73 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000, 74 + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000 75 + }; 76 + 77 + static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { 78 + smnPCS_XGMI3X16_PCS_ERROR_STATUS, 79 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, 80 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, 81 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, 82 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, 83 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, 84 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, 85 + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 86 + }; 87 + 88 + static const int walf_pcs_err_status_reg_aldebaran[] = { 89 + smnPCS_GOPX1_PCS_ERROR_STATUS, 90 + smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 68 91 }; 69 92 70 93 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { ··· 802 771 pcs_clear_status(adev, 803 772 xgmi_pcs_err_status_reg_vg20[i]); 804 773 break; 774 + case CHIP_ALDEBARAN: 775 + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) 776 + pcs_clear_status(adev, 777 + xgmi23_pcs_err_status_reg_aldebaran[i]); 778 + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) 779 + pcs_clear_status(adev, 780 + xgmi23_pcs_err_status_reg_aldebaran[i]); 781 + for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) 782 + pcs_clear_status(adev, 783 + walf_pcs_err_status_reg_aldebaran[i]); 784 + break; 805 785 default: 806 786 break; 807 787 } ··· 900 858 /* check wafl pcs error */ 901 859 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { 902 860 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); 861 + if (data) 862 + amdgpu_xgmi_query_pcs_error_status(adev, 863 + data, &ue_cnt, &ce_cnt, false); 864 + } 865 + break; 866 + case CHIP_ALDEBARAN: 867 + /* check xgmi23 pcs error */ 868 + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) { 869 + data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]); 870 + if (data) 871 + amdgpu_xgmi_query_pcs_error_status(adev, 872 + data, &ue_cnt, &ce_cnt, true); 873 + } 874 + /* check xgmi3x16 pcs error */ 875 + for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { 876 + data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); 877 + if (data) 878 + amdgpu_xgmi_query_pcs_error_status(adev, 879 + data, &ue_cnt, &ce_cnt, true); 880 + } 881 + /* check wafl pcs error */ 882 + for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { 883 + data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); 903 884 if (data) 904 885 amdgpu_xgmi_query_pcs_error_status(adev, 905 886 data, &ue_cnt, &ce_cnt, false);