Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: parse legacy RAS bad page mixed with new data in various NPS modes

All legacy RAS bad pages are generated in NPS1 mode, but new bad page
can be generated in any NPS mode, so we can't use retired_page stored
on eeprom directly in non-nps1 mode even for legacy data. We need to
take different actions for different data, new data can be identified
from old data by UMC_CHANNEL_IDX_V2 flag.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by
Alex Deucher
a8d133e6 0859eb54

+84 -18
+81 -15
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 192 192 193 193 if (amdgpu_bad_page_threshold != 0) { 194 194 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 195 - err_data.err_addr_cnt); 195 + err_data.err_addr_cnt, false); 196 196 amdgpu_ras_save_bad_pages(adev, NULL); 197 197 } 198 198 ··· 2728 2728 return 0; 2729 2729 } 2730 2730 2731 - static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 2731 + static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device *adev, 2732 2732 struct eeprom_table_record *bps, 2733 2733 struct ras_err_data *err_data) 2734 2734 { ··· 2757 2757 return ret; 2758 2758 } 2759 2759 2760 + static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 2761 + struct eeprom_table_record *bps, 2762 + struct ras_err_data *err_data) 2763 + { 2764 + struct ta_ras_query_address_input addr_in; 2765 + uint32_t die_id, socket = 0; 2766 + 2767 + if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 2768 + socket = adev->smuio.funcs->get_socket_id(adev); 2769 + 2770 + /* although die id is gotten from PA in nps1 mode, the id is 2771 + * fitable for any nps mode 2772 + */ 2773 + if (adev->umc.ras && adev->umc.ras->get_die_id_from_pa) 2774 + die_id = adev->umc.ras->get_die_id_from_pa(adev, bps->address, 2775 + bps->retired_page << AMDGPU_GPU_PAGE_SHIFT); 2776 + else 2777 + return -EINVAL; 2778 + 2779 + /* reinit err_data */ 2780 + err_data->err_addr_cnt = 0; 2781 + err_data->err_addr_len = adev->umc.retire_unit; 2782 + 2783 + memset(&addr_in, 0, sizeof(addr_in)); 2784 + addr_in.ma.err_addr = bps->address; 2785 + addr_in.ma.ch_inst = bps->mem_channel; 2786 + addr_in.ma.umc_inst = bps->mcumc_id; 2787 + addr_in.ma.node_inst = die_id; 2788 + addr_in.ma.socket_id = socket; 2789 + 2790 + if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 2791 + return adev->umc.ras->convert_ras_err_addr(adev, err_data, 2792 + &addr_in, NULL, false); 2793 + else 2794 + return -EINVAL; 2795 + } 2796 + 2760 2797 /* it deal with vram only. */ 2761 2798 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 2762 - struct eeprom_table_record *bps, int pages) 2799 + struct eeprom_table_record *bps, int pages, bool from_rom) 2763 2800 { 2764 2801 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2765 2802 struct ras_err_handler_data *data; ··· 2819 2782 is_mca_add = false; 2820 2783 } 2821 2784 2822 - mutex_lock(&con->recovery_lock); 2823 - data = con->eh_data; 2824 - if (!data) 2825 - goto out; 2826 - 2827 - if (is_mca_add) { 2785 + if (from_rom) { 2828 2786 err_data.err_addr = 2829 2787 kcalloc(adev->umc.retire_unit, 2830 2788 sizeof(struct eeprom_table_record), GFP_KERNEL); ··· 2829 2797 goto out; 2830 2798 } 2831 2799 2800 + err_rec = err_data.err_addr; 2832 2801 loop_cnt = adev->umc.retire_unit; 2833 2802 if (adev->gmc.gmc_funcs->query_mem_partition_mode) 2834 2803 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); 2835 2804 } 2836 2805 2806 + mutex_lock(&con->recovery_lock); 2807 + data = con->eh_data; 2808 + if (!data) 2809 + goto free; 2810 + 2837 2811 for (i = 0; i < pages; i++) { 2838 2812 if (is_mca_add) { 2839 2813 if (!find_pages_per_pa) { 2840 - if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) { 2814 + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { 2841 2815 if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { 2842 2816 /* may use old RAS TA, use PA to find pages in 2843 2817 * one row ··· 2863 2825 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 2864 2826 goto free; 2865 2827 } 2866 - 2867 - err_rec = err_data.err_addr; 2868 2828 } else { 2869 - err_rec = &bps[i]; 2829 + if (from_rom && !find_pages_per_pa) { 2830 + if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) { 2831 + /* bad page in any NPS mode in eeprom */ 2832 + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) 2833 + goto free; 2834 + } else { 2835 + /* legacy bad page in eeprom, generated only in 2836 + * NPS1 mode 2837 + */ 2838 + if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) { 2839 + /* old RAS TA or ASICs which don't support to 2840 + * convert addrss via mca address 2841 + */ 2842 + if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { 2843 + find_pages_per_pa = true; 2844 + err_rec = &bps[i]; 2845 + loop_cnt = 1; 2846 + } else { 2847 + /* non-nps1 mode, old RAS TA 2848 + * can't support it 2849 + */ 2850 + goto free; 2851 + } 2852 + } 2853 + } 2854 + 2855 + if (!find_pages_per_pa) 2856 + i += (adev->umc.retire_unit - 1); 2857 + } else { 2858 + err_rec = &bps[i]; 2859 + } 2870 2860 } 2871 2861 2872 2862 for (j = 0; j < loop_cnt; j++) { ··· 2918 2852 } 2919 2853 2920 2854 free: 2921 - if (is_mca_add) 2855 + if (from_rom) 2922 2856 kfree(err_data.err_addr); 2923 2857 out: 2924 2858 mutex_unlock(&con->recovery_lock); ··· 3021 2955 control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; 3022 2956 } 3023 2957 3024 - ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); 2958 + ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); 3025 2959 } 3026 2960 3027 2961 kfree(bps);
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 753 753 754 754 /* error handling functions */ 755 755 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 756 - struct eeprom_table_record *bps, int pages); 756 + struct eeprom_table_record *bps, int pages, bool from_rom); 757 757 758 758 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, 759 759 unsigned long *new_cnt);
+2 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
··· 78 78 79 79 if (amdgpu_bad_page_threshold != 0) { 80 80 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 81 - err_data.err_addr_cnt); 81 + err_data.err_addr_cnt, false); 82 82 amdgpu_ras_save_bad_pages(adev, NULL); 83 83 } 84 84 ··· 166 166 if ((amdgpu_bad_page_threshold != 0) && 167 167 err_data->err_addr_cnt) { 168 168 amdgpu_ras_add_bad_pages(adev, err_data->err_addr, 169 - err_data->err_addr_cnt); 169 + err_data->err_addr_cnt, false); 170 170 amdgpu_ras_save_bad_pages(adev, &err_count); 171 171 172 172 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);