Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: do RAS MCA2PA conversion in device init phase

NPS mode is introduced, the value of memory physical address (PA)
related to a MCA address varies per nps mode. We need to rely on
MCA address and convert it into PA accroding to nps mode.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by
Alex Deucher
0eecff79 772df3df

+83 -13
+83 -13
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 2717 2717 return 0; 2718 2718 } 2719 2719 2720 + static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, 2721 + struct eeprom_table_record *bps, 2722 + struct ras_err_data *err_data) 2723 + { 2724 + struct ta_ras_query_address_input addr_in; 2725 + uint32_t socket = 0; 2726 + int ret = 0; 2727 + 2728 + if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) 2729 + socket = adev->smuio.funcs->get_socket_id(adev); 2730 + 2731 + /* reinit err_data */ 2732 + err_data->err_addr_cnt = 0; 2733 + err_data->err_addr_len = adev->umc.retire_unit; 2734 + 2735 + memset(&addr_in, 0, sizeof(addr_in)); 2736 + addr_in.ma.err_addr = bps->address; 2737 + addr_in.ma.socket_id = socket; 2738 + addr_in.ma.ch_inst = bps->mem_channel; 2739 + /* tell RAS TA the node instance is not used */ 2740 + addr_in.ma.node_inst = TA_RAS_INV_NODE; 2741 + 2742 + if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 2743 + ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, 2744 + &addr_in, NULL, false); 2745 + 2746 + return ret; 2747 + } 2748 + 2720 2749 /* it deal with vram only. */ 2721 2750 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 2722 2751 struct eeprom_table_record *bps, int pages) 2723 2752 { 2724 2753 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 2725 2754 struct ras_err_handler_data *data; 2755 + struct ras_err_data err_data; 2756 + struct eeprom_table_record *err_rec; 2726 2757 int ret = 0; 2727 - uint32_t i; 2758 + uint32_t i, j, loop_cnt = 1; 2759 + bool is_mca_add = true; 2728 2760 2729 2761 if (!con || !con->eh_data || !bps || pages <= 0) 2730 2762 return 0; 2763 + 2764 + if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) { 2765 + is_mca_add = false; 2766 + } else { 2767 + if ((pages > 1) && 2768 + (bps[0].address == bps[1].address) && 2769 + (bps[0].mem_channel == bps[1].mem_channel)) 2770 + is_mca_add = false; 2771 + } 2731 2772 2732 2773 mutex_lock(&con->recovery_lock); 2733 2774 data = con->eh_data; 2734 2775 if (!data) 2735 2776 goto out; 2736 2777 2737 - for (i = 0; i < pages; i++) { 2738 - if (amdgpu_ras_check_bad_page_unlock(con, 2739 - bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 2740 - continue; 2741 - 2742 - if (!data->space_left && 2743 - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 2778 + if (is_mca_add) { 2779 + err_data.err_addr = 2780 + kcalloc(adev->umc.retire_unit, 2781 + sizeof(struct eeprom_table_record), GFP_KERNEL); 2782 + if (!err_data.err_addr) { 2783 + dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); 2744 2784 ret = -ENOMEM; 2745 2785 goto out; 2746 2786 } 2747 2787 2748 - amdgpu_ras_reserve_page(adev, bps[i].retired_page); 2749 - 2750 - memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); 2751 - data->count++; 2752 - data->space_left--; 2788 + loop_cnt = adev->umc.retire_unit; 2753 2789 } 2790 + 2791 + for (i = 0; i < pages; i++) { 2792 + if (is_mca_add) { 2793 + if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) 2794 + goto free; 2795 + 2796 + err_rec = err_data.err_addr; 2797 + } else { 2798 + err_rec = &bps[i]; 2799 + } 2800 + 2801 + for (j = 0; j < loop_cnt; j++) { 2802 + if (amdgpu_ras_check_bad_page_unlock(con, 2803 + err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) 2804 + continue; 2805 + 2806 + if (!data->space_left && 2807 + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { 2808 + ret = -ENOMEM; 2809 + goto free; 2810 + } 2811 + 2812 + amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); 2813 + 2814 + memcpy(&data->bps[data->count], &(err_rec[j]), 2815 + sizeof(struct eeprom_table_record)); 2816 + data->count++; 2817 + data->space_left--; 2818 + } 2819 + } 2820 + 2821 + free: 2822 + if (is_mca_add) 2823 + kfree(err_data.err_addr); 2754 2824 out: 2755 2825 mutex_unlock(&con->recovery_lock); 2756 2826