Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: reverts commit ce316fa55ef0f1751276b846a54fb3b835bd5e64.

In preparation for doing XGMI reset synchronization using task barrier.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Le Ma <Le.Ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Andrey Grodzovsky and committed by
Alex Deucher
041a62bc f06a58db

+10 -65
-2
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 994 994 995 995 bool pm_sysfs_en; 996 996 bool ucode_sysfs_en; 997 - 998 - bool in_baco; 999 997 }; 1000 998 1001 999 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
+10 -63
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 3797 3797 return r; 3798 3798 } 3799 3799 3800 - static int amdgpu_do_asic_reset(struct amdgpu_device *adev, 3801 - struct amdgpu_hive_info *hive, 3800 + static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 3802 3801 struct list_head *device_list_handle, 3803 3802 bool *need_full_reset_arg) 3804 3803 { 3805 3804 struct amdgpu_device *tmp_adev = NULL; 3806 3805 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 3807 3806 int r = 0; 3808 - int cpu = smp_processor_id(); 3809 - bool use_baco = 3810 - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? 3811 - true : false; 3812 3807 3813 3808 /* 3814 3809 * ASIC reset has to be done on all HGMI hive nodes ASAP ··· 3811 3816 */ 3812 3817 if (need_full_reset) { 3813 3818 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3814 - /* 3815 - * For XGMI run all resets in parallel to speed up the 3816 - * process by scheduling the highpri wq on different 3817 - * cpus. For XGMI with baco reset, all nodes must enter 3818 - * baco within close proximity before anyone exit. 3819 - */ 3819 + /* For XGMI run all resets in parallel to speed up the process */ 3820 3820 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3821 - if (!queue_work_on(cpu, system_highpri_wq, 3822 - &tmp_adev->xgmi_reset_work)) 3821 + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) 3823 3822 r = -EALREADY; 3824 - cpu = cpumask_next(cpu, cpu_online_mask); 3825 3823 } else 3826 3824 r = amdgpu_asic_reset(tmp_adev); 3827 - if (r) 3825 + 3826 + if (r) { 3827 + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 3828 + r, tmp_adev->ddev->unique); 3828 3829 break; 3830 + } 3829 3831 } 3830 3832 3831 - /* For XGMI wait for all work to complete before proceed */ 3833 + /* For XGMI wait for all resets to complete before proceed */ 3832 3834 if (!r) { 3833 3835 list_for_each_entry(tmp_adev, device_list_handle, 3834 3836 gmc.xgmi.head) { ··· 3834 3842 r = tmp_adev->asic_reset_res; 3835 3843 if (r) 3836 3844 break; 3837 - if (use_baco) 3838 - tmp_adev->in_baco = true; 3839 3845 } 3840 3846 } 3841 - } 3842 - 3843 - /* 3844 - * For XGMI with baco reset, need exit baco phase by scheduling 3845 - * xgmi_reset_work one more time. PSP reset and sGPU skips this 3846 - * phase. Not assume the situation that PSP reset and baco reset 3847 - * coexist within an XGMI hive. 3848 - */ 3849 - 3850 - if (!r && use_baco) { 3851 - cpu = smp_processor_id(); 3852 - list_for_each_entry(tmp_adev, device_list_handle, 3853 - gmc.xgmi.head) { 3854 - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3855 - if (!queue_work_on(cpu, 3856 - system_highpri_wq, 3857 - &tmp_adev->xgmi_reset_work)) 3858 - r = -EALREADY; 3859 - if (r) 3860 - break; 3861 - cpu = cpumask_next(cpu, cpu_online_mask); 3862 - } 3863 - } 3864 - } 3865 - 3866 - if (!r && use_baco) { 3867 - list_for_each_entry(tmp_adev, device_list_handle, 3868 - gmc.xgmi.head) { 3869 - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3870 - flush_work(&tmp_adev->xgmi_reset_work); 3871 - r = tmp_adev->asic_reset_res; 3872 - if (r) 3873 - break; 3874 - tmp_adev->in_baco = false; 3875 - } 3876 - } 3877 - } 3878 - 3879 - if (r) { 3880 - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 3881 - r, tmp_adev->ddev->unique); 3882 - goto end; 3883 3847 } 3884 3848 } 3885 3849 ··· 4130 4182 if (r) 4131 4183 adev->asic_reset_res = r; 4132 4184 } else { 4133 - r = amdgpu_do_asic_reset(adev, hive, device_list_handle, 4134 - &need_full_reset); 4185 + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 4135 4186 if (r && r == -EAGAIN) 4136 4187 goto retry; 4137 4188 }