Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Enable mode-1 reset for RAS recovery in fatal error mode

The patch is enabling mode-1 reset for RAS recovery in fatal error mode.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

YiPeng Chai and committed by
Alex Deucher
1a11a65d 64a3dbb0

+10 -1
+4
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4593 4593 if (amdgpu_gpu_recovery == 0) 4594 4594 goto disabled; 4595 4595 4596 + /* Skip soft reset check in fatal error mode */ 4597 + if (!amdgpu_ras_is_poison_mode_supported(adev)) 4598 + return true; 4599 + 4596 4600 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4597 4601 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); 4598 4602 return false;
+6 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 1948 1948 1949 1949 reset_context.method = AMD_RESET_METHOD_NONE; 1950 1950 reset_context.reset_req_dev = adev; 1951 - clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 1951 + 1952 + /* Perform full reset in fatal error mode */ 1953 + if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) 1954 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 1955 + else 1956 + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 1952 1957 1953 1958 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); 1954 1959 }