drm/amdgpu: Prefer RAS recovery for scheduler hang

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Before scheduling a recovery due to scheduler/job hang, check if a RAS
error is detected. If so, choose RAS recovery to handle the situation. A
scheduler/job hang could be the side effect of a RAS error. In such
cases, it is required to go through the RAS error recovery process. A
RAS error recovery process in certains cases also could avoid a full
device device reset.

An error state is maintained in RAS context to detect the block
affected. Fatal Error state uses unused block id. Set the block id when
error is detected. If the interrupt handler detected a poison error,
it's not required to look for a fatal error. Skip fatal error checking
in such cases.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Lijo Lazar and committed by

Alex Deucher 1 year ago e1ee2111 0eecff79

+78 -7

5 changed files

expand all

drivers

gpu

drm

amd

amdgpu

aldebaran.c

amdgpu_device.c

amdgpu_ras.c

amdgpu_ras.h

amdkfd

kfd_int_process_v9.c

drivers/gpu/drm/amd/amdgpu/aldebaran.c

··· 334 334 AMDGPU_INIT_LEVEL_RESET_RECOVERY); 335 335 dev_info(tmp_adev->dev, 336 336 "GPU reset succeeded, trying to resume\n"); 337 + /*TBD: Ideally should clear only GFX, SDMA blocks*/ 338 + amdgpu_ras_clear_err_state(tmp_adev); 337 339 r = aldebaran_mode2_restore_ip(tmp_adev); 338 340 if (r) 339 341 goto end;

+13 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

··· 5181 5181 if (r) 5182 5182 return r; 5183 5183 5184 - amdgpu_ras_set_fed(adev, false); 5184 + amdgpu_ras_clear_err_state(adev); 5185 5185 amdgpu_irq_gpu_reset_resume_helper(adev); 5186 5186 5187 5187 /* some sw clean up VF needs to do before recover */ ··· 5484 5484 amdgpu_set_init_level(tmp_adev, init_level); 5485 5485 if (full_reset) { 5486 5486 /* post card */ 5487 - amdgpu_ras_set_fed(tmp_adev, false); 5487 + amdgpu_ras_clear_err_state(tmp_adev); 5488 5488 r = amdgpu_device_asic_init(tmp_adev); 5489 5489 if (r) { 5490 5490 dev_warn(tmp_adev->dev, "asic atom init failed!"); ··· 5817 5817 bool audio_suspended = false; 5818 5818 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5819 5819 5820 + /* 5821 + * If it reaches here because of hang/timeout and a RAS error is 5822 + * detected at the same time, let RAS recovery take care of it. 5823 + */ 5824 + if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5825 + reset_context->src != AMDGPU_RESET_SRC_RAS) { 5826 + dev_dbg(adev->dev, 5827 + "Gpu recovery from source: %d yielding to RAS error recovery handling", 5828 + reset_context->src); 5829 + return 0; 5830 + } 5820 5831 /* 5821 5832 * Special case: RAS triggered and full reset isn't supported 5822 5833 */

+53 -2

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

··· 2156 2156 /* Fatal error events are handled on host side */ 2157 2157 if (amdgpu_sriov_vf(adev)) 2158 2158 return; 2159 + /** 2160 + * If the current interrupt is caused by a non-fatal RAS error, skip 2161 + * check for fatal error. For fatal errors, FED status of all devices 2162 + * in XGMI hive gets set when the first device gets fatal error 2163 + * interrupt. The error gets propagated to other devices as well, so 2164 + * make sure to ack the interrupt regardless of FED status. 2165 + */ 2166 + if (!amdgpu_ras_get_fed_status(adev) && 2167 + amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) 2168 + return; 2159 2169 2160 2170 if (adev->nbio.ras && 2161 2171 adev->nbio.ras->handle_ras_controller_intr_no_bifring) ··· 2195 2185 if (ret) 2196 2186 return; 2197 2187 2188 + amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); 2198 2189 /* both query_poison_status and handle_poison_consumption are optional, 2199 2190 * but at least one of them should be implemented if we need poison 2200 2191 * consumption handler ··· 4183 4172 if (!ras) 4184 4173 return false; 4185 4174 4186 - return atomic_read(&ras->fed); 4175 + return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4187 4176 } 4188 4177 4189 4178 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) ··· 4191 4180 struct amdgpu_ras *ras; 4192 4181 4193 4182 ras = amdgpu_ras_get_context(adev); 4183 + if (ras) { 4184 + if (status) 4185 + set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4186 + else 4187 + clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); 4188 + } 4189 + } 4190 + 4191 + void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) 4192 + { 4193 + struct amdgpu_ras *ras; 4194 + 4195 + ras = amdgpu_ras_get_context(adev); 4194 4196 if (ras) 4195 - atomic_set(&ras->fed, !!status); 4197 + ras->ras_err_state = 0; 4198 + } 4199 + 4200 + void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, 4201 + enum amdgpu_ras_block block) 4202 + { 4203 + struct amdgpu_ras *ras; 4204 + 4205 + ras = amdgpu_ras_get_context(adev); 4206 + if (ras) 4207 + set_bit(block, &ras->ras_err_state); 4208 + } 4209 + 4210 + bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) 4211 + { 4212 + struct amdgpu_ras *ras; 4213 + 4214 + ras = amdgpu_ras_get_context(adev); 4215 + if (ras) { 4216 + if (block == AMDGPU_RAS_BLOCK__ANY) 4217 + return (ras->ras_err_state != 0); 4218 + else 4219 + return test_bit(block, &ras->ras_err_state) || 4220 + test_bit(AMDGPU_RAS_BLOCK__LAST, 4221 + &ras->ras_err_state); 4222 + } 4223 + 4224 + return false; 4196 4225 } 4197 4226 4198 4227 static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)

+8 -3

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

··· 99 99 AMDGPU_RAS_BLOCK__IH, 100 100 AMDGPU_RAS_BLOCK__MPIO, 101 101 102 - AMDGPU_RAS_BLOCK__LAST 102 + AMDGPU_RAS_BLOCK__LAST, 103 + AMDGPU_RAS_BLOCK__ANY = -1 103 104 }; 104 105 105 106 enum amdgpu_ras_mca_block { ··· 559 558 struct ras_ecc_log_info umc_ecc_log; 560 559 struct delayed_work page_retirement_dwork; 561 560 562 - /* Fatal error detected flag */ 563 - atomic_t fed; 561 + /* ras errors detected */ 562 + unsigned long ras_err_state; 564 563 565 564 /* RAS event manager */ 566 565 struct ras_event_manager __event_mgr; ··· 953 952 954 953 void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); 955 954 bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); 955 + void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, 956 + enum amdgpu_ras_block block); 957 + void amdgpu_ras_clear_err_state(struct amdgpu_device *adev); 958 + bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block); 956 959 957 960 u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); 958 961 int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,

drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

··· 184 184 } else { 185 185 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 186 186 } 187 + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX); 187 188 break; 188 189 case SOC15_IH_CLIENTID_VMC: 189 190 case SOC15_IH_CLIENTID_VMC1: ··· 214 213 } else { 215 214 reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; 216 215 } 216 + amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA); 217 217 break; 218 218 default: 219 219 dev_warn(dev->adev->dev,