Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: process RAS fatal error MB notification

For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.

v2: add another mailbox check for handling case where kfd detects
timeout first

v3: set host_flr bit and use wait_for_reset

Signed-off-by: Vignesh Chander <Vignesh.Chander@amd.com>
Reviewed-by: Zhigang Luo <Zhigang.Luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Vignesh Chander and committed by
Alex Deucher
cbda2758 78146c1d

+55 -8
+8 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 5069 5069 struct amdgpu_hive_info *hive = NULL; 5070 5070 5071 5071 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5072 - amdgpu_virt_ready_to_reset(adev); 5072 + if (!amdgpu_ras_get_fed_status(adev)) 5073 + amdgpu_virt_ready_to_reset(adev); 5073 5074 amdgpu_virt_wait_reset(adev); 5074 5075 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5075 5076 r = amdgpu_virt_request_full_gpu(adev, true); ··· 5838 5837 /* Actual ASIC resets if needed.*/ 5839 5838 /* Host driver will handle XGMI hive reset for SRIOV */ 5840 5839 if (amdgpu_sriov_vf(adev)) { 5840 + if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5841 + dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5842 + amdgpu_ras_set_fed(adev, true); 5843 + set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5844 + } 5845 + 5841 5846 r = amdgpu_device_reset_sriov(adev, reset_context); 5842 5847 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5843 5848 amdgpu_virt_release_full_gpu(adev, true);
+22 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
··· 229 229 adev->virt.mm_table.gpu_addr = 0; 230 230 } 231 231 232 + /** 233 + * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt 234 + * @adev: amdgpu device. 235 + * Check whether host sent RAS error message 236 + * Return: true if found, otherwise false 237 + */ 238 + bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) 239 + { 240 + struct amdgpu_virt *virt = &adev->virt; 241 + 242 + if (!virt->ops || !virt->ops->rcvd_ras_intr) 243 + return false; 244 + 245 + return virt->ops->rcvd_ras_intr(adev); 246 + } 247 + 232 248 233 249 unsigned int amd_sriov_msg_checksum(void *obj, 234 250 unsigned long obj_size, ··· 628 612 ret = amdgpu_virt_read_pf2vf_data(adev); 629 613 if (ret) { 630 614 adev->virt.vf2pf_update_retry_cnt++; 631 - if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && 632 - amdgpu_sriov_runtime(adev)) { 615 + 616 + if ((amdgpu_virt_rcvd_ras_interrupt(adev) || 617 + adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && 618 + amdgpu_sriov_runtime(adev)) { 619 + 633 620 amdgpu_ras_set_fed(adev, true); 634 621 if (amdgpu_reset_domain_schedule(adev->reset_domain, 635 - &adev->kfd.reset_work)) 622 + &adev->kfd.reset_work)) 636 623 return; 637 624 else 638 625 dev_err(adev->dev, "Failed to queue work! at %s", __func__);
+3 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
··· 52 52 /* tonga/fiji use this offset */ 53 53 #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503 54 54 55 - #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5 55 + #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2 56 56 57 57 enum amdgpu_sriov_vf_mode { 58 58 SRIOV_VF_MODE_BARE_METAL = 0, ··· 94 94 u32 data1, u32 data2, u32 data3); 95 95 void (*ras_poison_handler)(struct amdgpu_device *adev, 96 96 enum amdgpu_ras_block block); 97 + bool (*rcvd_ras_intr)(struct amdgpu_device *adev); 97 98 }; 98 99 99 100 /* ··· 353 352 int amdgpu_virt_wait_reset(struct amdgpu_device *adev); 354 353 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); 355 354 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); 355 + bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev); 356 356 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev); 357 357 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev); 358 358 void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
+8
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
··· 408 408 xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); 409 409 } 410 410 411 + static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev) 412 + { 413 + enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev); 414 + 415 + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF); 416 + } 417 + 411 418 const struct amdgpu_virt_ops xgpu_ai_virt_ops = { 412 419 .req_full_gpu = xgpu_ai_request_full_gpu_access, 413 420 .rel_full_gpu = xgpu_ai_release_full_gpu_access, ··· 424 417 .trans_msg = xgpu_ai_mailbox_trans_msg, 425 418 .req_init_data = xgpu_ai_request_init_data, 426 419 .ras_poison_handler = xgpu_ai_ras_poison_handler, 420 + .rcvd_ras_intr = xgpu_ai_rcvd_ras_intr, 427 421 };
+3 -1
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
··· 51 51 IDH_FAIL, 52 52 IDH_QUERY_ALIVE, 53 53 IDH_REQ_GPU_INIT_DATA_READY, 54 - 54 + IDH_RAS_POISON_READY, 55 + IDH_PF_SOFT_FLR_NOTIFICATION, 56 + IDH_RAS_ERROR_DETECTED, 55 57 IDH_TEXT_MESSAGE = 255, 56 58 }; 57 59
+8
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
··· 449 449 } 450 450 } 451 451 452 + static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev) 453 + { 454 + enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev); 455 + 456 + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF); 457 + } 458 + 452 459 const struct amdgpu_virt_ops xgpu_nv_virt_ops = { 453 460 .req_full_gpu = xgpu_nv_request_full_gpu_access, 454 461 .rel_full_gpu = xgpu_nv_release_full_gpu_access, ··· 465 458 .wait_reset = xgpu_nv_wait_reset, 466 459 .trans_msg = xgpu_nv_mailbox_trans_msg, 467 460 .ras_poison_handler = xgpu_nv_ras_poison_handler, 461 + .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, 468 462 };
+3 -2
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
··· 26 26 27 27 #define NV_MAILBOX_POLL_ACK_TIMEDOUT 500 28 28 #define NV_MAILBOX_POLL_MSG_TIMEDOUT 6000 29 - #define NV_MAILBOX_POLL_FLR_TIMEDOUT 5000 29 + #define NV_MAILBOX_POLL_FLR_TIMEDOUT 10000 30 30 #define NV_MAILBOX_POLL_MSG_REP_MAX 11 31 31 32 32 enum idh_request { ··· 52 52 IDH_QUERY_ALIVE, 53 53 IDH_REQ_GPU_INIT_DATA_READY, 54 54 IDH_RAS_POISON_READY, 55 - 55 + IDH_PF_SOFT_FLR_NOTIFICATION, 56 + IDH_RAS_ERROR_DETECTED, 56 57 IDH_TEXT_MESSAGE = 255, 57 58 }; 58 59