drm/amdgpu: Implement unrecoverable error message handling for VFs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

This notification may arrive in VF mailbox while polling for response from
another event.

This patches covers the following scenarios:

- If VF is already in RMA state, then do not attempt to contact the host.
Host will ignore the VF after sending the notification.

- If the notification is detected during polling, then set the RMA status,
and return error to caller.

- If the notification arrives by interrupt, then set the RMA status and
queue a reset. This reset will fail and VF will stop runtime services.

Reviewed-by: Shravan Kumar Gande <Shravankumar.Gande@amd.com>
Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Signed-off-by: Ellen Pan <yunru.pan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Ellen Pan and committed by

Alex Deucher 11 months ago 086809c8 6be34e1d

+52 -6

5 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu_device.c

mxgpu_ai.c

mxgpu_ai.h

mxgpu_nv.c

mxgpu_nv.h

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

··· 6150 6150 /* Actual ASIC resets if needed.*/ 6151 6151 /* Host driver will handle XGMI hive reset for SRIOV */ 6152 6152 if (amdgpu_sriov_vf(adev)) { 6153 + 6154 + /* Bail out of reset early */ 6155 + if (amdgpu_ras_is_rma(adev)) 6156 + return -ENODEV; 6157 + 6153 6158 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6154 6159 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6155 6160 amdgpu_ras_set_fed(adev, true);

+14 -3

drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c

··· 324 324 struct amdgpu_iv_entry *entry) 325 325 { 326 326 enum idh_event event = xgpu_ai_mailbox_peek_msg(adev); 327 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 327 328 328 329 switch (event) { 329 330 case IDH_RAS_BAD_PAGES_NOTIFICATION: ··· 332 331 if (amdgpu_sriov_runtime(adev)) 333 332 schedule_work(&adev->virt.bad_pages_work); 334 333 break; 334 + case IDH_UNRECOV_ERR_NOTIFICATION: 335 + xgpu_ai_mailbox_send_ack(adev); 336 + ras->is_rma = true; 337 + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); 338 + if (amdgpu_sriov_runtime(adev)) 339 + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, 340 + &adev->virt.flr_work), 341 + "Failed to queue work! at %s", 342 + __func__); 343 + break; 335 344 case IDH_FLR_NOTIFICATION: 336 345 if (amdgpu_sriov_runtime(adev)) 337 346 WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, 338 - &adev->virt.flr_work), 339 - "Failed to queue work! at %s", 340 - __func__); 347 + &adev->virt.flr_work), 348 + "Failed to queue work! at %s", 349 + __func__); 341 350 break; 342 351 case IDH_QUERY_ALIVE: 343 352 xgpu_ai_mailbox_send_ack(adev);

drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h

··· 57 57 IDH_RAS_ERROR_DETECTED, 58 58 IDH_RAS_BAD_PAGES_READY = 15, 59 59 IDH_RAS_BAD_PAGES_NOTIFICATION = 16, 60 + IDH_UNRECOV_ERR_NOTIFICATION = 17, 60 61 IDH_TEXT_MESSAGE = 255, 61 62 }; 62 63

+31 -3

drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c

··· 67 67 reg = RREG32_NO_KIQ(mmMAILBOX_MSGBUF_RCV_DW0); 68 68 if (reg == IDH_FAIL) 69 69 r = -EINVAL; 70 + if (reg == IDH_UNRECOV_ERR_NOTIFICATION) 71 + r = -ENODEV; 70 72 else if (reg != event) 71 73 return -ENOENT; 72 74 ··· 105 103 { 106 104 int r; 107 105 uint64_t timeout, now; 106 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 108 107 109 108 now = (uint64_t)ktime_to_ms(ktime_get()); 110 109 timeout = now + NV_MAILBOX_POLL_MSG_TIMEDOUT; ··· 113 110 do { 114 111 r = xgpu_nv_mailbox_rcv_msg(adev, event); 115 112 if (!r) { 116 - dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); 113 + dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", 114 + event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); 117 115 return 0; 116 + } else if (r == -ENODEV) { 117 + if (!amdgpu_ras_is_rma(adev)) { 118 + ras->is_rma = true; 119 + dev_err(adev->dev, "VF is in an unrecoverable state. " 120 + "Runtime Services are halted.\n"); 121 + } 122 + return r; 118 123 } 119 124 120 125 msleep(10); ··· 177 166 enum idh_event event = -1; 178 167 179 168 send_request: 169 + 170 + if (amdgpu_ras_is_rma(adev)) 171 + return -ENODEV; 172 + 180 173 xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3); 181 174 182 175 switch (req) { ··· 338 323 { 339 324 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); 340 325 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); 326 + struct amdgpu_reset_context reset_context = { 0 }; 341 327 342 328 amdgpu_virt_fini_data_exchange(adev); 343 329 ··· 349 333 adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || 350 334 adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || 351 335 adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) { 352 - struct amdgpu_reset_context reset_context; 353 - memset(&reset_context, 0, sizeof(reset_context)); 354 336 355 337 reset_context.method = AMD_RESET_METHOD_NONE; 356 338 reset_context.reset_req_dev = adev; ··· 394 380 struct amdgpu_iv_entry *entry) 395 381 { 396 382 enum idh_event event = xgpu_nv_mailbox_peek_msg(adev); 383 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 397 384 398 385 switch (event) { 399 386 case IDH_RAS_BAD_PAGES_NOTIFICATION: 400 387 xgpu_nv_mailbox_send_ack(adev); 401 388 if (amdgpu_sriov_runtime(adev)) 402 389 schedule_work(&adev->virt.bad_pages_work); 390 + break; 391 + case IDH_UNRECOV_ERR_NOTIFICATION: 392 + xgpu_nv_mailbox_send_ack(adev); 393 + if (!amdgpu_ras_is_rma(adev)) { 394 + ras->is_rma = true; 395 + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); 396 + } 397 + 398 + if (amdgpu_sriov_runtime(adev)) 399 + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, 400 + &adev->virt.flr_work), 401 + "Failed to queue work! at %s", 402 + __func__); 403 403 break; 404 404 case IDH_FLR_NOTIFICATION: 405 405 if (amdgpu_sriov_runtime(adev))

drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h

··· 61 61 IDH_RAS_CPER_DUMP_READY = 14, 62 62 IDH_RAS_BAD_PAGES_READY = 15, 63 63 IDH_RAS_BAD_PAGES_NOTIFICATION = 16, 64 + IDH_UNRECOV_ERR_NOTIFICATION = 17, 64 65 65 66 IDH_TEXT_MESSAGE = 255, 66 67 };