Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: Introduce VF critical region check for RAS poison injection

The SRIOV guest send requet to host to check whether the poison
injection address is in VF critical region or not via mabox.

Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Shravan Kumar Gande <Shravankumar.Gande@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Xiang Liu and committed by
Alex Deucher
f1fdeb3d 18f769ff

+79
+55
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
··· 828 828 { 829 829 ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); 830 830 ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); 831 + ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1); 831 832 832 833 ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, 833 834 RATELIMIT_MSG_ON_RELEASE); 834 835 ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, 836 + RATELIMIT_MSG_ON_RELEASE); 837 + ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs, 835 838 RATELIMIT_MSG_ON_RELEASE); 836 839 837 840 mutex_init(&adev->virt.ras.ras_telemetry_mutex); ··· 1503 1500 1504 1501 if (virt->ops && virt->ops->req_bad_pages) 1505 1502 virt->ops->req_bad_pages(adev); 1503 + } 1504 + 1505 + static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev, 1506 + struct amdsriov_ras_telemetry *host_telemetry, 1507 + bool *hit) 1508 + { 1509 + struct amd_sriov_ras_chk_criti *tmp = NULL; 1510 + uint32_t checksum, used_size; 1511 + 1512 + checksum = host_telemetry->header.checksum; 1513 + used_size = host_telemetry->header.used_size; 1514 + 1515 + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) 1516 + return 0; 1517 + 1518 + tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL); 1519 + if (!tmp) 1520 + return -ENOMEM; 1521 + 1522 + if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1523 + goto out; 1524 + 1525 + if (hit) 1526 + *hit = tmp->hit ? true : false; 1527 + 1528 + out: 1529 + kfree(tmp); 1530 + 1531 + return 0; 1532 + } 1533 + 1534 + int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit) 1535 + { 1536 + struct amdgpu_virt *virt = &adev->virt; 1537 + int r = -EPERM; 1538 + 1539 + if (!virt->ops || !virt->ops->req_ras_chk_criti) 1540 + return -EOPNOTSUPP; 1541 + 1542 + /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1543 + * will ignore incoming guest messages. Ratelimit the guest messages to 1544 + * prevent guest self DOS. 1545 + */ 1546 + if (__ratelimit(&virt->ras.ras_chk_criti_rs)) { 1547 + mutex_lock(&virt->ras.ras_telemetry_mutex); 1548 + if (!virt->ops->req_ras_chk_criti(adev, addr)) 1549 + r = amdgpu_virt_cache_chk_criti_hit( 1550 + adev, virt->fw_reserve.ras_telemetry, hit); 1551 + mutex_unlock(&virt->ras.ras_telemetry_mutex); 1552 + } 1553 + 1554 + return r; 1506 1555 }
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
··· 98 98 int (*req_ras_err_count)(struct amdgpu_device *adev); 99 99 int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr); 100 100 int (*req_bad_pages)(struct amdgpu_device *adev); 101 + int (*req_ras_chk_criti)(struct amdgpu_device *adev, u64 addr); 101 102 }; 102 103 103 104 /* ··· 253 252 struct amdgpu_virt_ras { 254 253 struct ratelimit_state ras_error_cnt_rs; 255 254 struct ratelimit_state ras_cper_dump_rs; 255 + struct ratelimit_state ras_chk_criti_rs; 256 256 struct mutex ras_telemetry_mutex; 257 257 uint64_t cper_rptr; 258 258 }; ··· 455 453 bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, 456 454 enum amdgpu_ras_block block); 457 455 void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev); 456 + int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit); 458 457 #endif
+5
drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
··· 405 405 uint32_t buf[]; 406 406 }; 407 407 408 + struct amd_sriov_ras_chk_criti { 409 + uint32_t hit; 410 + }; 411 + 408 412 struct amdsriov_ras_telemetry { 409 413 struct amd_sriov_ras_telemetry_header header; 410 414 411 415 union { 412 416 struct amd_sriov_ras_telemetry_error_count error_count; 413 417 struct amd_sriov_ras_cper_dump cper_dump; 418 + struct amd_sriov_ras_chk_criti chk_criti; 414 419 } body; 415 420 }; 416 421
+14
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
··· 202 202 case IDH_REQ_RAS_CPER_DUMP: 203 203 event = IDH_RAS_CPER_DUMP_READY; 204 204 break; 205 + case IDH_REQ_RAS_CHK_CRITI: 206 + event = IDH_REQ_RAS_CHK_CRITI_READY; 207 + break; 205 208 default: 206 209 break; 207 210 } ··· 559 556 return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_BAD_PAGES); 560 557 } 561 558 559 + static int xgpu_nv_check_vf_critical_region(struct amdgpu_device *adev, u64 addr) 560 + { 561 + uint32_t addr_hi, addr_lo; 562 + 563 + addr_hi = (uint32_t)(addr >> 32); 564 + addr_lo = (uint32_t)(addr & 0xFFFFFFFF); 565 + return xgpu_nv_send_access_requests_with_param( 566 + adev, IDH_REQ_RAS_CHK_CRITI, addr_hi, addr_lo, 0); 567 + } 568 + 562 569 const struct amdgpu_virt_ops xgpu_nv_virt_ops = { 563 570 .req_full_gpu = xgpu_nv_request_full_gpu_access, 564 571 .rel_full_gpu = xgpu_nv_release_full_gpu_access, ··· 582 569 .req_ras_err_count = xgpu_nv_req_ras_err_count, 583 570 .req_ras_cper_dump = xgpu_nv_req_ras_cper_dump, 584 571 .req_bad_pages = xgpu_nv_req_ras_bad_pages, 572 + .req_ras_chk_criti = xgpu_nv_check_vf_critical_region 585 573 };
+2
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
··· 43 43 IDH_REQ_RAS_ERROR_COUNT = 203, 44 44 IDH_REQ_RAS_CPER_DUMP = 204, 45 45 IDH_REQ_RAS_BAD_PAGES = 205, 46 + IDH_REQ_RAS_CHK_CRITI = 206 46 47 }; 47 48 48 49 enum idh_event { ··· 63 62 IDH_RAS_BAD_PAGES_READY = 15, 64 63 IDH_RAS_BAD_PAGES_NOTIFICATION = 16, 65 64 IDH_UNRECOV_ERR_NOTIFICATION = 17, 65 + IDH_REQ_RAS_CHK_CRITI_READY = 18, 66 66 67 67 IDH_TEXT_MESSAGE = 255, 68 68 };