drm/amdgpu: Implement virt req_ras_err_count

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Enable RAS late init if VF RAS Telemetry is supported.

When enabled, the VF can use this interface to query total
RAS error counts from the host.

The VF FB access may abruptly end due to a fatal error,
therefore the VF must cache and sanitize the input.

The Host allows 15 Telemetry messages every 60 seconds, afterwhich
the host will ignore any more in-coming telemetry messages. The VF will
rate limit its msg calling to once every 5 seconds (12 times in 60 seconds).
While the VF is rate limited, it will continue to report the last
good cached data.

v2: Flip generate report & update statistics order for VF

Signed-off-by: Victor Skvortsov <victor.skvortsov@amd.com>
Acked-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Zhigang Luo <zhigang.luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Victor Skvortsov and committed by

Alex Deucher 1 year ago 84a2947e 907fec2d

+229 -7

7 changed files

expand all

drivers

gpu

drm

amd

amdgpu

amdgpu_device.c

amdgpu_gfx.c

amdgpu_ras.c

amdgpu_ras.h

amdgpu_umc.c

amdgpu_virt.c

amdgpu_virt.h

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

··· 4236 4236 * for throttling interrupt) = 60 seconds. 4237 4237 */ 4238 4238 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4239 + ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4240 + 4239 4241 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4242 + ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4240 4243 4241 4244 /* Registers mapping */ 4242 4245 /* TODO: block userspace mapping of io register */ ··· 5189 5186 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5190 5187 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5191 5188 amdgpu_ras_resume(adev); 5189 + 5190 + amdgpu_virt_ras_telemetry_post_reset(adev); 5191 + 5192 5192 return 0; 5193 5193 } 5194 5194

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

··· 904 904 if (r) 905 905 return r; 906 906 907 + if (amdgpu_sriov_vf(adev)) 908 + return r; 909 + 907 910 if (adev->gfx.cp_ecc_error_irq.funcs) { 908 911 r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); 909 912 if (r)

+65 -7

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

··· 1214 1214 } 1215 1215 } 1216 1216 1217 + static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev, 1218 + struct ras_query_if *query_if, 1219 + struct ras_err_data *err_data, 1220 + struct ras_query_context *qctx) 1221 + { 1222 + unsigned long new_ue, new_ce, new_de; 1223 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head); 1224 + const char *blk_name = get_ras_block_str(&query_if->head); 1225 + u64 event_id = qctx->evid.event_id; 1226 + 1227 + new_ce = err_data->ce_count - obj->err_data.ce_count; 1228 + new_ue = err_data->ue_count - obj->err_data.ue_count; 1229 + new_de = err_data->de_count - obj->err_data.de_count; 1230 + 1231 + if (new_ce) { 1232 + RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors " 1233 + "detected in %s block\n", 1234 + new_ce, 1235 + blk_name); 1236 + } 1237 + 1238 + if (new_ue) { 1239 + RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors " 1240 + "detected in %s block\n", 1241 + new_ue, 1242 + blk_name); 1243 + } 1244 + 1245 + if (new_de) { 1246 + RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors " 1247 + "detected in %s block\n", 1248 + new_de, 1249 + blk_name); 1250 + } 1251 + } 1252 + 1217 1253 static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) 1218 1254 { 1219 1255 struct ras_err_node *err_node; ··· 1271 1235 obj->err_data.ce_count += err_data->ce_count; 1272 1236 obj->err_data.de_count += err_data->de_count; 1273 1237 } 1238 + } 1239 + 1240 + static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj, 1241 + struct ras_err_data *err_data) 1242 + { 1243 + /* Host reports absolute counts */ 1244 + obj->err_data.ue_count = err_data->ue_count; 1245 + obj->err_data.ce_count = err_data->ce_count; 1246 + obj->err_data.de_count = err_data->de_count; 1274 1247 } 1275 1248 1276 1249 static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) ··· 1368 1323 if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) 1369 1324 return -EINVAL; 1370 1325 1371 - if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { 1326 + if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1327 + return amdgpu_virt_req_ras_err_count(adev, blk, err_data); 1328 + } else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { 1372 1329 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { 1373 1330 amdgpu_ras_get_ecc_info(adev, err_data); 1374 1331 } else { ··· 1452 1405 if (ret) 1453 1406 goto out_fini_err_data; 1454 1407 1455 - amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); 1408 + if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) { 1409 + amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); 1410 + amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); 1411 + } else { 1412 + /* Host provides absolute error counts. First generate the report 1413 + * using the previous VF internal count against new host count. 1414 + * Then Update VF internal count. 1415 + */ 1416 + amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx); 1417 + amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data); 1418 + } 1456 1419 1457 1420 info->ue_count = obj->err_data.ue_count; 1458 1421 info->ce_count = obj->err_data.ce_count; 1459 1422 info->de_count = obj->err_data.de_count; 1460 - 1461 - amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx); 1462 1423 1463 1424 out_fini_err_data: 1464 1425 amdgpu_ras_error_data_fini(&err_data); ··· 3985 3930 } 3986 3931 3987 3932 /* Guest side doesn't need init ras feature */ 3988 - if (amdgpu_sriov_vf(adev)) 3933 + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev)) 3989 3934 return 0; 3990 3935 3991 3936 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { ··· 4452 4397 return false; 4453 4398 } 4454 4399 4455 - if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) 4400 + if (amdgpu_sriov_vf(adev)) { 4401 + *error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY; 4402 + } else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) { 4456 4403 *error_query_mode = 4457 4404 (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; 4458 - else 4405 + } else { 4459 4406 *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; 4407 + } 4460 4408 4461 4409 return true; 4462 4410 }

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

··· 365 365 AMDGPU_RAS_INVALID_ERROR_QUERY = 0, 366 366 AMDGPU_RAS_DIRECT_ERROR_QUERY = 1, 367 367 AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2, 368 + AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY = 3, 368 369 }; 369 370 370 371 /* ras error status reisger fields */

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

··· 318 318 if (r) 319 319 return r; 320 320 321 + if (amdgpu_sriov_vf(adev)) 322 + return r; 323 + 321 324 if (amdgpu_ras_is_supported(adev, ras_block->block)) { 322 325 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); 323 326 if (r)

+136

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

··· 524 524 adev->unique_id = 525 525 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; 526 526 adev->virt.ras_en_caps.all = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_en_caps.all; 527 + adev->virt.ras_telemetry_en_caps.all = 528 + ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_telemetry_en_caps.all; 527 529 break; 528 530 default: 529 531 dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); ··· 706 704 adev->virt.fw_reserve.p_vf2pf = 707 705 (struct amd_sriov_msg_vf2pf_info_header *) 708 706 (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10)); 707 + adev->virt.fw_reserve.ras_telemetry = 708 + (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10)); 709 709 } else if (adev->mman.drv_vram_usage_va) { 710 710 adev->virt.fw_reserve.p_pf2vf = 711 711 (struct amd_sriov_msg_pf2vf_info_header *) ··· 715 711 adev->virt.fw_reserve.p_vf2pf = 716 712 (struct amd_sriov_msg_vf2pf_info_header *) 717 713 (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10)); 714 + adev->virt.fw_reserve.ras_telemetry = 715 + (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10)); 718 716 } 719 717 720 718 amdgpu_virt_read_pf2vf_data(adev); ··· 1202 1196 con->poison_supported = true; /* Poison is handled by host */ 1203 1197 1204 1198 return true; 1199 + } 1200 + 1201 + static inline enum amd_sriov_ras_telemetry_gpu_block 1202 + amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) { 1203 + switch (block) { 1204 + case AMDGPU_RAS_BLOCK__UMC: 1205 + return RAS_TELEMETRY_GPU_BLOCK_UMC; 1206 + case AMDGPU_RAS_BLOCK__SDMA: 1207 + return RAS_TELEMETRY_GPU_BLOCK_SDMA; 1208 + case AMDGPU_RAS_BLOCK__GFX: 1209 + return RAS_TELEMETRY_GPU_BLOCK_GFX; 1210 + case AMDGPU_RAS_BLOCK__MMHUB: 1211 + return RAS_TELEMETRY_GPU_BLOCK_MMHUB; 1212 + case AMDGPU_RAS_BLOCK__ATHUB: 1213 + return RAS_TELEMETRY_GPU_BLOCK_ATHUB; 1214 + case AMDGPU_RAS_BLOCK__PCIE_BIF: 1215 + return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF; 1216 + case AMDGPU_RAS_BLOCK__HDP: 1217 + return RAS_TELEMETRY_GPU_BLOCK_HDP; 1218 + case AMDGPU_RAS_BLOCK__XGMI_WAFL: 1219 + return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL; 1220 + case AMDGPU_RAS_BLOCK__DF: 1221 + return RAS_TELEMETRY_GPU_BLOCK_DF; 1222 + case AMDGPU_RAS_BLOCK__SMN: 1223 + return RAS_TELEMETRY_GPU_BLOCK_SMN; 1224 + case AMDGPU_RAS_BLOCK__SEM: 1225 + return RAS_TELEMETRY_GPU_BLOCK_SEM; 1226 + case AMDGPU_RAS_BLOCK__MP0: 1227 + return RAS_TELEMETRY_GPU_BLOCK_MP0; 1228 + case AMDGPU_RAS_BLOCK__MP1: 1229 + return RAS_TELEMETRY_GPU_BLOCK_MP1; 1230 + case AMDGPU_RAS_BLOCK__FUSE: 1231 + return RAS_TELEMETRY_GPU_BLOCK_FUSE; 1232 + case AMDGPU_RAS_BLOCK__MCA: 1233 + return RAS_TELEMETRY_GPU_BLOCK_MCA; 1234 + case AMDGPU_RAS_BLOCK__VCN: 1235 + return RAS_TELEMETRY_GPU_BLOCK_VCN; 1236 + case AMDGPU_RAS_BLOCK__JPEG: 1237 + return RAS_TELEMETRY_GPU_BLOCK_JPEG; 1238 + case AMDGPU_RAS_BLOCK__IH: 1239 + return RAS_TELEMETRY_GPU_BLOCK_IH; 1240 + case AMDGPU_RAS_BLOCK__MPIO: 1241 + return RAS_TELEMETRY_GPU_BLOCK_MPIO; 1242 + default: 1243 + dev_err(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", block); 1244 + return RAS_TELEMETRY_GPU_BLOCK_COUNT; 1245 + } 1246 + } 1247 + 1248 + static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev, 1249 + struct amdsriov_ras_telemetry *host_telemetry) 1250 + { 1251 + struct amd_sriov_ras_telemetry_error_count *tmp = NULL; 1252 + uint32_t checksum, used_size; 1253 + 1254 + checksum = host_telemetry->header.checksum; 1255 + used_size = host_telemetry->header.used_size; 1256 + 1257 + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) 1258 + return 0; 1259 + 1260 + tmp = kmalloc(used_size, GFP_KERNEL); 1261 + if (!tmp) 1262 + return -ENOMEM; 1263 + 1264 + memcpy(tmp, &host_telemetry->body.error_count, used_size); 1265 + 1266 + if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1267 + goto out; 1268 + 1269 + memcpy(&adev->virt.count_cache, tmp, 1270 + min(used_size, sizeof(adev->virt.count_cache))); 1271 + out: 1272 + kfree(tmp); 1273 + 1274 + return 0; 1275 + } 1276 + 1277 + static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update) 1278 + { 1279 + struct amdgpu_virt *virt = &adev->virt; 1280 + 1281 + /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1282 + * will ignore incoming guest messages. Ratelimit the guest messages to 1283 + * prevent guest self DOS. 1284 + */ 1285 + if (__ratelimit(&adev->virt.ras_telemetry_rs) || force_update) { 1286 + if (!virt->ops->req_ras_err_count(adev)) 1287 + amdgpu_virt_cache_host_error_counts(adev, 1288 + adev->virt.fw_reserve.ras_telemetry); 1289 + } 1290 + 1291 + return 0; 1292 + } 1293 + 1294 + /* Bypass ACA interface and query ECC counts directly from host */ 1295 + int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 1296 + struct ras_err_data *err_data) 1297 + { 1298 + enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1299 + 1300 + sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1301 + 1302 + if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1303 + !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1304 + return -EOPNOTSUPP; 1305 + 1306 + /* Host Access may be lost during reset, just return last cached data. */ 1307 + if (down_read_trylock(&adev->reset_domain->sem)) { 1308 + amdgpu_virt_req_ras_err_count_internal(adev, false); 1309 + up_read(&adev->reset_domain->sem); 1310 + } 1311 + 1312 + err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count; 1313 + err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count; 1314 + err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; 1315 + 1316 + return 0; 1317 + } 1318 + 1319 + int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) 1320 + { 1321 + unsigned long ue_count, ce_count; 1322 + 1323 + if (amdgpu_sriov_ras_telemetry_en(adev)) { 1324 + amdgpu_virt_req_ras_err_count_internal(adev, true); 1325 + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL); 1326 + } 1327 + 1328 + return 0; 1205 1329 }

+15

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

··· 104 104 struct amdgpu_virt_fw_reserve { 105 105 struct amd_sriov_msg_pf2vf_info_header *p_pf2vf; 106 106 struct amd_sriov_msg_vf2pf_info_header *p_vf2pf; 107 + void *ras_telemetry; 107 108 unsigned int checksum_key; 108 109 }; 109 110 ··· 139 138 /* MES info */ 140 139 AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8), 141 140 AMDGIM_FEATURE_RAS_CAPS = (1 << 9), 141 + AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10), 142 142 }; 143 143 144 144 enum AMDGIM_REG_ACCESS_FLAG { ··· 282 280 struct mutex rlcg_reg_lock; 283 281 284 282 union amd_sriov_ras_caps ras_en_caps; 283 + union amd_sriov_ras_caps ras_telemetry_en_caps; 284 + 285 + struct ratelimit_state ras_telemetry_rs; 286 + struct amd_sriov_ras_telemetry_error_count count_cache; 285 287 }; 286 288 287 289 struct amdgpu_video_codec_info; ··· 332 326 333 327 #define amdgpu_sriov_ras_caps_en(adev) \ 334 328 ((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CAPS) 329 + 330 + #define amdgpu_sriov_ras_telemetry_en(adev) \ 331 + (((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_TELEMETRY) && (adev)->virt.fw_reserve.ras_telemetry) 332 + 333 + #define amdgpu_sriov_ras_telemetry_block_en(adev, sriov_blk) \ 334 + (amdgpu_sriov_ras_telemetry_en((adev)) && (adev)->virt.ras_telemetry_en_caps.all & BIT(sriov_blk)) 335 335 336 336 static inline bool is_virtual_machine(void) 337 337 { ··· 403 391 bool write, u32 *rlcg_flag); 404 392 u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id); 405 393 bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev); 394 + int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 395 + struct ras_err_data *err_data); 396 + int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev); 406 397 #endif