drm/amdgpu: add amdgpu_ras.c to support ras (v2)

+1 -1

drivers/gpu/drm/amd/amdgpu/Makefile

··· 53 53 amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ 54 54 amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \ 55 55 amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \ 56 - amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o 56 + amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o 57 57 58 58 # add asic specific block 59 59 amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \

+9

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

··· 60 60 #include "amdgpu_pm.h" 61 61 62 62 #include "amdgpu_xgmi.h" 63 + #include "amdgpu_ras.h" 63 64 64 65 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 65 66 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); ··· 1639 1638 { 1640 1639 int i, r; 1641 1640 1641 + r = amdgpu_ras_init(adev); 1642 + if (r) 1643 + return r; 1644 + 1642 1645 for (i = 0; i < adev->num_ip_blocks; i++) { 1643 1646 if (!adev->ip_blocks[i].status.valid) 1644 1647 continue; ··· 1881 1876 { 1882 1877 int i, r; 1883 1878 1879 + amdgpu_ras_pre_fini(adev); 1880 + 1884 1881 if (adev->gmc.xgmi.num_physical_nodes > 1) 1885 1882 amdgpu_xgmi_remove_device(adev); 1886 1883 ··· 1951 1944 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 1952 1945 adev->ip_blocks[i].status.late_initialized = false; 1953 1946 } 1947 + 1948 + amdgpu_ras_fini(adev); 1954 1949 1955 1950 if (amdgpu_sriov_vf(adev)) 1956 1951 if (amdgpu_virt_release_full_gpu(adev, false))

+1

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h

··· 110 110 struct amdgpu_bo *ras_shared_bo; 111 111 uint64_t ras_shared_mc_addr; 112 112 void *ras_shared_buf; 113 + struct amdgpu_ras *ras; 113 114 }; 114 115 115 116 struct psp_context

+1247

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

··· 1 + /* 2 + * Copyright 2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + * 23 + */ 24 + #include <linux/debugfs.h> 25 + #include <linux/list.h> 26 + #include <linux/module.h> 27 + #include "amdgpu.h" 28 + #include "amdgpu_ras.h" 29 + 30 + struct ras_ih_data { 31 + /* interrupt bottom half */ 32 + struct work_struct ih_work; 33 + int inuse; 34 + /* IP callback */ 35 + ras_ih_cb cb; 36 + /* full of entries */ 37 + unsigned char *ring; 38 + unsigned int ring_size; 39 + unsigned int element_size; 40 + unsigned int aligned_element_size; 41 + unsigned int rptr; 42 + unsigned int wptr; 43 + }; 44 + 45 + struct ras_fs_data { 46 + char sysfs_name[32]; 47 + char debugfs_name[32]; 48 + }; 49 + 50 + struct ras_err_data { 51 + unsigned long ue_count; 52 + unsigned long ce_count; 53 + }; 54 + 55 + struct ras_err_handler_data { 56 + /* point to bad pages array */ 57 + struct { 58 + unsigned long bp; 59 + struct amdgpu_bo *bo; 60 + } *bps; 61 + /* the count of entries */ 62 + int count; 63 + /* the space can place new entries */ 64 + int space_left; 65 + /* last reserved entry's index + 1 */ 66 + int last_reserved; 67 + }; 68 + 69 + struct ras_manager { 70 + struct ras_common_if head; 71 + /* reference count */ 72 + int use; 73 + /* ras block link */ 74 + struct list_head node; 75 + /* the device */ 76 + struct amdgpu_device *adev; 77 + /* debugfs */ 78 + struct dentry *ent; 79 + /* sysfs */ 80 + struct device_attribute sysfs_attr; 81 + int attr_inuse; 82 + 83 + /* fs node name */ 84 + struct ras_fs_data fs_data; 85 + 86 + /* IH data */ 87 + struct ras_ih_data ih_data; 88 + 89 + struct ras_err_data err_data; 90 + }; 91 + 92 + const char *ras_error_string[] = { 93 + "none", 94 + "parity", 95 + "single_correctable", 96 + "multi_uncorrectable", 97 + "poison", 98 + }; 99 + 100 + const char *ras_block_string[] = { 101 + "umc", 102 + "sdma", 103 + "gfx", 104 + "mmhub", 105 + "athub", 106 + "pcie_bif", 107 + "hdp", 108 + "xgmi_wafl", 109 + "df", 110 + "smn", 111 + "sem", 112 + "mp0", 113 + "mp1", 114 + "fuse", 115 + }; 116 + 117 + #define ras_err_str(i) (ras_error_string[ffs(i)]) 118 + #define ras_block_str(i) (ras_block_string[i]) 119 + 120 + static void amdgpu_ras_self_test(struct amdgpu_device *adev) 121 + { 122 + /* TODO */ 123 + } 124 + 125 + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, 126 + size_t size, loff_t *pos) 127 + { 128 + struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 129 + struct ras_query_if info = { 130 + .head = obj->head, 131 + }; 132 + ssize_t s; 133 + char val[128]; 134 + 135 + if (amdgpu_ras_error_query(obj->adev, &info)) 136 + return -EINVAL; 137 + 138 + s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", 139 + "ue", info.ue_count, 140 + "ce", info.ce_count); 141 + if (*pos >= s) 142 + return 0; 143 + 144 + s -= *pos; 145 + s = min_t(u64, s, size); 146 + 147 + 148 + if (copy_to_user(buf, &val[*pos], s)) 149 + return -EINVAL; 150 + 151 + *pos += s; 152 + 153 + return s; 154 + } 155 + 156 + static ssize_t amdgpu_ras_debugfs_write(struct file *f, const char __user *buf, 157 + size_t size, loff_t *pos) 158 + { 159 + struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; 160 + struct ras_inject_if info = { 161 + .head = obj->head, 162 + }; 163 + ssize_t s = min_t(u64, 64, size); 164 + char val[64]; 165 + char *str = val; 166 + memset(val, 0, sizeof(val)); 167 + 168 + if (*pos) 169 + return -EINVAL; 170 + 171 + if (copy_from_user(str, buf, s)) 172 + return -EINVAL; 173 + 174 + /* only care ue/ce for now. */ 175 + if (memcmp(str, "ue", 2) == 0) { 176 + info.head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 177 + str += 2; 178 + } else if (memcmp(str, "ce", 2) == 0) { 179 + info.head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; 180 + str += 2; 181 + } 182 + 183 + if (sscanf(str, "0x%llx 0x%llx", &info.address, &info.value) != 2) { 184 + if (sscanf(str, "%llu %llu", &info.address, &info.value) != 2) 185 + return -EINVAL; 186 + } 187 + 188 + *pos = s; 189 + 190 + if (amdgpu_ras_error_inject(obj->adev, &info)) 191 + return -EINVAL; 192 + 193 + return size; 194 + } 195 + 196 + static const struct file_operations amdgpu_ras_debugfs_ops = { 197 + .owner = THIS_MODULE, 198 + .read = amdgpu_ras_debugfs_read, 199 + .write = amdgpu_ras_debugfs_write, 200 + .llseek = default_llseek 201 + }; 202 + 203 + static ssize_t amdgpu_ras_sysfs_read(struct device *dev, 204 + struct device_attribute *attr, char *buf) 205 + { 206 + struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); 207 + struct ras_query_if info = { 208 + .head = obj->head, 209 + }; 210 + 211 + if (amdgpu_ras_error_query(obj->adev, &info)) 212 + return -EINVAL; 213 + 214 + return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", 215 + "ue", info.ue_count, 216 + "ce", info.ce_count); 217 + } 218 + 219 + /* obj begin */ 220 + 221 + #define get_obj(obj) do { (obj)->use++; } while (0) 222 + #define alive_obj(obj) ((obj)->use) 223 + 224 + static inline void put_obj(struct ras_manager *obj) 225 + { 226 + if (obj && --obj->use == 0) 227 + list_del(&obj->node); 228 + if (obj && obj->use < 0) { 229 + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); 230 + } 231 + } 232 + 233 + /* make one obj and return it. */ 234 + static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, 235 + struct ras_common_if *head) 236 + { 237 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 238 + struct ras_manager *obj; 239 + 240 + if (!con) 241 + return NULL; 242 + 243 + if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 244 + return NULL; 245 + 246 + obj = &con->objs[head->block]; 247 + /* already exist. return obj? */ 248 + if (alive_obj(obj)) 249 + return NULL; 250 + 251 + obj->head = *head; 252 + obj->adev = adev; 253 + list_add(&obj->node, &con->head); 254 + get_obj(obj); 255 + 256 + return obj; 257 + } 258 + 259 + /* return an obj equal to head, or the first when head is NULL */ 260 + static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, 261 + struct ras_common_if *head) 262 + { 263 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 264 + struct ras_manager *obj; 265 + int i; 266 + 267 + if (!con) 268 + return NULL; 269 + 270 + if (head) { 271 + if (head->block >= AMDGPU_RAS_BLOCK_COUNT) 272 + return NULL; 273 + 274 + obj = &con->objs[head->block]; 275 + 276 + if (alive_obj(obj)) { 277 + WARN_ON(head->block != obj->head.block); 278 + return obj; 279 + } 280 + } else { 281 + for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { 282 + obj = &con->objs[i]; 283 + if (alive_obj(obj)) { 284 + WARN_ON(i != obj->head.block); 285 + return obj; 286 + } 287 + } 288 + } 289 + 290 + return NULL; 291 + } 292 + /* obj end */ 293 + 294 + /* feature ctl begin */ 295 + static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, 296 + struct ras_common_if *head) 297 + { 298 + return amdgpu_ras_enable && (amdgpu_ras_mask & BIT(head->block)); 299 + } 300 + 301 + static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, 302 + struct ras_common_if *head) 303 + { 304 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 305 + 306 + return con->features & BIT(head->block); 307 + } 308 + 309 + /* 310 + * if obj is not created, then create one. 311 + * set feature enable flag. 312 + */ 313 + static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, 314 + struct ras_common_if *head, int enable) 315 + { 316 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 317 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 318 + 319 + if (!amdgpu_ras_is_feature_allowed(adev, head)) 320 + return 0; 321 + if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 322 + return 0; 323 + 324 + if (enable) { 325 + if (!obj) { 326 + obj = amdgpu_ras_create_obj(adev, head); 327 + if (!obj) 328 + return -EINVAL; 329 + } else { 330 + /* In case we create obj somewhere else */ 331 + get_obj(obj); 332 + } 333 + con->features |= BIT(head->block); 334 + } else { 335 + if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { 336 + con->features &= ~BIT(head->block); 337 + put_obj(obj); 338 + } 339 + } 340 + 341 + return 0; 342 + } 343 + 344 + /* wrapper of psp_ras_enable_features */ 345 + int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 346 + struct ras_common_if *head, bool enable) 347 + { 348 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 349 + union ta_ras_cmd_input info; 350 + int ret; 351 + 352 + if (!con) 353 + return -EINVAL; 354 + 355 + if (!enable) { 356 + info.disable_features = (struct ta_ras_disable_features_input) { 357 + .block_id = head->block, 358 + .error_type = head->type, 359 + }; 360 + } else { 361 + info.enable_features = (struct ta_ras_enable_features_input) { 362 + .block_id = head->block, 363 + .error_type = head->type, 364 + }; 365 + } 366 + 367 + /* Do not enable if it is not allowed. */ 368 + WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); 369 + /* Are we alerady in that state we are going to set? */ 370 + if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) 371 + return 0; 372 + 373 + ret = psp_ras_enable_features(&adev->psp, &info, enable); 374 + if (ret) { 375 + DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", 376 + enable ? "enable":"disable", 377 + ras_block_str(head->block), 378 + ret); 379 + return -EINVAL; 380 + } 381 + 382 + /* setup the obj */ 383 + __amdgpu_ras_feature_enable(adev, head, enable); 384 + 385 + return 0; 386 + } 387 + 388 + static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, 389 + bool bypass) 390 + { 391 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 392 + struct ras_manager *obj, *tmp; 393 + 394 + list_for_each_entry_safe(obj, tmp, &con->head, node) { 395 + /* bypass psp. 396 + * aka just release the obj and corresponding flags 397 + */ 398 + if (bypass) { 399 + if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) 400 + break; 401 + } else { 402 + if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) 403 + break; 404 + } 405 + }; 406 + 407 + return con->features; 408 + } 409 + 410 + static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, 411 + bool bypass) 412 + { 413 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 414 + int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 415 + int i; 416 + 417 + for (i = 0; i < ras_block_count; i++) { 418 + struct ras_common_if head = { 419 + .block = i, 420 + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 421 + .sub_block_index = 0, 422 + }; 423 + strcpy(head.name, ras_block_str(i)); 424 + if (bypass) { 425 + /* 426 + * bypass psp. vbios enable ras for us. 427 + * so just create the obj 428 + */ 429 + if (__amdgpu_ras_feature_enable(adev, &head, 1)) 430 + break; 431 + } else { 432 + if (amdgpu_ras_feature_enable(adev, &head, 1)) 433 + break; 434 + } 435 + }; 436 + 437 + return con->features; 438 + } 439 + /* feature ctl end */ 440 + 441 + /* query/inject/cure begin */ 442 + int amdgpu_ras_error_query(struct amdgpu_device *adev, 443 + struct ras_query_if *info) 444 + { 445 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 446 + 447 + if (!obj) 448 + return -EINVAL; 449 + /* TODO might read the register to read the count */ 450 + 451 + info->ue_count = obj->err_data.ue_count; 452 + info->ce_count = obj->err_data.ce_count; 453 + 454 + return 0; 455 + } 456 + 457 + /* wrapper of psp_ras_trigger_error */ 458 + int amdgpu_ras_error_inject(struct amdgpu_device *adev, 459 + struct ras_inject_if *info) 460 + { 461 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 462 + struct ta_ras_trigger_error_input block_info = { 463 + .block_id = info->head.block, 464 + .inject_error_type = info->head.type, 465 + .sub_block_index = info->head.sub_block_index, 466 + .address = info->address, 467 + .value = info->value, 468 + }; 469 + int ret = 0; 470 + 471 + if (!obj) 472 + return -EINVAL; 473 + 474 + ret = psp_ras_trigger_error(&adev->psp, &block_info); 475 + if (ret) 476 + DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", 477 + ras_block_str(info->head.block), 478 + ret); 479 + 480 + return ret; 481 + } 482 + 483 + int amdgpu_ras_error_cure(struct amdgpu_device *adev, 484 + struct ras_cure_if *info) 485 + { 486 + /* psp fw has no cure interface for now. */ 487 + return 0; 488 + } 489 + 490 + /* get the total error counts on all IPs */ 491 + int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 492 + bool is_ce) 493 + { 494 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 495 + struct ras_manager *obj; 496 + struct ras_err_data data = {0, 0}; 497 + 498 + if (!con) 499 + return -EINVAL; 500 + 501 + list_for_each_entry(obj, &con->head, node) { 502 + struct ras_query_if info = { 503 + .head = obj->head, 504 + }; 505 + 506 + if (amdgpu_ras_error_query(adev, &info)) 507 + return -EINVAL; 508 + 509 + data.ce_count += info.ce_count; 510 + data.ue_count += info.ue_count; 511 + } 512 + 513 + return is_ce ? data.ce_count : data.ue_count; 514 + } 515 + /* query/inject/cure end */ 516 + 517 + 518 + /* sysfs begin */ 519 + 520 + static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 521 + struct device_attribute *attr, char *buf) 522 + { 523 + struct amdgpu_ras *con = 524 + container_of(attr, struct amdgpu_ras, features_attr); 525 + struct drm_device *ddev = dev_get_drvdata(dev); 526 + struct amdgpu_device *adev = ddev->dev_private; 527 + struct ras_common_if head; 528 + int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; 529 + int i; 530 + ssize_t s; 531 + struct ras_manager *obj; 532 + 533 + s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); 534 + 535 + for (i = 0; i < ras_block_count; i++) { 536 + head.block = i; 537 + 538 + if (amdgpu_ras_is_feature_enabled(adev, &head)) { 539 + obj = amdgpu_ras_find_obj(adev, &head); 540 + s += scnprintf(&buf[s], PAGE_SIZE - s, 541 + "%s: %s\n", 542 + ras_block_str(i), 543 + ras_err_str(obj->head.type)); 544 + } else 545 + s += scnprintf(&buf[s], PAGE_SIZE - s, 546 + "%s: disabled\n", 547 + ras_block_str(i)); 548 + } 549 + 550 + return s; 551 + } 552 + 553 + static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) 554 + { 555 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 556 + struct attribute *attrs[] = { 557 + &con->features_attr.attr, 558 + NULL 559 + }; 560 + struct attribute_group group = { 561 + .name = "ras", 562 + .attrs = attrs, 563 + }; 564 + 565 + con->features_attr = (struct device_attribute) { 566 + .attr = { 567 + .name = "features", 568 + .mode = S_IRUGO, 569 + }, 570 + .show = amdgpu_ras_sysfs_features_read, 571 + }; 572 + 573 + return sysfs_create_group(&adev->dev->kobj, &group); 574 + } 575 + 576 + static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) 577 + { 578 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 579 + struct attribute *attrs[] = { 580 + &con->features_attr.attr, 581 + NULL 582 + }; 583 + struct attribute_group group = { 584 + .name = "ras", 585 + .attrs = attrs, 586 + }; 587 + 588 + sysfs_remove_group(&adev->dev->kobj, &group); 589 + 590 + return 0; 591 + } 592 + 593 + int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 594 + struct ras_fs_if *head) 595 + { 596 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 597 + 598 + if (!obj || obj->attr_inuse) 599 + return -EINVAL; 600 + 601 + get_obj(obj); 602 + 603 + memcpy(obj->fs_data.sysfs_name, 604 + head->sysfs_name, 605 + sizeof(obj->fs_data.sysfs_name)); 606 + 607 + obj->sysfs_attr = (struct device_attribute){ 608 + .attr = { 609 + .name = obj->fs_data.sysfs_name, 610 + .mode = S_IRUGO, 611 + }, 612 + .show = amdgpu_ras_sysfs_read, 613 + }; 614 + 615 + if (sysfs_add_file_to_group(&adev->dev->kobj, 616 + &obj->sysfs_attr.attr, 617 + "ras")) { 618 + put_obj(obj); 619 + return -EINVAL; 620 + } 621 + 622 + obj->attr_inuse = 1; 623 + 624 + return 0; 625 + } 626 + 627 + int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 628 + struct ras_common_if *head) 629 + { 630 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 631 + 632 + if (!obj || !obj->attr_inuse) 633 + return -EINVAL; 634 + 635 + sysfs_remove_file_from_group(&adev->dev->kobj, 636 + &obj->sysfs_attr.attr, 637 + "ras"); 638 + obj->attr_inuse = 0; 639 + put_obj(obj); 640 + 641 + return 0; 642 + } 643 + 644 + static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) 645 + { 646 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 647 + struct ras_manager *obj, *tmp; 648 + 649 + list_for_each_entry_safe(obj, tmp, &con->head, node) { 650 + amdgpu_ras_sysfs_remove(adev, &obj->head); 651 + } 652 + 653 + amdgpu_ras_sysfs_remove_feature_node(adev); 654 + 655 + return 0; 656 + } 657 + /* sysfs end */ 658 + 659 + /* debugfs begin */ 660 + int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 661 + struct ras_fs_if *head) 662 + { 663 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 664 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); 665 + struct dentry *ent; 666 + 667 + if (!obj || obj->ent) 668 + return -EINVAL; 669 + 670 + get_obj(obj); 671 + 672 + memcpy(obj->fs_data.debugfs_name, 673 + head->debugfs_name, 674 + sizeof(obj->fs_data.debugfs_name)); 675 + 676 + ent = debugfs_create_file(obj->fs_data.debugfs_name, 677 + S_IWUGO | S_IRUGO, con->dir, 678 + obj, &amdgpu_ras_debugfs_ops); 679 + 680 + if (IS_ERR(ent)) 681 + return -EINVAL; 682 + 683 + obj->ent = ent; 684 + 685 + return 0; 686 + } 687 + 688 + int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, 689 + struct ras_common_if *head) 690 + { 691 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); 692 + 693 + if (!obj || !obj->ent) 694 + return 0; 695 + 696 + debugfs_remove(obj->ent); 697 + obj->ent = NULL; 698 + put_obj(obj); 699 + 700 + return 0; 701 + } 702 + 703 + static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) 704 + { 705 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 706 + struct ras_manager *obj, *tmp; 707 + 708 + list_for_each_entry_safe(obj, tmp, &con->head, node) { 709 + amdgpu_ras_debugfs_remove(adev, &obj->head); 710 + } 711 + 712 + debugfs_remove(con->dir); 713 + con->dir = NULL; 714 + 715 + return 0; 716 + } 717 + /* debugfs end */ 718 + 719 + /* ras fs */ 720 + 721 + static int amdgpu_ras_fs_init(struct amdgpu_device *adev) 722 + { 723 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 724 + struct drm_minor *minor = adev->ddev->primary; 725 + struct dentry *root = minor->debugfs_root, *dir; 726 + 727 + dir = debugfs_create_dir("ras", root); 728 + if (IS_ERR(dir)) 729 + return -EINVAL; 730 + 731 + con->dir = dir; 732 + 733 + amdgpu_ras_sysfs_create_feature_node(adev); 734 + 735 + return 0; 736 + } 737 + 738 + static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) 739 + { 740 + amdgpu_ras_debugfs_remove_all(adev); 741 + amdgpu_ras_sysfs_remove_all(adev); 742 + return 0; 743 + } 744 + /* ras fs end */ 745 + 746 + /* ih begin */ 747 + static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) 748 + { 749 + struct ras_ih_data *data = &obj->ih_data; 750 + struct amdgpu_iv_entry entry; 751 + int ret; 752 + 753 + while (data->rptr != data->wptr) { 754 + rmb(); 755 + memcpy(&entry, &data->ring[data->rptr], 756 + data->element_size); 757 + 758 + wmb(); 759 + data->rptr = (data->aligned_element_size + 760 + data->rptr) % data->ring_size; 761 + 762 + /* Let IP handle its data, maybe we need get the output 763 + * from the callback to udpate the error type/count, etc 764 + */ 765 + if (data->cb) { 766 + ret = data->cb(obj->adev, &entry); 767 + /* ue will trigger an interrupt, and in that case 768 + * we need do a reset to recovery the whole system. 769 + * But leave IP do that recovery, here we just dispatch 770 + * the error. 771 + */ 772 + if (ret == AMDGPU_RAS_UE) { 773 + obj->err_data.ue_count++; 774 + } 775 + /* Might need get ce count by register, but not all IP 776 + * saves ce count, some IP just use one bit or two bits 777 + * to indicate ce happened. 778 + */ 779 + } 780 + } 781 + } 782 + 783 + static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) 784 + { 785 + struct ras_ih_data *data = 786 + container_of(work, struct ras_ih_data, ih_work); 787 + struct ras_manager *obj = 788 + container_of(data, struct ras_manager, ih_data); 789 + 790 + amdgpu_ras_interrupt_handler(obj); 791 + } 792 + 793 + int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 794 + struct ras_dispatch_if *info) 795 + { 796 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 797 + struct ras_ih_data *data = &obj->ih_data; 798 + 799 + if (!obj) 800 + return -EINVAL; 801 + 802 + if (data->inuse == 0) 803 + return 0; 804 + 805 + /* Might be overflow... */ 806 + memcpy(&data->ring[data->wptr], info->entry, 807 + data->element_size); 808 + 809 + wmb(); 810 + data->wptr = (data->aligned_element_size + 811 + data->wptr) % data->ring_size; 812 + 813 + schedule_work(&data->ih_work); 814 + 815 + return 0; 816 + } 817 + 818 + int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 819 + struct ras_ih_if *info) 820 + { 821 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 822 + struct ras_ih_data *data; 823 + 824 + if (!obj) 825 + return -EINVAL; 826 + 827 + data = &obj->ih_data; 828 + if (data->inuse == 0) 829 + return 0; 830 + 831 + cancel_work_sync(&data->ih_work); 832 + 833 + kfree(data->ring); 834 + memset(data, 0, sizeof(*data)); 835 + put_obj(obj); 836 + 837 + return 0; 838 + } 839 + 840 + int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 841 + struct ras_ih_if *info) 842 + { 843 + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); 844 + struct ras_ih_data *data; 845 + 846 + if (!obj) { 847 + /* in case we registe the IH before enable ras feature */ 848 + obj = amdgpu_ras_create_obj(adev, &info->head); 849 + if (!obj) 850 + return -EINVAL; 851 + } else 852 + get_obj(obj); 853 + 854 + data = &obj->ih_data; 855 + /* add the callback.etc */ 856 + *data = (struct ras_ih_data) { 857 + .inuse = 0, 858 + .cb = info->cb, 859 + .element_size = sizeof(struct amdgpu_iv_entry), 860 + .rptr = 0, 861 + .wptr = 0, 862 + }; 863 + 864 + INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); 865 + 866 + data->aligned_element_size = ALIGN(data->element_size, 8); 867 + /* the ring can store 64 iv entries. */ 868 + data->ring_size = 64 * data->aligned_element_size; 869 + data->ring = kmalloc(data->ring_size, GFP_KERNEL); 870 + if (!data->ring) { 871 + put_obj(obj); 872 + return -ENOMEM; 873 + } 874 + 875 + /* IH is ready */ 876 + data->inuse = 1; 877 + 878 + return 0; 879 + } 880 + 881 + static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) 882 + { 883 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 884 + struct ras_manager *obj, *tmp; 885 + 886 + list_for_each_entry_safe(obj, tmp, &con->head, node) { 887 + struct ras_ih_if info = { 888 + .head = obj->head, 889 + }; 890 + amdgpu_ras_interrupt_remove_handler(adev, &info); 891 + } 892 + 893 + return 0; 894 + } 895 + /* ih end */ 896 + 897 + /* recovery begin */ 898 + static void amdgpu_ras_do_recovery(struct work_struct *work) 899 + { 900 + struct amdgpu_ras *ras = 901 + container_of(work, struct amdgpu_ras, recovery_work); 902 + 903 + amdgpu_device_gpu_recover(ras->adev, 0); 904 + atomic_set(&ras->in_recovery, 0); 905 + } 906 + 907 + static int amdgpu_ras_release_vram(struct amdgpu_device *adev, 908 + struct amdgpu_bo **bo_ptr) 909 + { 910 + /* no need to free it actually. */ 911 + amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); 912 + return 0; 913 + } 914 + 915 + /* reserve vram with size@offset */ 916 + static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, 917 + uint64_t offset, uint64_t size, 918 + struct amdgpu_bo **bo_ptr) 919 + { 920 + struct ttm_operation_ctx ctx = { false, false }; 921 + struct amdgpu_bo_param bp; 922 + int r = 0; 923 + int i; 924 + struct amdgpu_bo *bo; 925 + 926 + if (bo_ptr) 927 + *bo_ptr = NULL; 928 + memset(&bp, 0, sizeof(bp)); 929 + bp.size = size; 930 + bp.byte_align = PAGE_SIZE; 931 + bp.domain = AMDGPU_GEM_DOMAIN_VRAM; 932 + bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | 933 + AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 934 + bp.type = ttm_bo_type_kernel; 935 + bp.resv = NULL; 936 + 937 + r = amdgpu_bo_create(adev, &bp, &bo); 938 + if (r) 939 + return -EINVAL; 940 + 941 + r = amdgpu_bo_reserve(bo, false); 942 + if (r) 943 + goto error_reserve; 944 + 945 + offset = ALIGN(offset, PAGE_SIZE); 946 + for (i = 0; i < bo->placement.num_placement; ++i) { 947 + bo->placements[i].fpfn = offset >> PAGE_SHIFT; 948 + bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; 949 + } 950 + 951 + ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); 952 + r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); 953 + if (r) 954 + goto error_pin; 955 + 956 + r = amdgpu_bo_pin_restricted(bo, 957 + AMDGPU_GEM_DOMAIN_VRAM, 958 + offset, 959 + offset + size); 960 + if (r) 961 + goto error_pin; 962 + 963 + if (bo_ptr) 964 + *bo_ptr = bo; 965 + 966 + amdgpu_bo_unreserve(bo); 967 + return r; 968 + 969 + error_pin: 970 + amdgpu_bo_unreserve(bo); 971 + error_reserve: 972 + amdgpu_bo_unref(&bo); 973 + return r; 974 + } 975 + 976 + /* alloc/realloc bps array */ 977 + static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, 978 + struct ras_err_handler_data *data, int pages) 979 + { 980 + unsigned int old_space = data->count + data->space_left; 981 + unsigned int new_space = old_space + pages; 982 + unsigned int align_space = ALIGN(new_space, 1024); 983 + void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); 984 + 985 + if (!tmp) 986 + return -ENOMEM; 987 + 988 + if (data->bps) { 989 + memcpy(tmp, data->bps, 990 + data->count * sizeof(*data->bps)); 991 + kfree(data->bps); 992 + } 993 + 994 + data->bps = tmp; 995 + data->space_left += align_space - old_space; 996 + return 0; 997 + } 998 + 999 + /* it deal with vram only. */ 1000 + int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 1001 + unsigned long *bps, int pages) 1002 + { 1003 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1004 + struct ras_err_handler_data *data = con->eh_data; 1005 + int i = pages; 1006 + int ret = 0; 1007 + 1008 + if (!con || !data || !bps || pages <= 0) 1009 + return 0; 1010 + 1011 + mutex_lock(&con->recovery_lock); 1012 + if (!data) 1013 + goto out; 1014 + 1015 + if (data->space_left <= pages) 1016 + if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { 1017 + ret = -ENOMEM; 1018 + goto out; 1019 + } 1020 + 1021 + while (i--) 1022 + data->bps[data->count++].bp = bps[i]; 1023 + 1024 + data->space_left -= pages; 1025 + out: 1026 + mutex_unlock(&con->recovery_lock); 1027 + 1028 + return ret; 1029 + } 1030 + 1031 + /* called in gpu recovery/init */ 1032 + int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) 1033 + { 1034 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1035 + struct ras_err_handler_data *data = con->eh_data; 1036 + uint64_t bp; 1037 + struct amdgpu_bo *bo; 1038 + int i; 1039 + 1040 + if (!con || !data) 1041 + return 0; 1042 + 1043 + mutex_lock(&con->recovery_lock); 1044 + /* reserve vram at driver post stage. */ 1045 + for (i = data->last_reserved; i < data->count; i++) { 1046 + bp = data->bps[i].bp; 1047 + 1048 + if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, 1049 + PAGE_SIZE, &bo)) 1050 + DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); 1051 + 1052 + data->bps[i].bo = bo; 1053 + data->last_reserved = i + 1; 1054 + } 1055 + mutex_unlock(&con->recovery_lock); 1056 + return 0; 1057 + } 1058 + 1059 + /* called when driver unload */ 1060 + static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) 1061 + { 1062 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1063 + struct ras_err_handler_data *data = con->eh_data; 1064 + struct amdgpu_bo *bo; 1065 + int i; 1066 + 1067 + if (!con || !data) 1068 + return 0; 1069 + 1070 + mutex_lock(&con->recovery_lock); 1071 + for (i = data->last_reserved - 1; i >= 0; i--) { 1072 + bo = data->bps[i].bo; 1073 + 1074 + amdgpu_ras_release_vram(adev, &bo); 1075 + 1076 + data->bps[i].bo = bo; 1077 + data->last_reserved = i; 1078 + } 1079 + mutex_unlock(&con->recovery_lock); 1080 + return 0; 1081 + } 1082 + 1083 + static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) 1084 + { 1085 + /* TODO 1086 + * write the array to eeprom when SMU disabled. 1087 + */ 1088 + return 0; 1089 + } 1090 + 1091 + static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) 1092 + { 1093 + /* TODO 1094 + * read the array to eeprom when SMU disabled. 1095 + */ 1096 + return 0; 1097 + } 1098 + 1099 + static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) 1100 + { 1101 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1102 + struct ras_err_handler_data **data = &con->eh_data; 1103 + 1104 + *data = kmalloc(sizeof(**data), 1105 + GFP_KERNEL|__GFP_ZERO); 1106 + if (!*data) 1107 + return -ENOMEM; 1108 + 1109 + mutex_init(&con->recovery_lock); 1110 + INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); 1111 + atomic_set(&con->in_recovery, 0); 1112 + con->adev = adev; 1113 + 1114 + amdgpu_ras_load_bad_pages(adev); 1115 + amdgpu_ras_reserve_bad_pages(adev); 1116 + 1117 + return 0; 1118 + } 1119 + 1120 + static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) 1121 + { 1122 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1123 + struct ras_err_handler_data *data = con->eh_data; 1124 + 1125 + cancel_work_sync(&con->recovery_work); 1126 + amdgpu_ras_save_bad_pages(adev); 1127 + amdgpu_ras_release_bad_pages(adev); 1128 + 1129 + mutex_lock(&con->recovery_lock); 1130 + con->eh_data = NULL; 1131 + kfree(data->bps); 1132 + kfree(data); 1133 + mutex_unlock(&con->recovery_lock); 1134 + 1135 + return 0; 1136 + } 1137 + /* recovery end */ 1138 + 1139 + struct ras_DID_capability { 1140 + u16 did; 1141 + u8 rid; 1142 + u32 capability; 1143 + }; 1144 + 1145 + static const struct ras_DID_capability supported_DID_array[] = { 1146 + {0x66a0, 0x00, AMDGPU_RAS_BLOCK_MASK}, 1147 + {0x66a0, 0x02, AMDGPU_RAS_BLOCK_MASK}, 1148 + {0x66a1, 0x00, AMDGPU_RAS_BLOCK_MASK}, 1149 + {0x66a1, 0x01, AMDGPU_RAS_BLOCK_MASK}, 1150 + {0x66a1, 0x04, AMDGPU_RAS_BLOCK_MASK}, 1151 + {0x66a3, 0x00, AMDGPU_RAS_BLOCK_MASK}, 1152 + {0x66a7, 0x00, AMDGPU_RAS_BLOCK_MASK}, 1153 + }; 1154 + 1155 + static uint32_t amdgpu_ras_check_supported(struct amdgpu_device *adev) 1156 + { 1157 + /* TODO need check vbios table */ 1158 + int i; 1159 + int did = adev->pdev->device; 1160 + int rid = adev->pdev->revision; 1161 + 1162 + for (i = 0; i < ARRAY_SIZE(supported_DID_array); i++) { 1163 + if (did == supported_DID_array[i].did && 1164 + rid == supported_DID_array[i].rid) { 1165 + return supported_DID_array[i].capability; 1166 + } 1167 + } 1168 + return 0; 1169 + } 1170 + 1171 + int amdgpu_ras_init(struct amdgpu_device *adev) 1172 + { 1173 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1174 + uint32_t supported = amdgpu_ras_check_supported(adev); 1175 + 1176 + if (con || supported == 0) 1177 + return 0; 1178 + 1179 + con = kmalloc(sizeof(struct amdgpu_ras) + 1180 + sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, 1181 + GFP_KERNEL|__GFP_ZERO); 1182 + if (!con) 1183 + return -ENOMEM; 1184 + 1185 + con->objs = (struct ras_manager *)(con + 1); 1186 + 1187 + amdgpu_ras_set_context(adev, con); 1188 + 1189 + con->supported = supported; 1190 + con->features = 0; 1191 + INIT_LIST_HEAD(&con->head); 1192 + 1193 + if (amdgpu_ras_recovery_init(adev)) 1194 + goto recovery_out; 1195 + 1196 + amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; 1197 + 1198 + amdgpu_ras_enable_all_features(adev, 1); 1199 + 1200 + if (amdgpu_ras_fs_init(adev)) 1201 + goto fs_out; 1202 + 1203 + amdgpu_ras_self_test(adev); 1204 + return 0; 1205 + fs_out: 1206 + amdgpu_ras_recovery_fini(adev); 1207 + recovery_out: 1208 + amdgpu_ras_set_context(adev, NULL); 1209 + kfree(con); 1210 + 1211 + return -EINVAL; 1212 + } 1213 + 1214 + /* do some fini work before IP fini as dependence */ 1215 + int amdgpu_ras_pre_fini(struct amdgpu_device *adev) 1216 + { 1217 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1218 + 1219 + if (!con) 1220 + return 0; 1221 + 1222 + /* Need disable ras on all IPs here before ip [hw/sw]fini */ 1223 + amdgpu_ras_disable_all_features(adev, 0); 1224 + amdgpu_ras_recovery_fini(adev); 1225 + return 0; 1226 + } 1227 + 1228 + int amdgpu_ras_fini(struct amdgpu_device *adev) 1229 + { 1230 + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1231 + 1232 + if (!con) 1233 + return 0; 1234 + 1235 + amdgpu_ras_fs_fini(adev); 1236 + amdgpu_ras_interrupt_remove_all(adev); 1237 + 1238 + WARN(con->features, "Feature mask is not cleared"); 1239 + 1240 + if (con->features) 1241 + amdgpu_ras_disable_all_features(adev, 1); 1242 + 1243 + amdgpu_ras_set_context(adev, NULL); 1244 + kfree(con); 1245 + 1246 + return 0; 1247 + }

+217

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

··· 1 + /* 2 + * Copyright 2018 Advanced Micro Devices, Inc. 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a 5 + * copy of this software and associated documentation files (the "Software"), 6 + * to deal in the Software without restriction, including without limitation 7 + * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 + * and/or sell copies of the Software, and to permit persons to whom the 9 + * Software is furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 + * OTHER DEALINGS IN THE SOFTWARE. 21 + * 22 + * 23 + */ 24 + #ifndef _AMDGPU_RAS_H 25 + #define _AMDGPU_RAS_H 26 + 27 + #include <linux/debugfs.h> 28 + #include <linux/list.h> 29 + #include "amdgpu.h" 30 + #include "amdgpu_psp.h" 31 + #include "ta_ras_if.h" 32 + 33 + enum amdgpu_ras_block { 34 + AMDGPU_RAS_BLOCK__UMC = 0, 35 + AMDGPU_RAS_BLOCK__SDMA, 36 + AMDGPU_RAS_BLOCK__GFX, 37 + AMDGPU_RAS_BLOCK__MMHUB, 38 + AMDGPU_RAS_BLOCK__ATHUB, 39 + AMDGPU_RAS_BLOCK__PCIE_BIF, 40 + AMDGPU_RAS_BLOCK__HDP, 41 + AMDGPU_RAS_BLOCK__XGMI_WAFL, 42 + AMDGPU_RAS_BLOCK__DF, 43 + AMDGPU_RAS_BLOCK__SMN, 44 + AMDGPU_RAS_BLOCK__SEM, 45 + AMDGPU_RAS_BLOCK__MP0, 46 + AMDGPU_RAS_BLOCK__MP1, 47 + AMDGPU_RAS_BLOCK__FUSE, 48 + 49 + AMDGPU_RAS_BLOCK__LAST 50 + }; 51 + 52 + #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST 53 + #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) 54 + 55 + enum amdgpu_ras_error_type { 56 + AMDGPU_RAS_ERROR__NONE = 0, 57 + AMDGPU_RAS_ERROR__PARITY = 1, 58 + AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, 59 + AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, 60 + AMDGPU_RAS_ERROR__POISON = 8, 61 + }; 62 + 63 + enum amdgpu_ras_ret { 64 + AMDGPU_RAS_SUCCESS = 0, 65 + AMDGPU_RAS_FAIL, 66 + AMDGPU_RAS_UE, 67 + AMDGPU_RAS_CE, 68 + AMDGPU_RAS_PT, 69 + }; 70 + 71 + struct ras_common_if { 72 + enum amdgpu_ras_block block; 73 + enum amdgpu_ras_error_type type; 74 + uint32_t sub_block_index; 75 + /* block name */ 76 + char name[32]; 77 + }; 78 + 79 + typedef int (*ras_ih_cb)(struct amdgpu_device *adev, 80 + struct amdgpu_iv_entry *entry); 81 + 82 + struct amdgpu_ras { 83 + /* ras infrastructure */ 84 + uint32_t supported; 85 + uint32_t features; 86 + struct list_head head; 87 + /* debugfs */ 88 + struct dentry *dir; 89 + /* sysfs */ 90 + struct device_attribute features_attr; 91 + /* block array */ 92 + struct ras_manager *objs; 93 + 94 + /* gpu recovery */ 95 + struct work_struct recovery_work; 96 + atomic_t in_recovery; 97 + struct amdgpu_device *adev; 98 + /* error handler data */ 99 + struct ras_err_handler_data *eh_data; 100 + struct mutex recovery_lock; 101 + }; 102 + 103 + /* interfaces for IP */ 104 + 105 + struct ras_fs_if { 106 + struct ras_common_if head; 107 + char sysfs_name[32]; 108 + char debugfs_name[32]; 109 + }; 110 + 111 + struct ras_query_if { 112 + struct ras_common_if head; 113 + unsigned long ue_count; 114 + unsigned long ce_count; 115 + }; 116 + 117 + struct ras_inject_if { 118 + struct ras_common_if head; 119 + uint64_t address; 120 + uint64_t value; 121 + }; 122 + 123 + struct ras_cure_if { 124 + struct ras_common_if head; 125 + uint64_t address; 126 + }; 127 + 128 + struct ras_ih_if { 129 + struct ras_common_if head; 130 + ras_ih_cb cb; 131 + }; 132 + 133 + struct ras_dispatch_if { 134 + struct ras_common_if head; 135 + struct amdgpu_iv_entry *entry; 136 + }; 137 + 138 + /* work flow 139 + * vbios 140 + * 1: ras feature enable (enabled by default) 141 + * psp 142 + * 2: ras framework init (in ip_init) 143 + * IP 144 + * 3: IH add 145 + * 4: debugfs/sysfs create 146 + * 5: query/inject 147 + * 6: debugfs/sysfs remove 148 + * 7: IH remove 149 + * 8: feature disable 150 + */ 151 + 152 + #define amdgpu_ras_get_context(adev) ((adev)->psp.ras.ras) 153 + #define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras.ras = (ras_con)) 154 + 155 + /* check if ras is supported on block, say, sdma, gfx */ 156 + static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, 157 + unsigned int block) 158 + { 159 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 160 + 161 + return ras && (ras->supported & (1 << block)); 162 + } 163 + 164 + int amdgpu_ras_query_error_count(struct amdgpu_device *adev, 165 + bool is_ce); 166 + 167 + /* error handling functions */ 168 + int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, 169 + unsigned long *bps, int pages); 170 + 171 + int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); 172 + 173 + static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, 174 + bool is_baco) 175 + { 176 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 177 + 178 + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) 179 + schedule_work(&ras->recovery_work); 180 + return 0; 181 + } 182 + 183 + /* called in ip_init and ip_fini */ 184 + int amdgpu_ras_init(struct amdgpu_device *adev); 185 + int amdgpu_ras_fini(struct amdgpu_device *adev); 186 + int amdgpu_ras_pre_fini(struct amdgpu_device *adev); 187 + 188 + int amdgpu_ras_feature_enable(struct amdgpu_device *adev, 189 + struct ras_common_if *head, bool enable); 190 + 191 + int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, 192 + struct ras_fs_if *head); 193 + 194 + int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, 195 + struct ras_common_if *head); 196 + 197 + int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, 198 + struct ras_fs_if *head); 199 + 200 + int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, 201 + struct ras_common_if *head); 202 + 203 + int amdgpu_ras_error_query(struct amdgpu_device *adev, 204 + struct ras_query_if *info); 205 + 206 + int amdgpu_ras_error_inject(struct amdgpu_device *adev, 207 + struct ras_inject_if *info); 208 + 209 + int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, 210 + struct ras_ih_if *info); 211 + 212 + int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, 213 + struct ras_ih_if *info); 214 + 215 + int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, 216 + struct ras_dispatch_if *info); 217 + #endif