Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amdgpu: refine ras related message print

Prefix ras related kernel message logging with PCI
device info by replacing DRM_INFO/WARN/ERROR with
dev_info/warn/err. This can clearly tell user about
GPU device information where ras is. And add some
other ras message printing to make it more clear
and friendly as well.

Suggested-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Guchun Chen and committed by
Alex Deucher
6952e99c 1f3ef0ef

+48 -29
+31 -20
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 296 296 int ret = 0; 297 297 298 298 if (!amdgpu_ras_get_error_query_ready(adev)) { 299 - DRM_WARN("RAS WARN: error injection currently inaccessible\n"); 299 + dev_warn(adev->dev, "RAS WARN: error injection " 300 + "currently inaccessible\n"); 300 301 return size; 301 302 } 302 303 ··· 325 324 /* umc ce/ue error injection for a bad page is not allowed */ 326 325 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && 327 326 amdgpu_ras_check_bad_page(adev, data.inject.address)) { 328 - DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n", 327 + dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked " 328 + "as bad before error injection!\n", 329 329 data.inject.address); 330 330 break; 331 331 } ··· 592 590 if (!amdgpu_ras_intr_triggered()) { 593 591 ret = psp_ras_enable_features(&adev->psp, &info, enable); 594 592 if (ret) { 595 - DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", 593 + dev_err(adev->dev, "RAS ERROR: %s %s feature " 594 + "failed ret %d\n", 596 595 enable ? "enable":"disable", 597 596 ras_block_str(head->block), 598 597 ret); ··· 635 632 if (ret == -EINVAL) { 636 633 ret = __amdgpu_ras_feature_enable(adev, head, 1); 637 634 if (!ret) 638 - DRM_INFO("RAS INFO: %s setup object\n", 635 + dev_info(adev->dev, 636 + "RAS INFO: %s setup object\n", 639 637 ras_block_str(head->block)); 640 638 } 641 639 } else { ··· 762 758 info->ce_count = obj->err_data.ce_count; 763 759 764 760 if (err_data.ce_count) { 765 - dev_info(adev->dev, "%ld correctable errors detected in %s block\n", 766 - obj->err_data.ce_count, ras_block_str(info->head.block)); 761 + dev_info(adev->dev, "%ld correctable hardware errors " 762 + "detected in %s block, no user " 763 + "action is needed.\n", 764 + obj->err_data.ce_count, 765 + ras_block_str(info->head.block)); 767 766 } 768 767 if (err_data.ue_count) { 769 - dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", 770 - obj->err_data.ue_count, ras_block_str(info->head.block)); 768 + dev_info(adev->dev, "%ld uncorrectable hardware errors " 769 + "detected in %s block\n", 770 + obj->err_data.ue_count, 771 + ras_block_str(info->head.block)); 771 772 } 772 773 773 774 return 0; ··· 816 807 ret = psp_ras_trigger_error(&adev->psp, &block_info); 817 808 break; 818 809 default: 819 - DRM_INFO("%s error injection is not supported yet\n", 810 + dev_info(adev->dev, "%s error injection is not supported yet\n", 820 811 ras_block_str(info->head.block)); 821 812 ret = -EINVAL; 822 813 } 823 814 824 815 if (ret) 825 - DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", 816 + dev_err(adev->dev, "RAS ERROR: inject %s error failed ret %d\n", 826 817 ras_block_str(info->head.block), 827 818 ret); 828 819 ··· 1558 1549 &data->bps[control->num_recs], 1559 1550 true, 1560 1551 save_count)) { 1561 - DRM_ERROR("Failed to save EEPROM table data!"); 1552 + dev_err(adev->dev, "Failed to save EEPROM table data!"); 1562 1553 return -EIO; 1563 1554 } 1564 1555 ··· 1586 1577 1587 1578 if (amdgpu_ras_eeprom_process_recods(control, bps, false, 1588 1579 control->num_recs)) { 1589 - DRM_ERROR("Failed to load EEPROM table records!"); 1580 + dev_err(adev->dev, "Failed to load EEPROM table records!"); 1590 1581 ret = -EIO; 1591 1582 goto out; 1592 1583 } ··· 1660 1651 AMDGPU_GPU_PAGE_SIZE, 1661 1652 AMDGPU_GEM_DOMAIN_VRAM, 1662 1653 &bo, NULL)) 1663 - DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); 1654 + dev_warn(adev->dev, "RAS WARN: reserve vram for " 1655 + "retired page %llx fail\n", bp); 1664 1656 1665 1657 data->bps_bo[i] = bo; 1666 1658 data->last_reserved = i + 1; ··· 1749 1739 kfree(*data); 1750 1740 con->eh_data = NULL; 1751 1741 out: 1752 - DRM_WARN("Failed to initialize ras recovery!\n"); 1742 + dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); 1753 1743 1754 1744 return ret; 1755 1745 } ··· 1811 1801 return; 1812 1802 1813 1803 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { 1814 - DRM_INFO("HBM ECC is active.\n"); 1804 + dev_info(adev->dev, "HBM ECC is active.\n"); 1815 1805 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | 1816 1806 1 << AMDGPU_RAS_BLOCK__DF); 1817 1807 } else 1818 - DRM_INFO("HBM ECC is not presented.\n"); 1808 + dev_info(adev->dev, "HBM ECC is not presented.\n"); 1819 1809 1820 1810 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { 1821 - DRM_INFO("SRAM ECC is active.\n"); 1811 + dev_info(adev->dev, "SRAM ECC is active.\n"); 1822 1812 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1823 1813 1 << AMDGPU_RAS_BLOCK__DF); 1824 1814 } else 1825 - DRM_INFO("SRAM ECC is not presented.\n"); 1815 + dev_info(adev->dev, "SRAM ECC is not presented.\n"); 1826 1816 1827 1817 /* hw_supported needs to be aligned with RAS block mask. */ 1828 1818 *hw_supported &= AMDGPU_RAS_BLOCK_MASK; ··· 1879 1869 if (amdgpu_ras_fs_init(adev)) 1880 1870 goto fs_out; 1881 1871 1882 - DRM_INFO("RAS INFO: ras initialized successfully, " 1872 + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " 1883 1873 "hardware ability[%x] ras_mask[%x]\n", 1884 1874 con->hw_supported, con->supported); 1885 1875 return 0; ··· 2065 2055 return; 2066 2056 2067 2057 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { 2068 - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); 2058 + dev_info(adev->dev, "uncorrectable hardware error" 2059 + "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); 2069 2060 2070 2061 amdgpu_ras_reset_gpu(adev); 2071 2062 }
+6 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
··· 110 110 * even NOMEM error is encountered 111 111 */ 112 112 if(!err_data->err_addr) 113 - DRM_WARN("Failed to alloc memory for umc error address record!\n"); 113 + dev_warn(adev->dev, "Failed to alloc memory for " 114 + "umc error address record!\n"); 114 115 115 116 /* umc query_ras_error_address is also responsible for clearing 116 117 * error status ··· 121 120 122 121 /* only uncorrectable error needs gpu reset */ 123 122 if (err_data->ue_count) { 124 - dev_info(adev->dev, "%ld uncorrectable errors detected in UMC block\n", 125 - err_data->ue_count); 123 + dev_info(adev->dev, "%ld uncorrectable hardware errors " 124 + "detected in UMC block\n", 125 + err_data->ue_count); 126 126 127 127 if (err_data->err_addr_cnt && 128 128 amdgpu_ras_add_bad_pages(adev, err_data->err_addr, 129 129 err_data->err_addr_cnt)) 130 - DRM_WARN("Failed to add ras bad page!\n"); 130 + dev_warn(adev->dev, "Failed to add ras bad page!\n"); 131 131 132 132 amdgpu_ras_reset_gpu(adev); 133 133 }
+11 -5
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
··· 323 323 obj->err_data.ce_count += err_data.ce_count; 324 324 325 325 if (err_data.ce_count) 326 - DRM_INFO("%ld correctable errors detected in %s block\n", 327 - obj->err_data.ce_count, adev->nbio.ras_if->name); 326 + dev_info(adev->dev, "%ld correctable hardware " 327 + "errors detected in %s block, " 328 + "no user action is needed.\n", 329 + obj->err_data.ce_count, 330 + adev->nbio.ras_if->name); 328 331 329 332 if (err_data.ue_count) 330 - DRM_INFO("%ld uncorrectable errors detected in %s block\n", 331 - obj->err_data.ue_count, adev->nbio.ras_if->name); 333 + dev_info(adev->dev, "%ld uncorrectable hardware " 334 + "errors detected in %s block\n", 335 + obj->err_data.ue_count, 336 + adev->nbio.ras_if->name); 332 337 333 - DRM_WARN("RAS controller interrupt triggered by NBIF error\n"); 338 + dev_info(adev->dev, "RAS controller interrupt triggered " 339 + "by NBIF error\n"); 334 340 335 341 /* ras_controller_int is dedicated for nbif ras error, 336 342 * not the global interrupt for sync flood