Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

drm/amd/pm:add new gpu_metrics_v2_3 to acquire average temperature info

Add new gpu_metrics_v2_3 to acquire average temperature info from SMU metrics. To acquire average temp info from gpu_metrics interface, but gpu_metrics_v2_2 only has members to show current temp info.
---
v1:
Only add average_temperature_gfx in gpu_metrics_v2_3.
v2:
Add average temp members for soc, core and l3 in gpu_metrics_v2_3 and put these new members at the end of gpu_metrics_v2_3. Add operation to read average temp info from metrics table.
v3:
Merge v1 and v2 and rename the patch.
v4:
Merge v3. Add firmware version judgment in vangogh_common_get_gpu_metrics to maintain backward compatibility and rename the patch. "return ret" on error scenario in smu_cmn_get_smc_version.

Signed-off-by: Li Ma <li.ma@amd.com>
Reviewed-by: Evan Quan <evan.quan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Li Ma and committed by
Alex Deucher
0d6516ef ee108183

+209 -12
+58
drivers/gpu/drm/amd/include/kgd_pp_interface.h
··· 824 824 uint64_t indep_throttle_status; 825 825 }; 826 826 827 + struct gpu_metrics_v2_3 { 828 + struct metrics_table_header common_header; 829 + 830 + /* Temperature */ 831 + uint16_t temperature_gfx; // gfx temperature on APUs 832 + uint16_t temperature_soc; // soc temperature on APUs 833 + uint16_t temperature_core[8]; // CPU core temperature on APUs 834 + uint16_t temperature_l3[2]; 835 + 836 + /* Utilization */ 837 + uint16_t average_gfx_activity; 838 + uint16_t average_mm_activity; // UVD or VCN 839 + 840 + /* Driver attached timestamp (in ns) */ 841 + uint64_t system_clock_counter; 842 + 843 + /* Power/Energy */ 844 + uint16_t average_socket_power; // dGPU + APU power on A + A platform 845 + uint16_t average_cpu_power; 846 + uint16_t average_soc_power; 847 + uint16_t average_gfx_power; 848 + uint16_t average_core_power[8]; // CPU core power on APUs 849 + 850 + /* Average clocks */ 851 + uint16_t average_gfxclk_frequency; 852 + uint16_t average_socclk_frequency; 853 + uint16_t average_uclk_frequency; 854 + uint16_t average_fclk_frequency; 855 + uint16_t average_vclk_frequency; 856 + uint16_t average_dclk_frequency; 857 + 858 + /* Current clocks */ 859 + uint16_t current_gfxclk; 860 + uint16_t current_socclk; 861 + uint16_t current_uclk; 862 + uint16_t current_fclk; 863 + uint16_t current_vclk; 864 + uint16_t current_dclk; 865 + uint16_t current_coreclk[8]; // CPU core clocks 866 + uint16_t current_l3clk[2]; 867 + 868 + /* Throttle status (ASIC dependent) */ 869 + uint32_t throttle_status; 870 + 871 + /* Fans */ 872 + uint16_t fan_pwm; 873 + 874 + uint16_t padding[3]; 875 + 876 + /* Throttle status (ASIC independent) */ 877 + uint64_t indep_throttle_status; 878 + 879 + /* Average Temperature */ 880 + uint16_t average_temperature_gfx; // average gfx temperature on APUs 881 + uint16_t average_temperature_soc; // average soc temperature on APUs 882 + uint16_t average_temperature_core[8]; // average CPU core temperature on APUs 883 + uint16_t average_temperature_l3[2]; 884 + }; 827 885 #endif
+148 -12
drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
··· 223 223 { 224 224 struct smu_table_context *smu_table = &smu->smu_table; 225 225 struct smu_table *tables = smu_table->tables; 226 - struct amdgpu_device *adev = smu->adev; 227 226 uint32_t if_version; 227 + uint32_t smu_version; 228 228 uint32_t ret = 0; 229 229 230 - ret = smu_cmn_get_smc_version(smu, &if_version, NULL); 230 + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); 231 231 if (ret) { 232 - dev_err(adev->dev, "Failed to get smu if version!\n"); 233 - goto err0_out; 232 + return ret; 234 233 } 235 234 236 235 SMU_TABLE_INIT(tables, SMU_TABLE_WATERMARKS, sizeof(Watermarks_t), ··· 254 255 goto err0_out; 255 256 smu_table->metrics_time = 0; 256 257 257 - smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v2_2); 258 + if (smu_version >= 0x043F3E00) 259 + smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v2_3); 260 + else 261 + smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v2_2); 258 262 smu_table->gpu_metrics_table = kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL); 259 263 if (!smu_table->gpu_metrics_table) 260 264 goto err1_out; ··· 1650 1648 return 0; 1651 1649 } 1652 1650 1651 + static ssize_t vangogh_get_legacy_gpu_metrics_v2_3(struct smu_context *smu, 1652 + void **table) 1653 + { 1654 + struct smu_table_context *smu_table = &smu->smu_table; 1655 + struct gpu_metrics_v2_3 *gpu_metrics = 1656 + (struct gpu_metrics_v2_3 *)smu_table->gpu_metrics_table; 1657 + SmuMetrics_legacy_t metrics; 1658 + int ret = 0; 1659 + 1660 + ret = smu_cmn_get_metrics_table(smu, &metrics, true); 1661 + if (ret) 1662 + return ret; 1663 + 1664 + smu_cmn_init_soft_gpu_metrics(gpu_metrics, 2, 3); 1665 + 1666 + gpu_metrics->temperature_gfx = metrics.GfxTemperature; 1667 + gpu_metrics->temperature_soc = metrics.SocTemperature; 1668 + memcpy(&gpu_metrics->temperature_core[0], 1669 + &metrics.CoreTemperature[0], 1670 + sizeof(uint16_t) * 4); 1671 + gpu_metrics->temperature_l3[0] = metrics.L3Temperature[0]; 1672 + 1673 + gpu_metrics->average_gfx_activity = metrics.GfxActivity; 1674 + gpu_metrics->average_mm_activity = metrics.UvdActivity; 1675 + 1676 + gpu_metrics->average_socket_power = metrics.CurrentSocketPower; 1677 + gpu_metrics->average_cpu_power = metrics.Power[0]; 1678 + gpu_metrics->average_soc_power = metrics.Power[1]; 1679 + gpu_metrics->average_gfx_power = metrics.Power[2]; 1680 + memcpy(&gpu_metrics->average_core_power[0], 1681 + &metrics.CorePower[0], 1682 + sizeof(uint16_t) * 4); 1683 + 1684 + gpu_metrics->average_gfxclk_frequency = metrics.GfxclkFrequency; 1685 + gpu_metrics->average_socclk_frequency = metrics.SocclkFrequency; 1686 + gpu_metrics->average_uclk_frequency = metrics.MemclkFrequency; 1687 + gpu_metrics->average_fclk_frequency = metrics.MemclkFrequency; 1688 + gpu_metrics->average_vclk_frequency = metrics.VclkFrequency; 1689 + gpu_metrics->average_dclk_frequency = metrics.DclkFrequency; 1690 + 1691 + memcpy(&gpu_metrics->current_coreclk[0], 1692 + &metrics.CoreFrequency[0], 1693 + sizeof(uint16_t) * 4); 1694 + gpu_metrics->current_l3clk[0] = metrics.L3Frequency[0]; 1695 + 1696 + gpu_metrics->throttle_status = metrics.ThrottlerStatus; 1697 + gpu_metrics->indep_throttle_status = 1698 + smu_cmn_get_indep_throttler_status(metrics.ThrottlerStatus, 1699 + vangogh_throttler_map); 1700 + 1701 + gpu_metrics->system_clock_counter = ktime_get_boottime_ns(); 1702 + 1703 + *table = (void *)gpu_metrics; 1704 + 1705 + return sizeof(struct gpu_metrics_v2_3); 1706 + } 1707 + 1653 1708 static ssize_t vangogh_get_legacy_gpu_metrics(struct smu_context *smu, 1654 1709 void **table) 1655 1710 { ··· 1762 1703 *table = (void *)gpu_metrics; 1763 1704 1764 1705 return sizeof(struct gpu_metrics_v2_2); 1706 + } 1707 + 1708 + static ssize_t vangogh_get_gpu_metrics_v2_3(struct smu_context *smu, 1709 + void **table) 1710 + { 1711 + struct smu_table_context *smu_table = &smu->smu_table; 1712 + struct gpu_metrics_v2_3 *gpu_metrics = 1713 + (struct gpu_metrics_v2_3 *)smu_table->gpu_metrics_table; 1714 + SmuMetrics_t metrics; 1715 + int ret = 0; 1716 + 1717 + ret = smu_cmn_get_metrics_table(smu, &metrics, true); 1718 + if (ret) 1719 + return ret; 1720 + 1721 + smu_cmn_init_soft_gpu_metrics(gpu_metrics, 2, 3); 1722 + 1723 + gpu_metrics->temperature_gfx = metrics.Current.GfxTemperature; 1724 + gpu_metrics->temperature_soc = metrics.Current.SocTemperature; 1725 + memcpy(&gpu_metrics->temperature_core[0], 1726 + &metrics.Current.CoreTemperature[0], 1727 + sizeof(uint16_t) * 4); 1728 + gpu_metrics->temperature_l3[0] = metrics.Current.L3Temperature[0]; 1729 + 1730 + gpu_metrics->average_temperature_gfx = metrics.Average.GfxTemperature; 1731 + gpu_metrics->average_temperature_soc = metrics.Average.SocTemperature; 1732 + memcpy(&gpu_metrics->average_temperature_core[0], 1733 + &metrics.Average.CoreTemperature[0], 1734 + sizeof(uint16_t) * 4); 1735 + gpu_metrics->average_temperature_l3[0] = metrics.Average.L3Temperature[0]; 1736 + 1737 + gpu_metrics->average_gfx_activity = metrics.Current.GfxActivity; 1738 + gpu_metrics->average_mm_activity = metrics.Current.UvdActivity; 1739 + 1740 + gpu_metrics->average_socket_power = metrics.Current.CurrentSocketPower; 1741 + gpu_metrics->average_cpu_power = metrics.Current.Power[0]; 1742 + gpu_metrics->average_soc_power = metrics.Current.Power[1]; 1743 + gpu_metrics->average_gfx_power = metrics.Current.Power[2]; 1744 + memcpy(&gpu_metrics->average_core_power[0], 1745 + &metrics.Average.CorePower[0], 1746 + sizeof(uint16_t) * 4); 1747 + 1748 + gpu_metrics->average_gfxclk_frequency = metrics.Average.GfxclkFrequency; 1749 + gpu_metrics->average_socclk_frequency = metrics.Average.SocclkFrequency; 1750 + gpu_metrics->average_uclk_frequency = metrics.Average.MemclkFrequency; 1751 + gpu_metrics->average_fclk_frequency = metrics.Average.MemclkFrequency; 1752 + gpu_metrics->average_vclk_frequency = metrics.Average.VclkFrequency; 1753 + gpu_metrics->average_dclk_frequency = metrics.Average.DclkFrequency; 1754 + 1755 + gpu_metrics->current_gfxclk = metrics.Current.GfxclkFrequency; 1756 + gpu_metrics->current_socclk = metrics.Current.SocclkFrequency; 1757 + gpu_metrics->current_uclk = metrics.Current.MemclkFrequency; 1758 + gpu_metrics->current_fclk = metrics.Current.MemclkFrequency; 1759 + gpu_metrics->current_vclk = metrics.Current.VclkFrequency; 1760 + gpu_metrics->current_dclk = metrics.Current.DclkFrequency; 1761 + 1762 + memcpy(&gpu_metrics->current_coreclk[0], 1763 + &metrics.Current.CoreFrequency[0], 1764 + sizeof(uint16_t) * 4); 1765 + gpu_metrics->current_l3clk[0] = metrics.Current.L3Frequency[0]; 1766 + 1767 + gpu_metrics->throttle_status = metrics.Current.ThrottlerStatus; 1768 + gpu_metrics->indep_throttle_status = 1769 + smu_cmn_get_indep_throttler_status(metrics.Current.ThrottlerStatus, 1770 + vangogh_throttler_map); 1771 + 1772 + gpu_metrics->system_clock_counter = ktime_get_boottime_ns(); 1773 + 1774 + *table = (void *)gpu_metrics; 1775 + 1776 + return sizeof(struct gpu_metrics_v2_3); 1765 1777 } 1766 1778 1767 1779 static ssize_t vangogh_get_gpu_metrics(struct smu_context *smu, ··· 1902 1772 static ssize_t vangogh_common_get_gpu_metrics(struct smu_context *smu, 1903 1773 void **table) 1904 1774 { 1905 - struct amdgpu_device *adev = smu->adev; 1906 1775 uint32_t if_version; 1776 + uint32_t smu_version; 1907 1777 int ret = 0; 1908 1778 1909 - ret = smu_cmn_get_smc_version(smu, &if_version, NULL); 1779 + ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version); 1910 1780 if (ret) { 1911 - dev_err(adev->dev, "Failed to get smu if version!\n"); 1912 1781 return ret; 1913 1782 } 1914 1783 1915 - if (if_version < 0x3) 1916 - ret = vangogh_get_legacy_gpu_metrics(smu, table); 1917 - else 1918 - ret = vangogh_get_gpu_metrics(smu, table); 1784 + if (smu_version >= 0x043F3E00) { 1785 + if (if_version < 0x3) 1786 + ret = vangogh_get_legacy_gpu_metrics_v2_3(smu, table); 1787 + else 1788 + ret = vangogh_get_gpu_metrics_v2_3(smu, table); 1789 + } else { 1790 + if (if_version < 0x3) 1791 + ret = vangogh_get_legacy_gpu_metrics(smu, table); 1792 + else 1793 + ret = vangogh_get_gpu_metrics(smu, table); 1794 + } 1919 1795 1920 1796 return ret; 1921 1797 }
+3
drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
··· 969 969 case METRICS_VERSION(2, 2): 970 970 structure_size = sizeof(struct gpu_metrics_v2_2); 971 971 break; 972 + case METRICS_VERSION(2, 3): 973 + structure_size = sizeof(struct gpu_metrics_v2_3); 974 + break; 972 975 default: 973 976 return; 974 977 }