Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cpupower: mperf monitor - Use TSC to calculate max frequency if possible

Which makes the implementation independent from cpufreq drivers.
Therefore this would also work on a Xen kernel where the hypervisor
is doing frequency switching and idle entering.

Signed-off-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>

authored by

Thomas Renninger and committed by
Dominik Brodowski
2dfc818b 75f25bd3

+130 -47
+1 -1
tools/power/cpupower/Makefile
··· 24 24 25 25 # Set the following to `true' to make a unstripped, unoptimized 26 26 # binary. Leave this set to `false' for production use. 27 - DEBUG ?= false 27 + DEBUG ?= true 28 28 29 29 # make the build silent. Set this to something else to make it noisy again. 30 30 V ?= false
+129 -46
tools/power/cpupower/utils/idle_monitor/mperf_monitor.c
··· 22 22 23 23 #define MSR_TSC 0x10 24 24 25 + #define MSR_AMD_HWCR 0xc0010015 26 + 25 27 enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT }; 26 28 27 29 static int mperf_get_count_percent(unsigned int self_id, double *percent, 28 30 unsigned int cpu); 29 31 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 30 32 unsigned int cpu); 33 + static struct timespec time_start, time_end; 31 34 32 35 static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = { 33 36 { ··· 57 54 }, 58 55 }; 59 56 57 + enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF }; 58 + static int max_freq_mode; 59 + /* 60 + * The max frequency mperf is ticking at (in C0), either retrieved via: 61 + * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency 62 + * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time 63 + * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen) 64 + */ 65 + static unsigned long max_frequency; 66 + 60 67 static unsigned long long tsc_at_measure_start; 61 68 static unsigned long long tsc_at_measure_end; 62 - static unsigned long max_frequency; 63 69 static unsigned long long *mperf_previous_count; 64 70 static unsigned long long *aperf_previous_count; 65 71 static unsigned long long *mperf_current_count; 66 72 static unsigned long long *aperf_current_count; 73 + 67 74 /* valid flag for all CPUs. If a MSR read failed it will be zero */ 68 75 static int *is_valid; 69 76 70 77 static int mperf_get_tsc(unsigned long long *tsc) 71 78 { 72 - return read_msr(0, MSR_TSC, tsc); 79 + int ret; 80 + ret = read_msr(0, MSR_TSC, tsc); 81 + if (ret) 82 + dprint("Reading TSC MSR failed, returning %llu\n", *tsc); 83 + return ret; 73 84 } 74 85 75 86 static int mperf_init_stats(unsigned int cpu) ··· 114 97 return 0; 115 98 } 116 99 117 - /* 118 - * get_average_perf() 119 - * 120 - * Returns the average performance (also considers boosted frequencies) 121 - * 122 - * Input: 123 - * aperf_diff: Difference of the aperf register over a time period 124 - * mperf_diff: Difference of the mperf register over the same time period 125 - * max_freq: Maximum frequency (P0) 126 - * 127 - * Returns: 128 - * Average performance over the time period 129 - */ 130 - static unsigned long get_average_perf(unsigned long long aperf_diff, 131 - unsigned long long mperf_diff) 132 - { 133 - unsigned int perf_percent = 0; 134 - if (((unsigned long)(-1) / 100) < aperf_diff) { 135 - int shift_count = 7; 136 - aperf_diff >>= shift_count; 137 - mperf_diff >>= shift_count; 138 - } 139 - perf_percent = (aperf_diff * 100) / mperf_diff; 140 - return (max_frequency * perf_percent) / 100; 141 - } 142 - 143 100 static int mperf_get_count_percent(unsigned int id, double *percent, 144 101 unsigned int cpu) 145 102 { 146 103 unsigned long long aperf_diff, mperf_diff, tsc_diff; 104 + unsigned long long timediff; 147 105 148 106 if (!is_valid[cpu]) 149 107 return -1; ··· 128 136 129 137 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 130 138 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 131 - tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 132 139 133 - *percent = 100.0 * mperf_diff / tsc_diff; 134 - dprint("%s: mperf_diff: %llu, tsc_diff: %llu\n", 135 - mperf_cstates[id].name, mperf_diff, tsc_diff); 140 + if (max_freq_mode == MAX_FREQ_TSC_REF) { 141 + tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 142 + *percent = 100.0 * mperf_diff / tsc_diff; 143 + dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n", 144 + mperf_cstates[id].name, mperf_diff, tsc_diff); 145 + } else if (max_freq_mode == MAX_FREQ_SYSFS) { 146 + timediff = timespec_diff_us(time_start, time_end); 147 + *percent = 100.0 * mperf_diff / timediff; 148 + dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n", 149 + mperf_cstates[id].name, mperf_diff, timediff); 150 + } else 151 + return -1; 136 152 137 153 if (id == Cx) 138 154 *percent = 100.0 - *percent; ··· 154 154 static int mperf_get_count_freq(unsigned int id, unsigned long long *count, 155 155 unsigned int cpu) 156 156 { 157 - unsigned long long aperf_diff, mperf_diff; 157 + unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff; 158 158 159 159 if (id != AVG_FREQ) 160 160 return 1; ··· 165 165 mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu]; 166 166 aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu]; 167 167 168 - /* Return MHz for now, might want to return KHz if column width is more 169 - generic */ 170 - *count = get_average_perf(aperf_diff, mperf_diff) / 1000; 171 - dprint("%s: %llu\n", mperf_cstates[id].name, *count); 168 + if (max_freq_mode == MAX_FREQ_TSC_REF) { 169 + /* Calculate max_freq from TSC count */ 170 + tsc_diff = tsc_at_measure_end - tsc_at_measure_start; 171 + time_diff = timespec_diff_us(time_start, time_end); 172 + max_frequency = tsc_diff / time_diff; 173 + } 172 174 175 + *count = max_frequency * ((double)aperf_diff / mperf_diff); 176 + dprint("%s: Average freq based on %s maximum frequency:\n", 177 + mperf_cstates[id].name, 178 + (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read"); 179 + dprint("%max_frequency: %lu", max_frequency); 180 + dprint("aperf_diff: %llu\n", aperf_diff); 181 + dprint("mperf_diff: %llu\n", mperf_diff); 182 + dprint("avg freq: %llu\n", *count); 173 183 return 0; 174 184 } 175 185 ··· 188 178 int cpu; 189 179 unsigned long long dbg; 190 180 181 + clock_gettime(CLOCK_REALTIME, &time_start); 191 182 mperf_get_tsc(&tsc_at_measure_start); 192 183 193 184 for (cpu = 0; cpu < cpu_count; cpu++) ··· 204 193 unsigned long long dbg; 205 194 int cpu; 206 195 207 - mperf_get_tsc(&tsc_at_measure_end); 208 - 209 196 for (cpu = 0; cpu < cpu_count; cpu++) 210 197 mperf_measure_stats(cpu); 198 + 199 + mperf_get_tsc(&tsc_at_measure_end); 200 + clock_gettime(CLOCK_REALTIME, &time_end); 211 201 212 202 mperf_get_tsc(&dbg); 213 203 dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end); ··· 216 204 return 0; 217 205 } 218 206 219 - struct cpuidle_monitor mperf_monitor; 220 - 221 - struct cpuidle_monitor *mperf_register(void) 207 + /* 208 + * Mperf register is defined to tick at P0 (maximum) frequency 209 + * 210 + * Instead of reading out P0 which can be tricky to read out from HW, 211 + * we use TSC counter if it reliably ticks at P0/mperf frequency. 212 + * 213 + * Still try to fall back to: 214 + * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq 215 + * on older Intel HW without invariant TSC feature. 216 + * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but 217 + * it's still double checked (MSR_AMD_HWCR)). 218 + * 219 + * On these machines the user would still get useful mperf 220 + * stats when acpi-cpufreq driver is loaded. 221 + */ 222 + static int init_maxfreq_mode(void) 222 223 { 224 + int ret; 225 + unsigned long long hwcr; 223 226 unsigned long min; 224 227 225 - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) 226 - return NULL; 228 + if (!cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC) 229 + goto use_sysfs; 227 230 228 - /* Assume min/max all the same on all cores */ 231 + if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) { 232 + /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf 233 + * freq. 234 + * A test whether hwcr is accessable/available would be: 235 + * (cpupower_cpu_info.family > 0x10 || 236 + * cpupower_cpu_info.family == 0x10 && 237 + * cpupower_cpu_info.model >= 0x2)) 238 + * This should be the case for all aperf/mperf 239 + * capable AMD machines and is therefore safe to test here. 240 + * Compare with Linus kernel git commit: acf01734b1747b1ec4 241 + */ 242 + ret = read_msr(0, MSR_AMD_HWCR, &hwcr); 243 + /* 244 + * If the MSR read failed, assume a Xen system that did 245 + * not explicitly provide access to it and assume TSC works 246 + */ 247 + if (ret != 0) { 248 + dprint("TSC read 0x%x failed - assume TSC working\n", 249 + MSR_AMD_HWCR); 250 + return 0; 251 + } else if (1 & (hwcr >> 24)) { 252 + max_freq_mode = MAX_FREQ_TSC_REF; 253 + return 0; 254 + } else { /* Use sysfs max frequency if available */ } 255 + } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) { 256 + /* 257 + * On Intel we assume mperf (in C0) is ticking at same 258 + * rate than TSC 259 + */ 260 + max_freq_mode = MAX_FREQ_TSC_REF; 261 + return 0; 262 + } 263 + use_sysfs: 229 264 if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) { 230 265 dprint("Cannot retrieve max freq from cpufreq kernel " 231 266 "subsystem\n"); 232 - return NULL; 267 + return -1; 233 268 } 269 + max_freq_mode = MAX_FREQ_SYSFS; 270 + return 0; 271 + } 272 + 273 + /* 274 + * This monitor provides: 275 + * 276 + * 1) Average frequency a CPU resided in 277 + * This always works if the CPU has aperf/mperf capabilities 278 + * 279 + * 2) C0 and Cx (any sleep state) time a CPU resided in 280 + * Works if mperf timer stops ticking in sleep states which 281 + * seem to be the case on all current HW. 282 + * Both is directly retrieved from HW registers and is independent 283 + * from kernel statistics. 284 + */ 285 + struct cpuidle_monitor mperf_monitor; 286 + struct cpuidle_monitor *mperf_register(void) 287 + { 288 + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)) 289 + return NULL; 290 + 291 + if (init_maxfreq_mode()) 292 + return NULL; 234 293 235 294 /* Free this at program termination */ 236 295 is_valid = calloc(cpu_count, sizeof(int));