Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

PM: EM: convert power field to micro-Watts precision and align drivers

The milli-Watts precision causes rounding errors while calculating
efficiency cost for each OPP. This is especially visible in the 'simple'
Energy Model (EM), where the power for each OPP is provided from OPP
framework. This can cause some OPPs to be marked inefficient, while
using micro-Watts precision that might not happen.

Update all EM users which access 'power' field and assume the value is
in milli-Watts.

Solve also an issue with potential overflow in calculation of energy
estimation on 32bit machine. It's needed now since the power value
(thus the 'cost' as well) are higher.

Example calculation which shows the rounding error and impact:

power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz

power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000
power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18

power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961
power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21

max_freq = 2000MHz

cost_a_mW = 18 * 2000MHz/500MHz = 72
cost_a_uW = 18000 * 2000MHz/500MHz = 72000

cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better
cost_b_uW = 21961 * 2000MHz/600MHz = 73203

The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly
better that the 'cost_b_uW' (this patch uses micro-Watts) and such
would have impact on the 'inefficient OPPs' information in the Cpufreq
framework. This patch set removes the rounding issue.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

authored by

Lukasz Luba and committed by
Rafael J. Wysocki
ae6ccaa6 32346491

+101 -44
+4 -3
drivers/cpufreq/mediatek-cpufreq-hw.c
··· 51 51 }; 52 52 53 53 static int __maybe_unused 54 - mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW, 54 + mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *uW, 55 55 unsigned long *KHz) 56 56 { 57 57 struct mtk_cpufreq_data *data; ··· 71 71 i--; 72 72 73 73 *KHz = data->table[i].frequency; 74 - *mW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] + 75 - i * LUT_ROW_SIZE) / 1000; 74 + /* Provide micro-Watts value to the Energy Model */ 75 + *uW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] + 76 + i * LUT_ROW_SIZE); 76 77 77 78 return 0; 78 79 }
+6
drivers/cpufreq/scmi-cpufreq.c
··· 19 19 #include <linux/slab.h> 20 20 #include <linux/scmi_protocol.h> 21 21 #include <linux/types.h> 22 + #include <linux/units.h> 22 23 23 24 struct scmi_data { 24 25 int domain_id; ··· 100 99 scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, 101 100 unsigned long *KHz) 102 101 { 102 + bool power_scale_mw = perf_ops->power_scale_mw_get(ph); 103 103 unsigned long Hz; 104 104 int ret, domain; 105 105 ··· 113 111 ret = perf_ops->est_power_get(ph, domain, &Hz, power); 114 112 if (ret) 115 113 return ret; 114 + 115 + /* Provide bigger resolution power to the Energy Model */ 116 + if (power_scale_mw) 117 + *power *= MICROWATT_PER_MILLIWATT; 116 118 117 119 /* The EM framework specifies the frequency in KHz. */ 118 120 *KHz = Hz / 1000;
+8 -7
drivers/opp/of.c
··· 1443 1443 * It provides the power used by @dev at @kHz if it is the frequency of an 1444 1444 * existing OPP, or at the frequency of the first OPP above @kHz otherwise 1445 1445 * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled 1446 - * frequency and @mW to the associated power. 1446 + * frequency and @uW to the associated power. 1447 1447 * 1448 1448 * Returns 0 on success or a proper -EINVAL value in case of error. 1449 1449 */ 1450 1450 static int __maybe_unused 1451 - _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz) 1451 + _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) 1452 1452 { 1453 1453 struct dev_pm_opp *opp; 1454 1454 unsigned long opp_freq, opp_power; ··· 1465 1465 return -EINVAL; 1466 1466 1467 1467 *kHz = opp_freq / 1000; 1468 - *mW = opp_power / 1000; 1468 + *uW = opp_power; 1469 1469 1470 1470 return 0; 1471 1471 } ··· 1475 1475 * This computes the power estimated by @dev at @kHz if it is the frequency 1476 1476 * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise 1477 1477 * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled 1478 - * frequency and @mW to the associated power. The power is estimated as 1478 + * frequency and @uW to the associated power. The power is estimated as 1479 1479 * P = C * V^2 * f with C being the device's capacitance and V and f 1480 1480 * respectively the voltage and frequency of the OPP. 1481 1481 * 1482 1482 * Returns -EINVAL if the power calculation failed because of missing 1483 1483 * parameters, 0 otherwise. 1484 1484 */ 1485 - static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, 1485 + static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, 1486 1486 unsigned long *kHz) 1487 1487 { 1488 1488 struct dev_pm_opp *opp; ··· 1512 1512 return -EINVAL; 1513 1513 1514 1514 tmp = (u64)cap * mV * mV * (Hz / 1000000); 1515 - do_div(tmp, 1000000000); 1515 + /* Provide power in micro-Watts */ 1516 + do_div(tmp, 1000000); 1516 1517 1517 - *mW = (unsigned long)tmp; 1518 + *uW = (unsigned long)tmp; 1518 1519 *kHz = Hz / 1000; 1519 1520 1520 1521 return 0;
+2 -3
drivers/powercap/dtpm_cpu.c
··· 53 53 54 54 for (i = 0; i < pd->nr_perf_states; i++) { 55 55 56 - power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus; 56 + power = pd->table[i].power * nr_cpus; 57 57 58 58 if (power > power_limit) 59 59 break; ··· 63 63 64 64 freq_qos_update_request(&dtpm_cpu->qos_req, freq); 65 65 66 - power_limit = pd->table[i - 1].power * 67 - MICROWATT_PER_MILLIWATT * nr_cpus; 66 + power_limit = pd->table[i - 1].power * nr_cpus; 68 67 69 68 return power_limit; 70 69 }
+11 -2
drivers/thermal/cpufreq_cooling.c
··· 21 21 #include <linux/pm_qos.h> 22 22 #include <linux/slab.h> 23 23 #include <linux/thermal.h> 24 + #include <linux/units.h> 24 25 25 26 #include <trace/events/thermal.h> 26 27 ··· 102 101 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev, 103 102 u32 freq) 104 103 { 104 + unsigned long power_mw; 105 105 int i; 106 106 107 107 for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { ··· 110 108 break; 111 109 } 112 110 113 - return cpufreq_cdev->em->table[i + 1].power; 111 + power_mw = cpufreq_cdev->em->table[i + 1].power; 112 + power_mw /= MICROWATT_PER_MILLIWATT; 113 + 114 + return power_mw; 114 115 } 115 116 116 117 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, 117 118 u32 power) 118 119 { 120 + unsigned long em_power_mw; 119 121 int i; 120 122 121 123 for (i = cpufreq_cdev->max_level; i > 0; i--) { 122 - if (power >= cpufreq_cdev->em->table[i].power) 124 + /* Convert EM power to milli-Watts to make safe comparison */ 125 + em_power_mw = cpufreq_cdev->em->table[i].power; 126 + em_power_mw /= MICROWATT_PER_MILLIWATT; 127 + if (power >= em_power_mw) 123 128 break; 124 129 } 125 130
+15 -4
drivers/thermal/devfreq_cooling.c
··· 200 200 res = dfc->power_ops->get_real_power(df, power, freq, voltage); 201 201 if (!res) { 202 202 state = dfc->capped_state; 203 + 204 + /* Convert EM power into milli-Watts first */ 203 205 dfc->res_util = dfc->em_pd->table[state].power; 206 + dfc->res_util /= MICROWATT_PER_MILLIWATT; 207 + 204 208 dfc->res_util *= SCALE_ERROR_MITIGATION; 205 209 206 210 if (*power > 1) ··· 222 218 223 219 _normalize_load(&status); 224 220 225 - /* Scale power for utilization */ 221 + /* Convert EM power into milli-Watts first */ 226 222 *power = dfc->em_pd->table[perf_idx].power; 223 + *power /= MICROWATT_PER_MILLIWATT; 224 + /* Scale power for utilization */ 227 225 *power *= status.busy_time; 228 226 *power >>= 10; 229 227 } ··· 250 244 251 245 perf_idx = dfc->max_state - state; 252 246 *power = dfc->em_pd->table[perf_idx].power; 247 + *power /= MICROWATT_PER_MILLIWATT; 253 248 254 249 return 0; 255 250 } ··· 261 254 struct devfreq_cooling_device *dfc = cdev->devdata; 262 255 struct devfreq *df = dfc->devfreq; 263 256 struct devfreq_dev_status status; 264 - unsigned long freq; 257 + unsigned long freq, em_power_mw; 265 258 s32 est_power; 266 259 int i; 267 260 ··· 286 279 * Find the first cooling state that is within the power 287 280 * budget. The EM power table is sorted ascending. 288 281 */ 289 - for (i = dfc->max_state; i > 0; i--) 290 - if (est_power >= dfc->em_pd->table[i].power) 282 + for (i = dfc->max_state; i > 0; i--) { 283 + /* Convert EM power to milli-Watts to make safe comparison */ 284 + em_power_mw = dfc->em_pd->table[i].power; 285 + em_power_mw /= MICROWATT_PER_MILLIWATT; 286 + if (est_power >= em_power_mw) 291 287 break; 288 + } 292 289 293 290 *state = dfc->max_state - i; 294 291 dfc->capped_state = *state;
+39 -17
include/linux/energy_model.h
··· 62 62 /* 63 63 * em_perf_domain flags: 64 64 * 65 - * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some 65 + * EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some 66 66 * other scale. 67 67 * 68 68 * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating ··· 71 71 * EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be 72 72 * created by platform missing real power information 73 73 */ 74 - #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) 74 + #define EM_PERF_DOMAIN_MICROWATTS BIT(0) 75 75 #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) 76 76 #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2) 77 77 ··· 79 79 #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL) 80 80 81 81 #ifdef CONFIG_ENERGY_MODEL 82 - #define EM_MAX_POWER 0xFFFF 82 + /* 83 + * The max power value in micro-Watts. The limit of 64 Watts is set as 84 + * a safety net to not overflow multiplications on 32bit platforms. The 85 + * 32bit value limit for total Perf Domain power implies a limit of 86 + * maximum CPUs in such domain to 64. 87 + */ 88 + #define EM_MAX_POWER (64000000) /* 64 Watts */ 83 89 84 90 /* 85 - * Increase resolution of energy estimation calculations for 64-bit 86 - * architectures. The extra resolution improves decision made by EAS for the 87 - * task placement when two Performance Domains might provide similar energy 88 - * estimation values (w/o better resolution the values could be equal). 89 - * 90 - * We increase resolution only if we have enough bits to allow this increased 91 - * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit 92 - * are pretty high and the returns do not justify the increased costs. 91 + * To avoid possible energy estimation overflow on 32bit machines add 92 + * limits to number of CPUs in the Perf. Domain. 93 + * We are safe on 64bit machine, thus some big number. 93 94 */ 94 95 #ifdef CONFIG_64BIT 95 - #define em_scale_power(p) ((p) * 1000) 96 + #define EM_MAX_NUM_CPUS 4096 96 97 #else 97 - #define em_scale_power(p) (p) 98 + #define EM_MAX_NUM_CPUS 16 99 + #endif 100 + 101 + /* 102 + * To avoid an overflow on 32bit machines while calculating the energy 103 + * use a different order in the operation. First divide by the 'cpu_scale' 104 + * which would reduce big value stored in the 'cost' field, then multiply by 105 + * the 'sum_util'. This would allow to handle existing platforms, which have 106 + * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts. 107 + * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util' 108 + * could be 4096, then multiplication: 'cost' * 'sum_util' would overflow. 109 + * This reordering of operations has some limitations, we lose small 110 + * precision in the estimation (comparing to 64bit platform w/o reordering). 111 + * 112 + * We are safe on 64bit machine. 113 + */ 114 + #ifdef CONFIG_64BIT 115 + #define em_estimate_energy(cost, sum_util, scale_cpu) \ 116 + (((cost) * (sum_util)) / (scale_cpu)) 117 + #else 118 + #define em_estimate_energy(cost, sum_util, scale_cpu) \ 119 + (((cost) / (scale_cpu)) * (sum_util)) 98 120 #endif 99 121 100 122 struct em_data_callback { ··· 134 112 * and frequency. 135 113 * 136 114 * In case of CPUs, the power is the one of a single CPU in the domain, 137 - * expressed in milli-Watts or an abstract scale. It is expected to 115 + * expressed in micro-Watts or an abstract scale. It is expected to 138 116 * fit in the [0, EM_MAX_POWER] range. 139 117 * 140 118 * Return 0 on success. ··· 170 148 struct em_perf_domain *em_pd_get(struct device *dev); 171 149 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 172 150 struct em_data_callback *cb, cpumask_t *span, 173 - bool milliwatts); 151 + bool microwatts); 174 152 void em_dev_unregister_perf_domain(struct device *dev); 175 153 176 154 /** ··· 295 273 * pd_nrg = ------------------------ (4) 296 274 * scale_cpu 297 275 */ 298 - return ps->cost * sum_util / scale_cpu; 276 + return em_estimate_energy(ps->cost, sum_util, scale_cpu); 299 277 } 300 278 301 279 /** ··· 319 297 static inline 320 298 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 321 299 struct em_data_callback *cb, cpumask_t *span, 322 - bool milliwatts) 300 + bool microwatts) 323 301 { 324 302 return -EINVAL; 325 303 }
+16 -8
kernel/power/energy_model.c
··· 145 145 146 146 /* 147 147 * The power returned by active_state() is expected to be 148 - * positive and to fit into 16 bits. 148 + * positive and be in range. 149 149 */ 150 150 if (!power || power > EM_MAX_POWER) { 151 151 dev_err(dev, "EM: invalid power: %lu\n", ··· 170 170 goto free_ps_table; 171 171 } 172 172 } else { 173 - power_res = em_scale_power(table[i].power); 173 + power_res = table[i].power; 174 174 cost = div64_u64(fmax * power_res, table[i].frequency); 175 175 } 176 176 ··· 201 201 { 202 202 struct em_perf_domain *pd; 203 203 struct device *cpu_dev; 204 - int cpu, ret; 204 + int cpu, ret, num_cpus; 205 205 206 206 if (_is_cpu_device(dev)) { 207 + num_cpus = cpumask_weight(cpus); 208 + 209 + /* Prevent max possible energy calculation to not overflow */ 210 + if (num_cpus > EM_MAX_NUM_CPUS) { 211 + dev_err(dev, "EM: too many CPUs, overflow possible\n"); 212 + return -EINVAL; 213 + } 214 + 207 215 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 208 216 if (!pd) 209 217 return -ENOMEM; ··· 322 314 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 323 315 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 324 316 * type of devices this should be set to NULL. 325 - * @milliwatts : Flag indicating that the power values are in milliWatts or 317 + * @microwatts : Flag indicating that the power values are in micro-Watts or 326 318 * in some other scale. It must be set properly. 327 319 * 328 320 * Create Energy Model tables for a performance domain using the callbacks 329 321 * defined in cb. 330 322 * 331 - * The @milliwatts is important to set with correct value. Some kernel 323 + * The @microwatts is important to set with correct value. Some kernel 332 324 * sub-systems might rely on this flag and check if all devices in the EM are 333 325 * using the same scale. 334 326 * ··· 339 331 */ 340 332 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 341 333 struct em_data_callback *cb, cpumask_t *cpus, 342 - bool milliwatts) 334 + bool microwatts) 343 335 { 344 336 unsigned long cap, prev_cap = 0; 345 337 unsigned long flags = 0; ··· 389 381 } 390 382 } 391 383 392 - if (milliwatts) 393 - flags |= EM_PERF_DOMAIN_MILLIWATTS; 384 + if (microwatts) 385 + flags |= EM_PERF_DOMAIN_MICROWATTS; 394 386 else if (cb->get_cost) 395 387 flags |= EM_PERF_DOMAIN_ARTIFICIAL; 396 388