PM: EM: convert power field to micro-Watts precision and align drivers

+4 -3

drivers/cpufreq/mediatek-cpufreq-hw.c

··· 51 51 }; 52 52 53 53 static int __maybe_unused 54 - mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW, 54 + mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *uW, 55 55 unsigned long *KHz) 56 56 { 57 57 struct mtk_cpufreq_data *data; ··· 71 71 i--; 72 72 73 73 *KHz = data->table[i].frequency; 74 - *mW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] + 75 - i * LUT_ROW_SIZE) / 1000; 74 + /* Provide micro-Watts value to the Energy Model */ 75 + *uW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] + 76 + i * LUT_ROW_SIZE); 76 77 77 78 return 0; 78 79 }

+6

drivers/cpufreq/scmi-cpufreq.c

··· 19 19 #include <linux/slab.h> 20 20 #include <linux/scmi_protocol.h> 21 21 #include <linux/types.h> 22 + #include <linux/units.h> 22 23 23 24 struct scmi_data { 24 25 int domain_id; ··· 100 99 scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power, 101 100 unsigned long *KHz) 102 101 { 102 + bool power_scale_mw = perf_ops->power_scale_mw_get(ph); 103 103 unsigned long Hz; 104 104 int ret, domain; 105 105 ··· 113 111 ret = perf_ops->est_power_get(ph, domain, &Hz, power); 114 112 if (ret) 115 113 return ret; 114 + 115 + /* Provide bigger resolution power to the Energy Model */ 116 + if (power_scale_mw) 117 + *power *= MICROWATT_PER_MILLIWATT; 116 118 117 119 /* The EM framework specifies the frequency in KHz. */ 118 120 *KHz = Hz / 1000;

+8 -7

drivers/opp/of.c

··· 1443 1443 * It provides the power used by @dev at @kHz if it is the frequency of an 1444 1444 * existing OPP, or at the frequency of the first OPP above @kHz otherwise 1445 1445 * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled 1446 - * frequency and @mW to the associated power. 1446 + * frequency and @uW to the associated power. 1447 1447 * 1448 1448 * Returns 0 on success or a proper -EINVAL value in case of error. 1449 1449 */ 1450 1450 static int __maybe_unused 1451 - _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz) 1451 + _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz) 1452 1452 { 1453 1453 struct dev_pm_opp *opp; 1454 1454 unsigned long opp_freq, opp_power; ··· 1465 1465 return -EINVAL; 1466 1466 1467 1467 *kHz = opp_freq / 1000; 1468 - *mW = opp_power / 1000; 1468 + *uW = opp_power; 1469 1469 1470 1470 return 0; 1471 1471 } ··· 1475 1475 * This computes the power estimated by @dev at @kHz if it is the frequency 1476 1476 * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise 1477 1477 * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled 1478 - * frequency and @mW to the associated power. The power is estimated as 1478 + * frequency and @uW to the associated power. The power is estimated as 1479 1479 * P = C * V^2 * f with C being the device's capacitance and V and f 1480 1480 * respectively the voltage and frequency of the OPP. 1481 1481 * 1482 1482 * Returns -EINVAL if the power calculation failed because of missing 1483 1483 * parameters, 0 otherwise. 1484 1484 */ 1485 - static int __maybe_unused _get_power(struct device *dev, unsigned long *mW, 1485 + static int __maybe_unused _get_power(struct device *dev, unsigned long *uW, 1486 1486 unsigned long *kHz) 1487 1487 { 1488 1488 struct dev_pm_opp *opp; ··· 1512 1512 return -EINVAL; 1513 1513 1514 1514 tmp = (u64)cap * mV * mV * (Hz / 1000000); 1515 - do_div(tmp, 1000000000); 1515 + /* Provide power in micro-Watts */ 1516 + do_div(tmp, 1000000); 1516 1517 1517 - *mW = (unsigned long)tmp; 1518 + *uW = (unsigned long)tmp; 1518 1519 *kHz = Hz / 1000; 1519 1520 1520 1521 return 0;

+2 -3

drivers/powercap/dtpm_cpu.c

··· 53 53 54 54 for (i = 0; i < pd->nr_perf_states; i++) { 55 55 56 - power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus; 56 + power = pd->table[i].power * nr_cpus; 57 57 58 58 if (power > power_limit) 59 59 break; ··· 63 63 64 64 freq_qos_update_request(&dtpm_cpu->qos_req, freq); 65 65 66 - power_limit = pd->table[i - 1].power * 67 - MICROWATT_PER_MILLIWATT * nr_cpus; 66 + power_limit = pd->table[i - 1].power * nr_cpus; 68 67 69 68 return power_limit; 70 69 }

+11 -2

drivers/thermal/cpufreq_cooling.c

··· 21 21 #include <linux/pm_qos.h> 22 22 #include <linux/slab.h> 23 23 #include <linux/thermal.h> 24 + #include <linux/units.h> 24 25 25 26 #include <trace/events/thermal.h> 26 27 ··· 102 101 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev, 103 102 u32 freq) 104 103 { 104 + unsigned long power_mw; 105 105 int i; 106 106 107 107 for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { ··· 110 108 break; 111 109 } 112 110 113 - return cpufreq_cdev->em->table[i + 1].power; 111 + power_mw = cpufreq_cdev->em->table[i + 1].power; 112 + power_mw /= MICROWATT_PER_MILLIWATT; 113 + 114 + return power_mw; 114 115 } 115 116 116 117 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, 117 118 u32 power) 118 119 { 120 + unsigned long em_power_mw; 119 121 int i; 120 122 121 123 for (i = cpufreq_cdev->max_level; i > 0; i--) { 122 - if (power >= cpufreq_cdev->em->table[i].power) 124 + /* Convert EM power to milli-Watts to make safe comparison */ 125 + em_power_mw = cpufreq_cdev->em->table[i].power; 126 + em_power_mw /= MICROWATT_PER_MILLIWATT; 127 + if (power >= em_power_mw) 123 128 break; 124 129 } 125 130

+15 -4

drivers/thermal/devfreq_cooling.c

··· 200 200 res = dfc->power_ops->get_real_power(df, power, freq, voltage); 201 201 if (!res) { 202 202 state = dfc->capped_state; 203 + 204 + /* Convert EM power into milli-Watts first */ 203 205 dfc->res_util = dfc->em_pd->table[state].power; 206 + dfc->res_util /= MICROWATT_PER_MILLIWATT; 207 + 204 208 dfc->res_util *= SCALE_ERROR_MITIGATION; 205 209 206 210 if (*power > 1) ··· 222 218 223 219 _normalize_load(&status); 224 220 225 - /* Scale power for utilization */ 221 + /* Convert EM power into milli-Watts first */ 226 222 *power = dfc->em_pd->table[perf_idx].power; 223 + *power /= MICROWATT_PER_MILLIWATT; 224 + /* Scale power for utilization */ 227 225 *power *= status.busy_time; 228 226 *power >>= 10; 229 227 } ··· 250 244 251 245 perf_idx = dfc->max_state - state; 252 246 *power = dfc->em_pd->table[perf_idx].power; 247 + *power /= MICROWATT_PER_MILLIWATT; 253 248 254 249 return 0; 255 250 } ··· 261 254 struct devfreq_cooling_device *dfc = cdev->devdata; 262 255 struct devfreq *df = dfc->devfreq; 263 256 struct devfreq_dev_status status; 264 - unsigned long freq; 257 + unsigned long freq, em_power_mw; 265 258 s32 est_power; 266 259 int i; 267 260 ··· 286 279 * Find the first cooling state that is within the power 287 280 * budget. The EM power table is sorted ascending. 288 281 */ 289 - for (i = dfc->max_state; i > 0; i--) 290 - if (est_power >= dfc->em_pd->table[i].power) 282 + for (i = dfc->max_state; i > 0; i--) { 283 + /* Convert EM power to milli-Watts to make safe comparison */ 284 + em_power_mw = dfc->em_pd->table[i].power; 285 + em_power_mw /= MICROWATT_PER_MILLIWATT; 286 + if (est_power >= em_power_mw) 291 287 break; 288 + } 292 289 293 290 *state = dfc->max_state - i; 294 291 dfc->capped_state = *state;

+39 -17

include/linux/energy_model.h

··· 62 62 /* 63 63 * em_perf_domain flags: 64 64 * 65 - * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some 65 + * EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some 66 66 * other scale. 67 67 * 68 68 * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating ··· 71 71 * EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be 72 72 * created by platform missing real power information 73 73 */ 74 - #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) 74 + #define EM_PERF_DOMAIN_MICROWATTS BIT(0) 75 75 #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) 76 76 #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2) 77 77 ··· 79 79 #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL) 80 80 81 81 #ifdef CONFIG_ENERGY_MODEL 82 - #define EM_MAX_POWER 0xFFFF 82 + /* 83 + * The max power value in micro-Watts. The limit of 64 Watts is set as 84 + * a safety net to not overflow multiplications on 32bit platforms. The 85 + * 32bit value limit for total Perf Domain power implies a limit of 86 + * maximum CPUs in such domain to 64. 87 + */ 88 + #define EM_MAX_POWER (64000000) /* 64 Watts */ 83 89 84 90 /* 85 - * Increase resolution of energy estimation calculations for 64-bit 86 - * architectures. The extra resolution improves decision made by EAS for the 87 - * task placement when two Performance Domains might provide similar energy 88 - * estimation values (w/o better resolution the values could be equal). 89 - * 90 - * We increase resolution only if we have enough bits to allow this increased 91 - * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit 92 - * are pretty high and the returns do not justify the increased costs. 91 + * To avoid possible energy estimation overflow on 32bit machines add 92 + * limits to number of CPUs in the Perf. Domain. 93 + * We are safe on 64bit machine, thus some big number. 93 94 */ 94 95 #ifdef CONFIG_64BIT 95 - #define em_scale_power(p) ((p) * 1000) 96 + #define EM_MAX_NUM_CPUS 4096 96 97 #else 97 - #define em_scale_power(p) (p) 98 + #define EM_MAX_NUM_CPUS 16 99 + #endif 100 + 101 + /* 102 + * To avoid an overflow on 32bit machines while calculating the energy 103 + * use a different order in the operation. First divide by the 'cpu_scale' 104 + * which would reduce big value stored in the 'cost' field, then multiply by 105 + * the 'sum_util'. This would allow to handle existing platforms, which have 106 + * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts. 107 + * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util' 108 + * could be 4096, then multiplication: 'cost' * 'sum_util' would overflow. 109 + * This reordering of operations has some limitations, we lose small 110 + * precision in the estimation (comparing to 64bit platform w/o reordering). 111 + * 112 + * We are safe on 64bit machine. 113 + */ 114 + #ifdef CONFIG_64BIT 115 + #define em_estimate_energy(cost, sum_util, scale_cpu) \ 116 + (((cost) * (sum_util)) / (scale_cpu)) 117 + #else 118 + #define em_estimate_energy(cost, sum_util, scale_cpu) \ 119 + (((cost) / (scale_cpu)) * (sum_util)) 98 120 #endif 99 121 100 122 struct em_data_callback { ··· 134 112 * and frequency. 135 113 * 136 114 * In case of CPUs, the power is the one of a single CPU in the domain, 137 - * expressed in milli-Watts or an abstract scale. It is expected to 115 + * expressed in micro-Watts or an abstract scale. It is expected to 138 116 * fit in the [0, EM_MAX_POWER] range. 139 117 * 140 118 * Return 0 on success. ··· 170 148 struct em_perf_domain *em_pd_get(struct device *dev); 171 149 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 172 150 struct em_data_callback *cb, cpumask_t *span, 173 - bool milliwatts); 151 + bool microwatts); 174 152 void em_dev_unregister_perf_domain(struct device *dev); 175 153 176 154 /** ··· 295 273 * pd_nrg = ------------------------ (4) 296 274 * scale_cpu 297 275 */ 298 - return ps->cost * sum_util / scale_cpu; 276 + return em_estimate_energy(ps->cost, sum_util, scale_cpu); 299 277 } 300 278 301 279 /** ··· 319 297 static inline 320 298 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 321 299 struct em_data_callback *cb, cpumask_t *span, 322 - bool milliwatts) 300 + bool microwatts) 323 301 { 324 302 return -EINVAL; 325 303 }

+16 -8

kernel/power/energy_model.c

··· 145 145 146 146 /* 147 147 * The power returned by active_state() is expected to be 148 - * positive and to fit into 16 bits. 148 + * positive and be in range. 149 149 */ 150 150 if (!power || power > EM_MAX_POWER) { 151 151 dev_err(dev, "EM: invalid power: %lu\n", ··· 170 170 goto free_ps_table; 171 171 } 172 172 } else { 173 - power_res = em_scale_power(table[i].power); 173 + power_res = table[i].power; 174 174 cost = div64_u64(fmax * power_res, table[i].frequency); 175 175 } 176 176 ··· 201 201 { 202 202 struct em_perf_domain *pd; 203 203 struct device *cpu_dev; 204 - int cpu, ret; 204 + int cpu, ret, num_cpus; 205 205 206 206 if (_is_cpu_device(dev)) { 207 + num_cpus = cpumask_weight(cpus); 208 + 209 + /* Prevent max possible energy calculation to not overflow */ 210 + if (num_cpus > EM_MAX_NUM_CPUS) { 211 + dev_err(dev, "EM: too many CPUs, overflow possible\n"); 212 + return -EINVAL; 213 + } 214 + 207 215 pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); 208 216 if (!pd) 209 217 return -ENOMEM; ··· 322 314 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 323 315 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 324 316 * type of devices this should be set to NULL. 325 - * @milliwatts : Flag indicating that the power values are in milliWatts or 317 + * @microwatts : Flag indicating that the power values are in micro-Watts or 326 318 * in some other scale. It must be set properly. 327 319 * 328 320 * Create Energy Model tables for a performance domain using the callbacks 329 321 * defined in cb. 330 322 * 331 - * The @milliwatts is important to set with correct value. Some kernel 323 + * The @microwatts is important to set with correct value. Some kernel 332 324 * sub-systems might rely on this flag and check if all devices in the EM are 333 325 * using the same scale. 334 326 * ··· 339 331 */ 340 332 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 341 333 struct em_data_callback *cb, cpumask_t *cpus, 342 - bool milliwatts) 334 + bool microwatts) 343 335 { 344 336 unsigned long cap, prev_cap = 0; 345 337 unsigned long flags = 0; ··· 389 381 } 390 382 } 391 383 392 - if (milliwatts) 393 - flags |= EM_PERF_DOMAIN_MILLIWATTS; 384 + if (microwatts) 385 + flags |= EM_PERF_DOMAIN_MICROWATTS; 394 386 else if (cb->get_cost) 395 387 flags |= EM_PERF_DOMAIN_ARTIFICIAL; 396 388