thermal: devfreq_cooling: add new interface for direct power read

This patch introduces a new interface for device drivers connected to
devfreq_cooling in the thermal framework: get_real_power().

Some devices have more sophisticated methods (like power counters)
to approximate the actual power that they use.
In the previous implementation we had a pre-calculated power
table which was then scaled by 'utilization'
('busy_time' and 'total_time' taken from devfreq 'last_status').

With this new interface the driver can provide more precise data
regarding actual power to the thermal governor every time the power
budget is calculated. We then use this value and calculate the real
resource utilization scaling factor.

Reviewed-by: Chris Diamand <chris.diamand@arm.com>
Acked-by: Javi Merino <javi.merino@kernel.org>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>

authored by Lukasz Luba and committed by Zhang Rui 2be83da8 e34cab4c

+101 -23
+82 -23
drivers/thermal/devfreq_cooling.c
··· 28 29 #include <trace/events/thermal.h> 30 31 static DEFINE_IDA(devfreq_ida); 32 33 /** ··· 47 * @freq_table_size: Size of the @freq_table and @power_table 48 * @power_ops: Pointer to devfreq_cooling_power, used to generate the 49 * @power_table. 50 */ 51 struct devfreq_cooling_device { 52 int id; ··· 63 u32 *freq_table; 64 size_t freq_table_size; 65 struct devfreq_cooling_power *power_ops; 66 }; 67 68 /** ··· 260 return power; 261 } 262 263 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev, 264 struct thermal_zone_device *tz, 265 u32 *power) ··· 279 struct devfreq_dev_status *status = &df->last_status; 280 unsigned long state; 281 unsigned long freq = status->current_frequency; 282 - u32 dyn_power, static_power; 283 284 - /* Get dynamic power for state */ 285 state = freq_get_state(dfc, freq); 286 - if (state == THERMAL_CSTATE_INVALID) 287 - return -EAGAIN; 288 289 - dyn_power = dfc->power_table[state]; 290 291 - /* Scale dynamic power for utilization */ 292 - dyn_power = (dyn_power * status->busy_time) / status->total_time; 293 294 - /* Get static power */ 295 - static_power = get_static_power(dfc, freq); 296 297 trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power, 298 static_power); 299 300 - *power = dyn_power + static_power; 301 - 302 return 0; 303 } 304 305 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev, ··· 360 unsigned long busy_time; 361 s32 dyn_power; 362 u32 static_power; 363 int i; 364 365 - static_power = get_static_power(dfc, freq); 366 367 - dyn_power = power - static_power; 368 - dyn_power = dyn_power > 0 ? dyn_power : 0; 369 370 - /* Scale dynamic power for utilization */ 371 - busy_time = status->busy_time ?: 1; 372 - dyn_power = (dyn_power * status->total_time) / busy_time; 373 374 /* 375 * Find the first cooling state that is within the power 376 * budget for dynamic power. 377 */ 378 for (i = 0; i < dfc->freq_table_size - 1; i++) 379 - if (dyn_power >= dfc->power_table[i]) 380 break; 381 382 *state = i; 383 trace_thermal_power_devfreq_limit(cdev, freq, *state, power); 384 return 0; 385 } ··· 443 } 444 445 for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) { 446 - unsigned long power_dyn, voltage; 447 struct dev_pm_opp *opp; 448 449 opp = dev_pm_opp_find_freq_floor(dev, &freq); ··· 456 dev_pm_opp_put(opp); 457 458 if (dfc->power_ops) { 459 - power_dyn = get_dynamic_power(dfc, freq, voltage); 460 461 - dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n", 462 - freq / 1000000, voltage, power_dyn, power_dyn); 463 464 - power_table[i] = power_dyn; 465 } 466 467 freq_table[i] = freq;
··· 28 29 #include <trace/events/thermal.h> 30 31 + #define SCALE_ERROR_MITIGATION 100 32 + 33 static DEFINE_IDA(devfreq_ida); 34 35 /** ··· 45 * @freq_table_size: Size of the @freq_table and @power_table 46 * @power_ops: Pointer to devfreq_cooling_power, used to generate the 47 * @power_table. 48 + * @res_util: Resource utilization scaling factor for the power. 49 + * It is multiplied by 100 to minimize the error. It is used 50 + * for estimation of the power budget instead of using 51 + * 'utilization' (which is 'busy_time / 'total_time'). 52 + * The 'res_util' range is from 100 to (power_table[state] * 100) 53 + * for the corresponding 'state'. 54 */ 55 struct devfreq_cooling_device { 56 int id; ··· 55 u32 *freq_table; 56 size_t freq_table_size; 57 struct devfreq_cooling_power *power_ops; 58 + u32 res_util; 59 + int capped_state; 60 }; 61 62 /** ··· 250 return power; 251 } 252 253 + 254 + static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc, 255 + unsigned long freq, 256 + unsigned long voltage) 257 + { 258 + return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq, 259 + voltage); 260 + } 261 + 262 + 263 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev, 264 struct thermal_zone_device *tz, 265 u32 *power) ··· 259 struct devfreq_dev_status *status = &df->last_status; 260 unsigned long state; 261 unsigned long freq = status->current_frequency; 262 + unsigned long voltage; 263 + u32 dyn_power = 0; 264 + u32 static_power = 0; 265 + int res; 266 267 state = freq_get_state(dfc, freq); 268 + if (state == THERMAL_CSTATE_INVALID) { 269 + res = -EAGAIN; 270 + goto fail; 271 + } 272 273 + if (dfc->power_ops->get_real_power) { 274 + voltage = get_voltage(df, freq); 275 + if (voltage == 0) { 276 + res = -EINVAL; 277 + goto fail; 278 + } 279 280 + res = dfc->power_ops->get_real_power(df, power, freq, voltage); 281 + if (!res) { 282 + state = dfc->capped_state; 283 + dfc->res_util = dfc->power_table[state]; 284 + dfc->res_util *= SCALE_ERROR_MITIGATION; 285 286 + if (*power > 1) 287 + dfc->res_util /= *power; 288 + } else { 289 + goto fail; 290 + } 291 + } else { 292 + dyn_power = dfc->power_table[state]; 293 + 294 + /* Scale dynamic power for utilization */ 295 + dyn_power *= status->busy_time; 296 + dyn_power /= status->total_time; 297 + /* Get static power */ 298 + static_power = get_static_power(dfc, freq); 299 + 300 + *power = dyn_power + static_power; 301 + } 302 303 trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power, 304 static_power); 305 306 return 0; 307 + fail: 308 + /* It is safe to set max in this case */ 309 + dfc->res_util = SCALE_ERROR_MITIGATION; 310 + return res; 311 } 312 313 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev, ··· 312 unsigned long busy_time; 313 s32 dyn_power; 314 u32 static_power; 315 + s32 est_power; 316 int i; 317 318 + if (dfc->power_ops->get_real_power) { 319 + /* Scale for resource utilization */ 320 + est_power = power * dfc->res_util; 321 + est_power /= SCALE_ERROR_MITIGATION; 322 + } else { 323 + static_power = get_static_power(dfc, freq); 324 325 + dyn_power = power - static_power; 326 + dyn_power = dyn_power > 0 ? dyn_power : 0; 327 328 + /* Scale dynamic power for utilization */ 329 + busy_time = status->busy_time ?: 1; 330 + est_power = (dyn_power * status->total_time) / busy_time; 331 + } 332 333 /* 334 * Find the first cooling state that is within the power 335 * budget for dynamic power. 336 */ 337 for (i = 0; i < dfc->freq_table_size - 1; i++) 338 + if (est_power >= dfc->power_table[i]) 339 break; 340 341 *state = i; 342 + dfc->capped_state = i; 343 trace_thermal_power_devfreq_limit(cdev, freq, *state, power); 344 return 0; 345 } ··· 387 } 388 389 for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) { 390 + unsigned long power, voltage; 391 struct dev_pm_opp *opp; 392 393 opp = dev_pm_opp_find_freq_floor(dev, &freq); ··· 400 dev_pm_opp_put(opp); 401 402 if (dfc->power_ops) { 403 + if (dfc->power_ops->get_real_power) 404 + power = get_total_power(dfc, freq, voltage); 405 + else 406 + power = get_dynamic_power(dfc, freq, voltage); 407 408 + dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n", 409 + freq / 1000000, voltage, power, power); 410 411 + power_table[i] = power; 412 } 413 414 freq_table[i] = freq;
+19
include/linux/devfreq_cooling.h
··· 34 * If get_dynamic_power() is NULL, then the 35 * dynamic power is calculated as 36 * @dyn_power_coeff * frequency * voltage^2 37 */ 38 struct devfreq_cooling_power { 39 unsigned long (*get_static_power)(struct devfreq *devfreq, ··· 58 unsigned long (*get_dynamic_power)(struct devfreq *devfreq, 59 unsigned long freq, 60 unsigned long voltage); 61 unsigned long dyn_power_coeff; 62 }; 63
··· 34 * If get_dynamic_power() is NULL, then the 35 * dynamic power is calculated as 36 * @dyn_power_coeff * frequency * voltage^2 37 + * @get_real_power: When this is set, the framework uses it to ask the 38 + * device driver for the actual power. 39 + * Some devices have more sophisticated methods 40 + * (like power counters) to approximate the actual power 41 + * that they use. 42 + * This function provides more accurate data to the 43 + * thermal governor. When the driver does not provide 44 + * such function, framework just uses pre-calculated 45 + * table and scale the power by 'utilization' 46 + * (based on 'busy_time' and 'total_time' taken from 47 + * devfreq 'last_status'). 48 + * The value returned by this function must be lower 49 + * or equal than the maximum power value 50 + * for the current state 51 + * (which can be found in power_table[state]). 52 + * When this interface is used, the power_table holds 53 + * max total (static + dynamic) power value for each OPP. 54 */ 55 struct devfreq_cooling_power { 56 unsigned long (*get_static_power)(struct devfreq *devfreq, ··· 41 unsigned long (*get_dynamic_power)(struct devfreq *devfreq, 42 unsigned long freq, 43 unsigned long voltage); 44 + int (*get_real_power)(struct devfreq *df, u32 *power, 45 + unsigned long freq, unsigned long voltage); 46 unsigned long dyn_power_coeff; 47 }; 48