sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

sched/cpufreq: Rename arch_update_thermal_pressure() => arch_update_hw_pressure()

Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into HW pressure to reflect that it returns
a pressure applied by HW (i.e. with a high frequency change) and not
always related to thermal mitigation but also generated by max current
limitation as an example. Such high frequency signal needs filtering to be
smoothed and provide an value that reflects the average available capacity
into the scheduler time scale.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20240326091616.3696851-5-vincent.guittot@linaro.org

authored by

Vincent Guittot and committed by

Ingo Molnar 2 years ago d4dbc991 c281afe2

+77 -77

14 changed files

expand all collapse all

arch

arm

include

asm

topology.h

arm64

include

asm

topology.h

drivers

base

arch_topology.c

cpufreq

qcom-cpufreq-hw.c

include

linux

arch_topology.h

sched

topology.h

trace

events

hw_pressure.h

sched.h

init

Kconfig

kernel

sched

core.c

fair.c

pelt.c

pelt.h

sched.h

+3 -3

arch/arm/include/asm/topology.h

reviewed

··· 22 22 /* Enable topology flag updates */ 23 23 #define arch_update_cpu_topology topology_update_cpu_topology 24 24 25 25 - /* Replace task scheduler's default thermal pressure API */ 26 26 - #define arch_scale_thermal_pressure topology_get_thermal_pressure 27 27 - #define arch_update_thermal_pressure topology_update_thermal_pressure 25 25 + /* Replace task scheduler's default HW pressure API */ 26 26 + #define arch_scale_hw_pressure topology_get_hw_pressure 27 27 + #define arch_update_hw_pressure topology_update_hw_pressure 28 28 29 29 #else 30 30

+3 -3

arch/arm64/include/asm/topology.h

reviewed

··· 35 35 /* Enable topology flag updates */ 36 36 #define arch_update_cpu_topology topology_update_cpu_topology 37 37 38 38 - /* Replace task scheduler's default thermal pressure API */ 39 39 - #define arch_scale_thermal_pressure topology_get_thermal_pressure 40 40 - #define arch_update_thermal_pressure topology_update_thermal_pressure 38 38 + /* Replace task scheduler's default HW pressure API */ 39 39 + #define arch_scale_hw_pressure topology_get_hw_pressure 40 40 + #define arch_update_hw_pressure topology_update_hw_pressure 41 41 42 42 #include <asm-generic/topology.h> 43 43

+13 -13

drivers/base/arch_topology.c

reviewed

··· 22 22 #include <linux/units.h> 23 23 24 24 #define CREATE_TRACE_POINTS 25 25 - #include <trace/events/thermal_pressure.h> 25 25 + #include <trace/events/hw_pressure.h> 26 26 27 27 static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); 28 28 static struct cpumask scale_freq_counters_mask; ··· 160 160 per_cpu(cpu_scale, cpu) = capacity; 161 161 } 162 162 163 163 - DEFINE_PER_CPU(unsigned long, thermal_pressure); 163 163 + DEFINE_PER_CPU(unsigned long, hw_pressure); 164 164 165 165 /** 166 166 - * topology_update_thermal_pressure() - Update thermal pressure for CPUs 166 166 + * topology_update_hw_pressure() - Update HW pressure for CPUs 167 167 * @cpus : The related CPUs for which capacity has been reduced 168 168 * @capped_freq : The maximum allowed frequency that CPUs can run at 169 169 * 170 170 - * Update the value of thermal pressure for all @cpus in the mask. The 170 170 + * Update the value of HW pressure for all @cpus in the mask. The 171 171 * cpumask should include all (online+offline) affected CPUs, to avoid 172 172 * operating on stale data when hot-plug is used for some CPUs. The 173 173 * @capped_freq reflects the currently allowed max CPUs frequency due to 174 174 - * thermal capping. It might be also a boost frequency value, which is bigger 174 174 + * HW capping. It might be also a boost frequency value, which is bigger 175 175 * than the internal 'capacity_freq_ref' max frequency. In such case the 176 176 * pressure value should simply be removed, since this is an indication that 177 177 - * there is no thermal throttling. The @capped_freq must be provided in kHz. 177 177 + * there is no HW throttling. The @capped_freq must be provided in kHz. 178 178 */ 179 179 - void topology_update_thermal_pressure(const struct cpumask *cpus, 179 179 + void topology_update_hw_pressure(const struct cpumask *cpus, 180 180 unsigned long capped_freq) 181 181 { 182 182 - unsigned long max_capacity, capacity, th_pressure; 182 182 + unsigned long max_capacity, capacity, hw_pressure; 183 183 u32 max_freq; 184 184 int cpu; 185 185 ··· 189 189 190 190 /* 191 191 * Handle properly the boost frequencies, which should simply clean 192 192 - * the thermal pressure value. 192 192 + * the HW pressure value. 193 193 */ 194 194 if (max_freq <= capped_freq) 195 195 capacity = max_capacity; 196 196 else 197 197 capacity = mult_frac(max_capacity, capped_freq, max_freq); 198 198 199 199 - th_pressure = max_capacity - capacity; 199 199 + hw_pressure = max_capacity - capacity; 200 200 201 201 - trace_thermal_pressure_update(cpu, th_pressure); 201 201 + trace_hw_pressure_update(cpu, hw_pressure); 202 202 203 203 for_each_cpu(cpu, cpus) 204 204 - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); 204 204 + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure); 205 205 } 206 206 - EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); 206 206 + EXPORT_SYMBOL_GPL(topology_update_hw_pressure); 207 207 208 208 static ssize_t cpu_capacity_show(struct device *dev, 209 209 struct device_attribute *attr,

+2 -2

drivers/cpufreq/qcom-cpufreq-hw.c

reviewed

··· 347 347 348 348 throttled_freq = freq_hz / HZ_PER_KHZ; 349 349 350 350 - /* Update thermal pressure (the boost frequencies are accepted) */ 351 351 - arch_update_thermal_pressure(policy->related_cpus, throttled_freq); 350 350 + /* Update HW pressure (the boost frequencies are accepted) */ 351 351 + arch_update_hw_pressure(policy->related_cpus, throttled_freq); 352 352 353 353 /* 354 354 * In the unlikely case policy is unregistered do not enable

+4 -4

include/linux/arch_topology.h

reviewed

··· 60 60 void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); 61 61 void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); 62 62 63 63 - DECLARE_PER_CPU(unsigned long, thermal_pressure); 63 63 + DECLARE_PER_CPU(unsigned long, hw_pressure); 64 64 65 65 - static inline unsigned long topology_get_thermal_pressure(int cpu) 65 65 + static inline unsigned long topology_get_hw_pressure(int cpu) 66 66 { 67 67 - return per_cpu(thermal_pressure, cpu); 67 67 + return per_cpu(hw_pressure, cpu); 68 68 } 69 69 70 70 - void topology_update_thermal_pressure(const struct cpumask *cpus, 70 70 + void topology_update_hw_pressure(const struct cpumask *cpus, 71 71 unsigned long capped_freq); 72 72 73 73 struct cpu_topology {

+4 -4

include/linux/sched/topology.h

reviewed

··· 270 270 } 271 271 #endif 272 272 273 273 - #ifndef arch_scale_thermal_pressure 273 273 + #ifndef arch_scale_hw_pressure 274 274 static __always_inline 275 275 - unsigned long arch_scale_thermal_pressure(int cpu) 275 275 + unsigned long arch_scale_hw_pressure(int cpu) 276 276 { 277 277 return 0; 278 278 } 279 279 #endif 280 280 281 281 - #ifndef arch_update_thermal_pressure 281 281 + #ifndef arch_update_hw_pressure 282 282 static __always_inline 283 283 - void arch_update_thermal_pressure(const struct cpumask *cpus, 283 283 + void arch_update_hw_pressure(const struct cpumask *cpus, 284 284 unsigned long capped_frequency) 285 285 { } 286 286 #endif

+1 -1

include/trace/events/sched.h

reviewed

··· 752 752 TP_PROTO(struct rq *rq), 753 753 TP_ARGS(rq)); 754 754 755 755 - DECLARE_TRACE(pelt_thermal_tp, 755 755 + DECLARE_TRACE(pelt_hw_tp, 756 756 TP_PROTO(struct rq *rq), 757 757 TP_ARGS(rq)); 758 758

+7 -7

include/trace/events/thermal_pressure.h include/trace/events/hw_pressure.h

reviewed

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #undef TRACE_SYSTEM 3 3 - #define TRACE_SYSTEM thermal_pressure 3 3 + #define TRACE_SYSTEM hw_pressure 4 4 5 5 #if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ) 6 6 #define _TRACE_THERMAL_PRESSURE_H 7 7 8 8 #include <linux/tracepoint.h> 9 9 10 10 - TRACE_EVENT(thermal_pressure_update, 11 11 - TP_PROTO(int cpu, unsigned long thermal_pressure), 12 12 - TP_ARGS(cpu, thermal_pressure), 10 10 + TRACE_EVENT(hw_pressure_update, 11 11 + TP_PROTO(int cpu, unsigned long hw_pressure), 12 12 + TP_ARGS(cpu, hw_pressure), 13 13 14 14 TP_STRUCT__entry( 15 15 - __field(unsigned long, thermal_pressure) 15 15 + __field(unsigned long, hw_pressure) 16 16 __field(int, cpu) 17 17 ), 18 18 19 19 TP_fast_assign( 20 20 - __entry->thermal_pressure = thermal_pressure; 20 20 + __entry->hw_pressure = hw_pressure; 21 21 __entry->cpu = cpu; 22 22 ), 23 23 24 24 - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure) 24 24 + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure) 25 25 ); 26 26 #endif /* _TRACE_THERMAL_PRESSURE_H */ 27 27

+6 -6

init/Kconfig

reviewed

··· 547 547 depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING 548 548 depends on SMP 549 549 550 550 - config SCHED_THERMAL_PRESSURE 550 550 + config SCHED_HW_PRESSURE 551 551 bool 552 552 default y if ARM && ARM_CPU_TOPOLOGY 553 553 default y if ARM64 554 554 depends on SMP 555 555 depends on CPU_FREQ_THERMAL 556 556 help 557 557 - Select this option to enable thermal pressure accounting in the 558 558 - scheduler. Thermal pressure is the value conveyed to the scheduler 557 557 + Select this option to enable HW pressure accounting in the 558 558 + scheduler. HW pressure is the value conveyed to the scheduler 559 559 that reflects the reduction in CPU compute capacity resulted from 560 560 - thermal throttling. Thermal throttling occurs when the performance of 561 561 - a CPU is capped due to high operating temperatures. 560 560 + HW throttling. HW throttling occurs when the performance of 561 561 + a CPU is capped due to high operating temperatures as an example. 562 562 563 563 If selected, the scheduler will be able to balance tasks accordingly, 564 564 i.e. put less load on throttled CPUs than on non/less throttled ones. 565 565 566 566 This requires the architecture to implement 567 567 - arch_update_thermal_pressure() and arch_scale_thermal_pressure(). 567 567 + arch_update_hw_pressure() and arch_scale_thermal_pressure(). 568 568 569 569 config BSD_PROCESS_ACCT 570 570 bool "BSD Process Accounting"

+4 -4

kernel/sched/core.c

reviewed

··· 108 108 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); 109 109 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); 110 110 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); 111 111 - EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); 111 111 + EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); 112 112 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); 113 113 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 114 114 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); ··· 5668 5668 struct rq *rq = cpu_rq(cpu); 5669 5669 struct task_struct *curr = rq->curr; 5670 5670 struct rq_flags rf; 5671 5671 - unsigned long thermal_pressure; 5671 5671 + unsigned long hw_pressure; 5672 5672 u64 resched_latency; 5673 5673 5674 5674 if (housekeeping_cpu(cpu, HK_TYPE_TICK)) ··· 5679 5679 rq_lock(rq, &rf); 5680 5680 5681 5681 update_rq_clock(rq); 5682 5682 - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 5683 5683 - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); 5682 5682 + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 5683 5683 + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure); 5684 5684 curr->sched_class->task_tick(rq, curr, 0); 5685 5685 if (sched_feat(LATENCY_WARN)) 5686 5686 resched_latency = cpu_resched_latency(rq);

+8 -8

kernel/sched/fair.c

reviewed

··· 78 78 79 79 const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 80 80 81 81 - int sched_thermal_decay_shift; 81 81 + int sched_hw_decay_shift; 82 82 static int __init setup_sched_thermal_decay_shift(char *str) 83 83 { 84 84 int _shift = 0; ··· 86 86 if (kstrtoint(str, 0, &_shift)) 87 87 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); 88 88 89 89 - sched_thermal_decay_shift = clamp(_shift, 0, 10); 89 89 + sched_hw_decay_shift = clamp(_shift, 0, 10); 90 90 return 1; 91 91 } 92 92 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); ··· 4969 4969 { 4970 4970 unsigned long capacity = arch_scale_cpu_capacity(cpu); 4971 4971 4972 4972 - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); 4972 4972 + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); 4973 4973 4974 4974 return capacity; 4975 4975 } ··· 5002 5002 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it 5003 5003 * should fit a little cpu even if there's some pressure. 5004 5004 * 5005 5005 - * Only exception is for thermal pressure since it has a direct impact 5005 5005 + * Only exception is for HW or cpufreq pressure since it has a direct impact 5006 5006 * on available OPP of the system. 5007 5007 * 5008 5008 * We honour it for uclamp_min only as a drop in performance level ··· 9324 9324 if (cpu_util_dl(rq)) 9325 9325 return true; 9326 9326 9327 9327 - if (thermal_load_avg(rq)) 9327 9327 + if (hw_load_avg(rq)) 9328 9328 return true; 9329 9329 9330 9330 if (cpu_util_irq(rq)) ··· 9354 9354 { 9355 9355 const struct sched_class *curr_class; 9356 9356 u64 now = rq_clock_pelt(rq); 9357 9357 - unsigned long thermal_pressure; 9357 9357 + unsigned long hw_pressure; 9358 9358 bool decayed; 9359 9359 9360 9360 /* ··· 9363 9363 */ 9364 9364 curr_class = rq->curr->sched_class; 9365 9365 9366 9366 - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 9366 9366 + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 9367 9367 9368 9368 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | 9369 9369 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | 9370 9370 - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | 9370 9370 + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) | 9371 9371 update_irq_load_avg(rq, 0); 9372 9372 9373 9373 if (others_have_blocked(rq))

+9 -9

kernel/sched/pelt.c

reviewed

··· 384 384 return 0; 385 385 } 386 386 387 387 - #ifdef CONFIG_SCHED_THERMAL_PRESSURE 387 387 + #ifdef CONFIG_SCHED_HW_PRESSURE 388 388 /* 389 389 - * thermal: 389 389 + * hardware: 390 390 * 391 391 * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked 392 392 * 393 393 * util_avg and runnable_load_avg are not supported and meaningless. 394 394 * 395 395 * Unlike rt/dl utilization tracking that track time spent by a cpu 396 396 - * running a rt/dl task through util_avg, the average thermal pressure is 397 397 - * tracked through load_avg. This is because thermal pressure signal is 396 396 + * running a rt/dl task through util_avg, the average HW pressure is 397 397 + * tracked through load_avg. This is because HW pressure signal is 398 398 * time weighted "delta" capacity unlike util_avg which is binary. 399 399 * "delta capacity" = actual capacity - 400 400 - * capped capacity a cpu due to a thermal event. 400 400 + * capped capacity a cpu due to a HW event. 401 401 */ 402 402 403 403 - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 403 403 + int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) 404 404 { 405 405 - if (___update_load_sum(now, &rq->avg_thermal, 405 405 + if (___update_load_sum(now, &rq->avg_hw, 406 406 capacity, 407 407 capacity, 408 408 capacity)) { 409 409 - ___update_load_avg(&rq->avg_thermal, 1); 410 410 - trace_pelt_thermal_tp(rq); 409 409 + ___update_load_avg(&rq->avg_hw, 1); 410 410 + trace_pelt_hw_tp(rq); 411 411 return 1; 412 412 } 413 413

+8 -8

kernel/sched/pelt.h

reviewed

··· 7 7 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); 8 8 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); 9 9 10 10 - #ifdef CONFIG_SCHED_THERMAL_PRESSURE 11 11 - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); 10 10 + #ifdef CONFIG_SCHED_HW_PRESSURE 11 11 + int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); 12 12 13 13 - static inline u64 thermal_load_avg(struct rq *rq) 13 13 + static inline u64 hw_load_avg(struct rq *rq) 14 14 { 15 15 - return READ_ONCE(rq->avg_thermal.load_avg); 15 15 + return READ_ONCE(rq->avg_hw.load_avg); 16 16 } 17 17 #else 18 18 static inline int 19 19 - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 19 19 + update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) 20 20 { 21 21 return 0; 22 22 } 23 23 24 24 - static inline u64 thermal_load_avg(struct rq *rq) 24 24 + static inline u64 hw_load_avg(struct rq *rq) 25 25 { 26 26 return 0; 27 27 } ··· 202 202 } 203 203 204 204 static inline int 205 205 - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) 205 205 + update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) 206 206 { 207 207 return 0; 208 208 } 209 209 210 210 - static inline u64 thermal_load_avg(struct rq *rq) 210 210 + static inline u64 hw_load_avg(struct rq *rq) 211 211 { 212 212 return 0; 213 213 }

+5 -5

kernel/sched/sched.h

reviewed

··· 1108 1108 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 1109 1109 struct sched_avg avg_irq; 1110 1110 #endif 1111 1111 - #ifdef CONFIG_SCHED_THERMAL_PRESSURE 1112 1112 - struct sched_avg avg_thermal; 1111 1111 + #ifdef CONFIG_SCHED_HW_PRESSURE 1112 1112 + struct sched_avg avg_hw; 1113 1113 #endif 1114 1114 u64 idle_stamp; 1115 1115 u64 avg_idle; ··· 1561 1561 * 3 256 1562 1562 * 4 512 1563 1563 */ 1564 1564 - extern int sched_thermal_decay_shift; 1564 1564 + extern int sched_hw_decay_shift; 1565 1565 1566 1566 - static inline u64 rq_clock_thermal(struct rq *rq) 1566 1566 + static inline u64 rq_clock_hw(struct rq *rq) 1567 1567 { 1568 1568 - return rq_clock_task(rq) >> sched_thermal_decay_shift; 1568 1568 + return rq_clock_task(rq) >> sched_hw_decay_shift; 1569 1569 } 1570 1570 1571 1571 static inline void rq_clock_skip_update(struct rq *rq)