Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/topology: Consolidate and clean up access to a CPU's max compute capacity

Remove the rq::cpu_capacity_orig field and use arch_scale_cpu_capacity()
instead.

The scheduler uses 3 methods to get access to a CPU's max compute capacity:

- arch_scale_cpu_capacity(cpu) which is the default way to get a CPU's capacity.

- cpu_capacity_orig field which is periodically updated with
arch_scale_cpu_capacity().

- capacity_orig_of(cpu) which encapsulates rq->cpu_capacity_orig.

There is no real need to save the value returned by arch_scale_cpu_capacity()
in struct rq. arch_scale_cpu_capacity() returns:

- either a per_cpu variable.

- or a const value for systems which have only one capacity.

Remove rq::cpu_capacity_orig and use arch_scale_cpu_capacity() everywhere.

No functional changes.

Some performance tests on Arm64:

- small SMP device (hikey): no noticeable changes
- HMP device (RB5): hackbench shows minor improvement (1-2%)
- large smp (thx2): hackbench and tbench shows minor improvement (1%)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20231009103621.374412-2-vincent.guittot@linaro.org

authored by

Vincent Guittot and committed by
Ingo Molnar
7bc26384 089768df

+25 -29
+7 -6
Documentation/scheduler/sched-capacity.rst
··· 39 39 ------------------- 40 40 41 41 Two different capacity values are used within the scheduler. A CPU's 42 - ``capacity_orig`` is its maximum attainable capacity, i.e. its maximum 43 - attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to 44 - which some loss of available performance (e.g. time spent handling IRQs) is 45 - subtracted. 42 + ``original capacity`` is its maximum attainable capacity, i.e. its maximum 43 + attainable performance level. This original capacity is returned by 44 + the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original 45 + capacity`` to which some loss of available performance (e.g. time spent 46 + handling IRQs) is subtracted. 46 47 47 48 Note that a CPU's ``capacity`` is solely intended to be used by the CFS class, 48 - while ``capacity_orig`` is class-agnostic. The rest of this document will use 49 - the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of 49 + while ``original capacity`` is class-agnostic. The rest of this document will use 50 + the term ``capacity`` interchangeably with ``original capacity`` for the sake of 50 51 brevity. 51 52 52 53 1.3 Platform examples
+1 -1
kernel/sched/core.c
··· 9929 9929 #ifdef CONFIG_SMP 9930 9930 rq->sd = NULL; 9931 9931 rq->rd = NULL; 9932 - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 9932 + rq->cpu_capacity = SCHED_CAPACITY_SCALE; 9933 9933 rq->balance_callback = &balance_push_callback; 9934 9934 rq->active_balance = 0; 9935 9935 rq->next_balance = jiffies;
+1 -1
kernel/sched/cpudeadline.c
··· 131 131 if (!dl_task_fits_capacity(p, cpu)) { 132 132 cpumask_clear_cpu(cpu, later_mask); 133 133 134 - cap = capacity_orig_of(cpu); 134 + cap = arch_scale_cpu_capacity(cpu); 135 135 136 136 if (cap > max_cap || 137 137 (cpu == task_cpu(p) && cap == max_cap)) {
+2 -2
kernel/sched/deadline.c
··· 132 132 int i; 133 133 134 134 for_each_cpu_and(i, mask, cpu_active_mask) 135 - cap += capacity_orig_of(i); 135 + cap += arch_scale_cpu_capacity(i); 136 136 137 137 return cap; 138 138 } ··· 144 144 static inline unsigned long dl_bw_capacity(int i) 145 145 { 146 146 if (!sched_asym_cpucap_active() && 147 - capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { 147 + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { 148 148 return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; 149 149 } else { 150 150 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+8 -10
kernel/sched/fair.c
··· 4669 4669 * To avoid overestimation of actual task utilization, skip updates if 4670 4670 * we cannot grant there is idle time in this CPU. 4671 4671 */ 4672 - if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) 4672 + if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) 4673 4673 return; 4674 4674 4675 4675 /* ··· 4717 4717 return fits; 4718 4718 4719 4719 /* 4720 - * We must use capacity_orig_of() for comparing against uclamp_min and 4720 + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and 4721 4721 * uclamp_max. We only care about capacity pressure (by using 4722 4722 * capacity_of()) for comparing against the real util. 4723 4723 * 4724 4724 * If a task is boosted to 1024 for example, we don't want a tiny 4725 4725 * pressure to skew the check whether it fits a CPU or not. 4726 4726 * 4727 - * Similarly if a task is capped to capacity_orig_of(little_cpu), it 4727 + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it 4728 4728 * should fit a little cpu even if there's some pressure. 4729 4729 * 4730 4730 * Only exception is for thermal pressure since it has a direct impact ··· 4736 4736 * For uclamp_max, we can tolerate a drop in performance level as the 4737 4737 * goal is to cap the task. So it's okay if it's getting less. 4738 4738 */ 4739 - capacity_orig = capacity_orig_of(cpu); 4739 + capacity_orig = arch_scale_cpu_capacity(cpu); 4740 4740 capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); 4741 4741 4742 4742 /* ··· 7217 7217 * Look for the CPU with best capacity. 7218 7218 */ 7219 7219 else if (fits < 0) 7220 - cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); 7220 + cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); 7221 7221 7222 7222 /* 7223 7223 * First, select CPU which fits better (-1 being better than 0). ··· 7459 7459 util = max(util, util_est); 7460 7460 } 7461 7461 7462 - return min(util, capacity_orig_of(cpu)); 7462 + return min(util, arch_scale_cpu_capacity(cpu)); 7463 7463 } 7464 7464 7465 7465 unsigned long cpu_util_cfs(int cpu) ··· 9250 9250 unsigned long capacity = scale_rt_capacity(cpu); 9251 9251 struct sched_group *sdg = sd->groups; 9252 9252 9253 - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); 9254 - 9255 9253 if (!capacity) 9256 9254 capacity = 1; 9257 9255 ··· 9325 9327 check_cpu_capacity(struct rq *rq, struct sched_domain *sd) 9326 9328 { 9327 9329 return ((rq->cpu_capacity * sd->imbalance_pct) < 9328 - (rq->cpu_capacity_orig * 100)); 9330 + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); 9329 9331 } 9330 9332 9331 9333 /* ··· 9336 9338 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) 9337 9339 { 9338 9340 return rq->misfit_task_load && 9339 - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || 9341 + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || 9340 9342 check_cpu_capacity(rq, sd)); 9341 9343 } 9342 9344
+1 -1
kernel/sched/rt.c
··· 471 471 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 472 472 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 473 473 474 - cpu_cap = capacity_orig_of(cpu); 474 + cpu_cap = arch_scale_cpu_capacity(cpu); 475 475 476 476 return cpu_cap >= min(min_cap, max_cap); 477 477 }
-6
kernel/sched/sched.h
··· 1033 1033 struct sched_domain __rcu *sd; 1034 1034 1035 1035 unsigned long cpu_capacity; 1036 - unsigned long cpu_capacity_orig; 1037 1036 1038 1037 struct balance_callback *balance_callback; 1039 1038 ··· 2966 2967 #endif 2967 2968 2968 2969 #ifdef CONFIG_SMP 2969 - static inline unsigned long capacity_orig_of(int cpu) 2970 - { 2971 - return cpu_rq(cpu)->cpu_capacity_orig; 2972 - } 2973 - 2974 2970 /** 2975 2971 * enum cpu_util_type - CPU utilization type 2976 2972 * @FREQUENCY_UTIL: Utilization used to select frequency
+5 -2
kernel/sched/topology.c
··· 2488 2488 /* Attach the domains */ 2489 2489 rcu_read_lock(); 2490 2490 for_each_cpu(i, cpu_map) { 2491 + unsigned long capacity; 2492 + 2491 2493 rq = cpu_rq(i); 2492 2494 sd = *per_cpu_ptr(d.sd, i); 2493 2495 2496 + capacity = arch_scale_cpu_capacity(i); 2494 2497 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ 2495 - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) 2496 - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); 2498 + if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) 2499 + WRITE_ONCE(d.rd->max_cpu_capacity, capacity); 2497 2500 2498 2501 cpu_attach_domain(sd, d.rd, i); 2499 2502 }