Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Borislav Petkov:
"Mostly minor things this time; some highlights:

- core-sched: Add 'Forced Idle' accounting; this allows to track how
much CPU time is 'lost' due to core scheduling constraints.

- psi: Fix for MEM_FULL; a task running reclaim would be counted as a
runnable task and prevent MEM_FULL from being reported.

- cpuacct: Long standing fixes for some cgroup accounting issues.

- rt: Bandwidth timer could, under unusual circumstances, be failed
to armed, leading to indefinite throttling."

[ Description above by Peter Zijlstra ]

* tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs()
sched/fair: Cleanup task_util and capacity type
sched/rt: Try to restart rt period timer when rt runtime exceeded
sched/fair: Document the slow path and fast path in select_task_rq_fair
sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity
sched/fair: Fix detection of per-CPU kthreads waking a task
sched/cpuacct: Make user/system times in cpuacct.stat more precise
sched/cpuacct: Fix user/system in shown cpuacct.usage*
cpuacct: Convert BUG_ON() to WARN_ON_ONCE()
cputime, cpuacct: Include guest time in user time in cpuacct.stat
psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
sched/core: Forced idle accounting
psi: Add a missing SPDX license header
psi: Remove repeated verbose comment

+343 -181
+1
include/linux/psi.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef _LINUX_PSI_H 2 3 #define _LINUX_PSI_H 3 4
+13 -1
include/linux/psi_types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef _LINUX_PSI_TYPES_H 2 3 #define _LINUX_PSI_TYPES_H 3 4 ··· 22 21 * don't have to special case any state tracking for it. 23 22 */ 24 23 NR_ONCPU, 25 - NR_PSI_TASK_COUNTS = 4, 24 + /* 25 + * For IO and CPU stalls the presence of running/oncpu tasks 26 + * in the domain means a partial rather than a full stall. 27 + * For memory it's not so simple because of page reclaimers: 28 + * they are running/oncpu while representing a stall. To tell 29 + * whether a domain has productivity left or not, we need to 30 + * distinguish between regular running (i.e. productive) 31 + * threads and memstall ones. 32 + */ 33 + NR_MEMSTALL_RUNNING, 34 + NR_PSI_TASK_COUNTS = 5, 26 35 }; 27 36 28 37 /* Task state bitmasks */ ··· 40 29 #define TSK_MEMSTALL (1 << NR_MEMSTALL) 41 30 #define TSK_RUNNING (1 << NR_RUNNING) 42 31 #define TSK_ONCPU (1 << NR_ONCPU) 32 + #define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) 43 33 44 34 /* Resources that workloads could be stalled on */ 45 35 enum psi_res {
+4
include/linux/sched.h
··· 523 523 u64 nr_wakeups_affine_attempts; 524 524 u64 nr_wakeups_passive; 525 525 u64 nr_wakeups_idle; 526 + 527 + #ifdef CONFIG_SCHED_CORE 528 + u64 core_forceidle_sum; 526 529 #endif 530 + #endif /* CONFIG_SCHEDSTATS */ 527 531 } ____cacheline_aligned; 528 532 529 533 struct sched_entity {
+63 -21
kernel/sched/core.c
··· 144 144 return false; 145 145 146 146 /* flip prio, so high prio is leftmost */ 147 - if (prio_less(b, a, task_rq(a)->core->core_forceidle)) 147 + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) 148 148 return true; 149 149 150 150 return false; ··· 181 181 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); 182 182 } 183 183 184 - void sched_core_dequeue(struct rq *rq, struct task_struct *p) 184 + void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) 185 185 { 186 186 rq->core->core_task_seq++; 187 187 188 - if (!sched_core_enqueued(p)) 189 - return; 188 + if (sched_core_enqueued(p)) { 189 + rb_erase(&p->core_node, &rq->core_tree); 190 + RB_CLEAR_NODE(&p->core_node); 191 + } 190 192 191 - rb_erase(&p->core_node, &rq->core_tree); 192 - RB_CLEAR_NODE(&p->core_node); 193 + /* 194 + * Migrating the last task off the cpu, with the cpu in forced idle 195 + * state. Reschedule to create an accounting edge for forced idle, 196 + * and re-examine whether the core is still in forced idle state. 197 + */ 198 + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && 199 + rq->core->core_forceidle_count && rq->curr == rq->idle) 200 + resched_curr(rq); 193 201 } 194 202 195 203 /* ··· 288 280 for_each_cpu(t, smt_mask) 289 281 cpu_rq(t)->core_enabled = enabled; 290 282 283 + cpu_rq(cpu)->core->core_forceidle_start = 0; 284 + 291 285 sched_core_unlock(cpu, &flags); 292 286 293 287 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); ··· 374 364 #else /* !CONFIG_SCHED_CORE */ 375 365 376 366 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } 377 - static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } 367 + static inline void 368 + sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } 378 369 379 370 #endif /* CONFIG_SCHED_CORE */ 380 371 ··· 2016 2005 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 2017 2006 { 2018 2007 if (sched_core_enabled(rq)) 2019 - sched_core_dequeue(rq, p); 2008 + sched_core_dequeue(rq, p, flags); 2020 2009 2021 2010 if (!(flags & DEQUEUE_NOCLOCK)) 2022 2011 update_rq_clock(rq); ··· 5255 5244 if (sched_feat(LATENCY_WARN)) 5256 5245 resched_latency = cpu_resched_latency(rq); 5257 5246 calc_global_load_tick(rq); 5247 + sched_core_tick(rq); 5258 5248 5259 5249 rq_unlock(rq, &rf); 5260 5250 ··· 5668 5656 struct task_struct *next, *p, *max = NULL; 5669 5657 const struct cpumask *smt_mask; 5670 5658 bool fi_before = false; 5659 + bool core_clock_updated = (rq == rq->core); 5671 5660 unsigned long cookie; 5672 5661 int i, cpu, occ = 0; 5673 5662 struct rq *rq_i; ··· 5721 5708 5722 5709 /* reset state */ 5723 5710 rq->core->core_cookie = 0UL; 5724 - if (rq->core->core_forceidle) { 5711 + if (rq->core->core_forceidle_count) { 5712 + if (!core_clock_updated) { 5713 + update_rq_clock(rq->core); 5714 + core_clock_updated = true; 5715 + } 5716 + sched_core_account_forceidle(rq); 5717 + /* reset after accounting force idle */ 5718 + rq->core->core_forceidle_start = 0; 5719 + rq->core->core_forceidle_count = 0; 5720 + rq->core->core_forceidle_occupation = 0; 5725 5721 need_sync = true; 5726 5722 fi_before = true; 5727 - rq->core->core_forceidle = false; 5728 5723 } 5729 5724 5730 5725 /* ··· 5774 5753 for_each_cpu_wrap(i, smt_mask, cpu) { 5775 5754 rq_i = cpu_rq(i); 5776 5755 5777 - if (i != cpu) 5756 + /* 5757 + * Current cpu always has its clock updated on entrance to 5758 + * pick_next_task(). If the current cpu is not the core, 5759 + * the core may also have been updated above. 5760 + */ 5761 + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) 5778 5762 update_rq_clock(rq_i); 5779 5763 5780 5764 p = rq_i->core_pick = pick_task(rq_i); ··· 5809 5783 5810 5784 if (p == rq_i->idle) { 5811 5785 if (rq_i->nr_running) { 5812 - rq->core->core_forceidle = true; 5786 + rq->core->core_forceidle_count++; 5813 5787 if (!fi_before) 5814 5788 rq->core->core_forceidle_seq++; 5815 5789 } 5816 5790 } else { 5817 5791 occ++; 5818 5792 } 5793 + } 5794 + 5795 + if (schedstat_enabled() && rq->core->core_forceidle_count) { 5796 + if (cookie) 5797 + rq->core->core_forceidle_start = rq_clock(rq->core); 5798 + rq->core->core_forceidle_occupation = occ; 5819 5799 } 5820 5800 5821 5801 rq->core->core_pick_seq = rq->core->core_task_seq; ··· 5860 5828 * 1 0 1 5861 5829 * 1 1 0 5862 5830 */ 5863 - if (!(fi_before && rq->core->core_forceidle)) 5864 - task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); 5831 + if (!(fi_before && rq->core->core_forceidle_count)) 5832 + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); 5865 5833 5866 5834 rq_i->core_pick->core_occupation = occ; 5867 5835 ··· 6065 6033 goto unlock; 6066 6034 6067 6035 /* copy the shared state to the new leader */ 6068 - core_rq->core_task_seq = rq->core_task_seq; 6069 - core_rq->core_pick_seq = rq->core_pick_seq; 6070 - core_rq->core_cookie = rq->core_cookie; 6071 - core_rq->core_forceidle = rq->core_forceidle; 6072 - core_rq->core_forceidle_seq = rq->core_forceidle_seq; 6036 + core_rq->core_task_seq = rq->core_task_seq; 6037 + core_rq->core_pick_seq = rq->core_pick_seq; 6038 + core_rq->core_cookie = rq->core_cookie; 6039 + core_rq->core_forceidle_count = rq->core_forceidle_count; 6040 + core_rq->core_forceidle_seq = rq->core_forceidle_seq; 6041 + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; 6042 + 6043 + /* 6044 + * Accounting edge for forced idle is handled in pick_next_task(). 6045 + * Don't need another one here, since the hotplug thread shouldn't 6046 + * have a cookie. 6047 + */ 6048 + core_rq->core_forceidle_start = 0; 6073 6049 6074 6050 /* install new leader */ 6075 6051 for_each_cpu(t, smt_mask) { ··· 7166 7126 7167 7127 unsigned long sched_cpu_util(int cpu, unsigned long max) 7168 7128 { 7169 - return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, 7129 + return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, 7170 7130 ENERGY_UTIL, NULL); 7171 7131 } 7172 7132 #endif /* CONFIG_SMP */ ··· 9449 9409 rq->core_pick = NULL; 9450 9410 rq->core_enabled = 0; 9451 9411 rq->core_tree = RB_ROOT; 9452 - rq->core_forceidle = false; 9412 + rq->core_forceidle_count = 0; 9413 + rq->core_forceidle_occupation = 0; 9414 + rq->core_forceidle_start = 0; 9453 9415 9454 9416 rq->core_cookie = 0UL; 9455 9417 #endif
+65 -1
kernel/sched/core_sched.c
··· 73 73 74 74 enqueued = sched_core_enqueued(p); 75 75 if (enqueued) 76 - sched_core_dequeue(rq, p); 76 + sched_core_dequeue(rq, p, DEQUEUE_SAVE); 77 77 78 78 old_cookie = p->core_cookie; 79 79 p->core_cookie = cookie; ··· 85 85 * If task is currently running, it may not be compatible anymore after 86 86 * the cookie change, so enter the scheduler on its CPU to schedule it 87 87 * away. 88 + * 89 + * Note that it is possible that as a result of this cookie change, the 90 + * core has now entered/left forced idle state. Defer accounting to the 91 + * next scheduling edge, rather than always forcing a reschedule here. 88 92 */ 89 93 if (task_running(rq, p)) 90 94 resched_curr(rq); ··· 236 232 return err; 237 233 } 238 234 235 + #ifdef CONFIG_SCHEDSTATS 236 + 237 + /* REQUIRES: rq->core's clock recently updated. */ 238 + void __sched_core_account_forceidle(struct rq *rq) 239 + { 240 + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); 241 + u64 delta, now = rq_clock(rq->core); 242 + struct rq *rq_i; 243 + struct task_struct *p; 244 + int i; 245 + 246 + lockdep_assert_rq_held(rq); 247 + 248 + WARN_ON_ONCE(!rq->core->core_forceidle_count); 249 + 250 + if (rq->core->core_forceidle_start == 0) 251 + return; 252 + 253 + delta = now - rq->core->core_forceidle_start; 254 + if (unlikely((s64)delta <= 0)) 255 + return; 256 + 257 + rq->core->core_forceidle_start = now; 258 + 259 + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { 260 + /* can't be forced idle without a running task */ 261 + } else if (rq->core->core_forceidle_count > 1 || 262 + rq->core->core_forceidle_occupation > 1) { 263 + /* 264 + * For larger SMT configurations, we need to scale the charged 265 + * forced idle amount since there can be more than one forced 266 + * idle sibling and more than one running cookied task. 267 + */ 268 + delta *= rq->core->core_forceidle_count; 269 + delta = div_u64(delta, rq->core->core_forceidle_occupation); 270 + } 271 + 272 + for_each_cpu(i, smt_mask) { 273 + rq_i = cpu_rq(i); 274 + p = rq_i->core_pick ?: rq_i->curr; 275 + 276 + if (!p->core_cookie) 277 + continue; 278 + 279 + __schedstat_add(p->stats.core_forceidle_sum, delta); 280 + } 281 + } 282 + 283 + void __sched_core_tick(struct rq *rq) 284 + { 285 + if (!rq->core->core_forceidle_count) 286 + return; 287 + 288 + if (rq != rq->core) 289 + update_rq_clock(rq->core); 290 + 291 + __sched_core_account_forceidle(rq); 292 + } 293 + 294 + #endif /* CONFIG_SCHEDSTATS */
+49 -58
kernel/sched/cpuacct.c
··· 21 21 [CPUACCT_STAT_SYSTEM] = "system", 22 22 }; 23 23 24 - struct cpuacct_usage { 25 - u64 usages[CPUACCT_STAT_NSTATS]; 26 - }; 27 - 28 24 /* track CPU usage of a group of tasks and its child groups */ 29 25 struct cpuacct { 30 26 struct cgroup_subsys_state css; 31 27 /* cpuusage holds pointer to a u64-type object on every CPU */ 32 - struct cpuacct_usage __percpu *cpuusage; 28 + u64 __percpu *cpuusage; 33 29 struct kernel_cpustat __percpu *cpustat; 34 30 }; 35 31 ··· 45 49 return css_ca(ca->css.parent); 46 50 } 47 51 48 - static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); 52 + static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 49 53 static struct cpuacct root_cpuacct = { 50 54 .cpustat = &kernel_cpustat, 51 55 .cpuusage = &root_cpuacct_cpuusage, ··· 64 68 if (!ca) 65 69 goto out; 66 70 67 - ca->cpuusage = alloc_percpu(struct cpuacct_usage); 71 + ca->cpuusage = alloc_percpu(u64); 68 72 if (!ca->cpuusage) 69 73 goto out_free_ca; 70 74 ··· 95 99 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, 96 100 enum cpuacct_stat_index index) 97 101 { 98 - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 102 + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 103 + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; 99 104 u64 data; 100 105 101 106 /* 102 107 * We allow index == CPUACCT_STAT_NSTATS here to read 103 108 * the sum of usages. 104 109 */ 105 - BUG_ON(index > CPUACCT_STAT_NSTATS); 110 + if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS)) 111 + return 0; 106 112 107 113 #ifndef CONFIG_64BIT 108 114 /* ··· 113 115 raw_spin_rq_lock_irq(cpu_rq(cpu)); 114 116 #endif 115 117 116 - if (index == CPUACCT_STAT_NSTATS) { 117 - int i = 0; 118 - 119 - data = 0; 120 - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 121 - data += cpuusage->usages[i]; 122 - } else { 123 - data = cpuusage->usages[index]; 118 + switch (index) { 119 + case CPUACCT_STAT_USER: 120 + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; 121 + break; 122 + case CPUACCT_STAT_SYSTEM: 123 + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + 124 + cpustat[CPUTIME_SOFTIRQ]; 125 + break; 126 + case CPUACCT_STAT_NSTATS: 127 + data = *cpuusage; 128 + break; 124 129 } 125 130 126 131 #ifndef CONFIG_64BIT ··· 133 132 return data; 134 133 } 135 134 136 - static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 135 + static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) 137 136 { 138 - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 139 - int i; 137 + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 138 + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; 139 + 140 + /* Don't allow to reset global kernel_cpustat */ 141 + if (ca == &root_cpuacct) 142 + return; 140 143 141 144 #ifndef CONFIG_64BIT 142 145 /* ··· 148 143 */ 149 144 raw_spin_rq_lock_irq(cpu_rq(cpu)); 150 145 #endif 151 - 152 - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 153 - cpuusage->usages[i] = val; 146 + *cpuusage = 0; 147 + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; 148 + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; 149 + cpustat[CPUTIME_SOFTIRQ] = 0; 154 150 155 151 #ifndef CONFIG_64BIT 156 152 raw_spin_rq_unlock_irq(cpu_rq(cpu)); ··· 202 196 return -EINVAL; 203 197 204 198 for_each_possible_cpu(cpu) 205 - cpuacct_cpuusage_write(ca, cpu, 0); 199 + cpuacct_cpuusage_write(ca, cpu); 206 200 207 201 return 0; 208 202 } ··· 249 243 seq_puts(m, "\n"); 250 244 251 245 for_each_possible_cpu(cpu) { 252 - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 253 - 254 246 seq_printf(m, "%d", cpu); 255 - 256 - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { 257 - #ifndef CONFIG_64BIT 258 - /* 259 - * Take rq->lock to make 64-bit read safe on 32-bit 260 - * platforms. 261 - */ 262 - raw_spin_rq_lock_irq(cpu_rq(cpu)); 263 - #endif 264 - 265 - seq_printf(m, " %llu", cpuusage->usages[index]); 266 - 267 - #ifndef CONFIG_64BIT 268 - raw_spin_rq_unlock_irq(cpu_rq(cpu)); 269 - #endif 270 - } 247 + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) 248 + seq_printf(m, " %llu", 249 + cpuacct_cpuusage_read(ca, cpu, index)); 271 250 seq_puts(m, "\n"); 272 251 } 273 252 return 0; ··· 261 270 static int cpuacct_stats_show(struct seq_file *sf, void *v) 262 271 { 263 272 struct cpuacct *ca = css_ca(seq_css(sf)); 264 - s64 val[CPUACCT_STAT_NSTATS]; 273 + struct task_cputime cputime; 274 + u64 val[CPUACCT_STAT_NSTATS]; 265 275 int cpu; 266 276 int stat; 267 277 268 - memset(val, 0, sizeof(val)); 278 + memset(&cputime, 0, sizeof(cputime)); 269 279 for_each_possible_cpu(cpu) { 270 280 u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; 271 281 272 - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; 273 - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; 274 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; 275 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; 276 - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; 282 + cputime.utime += cpustat[CPUTIME_USER]; 283 + cputime.utime += cpustat[CPUTIME_NICE]; 284 + cputime.stime += cpustat[CPUTIME_SYSTEM]; 285 + cputime.stime += cpustat[CPUTIME_IRQ]; 286 + cputime.stime += cpustat[CPUTIME_SOFTIRQ]; 287 + 288 + cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu); 277 289 } 278 290 291 + cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime, 292 + &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]); 293 + 279 294 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { 280 - seq_printf(sf, "%s %lld\n", 281 - cpuacct_stat_desc[stat], 282 - (long long)nsec_to_clock_t(val[stat])); 295 + seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat], 296 + nsec_to_clock_t(val[stat])); 283 297 } 284 298 285 299 return 0; ··· 335 339 void cpuacct_charge(struct task_struct *tsk, u64 cputime) 336 340 { 337 341 struct cpuacct *ca; 338 - int index = CPUACCT_STAT_SYSTEM; 339 - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); 340 - 341 - if (regs && user_mode(regs)) 342 - index = CPUACCT_STAT_USER; 343 342 344 343 rcu_read_lock(); 345 344 346 345 for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) 347 - __this_cpu_add(ca->cpuusage->usages[index], cputime); 346 + __this_cpu_add(*ca->cpuusage, cputime); 348 347 349 348 rcu_read_unlock(); 350 349 }
+1 -1
kernel/sched/cpufreq_schedutil.c
··· 168 168 169 169 sg_cpu->max = max; 170 170 sg_cpu->bw_dl = cpu_bw_dl(rq); 171 - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, 171 + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, 172 172 FREQUENCY_UTIL, NULL); 173 173 } 174 174
+2 -2
kernel/sched/cputime.c
··· 148 148 149 149 /* Add guest time to cpustat. */ 150 150 if (task_nice(p) > 0) { 151 - cpustat[CPUTIME_NICE] += cputime; 151 + task_group_account_field(p, CPUTIME_NICE, cputime); 152 152 cpustat[CPUTIME_GUEST_NICE] += cputime; 153 153 } else { 154 - cpustat[CPUTIME_USER] += cputime; 154 + task_group_account_field(p, CPUTIME_USER, cputime); 155 155 cpustat[CPUTIME_GUEST] += cputime; 156 156 } 157 157 }
+4
kernel/sched/debug.c
··· 1023 1023 1024 1024 __PN(avg_atom); 1025 1025 __PN(avg_per_cpu); 1026 + 1027 + #ifdef CONFIG_SCHED_CORE 1028 + PN_SCHEDSTAT(core_forceidle_sum); 1029 + #endif 1026 1030 } 1027 1031 1028 1032 __P(nr_switches);
+20 -67
kernel/sched/fair.c
··· 1502 1502 1503 1503 static unsigned long cpu_load(struct rq *rq); 1504 1504 static unsigned long cpu_runnable(struct rq *rq); 1505 - static unsigned long cpu_util(int cpu); 1506 1505 static inline long adjust_numa_imbalance(int imbalance, 1507 1506 int dst_running, int dst_weight); 1508 1507 ··· 1568 1569 1569 1570 ns->load += cpu_load(rq); 1570 1571 ns->runnable += cpu_runnable(rq); 1571 - ns->util += cpu_util(cpu); 1572 + ns->util += cpu_util_cfs(cpu); 1572 1573 ns->nr_running += rq->cfs.h_nr_running; 1573 1574 ns->compute_capacity += capacity_of(cpu); 1574 1575 ··· 3239 3240 * As is, the util number is not freq-invariant (we'd have to 3240 3241 * implement arch_scale_freq_capacity() for that). 3241 3242 * 3242 - * See cpu_util(). 3243 + * See cpu_util_cfs(). 3243 3244 */ 3244 3245 cpufreq_update_util(rq, flags); 3245 3246 } ··· 4069 4070 trace_sched_util_est_se_tp(&p->se); 4070 4071 } 4071 4072 4072 - static inline int task_fits_capacity(struct task_struct *p, long capacity) 4073 + static inline int task_fits_capacity(struct task_struct *p, 4074 + unsigned long capacity) 4073 4075 { 4074 4076 return fits_capacity(uclamp_task_util(p), capacity); 4075 4077 } ··· 5509 5509 #endif 5510 5510 5511 5511 #ifdef CONFIG_SMP 5512 - static inline unsigned long cpu_util(int cpu); 5513 - 5514 5512 static inline bool cpu_overutilized(int cpu) 5515 5513 { 5516 - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); 5514 + return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); 5517 5515 } 5518 5516 5519 5517 static inline void update_overutilized_status(struct rq *rq) ··· 6343 6345 return best_cpu; 6344 6346 } 6345 6347 6346 - static inline bool asym_fits_capacity(int task_util, int cpu) 6348 + static inline bool asym_fits_capacity(unsigned long task_util, int cpu) 6347 6349 { 6348 6350 if (static_branch_unlikely(&sched_asym_cpucapacity)) 6349 6351 return fits_capacity(task_util, capacity_of(cpu)); ··· 6396 6398 * pattern is IO completions. 6397 6399 */ 6398 6400 if (is_per_cpu_kthread(current) && 6401 + in_task() && 6399 6402 prev == smp_processor_id() && 6400 - this_rq()->nr_running <= 1) { 6403 + this_rq()->nr_running <= 1 && 6404 + asym_fits_capacity(task_util, prev)) { 6401 6405 return prev; 6402 6406 } 6403 6407 ··· 6456 6456 return target; 6457 6457 } 6458 6458 6459 - /** 6460 - * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. 6461 - * @cpu: the CPU to get the utilization of 6462 - * 6463 - * The unit of the return value must be the one of capacity so we can compare 6464 - * the utilization with the capacity of the CPU that is available for CFS task 6465 - * (ie cpu_capacity). 6466 - * 6467 - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the 6468 - * recent utilization of currently non-runnable tasks on a CPU. It represents 6469 - * the amount of utilization of a CPU in the range [0..capacity_orig] where 6470 - * capacity_orig is the cpu_capacity available at the highest frequency 6471 - * (arch_scale_freq_capacity()). 6472 - * The utilization of a CPU converges towards a sum equal to or less than the 6473 - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is 6474 - * the running time on this CPU scaled by capacity_curr. 6475 - * 6476 - * The estimated utilization of a CPU is defined to be the maximum between its 6477 - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks 6478 - * currently RUNNABLE on that CPU. 6479 - * This allows to properly represent the expected utilization of a CPU which 6480 - * has just got a big task running since a long sleep period. At the same time 6481 - * however it preserves the benefits of the "blocked utilization" in 6482 - * describing the potential for other tasks waking up on the same CPU. 6483 - * 6484 - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even 6485 - * higher than capacity_orig because of unfortunate rounding in 6486 - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until 6487 - * the average stabilizes with the new running time. We need to check that the 6488 - * utilization stays within the range of [0..capacity_orig] and cap it if 6489 - * necessary. Without utilization capping, a group could be seen as overloaded 6490 - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of 6491 - * available capacity. We allow utilization to overshoot capacity_curr (but not 6492 - * capacity_orig) as it useful for predicting the capacity required after task 6493 - * migrations (scheduler-driven DVFS). 6494 - * 6495 - * Return: the (estimated) utilization for the specified CPU 6496 - */ 6497 - static inline unsigned long cpu_util(int cpu) 6498 - { 6499 - struct cfs_rq *cfs_rq; 6500 - unsigned int util; 6501 - 6502 - cfs_rq = &cpu_rq(cpu)->cfs; 6503 - util = READ_ONCE(cfs_rq->avg.util_avg); 6504 - 6505 - if (sched_feat(UTIL_EST)) 6506 - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); 6507 - 6508 - return min_t(unsigned long, util, capacity_orig_of(cpu)); 6509 - } 6510 - 6511 6459 /* 6512 6460 * cpu_util_without: compute cpu utilization without any contributions from *p 6513 6461 * @cpu: the CPU which utilization is requested ··· 6476 6528 6477 6529 /* Task has no contribution or is new */ 6478 6530 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 6479 - return cpu_util(cpu); 6531 + return cpu_util_cfs(cpu); 6480 6532 6481 6533 cfs_rq = &cpu_rq(cpu)->cfs; 6482 6534 util = READ_ONCE(cfs_rq->avg.util_avg); ··· 6540 6592 /* 6541 6593 * Utilization (estimated) can exceed the CPU capacity, thus let's 6542 6594 * clamp to the maximum CPU capacity to ensure consistency with 6543 - * the cpu_util call. 6595 + * cpu_util. 6544 6596 */ 6545 6597 return min_t(unsigned long, util, capacity_orig_of(cpu)); 6546 6598 } ··· 6572 6624 * During wake-up, the task isn't enqueued yet and doesn't 6573 6625 * appear in the cfs_rq->avg.util_est.enqueued of any rq, 6574 6626 * so just add it (if needed) to "simulate" what will be 6575 - * cpu_util() after the task has been enqueued. 6627 + * cpu_util after the task has been enqueued. 6576 6628 */ 6577 6629 if (dst_cpu == cpu) 6578 6630 util_est += _task_util_est(p); ··· 6863 6915 break; 6864 6916 } 6865 6917 6918 + /* 6919 + * Usually only true for WF_EXEC and WF_FORK, as sched_domains 6920 + * usually do not have SD_BALANCE_WAKE set. That means wakeup 6921 + * will usually go to the fast path. 6922 + */ 6866 6923 if (tmp->flags & sd_flag) 6867 6924 sd = tmp; 6868 6925 else if (!want_affine) ··· 8634 8681 struct rq *rq = cpu_rq(i); 8635 8682 8636 8683 sgs->group_load += cpu_load(rq); 8637 - sgs->group_util += cpu_util(i); 8684 + sgs->group_util += cpu_util_cfs(i); 8638 8685 sgs->group_runnable += cpu_runnable(rq); 8639 8686 sgs->sum_h_nr_running += rq->cfs.h_nr_running; 8640 8687 ··· 9652 9699 break; 9653 9700 9654 9701 case migrate_util: 9655 - util = cpu_util(cpu_of(rq)); 9702 + util = cpu_util_cfs(i); 9656 9703 9657 9704 /* 9658 9705 * Don't try to pull utilization from a CPU with one ··· 11021 11068 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check 11022 11069 * if we need to give up the CPU. 11023 11070 */ 11024 - if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && 11071 + if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && 11025 11072 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) 11026 11073 resched_curr(rq); 11027 11074 }
+29 -18
kernel/sched/psi.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 /* 2 3 * Pressure stall information for CPU, memory and IO 3 4 * ··· 35 34 * delayed on that resource such that nobody is advancing and the CPU 36 35 * goes idle. This leaves both workload and CPU unproductive. 37 36 * 38 - * Naturally, the FULL state doesn't exist for the CPU resource at the 39 - * system level, but exist at the cgroup level, means all non-idle tasks 40 - * in a cgroup are delayed on the CPU resource which used by others outside 41 - * of the cgroup or throttled by the cgroup cpu.max configuration. 42 - * 43 37 * SOME = nr_delayed_tasks != 0 44 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 38 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 39 + * 40 + * What it means for a task to be productive is defined differently 41 + * for each resource. For IO, productive means a running task. For 42 + * memory, productive means a running task that isn't a reclaimer. For 43 + * CPU, productive means an oncpu task. 44 + * 45 + * Naturally, the FULL state doesn't exist for the CPU resource at the 46 + * system level, but exist at the cgroup level. At the cgroup level, 47 + * FULL means all non-idle tasks in the cgroup are delayed on the CPU 48 + * resource which is being used by others outside of the cgroup or 49 + * throttled by the cgroup cpu.max configuration. 45 50 * 46 51 * The percentage of wallclock time spent in those compound stall 47 52 * states gives pressure numbers between 0 and 100 for each resource, ··· 88 81 * 89 82 * threads = min(nr_nonidle_tasks, nr_cpus) 90 83 * SOME = min(nr_delayed_tasks / threads, 1) 91 - * FULL = (threads - min(nr_running_tasks, threads)) / threads 84 + * FULL = (threads - min(nr_productive_tasks, threads)) / threads 92 85 * 93 86 * For the 257 number crunchers on 256 CPUs, this yields: 94 87 * 95 88 * threads = min(257, 256) 96 89 * SOME = min(1 / 256, 1) = 0.4% 97 - * FULL = (256 - min(257, 256)) / 256 = 0% 90 + * FULL = (256 - min(256, 256)) / 256 = 0% 98 91 * 99 92 * For the 1 out of 4 memory-delayed tasks, this yields: 100 93 * ··· 119 112 * For each runqueue, we track: 120 113 * 121 114 * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) 122 - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) 115 + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) 123 116 * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) 124 117 * 125 118 * and then periodically aggregate: ··· 240 233 case PSI_MEM_SOME: 241 234 return unlikely(tasks[NR_MEMSTALL]); 242 235 case PSI_MEM_FULL: 243 - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); 236 + return unlikely(tasks[NR_MEMSTALL] && 237 + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); 244 238 case PSI_CPU_SOME: 245 239 return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); 246 240 case PSI_CPU_FULL: ··· 718 710 if (groupc->tasks[t]) { 719 711 groupc->tasks[t]--; 720 712 } else if (!psi_bug) { 721 - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", 713 + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", 722 714 cpu, t, groupc->tasks[0], 723 715 groupc->tasks[1], groupc->tasks[2], 724 - groupc->tasks[3], clear, set); 716 + groupc->tasks[3], groupc->tasks[4], 717 + clear, set); 725 718 psi_bug = 1; 726 719 } 727 720 } ··· 842 833 /* 843 834 * When switching between tasks that have an identical 844 835 * runtime state, the cgroup that contains both tasks 845 - * runtime state, the cgroup that contains both tasks 846 836 * we reach the first common ancestor. Iterate @next's 847 837 * ancestors only until we encounter @prev's ONCPU. 848 838 */ ··· 862 854 int clear = TSK_ONCPU, set = 0; 863 855 864 856 /* 865 - * When we're going to sleep, psi_dequeue() lets us handle 866 - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it 867 - * with TSK_ONCPU and save walking common ancestors twice. 857 + * When we're going to sleep, psi_dequeue() lets us 858 + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and 859 + * TSK_IOWAIT here, where we can combine it with 860 + * TSK_ONCPU and save walking common ancestors twice. 868 861 */ 869 862 if (sleep) { 870 863 clear |= TSK_RUNNING; 864 + if (prev->in_memstall) 865 + clear |= TSK_MEMSTALL_RUNNING; 871 866 if (prev->in_iowait) 872 867 set |= TSK_IOWAIT; 873 868 } ··· 919 908 rq = this_rq_lock_irq(&rf); 920 909 921 910 current->in_memstall = 1; 922 - psi_task_change(current, 0, TSK_MEMSTALL); 911 + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); 923 912 924 913 rq_unlock_irq(rq, &rf); 925 914 } ··· 948 937 rq = this_rq_lock_irq(&rf); 949 938 950 939 current->in_memstall = 0; 951 - psi_task_change(current, TSK_MEMSTALL, 0); 940 + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); 952 941 953 942 rq_unlock_irq(rq, &rf); 954 943 }
+18 -5
kernel/sched/rt.c
··· 52 52 rt_b->rt_period_timer.function = sched_rt_period_timer; 53 53 } 54 54 55 - static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 55 + static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) 56 56 { 57 - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 58 - return; 59 - 60 57 raw_spin_lock(&rt_b->rt_runtime_lock); 61 58 if (!rt_b->rt_period_active) { 62 59 rt_b->rt_period_active = 1; ··· 70 73 HRTIMER_MODE_ABS_PINNED_HARD); 71 74 } 72 75 raw_spin_unlock(&rt_b->rt_runtime_lock); 76 + } 77 + 78 + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 79 + { 80 + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 81 + return; 82 + 83 + do_start_rt_bandwidth(rt_b); 73 84 } 74 85 75 86 void init_rt_rq(struct rt_rq *rt_rq) ··· 1036 1031 1037 1032 for_each_sched_rt_entity(rt_se) { 1038 1033 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1034 + int exceeded; 1039 1035 1040 1036 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1041 1037 raw_spin_lock(&rt_rq->rt_runtime_lock); 1042 1038 rt_rq->rt_time += delta_exec; 1043 - if (sched_rt_runtime_exceeded(rt_rq)) 1039 + exceeded = sched_rt_runtime_exceeded(rt_rq); 1040 + if (exceeded) 1044 1041 resched_curr(rq); 1045 1042 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1043 + if (exceeded) 1044 + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1046 1045 } 1047 1046 } 1048 1047 } ··· 2920 2911 2921 2912 static void sched_rt_do_global(void) 2922 2913 { 2914 + unsigned long flags; 2915 + 2916 + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2923 2917 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 2924 2918 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 2919 + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2925 2920 } 2926 2921 2927 2922 int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+70 -6
kernel/sched/sched.h
··· 1111 1111 unsigned int core_task_seq; 1112 1112 unsigned int core_pick_seq; 1113 1113 unsigned long core_cookie; 1114 - unsigned char core_forceidle; 1114 + unsigned int core_forceidle_count; 1115 1115 unsigned int core_forceidle_seq; 1116 + unsigned int core_forceidle_occupation; 1117 + u64 core_forceidle_start; 1116 1118 #endif 1117 1119 }; 1118 1120 ··· 1255 1253 } 1256 1254 1257 1255 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); 1258 - extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); 1256 + extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); 1259 1257 1260 1258 extern void sched_core_get(void); 1261 1259 extern void sched_core_put(void); ··· 1855 1853 1856 1854 #include "stats.h" 1857 1855 #include "autogroup.h" 1856 + 1857 + #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) 1858 + 1859 + extern void __sched_core_account_forceidle(struct rq *rq); 1860 + 1861 + static inline void sched_core_account_forceidle(struct rq *rq) 1862 + { 1863 + if (schedstat_enabled()) 1864 + __sched_core_account_forceidle(rq); 1865 + } 1866 + 1867 + extern void __sched_core_tick(struct rq *rq); 1868 + 1869 + static inline void sched_core_tick(struct rq *rq) 1870 + { 1871 + if (sched_core_enabled(rq) && schedstat_enabled()) 1872 + __sched_core_tick(rq); 1873 + } 1874 + 1875 + #else 1876 + 1877 + static inline void sched_core_account_forceidle(struct rq *rq) {} 1878 + 1879 + static inline void sched_core_tick(struct rq *rq) {} 1880 + 1881 + #endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ 1858 1882 1859 1883 #ifdef CONFIG_CGROUP_SCHED 1860 1884 ··· 2966 2938 return READ_ONCE(rq->avg_dl.util_avg); 2967 2939 } 2968 2940 2969 - static inline unsigned long cpu_util_cfs(struct rq *rq) 2941 + /** 2942 + * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. 2943 + * @cpu: the CPU to get the utilization for. 2944 + * 2945 + * The unit of the return value must be the same as the one of CPU capacity 2946 + * so that CPU utilization can be compared with CPU capacity. 2947 + * 2948 + * CPU utilization is the sum of running time of runnable tasks plus the 2949 + * recent utilization of currently non-runnable tasks on that CPU. 2950 + * It represents the amount of CPU capacity currently used by CFS tasks in 2951 + * the range [0..max CPU capacity] with max CPU capacity being the CPU 2952 + * capacity at f_max. 2953 + * 2954 + * The estimated CPU utilization is defined as the maximum between CPU 2955 + * utilization and sum of the estimated utilization of the currently 2956 + * runnable tasks on that CPU. It preserves a utilization "snapshot" of 2957 + * previously-executed tasks, which helps better deduce how busy a CPU will 2958 + * be when a long-sleeping task wakes up. The contribution to CPU utilization 2959 + * of such a task would be significantly decayed at this point of time. 2960 + * 2961 + * CPU utilization can be higher than the current CPU capacity 2962 + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because 2963 + * of rounding errors as well as task migrations or wakeups of new tasks. 2964 + * CPU utilization has to be capped to fit into the [0..max CPU capacity] 2965 + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) 2966 + * could be seen as over-utilized even though CPU1 has 20% of spare CPU 2967 + * capacity. CPU utilization is allowed to overshoot current CPU capacity 2968 + * though since this is useful for predicting the CPU capacity required 2969 + * after task migrations (scheduler-driven DVFS). 2970 + * 2971 + * Return: (Estimated) utilization for the specified CPU. 2972 + */ 2973 + static inline unsigned long cpu_util_cfs(int cpu) 2970 2974 { 2971 - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); 2975 + struct cfs_rq *cfs_rq; 2976 + unsigned long util; 2977 + 2978 + cfs_rq = &cpu_rq(cpu)->cfs; 2979 + util = READ_ONCE(cfs_rq->avg.util_avg); 2972 2980 2973 2981 if (sched_feat(UTIL_EST)) { 2974 2982 util = max_t(unsigned long, util, 2975 - READ_ONCE(rq->cfs.avg.util_est.enqueued)); 2983 + READ_ONCE(cfs_rq->avg.util_est.enqueued)); 2976 2984 } 2977 2985 2978 - return util; 2986 + return min(util, capacity_orig_of(cpu)); 2979 2987 } 2980 2988 2981 2989 static inline unsigned long cpu_util_rt(struct rq *rq)
+4 -1
kernel/sched/stats.h
··· 118 118 if (static_branch_likely(&psi_disabled)) 119 119 return; 120 120 121 + if (p->in_memstall) 122 + set |= TSK_MEMSTALL_RUNNING; 123 + 121 124 if (!wakeup || p->sched_psi_wake_requeue) { 122 125 if (p->in_memstall) 123 126 set |= TSK_MEMSTALL; ··· 151 148 return; 152 149 153 150 if (p->in_memstall) 154 - clear |= TSK_MEMSTALL; 151 + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); 155 152 156 153 psi_task_change(p, clear, 0); 157 154 }