commit 6f3f04c19074972ea12edeed23b07a32894e9e03 · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

- Updates to scheduler metrics:
- PELT fixes & enhancements
- PSI fixes & enhancements
- Refactor cpu_util_without()

- Updates to instrumentation/debugging:
- Remove sched_trace_*() helper functions - can be done via debug
info
- Fix double update_rq_clock() warnings

- Introduce & use "preemption model accessors" to simplify some of the
Kconfig complexity.

- Make softirq handling RT-safe.

- Misc smaller fixes & cleanups.

* tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
topology: Remove unused cpu_cluster_mask()
sched: Reverse sched_class layout
sched/deadline: Remove superfluous rq clock update in push_dl_task()
sched/core: Avoid obvious double update_rq_clock warning
smp: Make softirq handling RT safe in flush_smp_call_function_queue()
smp: Rename flush_smp_call_function_from_idle()
sched: Fix missing prototype warnings
sched/fair: Remove cfs_rq_tg_path()
sched/fair: Remove sched_trace_*() helper functions
sched/fair: Refactor cpu_util_without()
sched/fair: Revise comment about lb decision matrix
sched/psi: report zeroes for CPU full at the system level
sched/fair: Delete useless condition in tg_unthrottle_up()
sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq
sched/fair: Move calculate of avg_load to a better location
mailmap: Update my email address to @redhat.com
MAINTAINERS: Add myself as scheduler topology reviewer
psi: Fix trigger being fired unexpectedly at initial
ftrace: Use preemption model accessors for trace header printout
kcsan: Use preemption model accessors

Linus Torvalds 3 years ago 6f3f04c1 cfeb2522

+219 -337

23 changed files

expand all

unified split

.mailmap

Documentation

accounting

psi.rst

MAINTAINERS

include

asm-generic

vmlinux.lds.h

linux

interrupt.h

sched.h

topology.h

kernel

kcsan

kcsan_test.c

sched

build_policy.c

build_utility.c

core.c

deadline.c

fair.c

idle.c

pelt.h

psi.c

rt.c

sched.h

smp.h

smp.c

softirq.c

stop_machine.c

trace

trace.c

.mailmap

··· 398 398 Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org> 399 399 Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com> 400 400 Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru> 401 + Valentin Schneider <vschneid@redhat.com> <valentin.schneider@arm.com> 401 402 Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com> 402 403 Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com> 403 404 Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>

+4 -5

Documentation/accounting/psi.rst

··· 37 37 Pressure information for each resource is exported through the 38 38 respective file in /proc/pressure/ -- cpu, memory, and io. 39 39 40 - The format for CPU is as such:: 41 - 42 - some avg10=0.00 avg60=0.00 avg300=0.00 total=0 43 - 44 - and for memory and IO:: 40 + The format is as such:: 45 41 46 42 some avg10=0.00 avg60=0.00 avg300=0.00 total=0 47 43 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 ··· 53 57 situation from a state where some tasks are stalled but the CPU is 54 58 still doing productive work. As such, time spent in this subset of the 55 59 stall state is tracked separately and exported in the "full" averages. 60 + 61 + CPU full is undefined at the system level, but has been reported 62 + since 5.13, so it is set to zero for backward compatibility. 56 63 57 64 The ratios (in %) are tracked as recent trends over ten, sixty, and 58 65 three hundred second windows, which gives insight into short term events

MAINTAINERS

··· 17524 17524 R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) 17525 17525 R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) 17526 17526 R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE) 17527 + R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY) 17527 17528 L: linux-kernel@vger.kernel.org 17528 17529 S: Maintained 17529 17530 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

+6 -6

include/asm-generic/vmlinux.lds.h

··· 126 126 */ 127 127 #define SCHED_DATA \ 128 128 STRUCT_ALIGN(); \ 129 - __begin_sched_classes = .; \ 130 - *(__idle_sched_class) \ 131 - *(__fair_sched_class) \ 132 - *(__rt_sched_class) \ 133 - *(__dl_sched_class) \ 129 + __sched_class_highest = .; \ 134 130 *(__stop_sched_class) \ 135 - __end_sched_classes = .; 131 + *(__dl_sched_class) \ 132 + *(__rt_sched_class) \ 133 + *(__fair_sched_class) \ 134 + *(__idle_sched_class) \ 135 + __sched_class_lowest = .; 136 136 137 137 /* The actual configuration determine if the init/exit sections 138 138 * are handled as text/data or they can be discarded (which

include/linux/interrupt.h

··· 589 589 asmlinkage void do_softirq(void); 590 590 asmlinkage void __do_softirq(void); 591 591 592 + #ifdef CONFIG_PREEMPT_RT 593 + extern void do_softirq_post_smp_call_flush(unsigned int was_pending); 594 + #else 595 + static inline void do_softirq_post_smp_call_flush(unsigned int unused) 596 + { 597 + do_softirq(); 598 + } 599 + #endif 600 + 592 601 extern void open_softirq(int nr, void (*action)(struct softirq_action *)); 593 602 extern void softirq_init(void); 594 603 extern void __raise_softirq_irqoff(unsigned int nr);

+2 -14

include/linux/sched.h

··· 2382 2382 2383 2383 #endif 2384 2384 2385 - const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); 2386 - char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); 2387 - int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); 2388 - 2389 - const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); 2390 - const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); 2391 - const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); 2392 - 2393 - int sched_trace_rq_cpu(struct rq *rq); 2394 - int sched_trace_rq_cpu_capacity(struct rq *rq); 2395 - int sched_trace_rq_nr_running(struct rq *rq); 2396 - 2397 - const struct cpumask *sched_trace_rd_span(struct root_domain *rd); 2398 - 2399 2385 #ifdef CONFIG_SCHED_CORE 2400 2386 extern void sched_core_free(struct task_struct *tsk); 2401 2387 extern void sched_core_fork(struct task_struct *p); ··· 2391 2405 static inline void sched_core_free(struct task_struct *tsk) { } 2392 2406 static inline void sched_core_fork(struct task_struct *p) { } 2393 2407 #endif 2408 + 2409 + extern void sched_set_stop_task(int cpu, struct task_struct *stop); 2394 2410 2395 2411 #endif

-7

include/linux/topology.h

··· 240 240 } 241 241 #endif 242 242 243 - #if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) 244 - static inline const struct cpumask *cpu_cluster_mask(int cpu) 245 - { 246 - return topology_cluster_cpumask(cpu); 247 - } 248 - #endif 249 - 250 243 static inline const struct cpumask *cpu_cpu_mask(int cpu) 251 244 { 252 245 return cpumask_of_node(cpu_to_node(cpu));

+3 -2

kernel/kcsan/kcsan_test.c

··· 1380 1380 else 1381 1381 nthreads *= 2; 1382 1382 1383 - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { 1383 + if (!preempt_model_preemptible() || 1384 + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { 1384 1385 /* 1385 1386 * Without any preemption, keep 2 CPUs free for other tasks, one 1386 1387 * of which is the main test case function checking for 1387 1388 * completion or failure. 1388 1389 */ 1389 - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; 1390 + const long min_unused_cpus = preempt_model_none() ? 2 : 0; 1390 1391 const long min_required_cpus = 2 + min_unused_cpus; 1391 1392 1392 1393 if (num_online_cpus() < min_required_cpus) {

kernel/sched/build_policy.c

··· 15 15 /* Headers: */ 16 16 #include <linux/sched/clock.h> 17 17 #include <linux/sched/cputime.h> 18 + #include <linux/sched/hotplug.h> 18 19 #include <linux/sched/posix-timers.h> 19 20 #include <linux/sched/rt.h> 20 21 ··· 32 31 #include <uapi/linux/sched/types.h> 33 32 34 33 #include "sched.h" 34 + #include "smp.h" 35 35 36 36 #include "autogroup.h" 37 37 #include "stats.h"

kernel/sched/build_utility.c

··· 14 14 #include <linux/sched/debug.h> 15 15 #include <linux/sched/isolation.h> 16 16 #include <linux/sched/loadavg.h> 17 + #include <linux/sched/nohz.h> 17 18 #include <linux/sched/mm.h> 18 19 #include <linux/sched/rseq_api.h> 19 20 #include <linux/sched/task_stack.h>

+13 -10

kernel/sched/core.c

··· 26 26 #include <linux/topology.h> 27 27 #include <linux/sched/clock.h> 28 28 #include <linux/sched/cond_resched.h> 29 + #include <linux/sched/cputime.h> 29 30 #include <linux/sched/debug.h> 31 + #include <linux/sched/hotplug.h> 32 + #include <linux/sched/init.h> 30 33 #include <linux/sched/isolation.h> 31 34 #include <linux/sched/loadavg.h> 32 35 #include <linux/sched/mm.h> ··· 613 610 swap(rq1, rq2); 614 611 615 612 raw_spin_rq_lock(rq1); 616 - if (__rq_lockp(rq1) == __rq_lockp(rq2)) 617 - return; 613 + if (__rq_lockp(rq1) != __rq_lockp(rq2)) 614 + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); 618 615 619 - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); 616 + double_rq_clock_clear_update(rq1, rq2); 620 617 } 621 618 #endif 622 619 ··· 2193 2190 { 2194 2191 if (p->sched_class == rq->curr->sched_class) 2195 2192 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2196 - else if (p->sched_class > rq->curr->sched_class) 2193 + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) 2197 2194 resched_curr(rq); 2198 2195 2199 2196 /* ··· 2411 2408 * __migrate_task() such that we will not miss enforcing cpus_ptr 2412 2409 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 2413 2410 */ 2414 - flush_smp_call_function_from_idle(); 2411 + flush_smp_call_function_queue(); 2415 2412 2416 2413 raw_spin_lock(&p->pi_lock); 2417 2414 rq_lock(rq, &rf); ··· 5692 5689 * higher scheduling class, because otherwise those lose the 5693 5690 * opportunity to pull in more work from other CPUs. 5694 5691 */ 5695 - if (likely(prev->sched_class <= &fair_sched_class && 5692 + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && 5696 5693 rq->nr_running == rq->cfs.h_nr_running)) { 5697 5694 5698 5695 p = pick_next_task_fair(rq, prev, rf); ··· 9472 9469 int i; 9473 9470 9474 9471 /* Make sure the linker didn't screw up */ 9475 - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || 9476 - &fair_sched_class + 1 != &rt_sched_class || 9477 - &rt_sched_class + 1 != &dl_sched_class); 9472 + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || 9473 + &fair_sched_class != &rt_sched_class + 1 || 9474 + &rt_sched_class != &dl_sched_class + 1); 9478 9475 #ifdef CONFIG_SMP 9479 - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); 9476 + BUG_ON(&dl_sched_class != &stop_sched_class + 1); 9480 9477 #endif 9481 9478 9482 9479 wait_bit_init();

+4 -11

kernel/sched/deadline.c

··· 1220 1220 return (dl_se->runtime <= 0); 1221 1221 } 1222 1222 1223 - extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 1224 - 1225 1223 /* 1226 1224 * This function implements the GRUB accounting rule: 1227 1225 * according to the GRUB reclaiming algorithm, the runtime is ··· 1830 1832 1831 1833 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) 1832 1834 { 1835 + struct rq_flags rf; 1833 1836 struct rq *rq; 1834 1837 1835 1838 if (READ_ONCE(p->__state) != TASK_WAKING) ··· 1842 1843 * from try_to_wake_up(). Hence, p->pi_lock is locked, but 1843 1844 * rq->lock is not... So, lock it 1844 1845 */ 1845 - raw_spin_rq_lock(rq); 1846 + rq_lock(rq, &rf); 1846 1847 if (p->dl.dl_non_contending) { 1847 1848 update_rq_clock(rq); 1848 1849 sub_running_bw(&p->dl, &rq->dl); ··· 1858 1859 put_task_struct(p); 1859 1860 } 1860 1861 sub_rq_bw(&p->dl, &rq->dl); 1861 - raw_spin_rq_unlock(rq); 1862 + rq_unlock(rq, &rf); 1862 1863 } 1863 1864 1864 1865 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) ··· 2318 2319 2319 2320 deactivate_task(rq, next_task, 0); 2320 2321 set_task_cpu(next_task, later_rq->cpu); 2321 - 2322 - /* 2323 - * Update the later_rq clock here, because the clock is used 2324 - * by the cpufreq_update_util() inside __add_running_bw(). 2325 - */ 2326 - update_rq_clock(later_rq); 2327 - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); 2322 + activate_task(later_rq, next_task, 0); 2328 2323 ret = 1; 2329 2324 2330 2325 resched_curr(later_rq);

+73 -231

kernel/sched/fair.c

··· 36 36 #include <linux/sched/cond_resched.h> 37 37 #include <linux/sched/cputime.h> 38 38 #include <linux/sched/isolation.h> 39 + #include <linux/sched/nohz.h> 39 40 40 41 #include <linux/cpuidle.h> 41 42 #include <linux/interrupt.h> ··· 314 313 #define for_each_sched_entity(se) \ 315 314 for (; se; se = se->parent) 316 315 317 - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 318 - { 319 - if (!path) 320 - return; 321 - 322 - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) 323 - autogroup_path(cfs_rq->tg, path, len); 324 - else if (cfs_rq && cfs_rq->tg->css.cgroup) 325 - cgroup_path(cfs_rq->tg->css.cgroup, path, len); 326 - else 327 - strlcpy(path, "(null)", len); 328 - } 329 - 330 316 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 331 317 { 332 318 struct rq *rq = rq_of(cfs_rq); ··· 480 492 481 493 #define for_each_sched_entity(se) \ 482 494 for (; se; se = NULL) 483 - 484 - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 485 - { 486 - if (path) 487 - strlcpy(path, "(null)", len); 488 - } 489 495 490 496 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 491 497 { ··· 4828 4846 4829 4847 cfs_rq->throttle_count--; 4830 4848 if (!cfs_rq->throttle_count) { 4831 - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 4832 - cfs_rq->throttled_clock_task; 4849 + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 4850 + cfs_rq->throttled_clock_pelt; 4833 4851 4834 4852 /* Add cfs_rq with load or one or more already running entities to the list */ 4835 - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) 4853 + if (!cfs_rq_is_decayed(cfs_rq)) 4836 4854 list_add_leaf_cfs_rq(cfs_rq); 4837 4855 } 4838 4856 ··· 4846 4864 4847 4865 /* group is entering throttled state, stop time */ 4848 4866 if (!cfs_rq->throttle_count) { 4849 - cfs_rq->throttled_clock_task = rq_clock_task(rq); 4867 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 4850 4868 list_del_leaf_cfs_rq(cfs_rq); 4851 4869 } 4852 4870 cfs_rq->throttle_count++; ··· 5290 5308 pcfs_rq = tg->parent->cfs_rq[cpu]; 5291 5309 5292 5310 cfs_rq->throttle_count = pcfs_rq->throttle_count; 5293 - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); 5311 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); 5294 5312 } 5295 5313 5296 5314 /* conditionally throttle active cfs_rq's from put_prev_entity() */ ··· 6526 6544 } 6527 6545 6528 6546 /* 6547 + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu 6548 + * (@dst_cpu = -1) or migrated to @dst_cpu. 6549 + */ 6550 + static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) 6551 + { 6552 + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 6553 + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); 6554 + 6555 + /* 6556 + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its 6557 + * contribution. If @p migrates from another CPU to @cpu add its 6558 + * contribution. In all the other cases @cpu is not impacted by the 6559 + * migration so its util_avg is already correct. 6560 + */ 6561 + if (task_cpu(p) == cpu && dst_cpu != cpu) 6562 + lsub_positive(&util, task_util(p)); 6563 + else if (task_cpu(p) != cpu && dst_cpu == cpu) 6564 + util += task_util(p); 6565 + 6566 + if (sched_feat(UTIL_EST)) { 6567 + unsigned long util_est; 6568 + 6569 + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); 6570 + 6571 + /* 6572 + * During wake-up @p isn't enqueued yet and doesn't contribute 6573 + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. 6574 + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p 6575 + * has been enqueued. 6576 + * 6577 + * During exec (@dst_cpu = -1) @p is enqueued and does 6578 + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. 6579 + * Remove it to "simulate" cpu_util without @p's contribution. 6580 + * 6581 + * Despite the task_on_rq_queued(@p) check there is still a 6582 + * small window for a possible race when an exec 6583 + * select_task_rq_fair() races with LB's detach_task(). 6584 + * 6585 + * detach_task() 6586 + * deactivate_task() 6587 + * p->on_rq = TASK_ON_RQ_MIGRATING; 6588 + * -------------------------------- A 6589 + * dequeue_task() \ 6590 + * dequeue_task_fair() + Race Time 6591 + * util_est_dequeue() / 6592 + * -------------------------------- B 6593 + * 6594 + * The additional check "current == p" is required to further 6595 + * reduce the race window. 6596 + */ 6597 + if (dst_cpu == cpu) 6598 + util_est += _task_util_est(p); 6599 + else if (unlikely(task_on_rq_queued(p) || current == p)) 6600 + lsub_positive(&util_est, _task_util_est(p)); 6601 + 6602 + util = max(util, util_est); 6603 + } 6604 + 6605 + return min(util, capacity_orig_of(cpu)); 6606 + } 6607 + 6608 + /* 6529 6609 * cpu_util_without: compute cpu utilization without any contributions from *p 6530 6610 * @cpu: the CPU which utilization is requested 6531 6611 * @p: the task which utilization should be discounted ··· 6602 6558 */ 6603 6559 static unsigned long cpu_util_without(int cpu, struct task_struct *p) 6604 6560 { 6605 - struct cfs_rq *cfs_rq; 6606 - unsigned int util; 6607 - 6608 6561 /* Task has no contribution or is new */ 6609 6562 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 6610 6563 return cpu_util_cfs(cpu); 6611 6564 6612 - cfs_rq = &cpu_rq(cpu)->cfs; 6613 - util = READ_ONCE(cfs_rq->avg.util_avg); 6614 - 6615 - /* Discount task's util from CPU's util */ 6616 - lsub_positive(&util, task_util(p)); 6617 - 6618 - /* 6619 - * Covered cases: 6620 - * 6621 - * a) if *p is the only task sleeping on this CPU, then: 6622 - * cpu_util (== task_util) > util_est (== 0) 6623 - * and thus we return: 6624 - * cpu_util_without = (cpu_util - task_util) = 0 6625 - * 6626 - * b) if other tasks are SLEEPING on this CPU, which is now exiting 6627 - * IDLE, then: 6628 - * cpu_util >= task_util 6629 - * cpu_util > util_est (== 0) 6630 - * and thus we discount *p's blocked utilization to return: 6631 - * cpu_util_without = (cpu_util - task_util) >= 0 6632 - * 6633 - * c) if other tasks are RUNNABLE on that CPU and 6634 - * util_est > cpu_util 6635 - * then we use util_est since it returns a more restrictive 6636 - * estimation of the spare capacity on that CPU, by just 6637 - * considering the expected utilization of tasks already 6638 - * runnable on that CPU. 6639 - * 6640 - * Cases a) and b) are covered by the above code, while case c) is 6641 - * covered by the following code when estimated utilization is 6642 - * enabled. 6643 - */ 6644 - if (sched_feat(UTIL_EST)) { 6645 - unsigned int estimated = 6646 - READ_ONCE(cfs_rq->avg.util_est.enqueued); 6647 - 6648 - /* 6649 - * Despite the following checks we still have a small window 6650 - * for a possible race, when an execl's select_task_rq_fair() 6651 - * races with LB's detach_task(): 6652 - * 6653 - * detach_task() 6654 - * p->on_rq = TASK_ON_RQ_MIGRATING; 6655 - * ---------------------------------- A 6656 - * deactivate_task() \ 6657 - * dequeue_task() + RaceTime 6658 - * util_est_dequeue() / 6659 - * ---------------------------------- B 6660 - * 6661 - * The additional check on "current == p" it's required to 6662 - * properly fix the execl regression and it helps in further 6663 - * reducing the chances for the above race. 6664 - */ 6665 - if (unlikely(task_on_rq_queued(p) || current == p)) 6666 - lsub_positive(&estimated, _task_util_est(p)); 6667 - 6668 - util = max(util, estimated); 6669 - } 6670 - 6671 - /* 6672 - * Utilization (estimated) can exceed the CPU capacity, thus let's 6673 - * clamp to the maximum CPU capacity to ensure consistency with 6674 - * cpu_util. 6675 - */ 6676 - return min_t(unsigned long, util, capacity_orig_of(cpu)); 6677 - } 6678 - 6679 - /* 6680 - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) 6681 - * to @dst_cpu. 6682 - */ 6683 - static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) 6684 - { 6685 - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 6686 - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); 6687 - 6688 - /* 6689 - * If @p migrates from @cpu to another, remove its contribution. Or, 6690 - * if @p migrates from another CPU to @cpu, add its contribution. In 6691 - * the other cases, @cpu is not impacted by the migration, so the 6692 - * util_avg should already be correct. 6693 - */ 6694 - if (task_cpu(p) == cpu && dst_cpu != cpu) 6695 - lsub_positive(&util, task_util(p)); 6696 - else if (task_cpu(p) != cpu && dst_cpu == cpu) 6697 - util += task_util(p); 6698 - 6699 - if (sched_feat(UTIL_EST)) { 6700 - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); 6701 - 6702 - /* 6703 - * During wake-up, the task isn't enqueued yet and doesn't 6704 - * appear in the cfs_rq->avg.util_est.enqueued of any rq, 6705 - * so just add it (if needed) to "simulate" what will be 6706 - * cpu_util after the task has been enqueued. 6707 - */ 6708 - if (dst_cpu == cpu) 6709 - util_est += _task_util_est(p); 6710 - 6711 - util = max(util, util_est); 6712 - } 6713 - 6714 - return min(util, capacity_orig_of(cpu)); 6565 + return cpu_util_next(cpu, p, -1); 6715 6566 } 6716 6567 6717 6568 /* ··· 9399 9460 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / 9400 9461 local->group_capacity; 9401 9462 9402 - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 9403 - sds->total_capacity; 9404 9463 /* 9405 9464 * If the local group is more loaded than the selected 9406 9465 * busiest group don't try to pull any tasks. ··· 9407 9470 env->imbalance = 0; 9408 9471 return; 9409 9472 } 9473 + 9474 + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 9475 + sds->total_capacity; 9410 9476 } 9411 9477 9412 9478 /* ··· 9435 9495 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded 9436 9496 * has_spare nr_idle balanced N/A N/A balanced balanced 9437 9497 * fully_busy nr_idle nr_idle N/A N/A balanced balanced 9438 - * misfit_task force N/A N/A N/A force force 9498 + * misfit_task force N/A N/A N/A N/A N/A 9439 9499 * asym_packing force force N/A N/A force force 9440 9500 * imbalanced force force N/A N/A force force 9441 9501 * overloaded force force N/A N/A force avg_load ··· 11821 11881 #endif /* SMP */ 11822 11882 11823 11883 } 11824 - 11825 - /* 11826 - * Helper functions to facilitate extracting info from tracepoints. 11827 - */ 11828 - 11829 - const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) 11830 - { 11831 - #ifdef CONFIG_SMP 11832 - return cfs_rq ? &cfs_rq->avg : NULL; 11833 - #else 11834 - return NULL; 11835 - #endif 11836 - } 11837 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); 11838 - 11839 - char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) 11840 - { 11841 - if (!cfs_rq) { 11842 - if (str) 11843 - strlcpy(str, "(null)", len); 11844 - else 11845 - return NULL; 11846 - } 11847 - 11848 - cfs_rq_tg_path(cfs_rq, str, len); 11849 - return str; 11850 - } 11851 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); 11852 - 11853 - int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) 11854 - { 11855 - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; 11856 - } 11857 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); 11858 - 11859 - const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) 11860 - { 11861 - #ifdef CONFIG_SMP 11862 - return rq ? &rq->avg_rt : NULL; 11863 - #else 11864 - return NULL; 11865 - #endif 11866 - } 11867 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); 11868 - 11869 - const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) 11870 - { 11871 - #ifdef CONFIG_SMP 11872 - return rq ? &rq->avg_dl : NULL; 11873 - #else 11874 - return NULL; 11875 - #endif 11876 - } 11877 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); 11878 - 11879 - const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) 11880 - { 11881 - #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) 11882 - return rq ? &rq->avg_irq : NULL; 11883 - #else 11884 - return NULL; 11885 - #endif 11886 - } 11887 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); 11888 - 11889 - int sched_trace_rq_cpu(struct rq *rq) 11890 - { 11891 - return rq ? cpu_of(rq) : -1; 11892 - } 11893 - EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); 11894 - 11895 - int sched_trace_rq_cpu_capacity(struct rq *rq) 11896 - { 11897 - return rq ? 11898 - #ifdef CONFIG_SMP 11899 - rq->cpu_capacity 11900 - #else 11901 - SCHED_CAPACITY_SCALE 11902 - #endif 11903 - : -1; 11904 - } 11905 - EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); 11906 - 11907 - const struct cpumask *sched_trace_rd_span(struct root_domain *rd) 11908 - { 11909 - #ifdef CONFIG_SMP 11910 - return rd ? rd->span : NULL; 11911 - #else 11912 - return NULL; 11913 - #endif 11914 - } 11915 - EXPORT_SYMBOL_GPL(sched_trace_rd_span); 11916 - 11917 - int sched_trace_rq_nr_running(struct rq *rq) 11918 - { 11919 - return rq ? rq->nr_running : -1; 11920 - } 11921 - EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);

+1 -1

kernel/sched/idle.c

··· 327 327 * RCU relies on this call to be done outside of an RCU read-side 328 328 * critical section. 329 329 */ 330 - flush_smp_call_function_from_idle(); 330 + flush_smp_call_function_queue(); 331 331 schedule_idle(); 332 332 333 333 if (unlikely(klp_patch_pending(current)))

+2 -2

kernel/sched/pelt.h

··· 145 145 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 146 146 { 147 147 if (unlikely(cfs_rq->throttle_count)) 148 - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; 148 + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; 149 149 150 - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; 150 + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; 151 151 } 152 152 #else 153 153 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

+11 -7

kernel/sched/psi.c

··· 1060 1060 mutex_unlock(&group->avgs_lock); 1061 1061 1062 1062 for (full = 0; full < 2; full++) { 1063 - unsigned long avg[3]; 1064 - u64 total; 1063 + unsigned long avg[3] = { 0, }; 1064 + u64 total = 0; 1065 1065 int w; 1066 1066 1067 - for (w = 0; w < 3; w++) 1068 - avg[w] = group->avg[res * 2 + full][w]; 1069 - total = div_u64(group->total[PSI_AVGS][res * 2 + full], 1070 - NSEC_PER_USEC); 1067 + /* CPU FULL is undefined at the system level */ 1068 + if (!(group == &psi_system && res == PSI_CPU && full)) { 1069 + for (w = 0; w < 3; w++) 1070 + avg[w] = group->avg[res * 2 + full][w]; 1071 + total = div_u64(group->total[PSI_AVGS][res * 2 + full], 1072 + NSEC_PER_USEC); 1073 + } 1071 1074 1072 1075 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 1073 1076 full ? "full" : "some", ··· 1120 1117 t->state = state; 1121 1118 t->threshold = threshold_us * NSEC_PER_USEC; 1122 1119 t->win.size = window_us * NSEC_PER_USEC; 1123 - window_reset(&t->win, 0, 0, 0); 1120 + window_reset(&t->win, sched_clock(), 1121 + group->total[PSI_POLL][t->state], 0); 1124 1122 1125 1123 t->event = 0; 1126 1124 t->last_event_time = 0;

+3 -2

kernel/sched/rt.c

··· 871 871 int enqueue = 0; 872 872 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 873 873 struct rq *rq = rq_of_rt_rq(rt_rq); 874 + struct rq_flags rf; 874 875 int skip; 875 876 876 877 /* ··· 886 885 if (skip) 887 886 continue; 888 887 889 - raw_spin_rq_lock(rq); 888 + rq_lock(rq, &rf); 890 889 update_rq_clock(rq); 891 890 892 891 if (rt_rq->rt_time) { ··· 924 923 925 924 if (enqueue) 926 925 sched_rt_rq_enqueue(rt_rq); 927 - raw_spin_rq_unlock(rq); 926 + rq_unlock(rq, &rf); 928 927 } 929 928 930 929 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))

+36 -19

kernel/sched/sched.h

··· 603 603 s64 runtime_remaining; 604 604 605 605 u64 throttled_clock; 606 - u64 throttled_clock_task; 607 - u64 throttled_clock_task_time; 606 + u64 throttled_clock_pelt; 607 + u64 throttled_clock_pelt_time; 608 608 int throttled; 609 609 int throttle_count; 610 610 struct list_head throttled_list; ··· 1827 1827 #endif 1828 1828 1829 1829 extern int sched_update_scaling(void); 1830 - 1831 - extern void flush_smp_call_function_from_idle(void); 1832 - 1833 - #else /* !CONFIG_SMP: */ 1834 - static inline void flush_smp_call_function_from_idle(void) { } 1835 - #endif 1830 + #endif /* CONFIG_SMP */ 1836 1831 1837 1832 #include "stats.h" 1838 1833 ··· 2177 2182 * 2178 2183 * include/asm-generic/vmlinux.lds.h 2179 2184 * 2185 + * *CAREFUL* they are laid out in *REVERSE* order!!! 2186 + * 2180 2187 * Also enforce alignment on the instance, not the type, to guarantee layout. 2181 2188 */ 2182 2189 #define DEFINE_SCHED_CLASS(name) \ ··· 2187 2190 __section("__" #name "_sched_class") 2188 2191 2189 2192 /* Defined in include/asm-generic/vmlinux.lds.h */ 2190 - extern struct sched_class __begin_sched_classes[]; 2191 - extern struct sched_class __end_sched_classes[]; 2192 - 2193 - #define sched_class_highest (__end_sched_classes - 1) 2194 - #define sched_class_lowest (__begin_sched_classes - 1) 2193 + extern struct sched_class __sched_class_highest[]; 2194 + extern struct sched_class __sched_class_lowest[]; 2195 2195 2196 2196 #define for_class_range(class, _from, _to) \ 2197 - for (class = (_from); class != (_to); class--) 2197 + for (class = (_from); class < (_to); class++) 2198 2198 2199 2199 #define for_each_class(class) \ 2200 - for_class_range(class, sched_class_highest, sched_class_lowest) 2200 + for_class_range(class, __sched_class_highest, __sched_class_lowest) 2201 + 2202 + #define sched_class_above(_a, _b) ((_a) < (_b)) 2201 2203 2202 2204 extern const struct sched_class stop_sched_class; 2203 2205 extern const struct sched_class dl_sched_class; ··· 2305 2309 2306 2310 extern struct rt_bandwidth def_rt_bandwidth; 2307 2311 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 2312 + extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 2308 2313 2309 2314 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 2310 2315 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); ··· 2475 2478 } 2476 2479 #endif 2477 2480 2481 + #ifdef CONFIG_SCHED_DEBUG 2482 + /* 2483 + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to 2484 + * acquire rq lock instead of rq_lock(). So at the end of these two functions 2485 + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of 2486 + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. 2487 + */ 2488 + static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) 2489 + { 2490 + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2491 + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ 2492 + #ifdef CONFIG_SMP 2493 + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2494 + #endif 2495 + } 2496 + #else 2497 + static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} 2498 + #endif 2478 2499 2479 2500 #ifdef CONFIG_SMP 2480 2501 ··· 2558 2543 __acquires(busiest->lock) 2559 2544 __acquires(this_rq->lock) 2560 2545 { 2561 - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) 2546 + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || 2547 + likely(raw_spin_rq_trylock(busiest))) { 2548 + double_rq_clock_clear_update(this_rq, busiest); 2562 2549 return 0; 2563 - 2564 - if (likely(raw_spin_rq_trylock(busiest))) 2565 - return 0; 2550 + } 2566 2551 2567 2552 if (rq_order_less(this_rq, busiest)) { 2568 2553 raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); 2554 + double_rq_clock_clear_update(this_rq, busiest); 2569 2555 return 0; 2570 2556 } 2571 2557 ··· 2660 2644 BUG_ON(rq1 != rq2); 2661 2645 raw_spin_rq_lock(rq1); 2662 2646 __acquire(rq2->lock); /* Fake it out ;) */ 2647 + double_rq_clock_clear_update(rq1, rq2); 2663 2648 } 2664 2649 2665 2650 /*

kernel/sched/smp.h

··· 7 7 extern void sched_ttwu_pending(void *arg); 8 8 9 9 extern void send_call_function_single_ipi(int cpu); 10 + 11 + #ifdef CONFIG_SMP 12 + extern void flush_smp_call_function_queue(void); 13 + #else 14 + static inline void flush_smp_call_function_queue(void) { } 15 + #endif

+24 -8

kernel/smp.c

··· 96 96 97 97 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 98 98 99 - static void flush_smp_call_function_queue(bool warn_cpu_offline); 99 + static void __flush_smp_call_function_queue(bool warn_cpu_offline); 100 100 101 101 int smpcfd_prepare_cpu(unsigned int cpu) 102 102 { ··· 141 141 * ensure that the outgoing CPU doesn't go offline with work 142 142 * still pending. 143 143 */ 144 - flush_smp_call_function_queue(false); 144 + __flush_smp_call_function_queue(false); 145 145 irq_work_run(); 146 146 return 0; 147 147 } ··· 544 544 { 545 545 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, 546 546 smp_processor_id(), CFD_SEQ_GOTIPI); 547 - flush_smp_call_function_queue(true); 547 + __flush_smp_call_function_queue(true); 548 548 } 549 549 550 550 /** 551 - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks 551 + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks 552 552 * 553 553 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an 554 554 * offline CPU. Skip this check if set to 'false'. ··· 561 561 * Loop through the call_single_queue and run all the queued callbacks. 562 562 * Must be called with interrupts disabled. 563 563 */ 564 - static void flush_smp_call_function_queue(bool warn_cpu_offline) 564 + static void __flush_smp_call_function_queue(bool warn_cpu_offline) 565 565 { 566 566 call_single_data_t *csd, *csd_next; 567 567 struct llist_node *entry, *prev; ··· 684 684 smp_processor_id(), CFD_SEQ_HDLEND); 685 685 } 686 686 687 - void flush_smp_call_function_from_idle(void) 687 + 688 + /** 689 + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks 690 + * from task context (idle, migration thread) 691 + * 692 + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it 693 + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by 694 + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to 695 + * handle queued SMP function calls before scheduling. 696 + * 697 + * The migration thread has to ensure that an eventually pending wakeup has 698 + * been handled before it migrates a task. 699 + */ 700 + void flush_smp_call_function_queue(void) 688 701 { 702 + unsigned int was_pending; 689 703 unsigned long flags; 690 704 691 705 if (llist_empty(this_cpu_ptr(&call_single_queue))) ··· 708 694 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, 709 695 smp_processor_id(), CFD_SEQ_IDLE); 710 696 local_irq_save(flags); 711 - flush_smp_call_function_queue(true); 697 + /* Get the already pending soft interrupts for RT enabled kernels */ 698 + was_pending = local_softirq_pending(); 699 + __flush_smp_call_function_queue(true); 712 700 if (local_softirq_pending()) 713 - do_softirq(); 701 + do_softirq_post_smp_call_flush(was_pending); 714 702 715 703 local_irq_restore(flags); 716 704 }

+13

kernel/softirq.c

··· 294 294 wakeup_softirqd(); 295 295 } 296 296 297 + /* 298 + * flush_smp_call_function_queue() can raise a soft interrupt in a function 299 + * call. On RT kernels this is undesired and the only known functionality 300 + * in the block layer which does this is disabled on RT. If soft interrupts 301 + * get raised which haven't been raised before the flush, warn so it can be 302 + * investigated. 303 + */ 304 + void do_softirq_post_smp_call_flush(unsigned int was_pending) 305 + { 306 + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) 307 + invoke_softirq(); 308 + } 309 + 297 310 #else /* CONFIG_PREEMPT_RT */ 298 311 299 312 /*

-2

kernel/stop_machine.c

··· 535 535 kthread_park(stopper->thread); 536 536 } 537 537 538 - extern void sched_set_stop_task(int cpu, struct task_struct *stop); 539 - 540 538 static void cpu_stop_create(unsigned int cpu) 541 539 { 542 540 sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));

+4 -10

kernel/trace/trace.c

··· 4289 4289 entries, 4290 4290 total, 4291 4291 buf->cpu, 4292 - #if defined(CONFIG_PREEMPT_NONE) 4293 - "server", 4294 - #elif defined(CONFIG_PREEMPT_VOLUNTARY) 4295 - "desktop", 4296 - #elif defined(CONFIG_PREEMPT) 4297 - "preempt", 4298 - #elif defined(CONFIG_PREEMPT_RT) 4299 - "preempt_rt", 4300 - #else 4292 + preempt_model_none() ? "server" : 4293 + preempt_model_voluntary() ? "desktop" : 4294 + preempt_model_full() ? "preempt" : 4295 + preempt_model_rt() ? "preempt_rt" : 4301 4296 "unknown", 4302 - #endif 4303 4297 /* These are reserved for later use */ 4304 4298 0, 0, 0, 0); 4305 4299 #ifdef CONFIG_SMP