commit 6f3f04c19074972ea12edeed23b07a32894e9e03 · tjh.dev/kernel

+1

.mailmap

··· 398 Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org> 399 Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com> 400 Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru> 401 Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com> 402 Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com> 403 Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>

··· 398 Vasily Averin <vasily.averin@linux.dev> <vvs@openvz.org> 399 Vasily Averin <vasily.averin@linux.dev> <vvs@parallels.com> 400 Vasily Averin <vasily.averin@linux.dev> <vvs@sw.ru> 401 + Valentin Schneider <vschneid@redhat.com> <valentin.schneider@arm.com> 402 Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com> 403 Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com> 404 Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>

+4 -5

Documentation/accounting/psi.rst

··· 37 Pressure information for each resource is exported through the 38 respective file in /proc/pressure/ -- cpu, memory, and io. 39 40 - The format for CPU is as such:: 41 - 42 - some avg10=0.00 avg60=0.00 avg300=0.00 total=0 43 - 44 - and for memory and IO:: 45 46 some avg10=0.00 avg60=0.00 avg300=0.00 total=0 47 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 ··· 53 situation from a state where some tasks are stalled but the CPU is 54 still doing productive work. As such, time spent in this subset of the 55 stall state is tracked separately and exported in the "full" averages. 56 57 The ratios (in %) are tracked as recent trends over ten, sixty, and 58 three hundred second windows, which gives insight into short term events

··· 37 Pressure information for each resource is exported through the 38 respective file in /proc/pressure/ -- cpu, memory, and io. 39 40 + The format is as such:: 41 42 some avg10=0.00 avg60=0.00 avg300=0.00 total=0 43 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 ··· 57 situation from a state where some tasks are stalled but the CPU is 58 still doing productive work. As such, time spent in this subset of the 59 stall state is tracked separately and exported in the "full" averages. 60 + 61 + CPU full is undefined at the system level, but has been reported 62 + since 5.13, so it is set to zero for backward compatibility. 63 64 The ratios (in %) are tracked as recent trends over ten, sixty, and 65 three hundred second windows, which gives insight into short term events

+1

MAINTAINERS

··· 17524 R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) 17525 R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) 17526 R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE) 17527 L: linux-kernel@vger.kernel.org 17528 S: Maintained 17529 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

··· 17524 R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH) 17525 R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING) 17526 R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE) 17527 + R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY) 17528 L: linux-kernel@vger.kernel.org 17529 S: Maintained 17530 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

+6 -6

include/asm-generic/vmlinux.lds.h

··· 126 */ 127 #define SCHED_DATA \ 128 STRUCT_ALIGN(); \ 129 - __begin_sched_classes = .; \ 130 - *(__idle_sched_class) \ 131 - *(__fair_sched_class) \ 132 - *(__rt_sched_class) \ 133 - *(__dl_sched_class) \ 134 *(__stop_sched_class) \ 135 - __end_sched_classes = .; 136 137 /* The actual configuration determine if the init/exit sections 138 * are handled as text/data or they can be discarded (which

··· 126 */ 127 #define SCHED_DATA \ 128 STRUCT_ALIGN(); \ 129 + __sched_class_highest = .; \ 130 *(__stop_sched_class) \ 131 + *(__dl_sched_class) \ 132 + *(__rt_sched_class) \ 133 + *(__fair_sched_class) \ 134 + *(__idle_sched_class) \ 135 + __sched_class_lowest = .; 136 137 /* The actual configuration determine if the init/exit sections 138 * are handled as text/data or they can be discarded (which

+9

include/linux/interrupt.h

··· 589 asmlinkage void do_softirq(void); 590 asmlinkage void __do_softirq(void); 591 592 extern void open_softirq(int nr, void (*action)(struct softirq_action *)); 593 extern void softirq_init(void); 594 extern void __raise_softirq_irqoff(unsigned int nr);

··· 589 asmlinkage void do_softirq(void); 590 asmlinkage void __do_softirq(void); 591 592 + #ifdef CONFIG_PREEMPT_RT 593 + extern void do_softirq_post_smp_call_flush(unsigned int was_pending); 594 + #else 595 + static inline void do_softirq_post_smp_call_flush(unsigned int unused) 596 + { 597 + do_softirq(); 598 + } 599 + #endif 600 + 601 extern void open_softirq(int nr, void (*action)(struct softirq_action *)); 602 extern void softirq_init(void); 603 extern void __raise_softirq_irqoff(unsigned int nr);

+2 -14

include/linux/sched.h

··· 2382 2383 #endif 2384 2385 - const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); 2386 - char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); 2387 - int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); 2388 - 2389 - const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); 2390 - const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); 2391 - const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); 2392 - 2393 - int sched_trace_rq_cpu(struct rq *rq); 2394 - int sched_trace_rq_cpu_capacity(struct rq *rq); 2395 - int sched_trace_rq_nr_running(struct rq *rq); 2396 - 2397 - const struct cpumask *sched_trace_rd_span(struct root_domain *rd); 2398 - 2399 #ifdef CONFIG_SCHED_CORE 2400 extern void sched_core_free(struct task_struct *tsk); 2401 extern void sched_core_fork(struct task_struct *p); ··· 2391 static inline void sched_core_free(struct task_struct *tsk) { } 2392 static inline void sched_core_fork(struct task_struct *p) { } 2393 #endif 2394 2395 #endif

··· 2382 2383 #endif 2384 2385 #ifdef CONFIG_SCHED_CORE 2386 extern void sched_core_free(struct task_struct *tsk); 2387 extern void sched_core_fork(struct task_struct *p); ··· 2405 static inline void sched_core_free(struct task_struct *tsk) { } 2406 static inline void sched_core_fork(struct task_struct *p) { } 2407 #endif 2408 + 2409 + extern void sched_set_stop_task(int cpu, struct task_struct *stop); 2410 2411 #endif

-7

include/linux/topology.h

··· 240 } 241 #endif 242 243 - #if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) 244 - static inline const struct cpumask *cpu_cluster_mask(int cpu) 245 - { 246 - return topology_cluster_cpumask(cpu); 247 - } 248 - #endif 249 - 250 static inline const struct cpumask *cpu_cpu_mask(int cpu) 251 { 252 return cpumask_of_node(cpu_to_node(cpu));

··· 240 } 241 #endif 242 243 static inline const struct cpumask *cpu_cpu_mask(int cpu) 244 { 245 return cpumask_of_node(cpu_to_node(cpu));

+3 -2

kernel/kcsan/kcsan_test.c

··· 1380 else 1381 nthreads *= 2; 1382 1383 - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { 1384 /* 1385 * Without any preemption, keep 2 CPUs free for other tasks, one 1386 * of which is the main test case function checking for 1387 * completion or failure. 1388 */ 1389 - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; 1390 const long min_required_cpus = 2 + min_unused_cpus; 1391 1392 if (num_online_cpus() < min_required_cpus) {

··· 1380 else 1381 nthreads *= 2; 1382 1383 + if (!preempt_model_preemptible() || 1384 + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { 1385 /* 1386 * Without any preemption, keep 2 CPUs free for other tasks, one 1387 * of which is the main test case function checking for 1388 * completion or failure. 1389 */ 1390 + const long min_unused_cpus = preempt_model_none() ? 2 : 0; 1391 const long min_required_cpus = 2 + min_unused_cpus; 1392 1393 if (num_online_cpus() < min_required_cpus) {

+2

kernel/sched/build_policy.c

··· 15 /* Headers: */ 16 #include <linux/sched/clock.h> 17 #include <linux/sched/cputime.h> 18 #include <linux/sched/posix-timers.h> 19 #include <linux/sched/rt.h> 20 ··· 32 #include <uapi/linux/sched/types.h> 33 34 #include "sched.h" 35 36 #include "autogroup.h" 37 #include "stats.h"

··· 15 /* Headers: */ 16 #include <linux/sched/clock.h> 17 #include <linux/sched/cputime.h> 18 + #include <linux/sched/hotplug.h> 19 #include <linux/sched/posix-timers.h> 20 #include <linux/sched/rt.h> 21 ··· 31 #include <uapi/linux/sched/types.h> 32 33 #include "sched.h" 34 + #include "smp.h" 35 36 #include "autogroup.h" 37 #include "stats.h"

+1

kernel/sched/build_utility.c

··· 14 #include <linux/sched/debug.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/loadavg.h> 17 #include <linux/sched/mm.h> 18 #include <linux/sched/rseq_api.h> 19 #include <linux/sched/task_stack.h>

··· 14 #include <linux/sched/debug.h> 15 #include <linux/sched/isolation.h> 16 #include <linux/sched/loadavg.h> 17 + #include <linux/sched/nohz.h> 18 #include <linux/sched/mm.h> 19 #include <linux/sched/rseq_api.h> 20 #include <linux/sched/task_stack.h>

+13 -10

kernel/sched/core.c

··· 26 #include <linux/topology.h> 27 #include <linux/sched/clock.h> 28 #include <linux/sched/cond_resched.h> 29 #include <linux/sched/debug.h> 30 #include <linux/sched/isolation.h> 31 #include <linux/sched/loadavg.h> 32 #include <linux/sched/mm.h> ··· 613 swap(rq1, rq2); 614 615 raw_spin_rq_lock(rq1); 616 - if (__rq_lockp(rq1) == __rq_lockp(rq2)) 617 - return; 618 619 - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); 620 } 621 #endif 622 ··· 2193 { 2194 if (p->sched_class == rq->curr->sched_class) 2195 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2196 - else if (p->sched_class > rq->curr->sched_class) 2197 resched_curr(rq); 2198 2199 /* ··· 2411 * __migrate_task() such that we will not miss enforcing cpus_ptr 2412 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 2413 */ 2414 - flush_smp_call_function_from_idle(); 2415 2416 raw_spin_lock(&p->pi_lock); 2417 rq_lock(rq, &rf); ··· 5692 * higher scheduling class, because otherwise those lose the 5693 * opportunity to pull in more work from other CPUs. 5694 */ 5695 - if (likely(prev->sched_class <= &fair_sched_class && 5696 rq->nr_running == rq->cfs.h_nr_running)) { 5697 5698 p = pick_next_task_fair(rq, prev, rf); ··· 9472 int i; 9473 9474 /* Make sure the linker didn't screw up */ 9475 - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || 9476 - &fair_sched_class + 1 != &rt_sched_class || 9477 - &rt_sched_class + 1 != &dl_sched_class); 9478 #ifdef CONFIG_SMP 9479 - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); 9480 #endif 9481 9482 wait_bit_init();

··· 26 #include <linux/topology.h> 27 #include <linux/sched/clock.h> 28 #include <linux/sched/cond_resched.h> 29 + #include <linux/sched/cputime.h> 30 #include <linux/sched/debug.h> 31 + #include <linux/sched/hotplug.h> 32 + #include <linux/sched/init.h> 33 #include <linux/sched/isolation.h> 34 #include <linux/sched/loadavg.h> 35 #include <linux/sched/mm.h> ··· 610 swap(rq1, rq2); 611 612 raw_spin_rq_lock(rq1); 613 + if (__rq_lockp(rq1) != __rq_lockp(rq2)) 614 + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); 615 616 + double_rq_clock_clear_update(rq1, rq2); 617 } 618 #endif 619 ··· 2190 { 2191 if (p->sched_class == rq->curr->sched_class) 2192 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 2193 + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) 2194 resched_curr(rq); 2195 2196 /* ··· 2408 * __migrate_task() such that we will not miss enforcing cpus_ptr 2409 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 2410 */ 2411 + flush_smp_call_function_queue(); 2412 2413 raw_spin_lock(&p->pi_lock); 2414 rq_lock(rq, &rf); ··· 5689 * higher scheduling class, because otherwise those lose the 5690 * opportunity to pull in more work from other CPUs. 5691 */ 5692 + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && 5693 rq->nr_running == rq->cfs.h_nr_running)) { 5694 5695 p = pick_next_task_fair(rq, prev, rf); ··· 9469 int i; 9470 9471 /* Make sure the linker didn't screw up */ 9472 + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || 9473 + &fair_sched_class != &rt_sched_class + 1 || 9474 + &rt_sched_class != &dl_sched_class + 1); 9475 #ifdef CONFIG_SMP 9476 + BUG_ON(&dl_sched_class != &stop_sched_class + 1); 9477 #endif 9478 9479 wait_bit_init();

+4 -11

kernel/sched/deadline.c

··· 1220 return (dl_se->runtime <= 0); 1221 } 1222 1223 - extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 1224 - 1225 /* 1226 * This function implements the GRUB accounting rule: 1227 * according to the GRUB reclaiming algorithm, the runtime is ··· 1830 1831 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) 1832 { 1833 struct rq *rq; 1834 1835 if (READ_ONCE(p->__state) != TASK_WAKING) ··· 1842 * from try_to_wake_up(). Hence, p->pi_lock is locked, but 1843 * rq->lock is not... So, lock it 1844 */ 1845 - raw_spin_rq_lock(rq); 1846 if (p->dl.dl_non_contending) { 1847 update_rq_clock(rq); 1848 sub_running_bw(&p->dl, &rq->dl); ··· 1858 put_task_struct(p); 1859 } 1860 sub_rq_bw(&p->dl, &rq->dl); 1861 - raw_spin_rq_unlock(rq); 1862 } 1863 1864 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) ··· 2318 2319 deactivate_task(rq, next_task, 0); 2320 set_task_cpu(next_task, later_rq->cpu); 2321 - 2322 - /* 2323 - * Update the later_rq clock here, because the clock is used 2324 - * by the cpufreq_update_util() inside __add_running_bw(). 2325 - */ 2326 - update_rq_clock(later_rq); 2327 - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); 2328 ret = 1; 2329 2330 resched_curr(later_rq);

··· 1220 return (dl_se->runtime <= 0); 1221 } 1222 1223 /* 1224 * This function implements the GRUB accounting rule: 1225 * according to the GRUB reclaiming algorithm, the runtime is ··· 1832 1833 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) 1834 { 1835 + struct rq_flags rf; 1836 struct rq *rq; 1837 1838 if (READ_ONCE(p->__state) != TASK_WAKING) ··· 1843 * from try_to_wake_up(). Hence, p->pi_lock is locked, but 1844 * rq->lock is not... So, lock it 1845 */ 1846 + rq_lock(rq, &rf); 1847 if (p->dl.dl_non_contending) { 1848 update_rq_clock(rq); 1849 sub_running_bw(&p->dl, &rq->dl); ··· 1859 put_task_struct(p); 1860 } 1861 sub_rq_bw(&p->dl, &rq->dl); 1862 + rq_unlock(rq, &rf); 1863 } 1864 1865 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) ··· 2319 2320 deactivate_task(rq, next_task, 0); 2321 set_task_cpu(next_task, later_rq->cpu); 2322 + activate_task(later_rq, next_task, 0); 2323 ret = 1; 2324 2325 resched_curr(later_rq);

+73 -231

kernel/sched/fair.c

··· 36 #include <linux/sched/cond_resched.h> 37 #include <linux/sched/cputime.h> 38 #include <linux/sched/isolation.h> 39 40 #include <linux/cpuidle.h> 41 #include <linux/interrupt.h> ··· 314 #define for_each_sched_entity(se) \ 315 for (; se; se = se->parent) 316 317 - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 318 - { 319 - if (!path) 320 - return; 321 - 322 - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) 323 - autogroup_path(cfs_rq->tg, path, len); 324 - else if (cfs_rq && cfs_rq->tg->css.cgroup) 325 - cgroup_path(cfs_rq->tg->css.cgroup, path, len); 326 - else 327 - strlcpy(path, "(null)", len); 328 - } 329 - 330 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 331 { 332 struct rq *rq = rq_of(cfs_rq); ··· 480 481 #define for_each_sched_entity(se) \ 482 for (; se; se = NULL) 483 - 484 - static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) 485 - { 486 - if (path) 487 - strlcpy(path, "(null)", len); 488 - } 489 490 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 491 { ··· 4828 4829 cfs_rq->throttle_count--; 4830 if (!cfs_rq->throttle_count) { 4831 - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 4832 - cfs_rq->throttled_clock_task; 4833 4834 /* Add cfs_rq with load or one or more already running entities to the list */ 4835 - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) 4836 list_add_leaf_cfs_rq(cfs_rq); 4837 } 4838 ··· 4846 4847 /* group is entering throttled state, stop time */ 4848 if (!cfs_rq->throttle_count) { 4849 - cfs_rq->throttled_clock_task = rq_clock_task(rq); 4850 list_del_leaf_cfs_rq(cfs_rq); 4851 } 4852 cfs_rq->throttle_count++; ··· 5290 pcfs_rq = tg->parent->cfs_rq[cpu]; 5291 5292 cfs_rq->throttle_count = pcfs_rq->throttle_count; 5293 - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); 5294 } 5295 5296 /* conditionally throttle active cfs_rq's from put_prev_entity() */ ··· 6526 } 6527 6528 /* 6529 * cpu_util_without: compute cpu utilization without any contributions from *p 6530 * @cpu: the CPU which utilization is requested 6531 * @p: the task which utilization should be discounted ··· 6602 */ 6603 static unsigned long cpu_util_without(int cpu, struct task_struct *p) 6604 { 6605 - struct cfs_rq *cfs_rq; 6606 - unsigned int util; 6607 - 6608 /* Task has no contribution or is new */ 6609 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 6610 return cpu_util_cfs(cpu); 6611 6612 - cfs_rq = &cpu_rq(cpu)->cfs; 6613 - util = READ_ONCE(cfs_rq->avg.util_avg); 6614 - 6615 - /* Discount task's util from CPU's util */ 6616 - lsub_positive(&util, task_util(p)); 6617 - 6618 - /* 6619 - * Covered cases: 6620 - * 6621 - * a) if *p is the only task sleeping on this CPU, then: 6622 - * cpu_util (== task_util) > util_est (== 0) 6623 - * and thus we return: 6624 - * cpu_util_without = (cpu_util - task_util) = 0 6625 - * 6626 - * b) if other tasks are SLEEPING on this CPU, which is now exiting 6627 - * IDLE, then: 6628 - * cpu_util >= task_util 6629 - * cpu_util > util_est (== 0) 6630 - * and thus we discount *p's blocked utilization to return: 6631 - * cpu_util_without = (cpu_util - task_util) >= 0 6632 - * 6633 - * c) if other tasks are RUNNABLE on that CPU and 6634 - * util_est > cpu_util 6635 - * then we use util_est since it returns a more restrictive 6636 - * estimation of the spare capacity on that CPU, by just 6637 - * considering the expected utilization of tasks already 6638 - * runnable on that CPU. 6639 - * 6640 - * Cases a) and b) are covered by the above code, while case c) is 6641 - * covered by the following code when estimated utilization is 6642 - * enabled. 6643 - */ 6644 - if (sched_feat(UTIL_EST)) { 6645 - unsigned int estimated = 6646 - READ_ONCE(cfs_rq->avg.util_est.enqueued); 6647 - 6648 - /* 6649 - * Despite the following checks we still have a small window 6650 - * for a possible race, when an execl's select_task_rq_fair() 6651 - * races with LB's detach_task(): 6652 - * 6653 - * detach_task() 6654 - * p->on_rq = TASK_ON_RQ_MIGRATING; 6655 - * ---------------------------------- A 6656 - * deactivate_task() \ 6657 - * dequeue_task() + RaceTime 6658 - * util_est_dequeue() / 6659 - * ---------------------------------- B 6660 - * 6661 - * The additional check on "current == p" it's required to 6662 - * properly fix the execl regression and it helps in further 6663 - * reducing the chances for the above race. 6664 - */ 6665 - if (unlikely(task_on_rq_queued(p) || current == p)) 6666 - lsub_positive(&estimated, _task_util_est(p)); 6667 - 6668 - util = max(util, estimated); 6669 - } 6670 - 6671 - /* 6672 - * Utilization (estimated) can exceed the CPU capacity, thus let's 6673 - * clamp to the maximum CPU capacity to ensure consistency with 6674 - * cpu_util. 6675 - */ 6676 - return min_t(unsigned long, util, capacity_orig_of(cpu)); 6677 - } 6678 - 6679 - /* 6680 - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) 6681 - * to @dst_cpu. 6682 - */ 6683 - static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) 6684 - { 6685 - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 6686 - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); 6687 - 6688 - /* 6689 - * If @p migrates from @cpu to another, remove its contribution. Or, 6690 - * if @p migrates from another CPU to @cpu, add its contribution. In 6691 - * the other cases, @cpu is not impacted by the migration, so the 6692 - * util_avg should already be correct. 6693 - */ 6694 - if (task_cpu(p) == cpu && dst_cpu != cpu) 6695 - lsub_positive(&util, task_util(p)); 6696 - else if (task_cpu(p) != cpu && dst_cpu == cpu) 6697 - util += task_util(p); 6698 - 6699 - if (sched_feat(UTIL_EST)) { 6700 - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); 6701 - 6702 - /* 6703 - * During wake-up, the task isn't enqueued yet and doesn't 6704 - * appear in the cfs_rq->avg.util_est.enqueued of any rq, 6705 - * so just add it (if needed) to "simulate" what will be 6706 - * cpu_util after the task has been enqueued. 6707 - */ 6708 - if (dst_cpu == cpu) 6709 - util_est += _task_util_est(p); 6710 - 6711 - util = max(util, util_est); 6712 - } 6713 - 6714 - return min(util, capacity_orig_of(cpu)); 6715 } 6716 6717 /* ··· 9399 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / 9400 local->group_capacity; 9401 9402 - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 9403 - sds->total_capacity; 9404 /* 9405 * If the local group is more loaded than the selected 9406 * busiest group don't try to pull any tasks. ··· 9407 env->imbalance = 0; 9408 return; 9409 } 9410 } 9411 9412 /* ··· 9435 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded 9436 * has_spare nr_idle balanced N/A N/A balanced balanced 9437 * fully_busy nr_idle nr_idle N/A N/A balanced balanced 9438 - * misfit_task force N/A N/A N/A force force 9439 * asym_packing force force N/A N/A force force 9440 * imbalanced force force N/A N/A force force 9441 * overloaded force force N/A N/A force avg_load ··· 11821 #endif /* SMP */ 11822 11823 } 11824 - 11825 - /* 11826 - * Helper functions to facilitate extracting info from tracepoints. 11827 - */ 11828 - 11829 - const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) 11830 - { 11831 - #ifdef CONFIG_SMP 11832 - return cfs_rq ? &cfs_rq->avg : NULL; 11833 - #else 11834 - return NULL; 11835 - #endif 11836 - } 11837 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); 11838 - 11839 - char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) 11840 - { 11841 - if (!cfs_rq) { 11842 - if (str) 11843 - strlcpy(str, "(null)", len); 11844 - else 11845 - return NULL; 11846 - } 11847 - 11848 - cfs_rq_tg_path(cfs_rq, str, len); 11849 - return str; 11850 - } 11851 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); 11852 - 11853 - int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) 11854 - { 11855 - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; 11856 - } 11857 - EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); 11858 - 11859 - const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) 11860 - { 11861 - #ifdef CONFIG_SMP 11862 - return rq ? &rq->avg_rt : NULL; 11863 - #else 11864 - return NULL; 11865 - #endif 11866 - } 11867 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); 11868 - 11869 - const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) 11870 - { 11871 - #ifdef CONFIG_SMP 11872 - return rq ? &rq->avg_dl : NULL; 11873 - #else 11874 - return NULL; 11875 - #endif 11876 - } 11877 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); 11878 - 11879 - const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) 11880 - { 11881 - #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) 11882 - return rq ? &rq->avg_irq : NULL; 11883 - #else 11884 - return NULL; 11885 - #endif 11886 - } 11887 - EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); 11888 - 11889 - int sched_trace_rq_cpu(struct rq *rq) 11890 - { 11891 - return rq ? cpu_of(rq) : -1; 11892 - } 11893 - EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); 11894 - 11895 - int sched_trace_rq_cpu_capacity(struct rq *rq) 11896 - { 11897 - return rq ? 11898 - #ifdef CONFIG_SMP 11899 - rq->cpu_capacity 11900 - #else 11901 - SCHED_CAPACITY_SCALE 11902 - #endif 11903 - : -1; 11904 - } 11905 - EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); 11906 - 11907 - const struct cpumask *sched_trace_rd_span(struct root_domain *rd) 11908 - { 11909 - #ifdef CONFIG_SMP 11910 - return rd ? rd->span : NULL; 11911 - #else 11912 - return NULL; 11913 - #endif 11914 - } 11915 - EXPORT_SYMBOL_GPL(sched_trace_rd_span); 11916 - 11917 - int sched_trace_rq_nr_running(struct rq *rq) 11918 - { 11919 - return rq ? rq->nr_running : -1; 11920 - } 11921 - EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);

··· 36 #include <linux/sched/cond_resched.h> 37 #include <linux/sched/cputime.h> 38 #include <linux/sched/isolation.h> 39 + #include <linux/sched/nohz.h> 40 41 #include <linux/cpuidle.h> 42 #include <linux/interrupt.h> ··· 313 #define for_each_sched_entity(se) \ 314 for (; se; se = se->parent) 315 316 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 317 { 318 struct rq *rq = rq_of(cfs_rq); ··· 492 493 #define for_each_sched_entity(se) \ 494 for (; se; se = NULL) 495 496 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 497 { ··· 4846 4847 cfs_rq->throttle_count--; 4848 if (!cfs_rq->throttle_count) { 4849 + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 4850 + cfs_rq->throttled_clock_pelt; 4851 4852 /* Add cfs_rq with load or one or more already running entities to the list */ 4853 + if (!cfs_rq_is_decayed(cfs_rq)) 4854 list_add_leaf_cfs_rq(cfs_rq); 4855 } 4856 ··· 4864 4865 /* group is entering throttled state, stop time */ 4866 if (!cfs_rq->throttle_count) { 4867 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 4868 list_del_leaf_cfs_rq(cfs_rq); 4869 } 4870 cfs_rq->throttle_count++; ··· 5308 pcfs_rq = tg->parent->cfs_rq[cpu]; 5309 5310 cfs_rq->throttle_count = pcfs_rq->throttle_count; 5311 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); 5312 } 5313 5314 /* conditionally throttle active cfs_rq's from put_prev_entity() */ ··· 6544 } 6545 6546 /* 6547 + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu 6548 + * (@dst_cpu = -1) or migrated to @dst_cpu. 6549 + */ 6550 + static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) 6551 + { 6552 + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; 6553 + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); 6554 + 6555 + /* 6556 + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its 6557 + * contribution. If @p migrates from another CPU to @cpu add its 6558 + * contribution. In all the other cases @cpu is not impacted by the 6559 + * migration so its util_avg is already correct. 6560 + */ 6561 + if (task_cpu(p) == cpu && dst_cpu != cpu) 6562 + lsub_positive(&util, task_util(p)); 6563 + else if (task_cpu(p) != cpu && dst_cpu == cpu) 6564 + util += task_util(p); 6565 + 6566 + if (sched_feat(UTIL_EST)) { 6567 + unsigned long util_est; 6568 + 6569 + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); 6570 + 6571 + /* 6572 + * During wake-up @p isn't enqueued yet and doesn't contribute 6573 + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. 6574 + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p 6575 + * has been enqueued. 6576 + * 6577 + * During exec (@dst_cpu = -1) @p is enqueued and does 6578 + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. 6579 + * Remove it to "simulate" cpu_util without @p's contribution. 6580 + * 6581 + * Despite the task_on_rq_queued(@p) check there is still a 6582 + * small window for a possible race when an exec 6583 + * select_task_rq_fair() races with LB's detach_task(). 6584 + * 6585 + * detach_task() 6586 + * deactivate_task() 6587 + * p->on_rq = TASK_ON_RQ_MIGRATING; 6588 + * -------------------------------- A 6589 + * dequeue_task() \ 6590 + * dequeue_task_fair() + Race Time 6591 + * util_est_dequeue() / 6592 + * -------------------------------- B 6593 + * 6594 + * The additional check "current == p" is required to further 6595 + * reduce the race window. 6596 + */ 6597 + if (dst_cpu == cpu) 6598 + util_est += _task_util_est(p); 6599 + else if (unlikely(task_on_rq_queued(p) || current == p)) 6600 + lsub_positive(&util_est, _task_util_est(p)); 6601 + 6602 + util = max(util, util_est); 6603 + } 6604 + 6605 + return min(util, capacity_orig_of(cpu)); 6606 + } 6607 + 6608 + /* 6609 * cpu_util_without: compute cpu utilization without any contributions from *p 6610 * @cpu: the CPU which utilization is requested 6611 * @p: the task which utilization should be discounted ··· 6558 */ 6559 static unsigned long cpu_util_without(int cpu, struct task_struct *p) 6560 { 6561 /* Task has no contribution or is new */ 6562 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) 6563 return cpu_util_cfs(cpu); 6564 6565 + return cpu_util_next(cpu, p, -1); 6566 } 6567 6568 /* ··· 9460 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / 9461 local->group_capacity; 9462 9463 /* 9464 * If the local group is more loaded than the selected 9465 * busiest group don't try to pull any tasks. ··· 9470 env->imbalance = 0; 9471 return; 9472 } 9473 + 9474 + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / 9475 + sds->total_capacity; 9476 } 9477 9478 /* ··· 9495 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded 9496 * has_spare nr_idle balanced N/A N/A balanced balanced 9497 * fully_busy nr_idle nr_idle N/A N/A balanced balanced 9498 + * misfit_task force N/A N/A N/A N/A N/A 9499 * asym_packing force force N/A N/A force force 9500 * imbalanced force force N/A N/A force force 9501 * overloaded force force N/A N/A force avg_load ··· 11881 #endif /* SMP */ 11882 11883 }

+1 -1

kernel/sched/idle.c

··· 327 * RCU relies on this call to be done outside of an RCU read-side 328 * critical section. 329 */ 330 - flush_smp_call_function_from_idle(); 331 schedule_idle(); 332 333 if (unlikely(klp_patch_pending(current)))

··· 327 * RCU relies on this call to be done outside of an RCU read-side 328 * critical section. 329 */ 330 + flush_smp_call_function_queue(); 331 schedule_idle(); 332 333 if (unlikely(klp_patch_pending(current)))

+2 -2

kernel/sched/pelt.h

··· 145 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 146 { 147 if (unlikely(cfs_rq->throttle_count)) 148 - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; 149 150 - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; 151 } 152 #else 153 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

··· 145 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 146 { 147 if (unlikely(cfs_rq->throttle_count)) 148 + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; 149 150 + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; 151 } 152 #else 153 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

+11 -7

kernel/sched/psi.c

··· 1060 mutex_unlock(&group->avgs_lock); 1061 1062 for (full = 0; full < 2; full++) { 1063 - unsigned long avg[3]; 1064 - u64 total; 1065 int w; 1066 1067 - for (w = 0; w < 3; w++) 1068 - avg[w] = group->avg[res * 2 + full][w]; 1069 - total = div_u64(group->total[PSI_AVGS][res * 2 + full], 1070 - NSEC_PER_USEC); 1071 1072 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 1073 full ? "full" : "some", ··· 1120 t->state = state; 1121 t->threshold = threshold_us * NSEC_PER_USEC; 1122 t->win.size = window_us * NSEC_PER_USEC; 1123 - window_reset(&t->win, 0, 0, 0); 1124 1125 t->event = 0; 1126 t->last_event_time = 0;

··· 1060 mutex_unlock(&group->avgs_lock); 1061 1062 for (full = 0; full < 2; full++) { 1063 + unsigned long avg[3] = { 0, }; 1064 + u64 total = 0; 1065 int w; 1066 1067 + /* CPU FULL is undefined at the system level */ 1068 + if (!(group == &psi_system && res == PSI_CPU && full)) { 1069 + for (w = 0; w < 3; w++) 1070 + avg[w] = group->avg[res * 2 + full][w]; 1071 + total = div_u64(group->total[PSI_AVGS][res * 2 + full], 1072 + NSEC_PER_USEC); 1073 + } 1074 1075 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 1076 full ? "full" : "some", ··· 1117 t->state = state; 1118 t->threshold = threshold_us * NSEC_PER_USEC; 1119 t->win.size = window_us * NSEC_PER_USEC; 1120 + window_reset(&t->win, sched_clock(), 1121 + group->total[PSI_POLL][t->state], 0); 1122 1123 t->event = 0; 1124 t->last_event_time = 0;

+3 -2

kernel/sched/rt.c

··· 871 int enqueue = 0; 872 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 873 struct rq *rq = rq_of_rt_rq(rt_rq); 874 int skip; 875 876 /* ··· 886 if (skip) 887 continue; 888 889 - raw_spin_rq_lock(rq); 890 update_rq_clock(rq); 891 892 if (rt_rq->rt_time) { ··· 924 925 if (enqueue) 926 sched_rt_rq_enqueue(rt_rq); 927 - raw_spin_rq_unlock(rq); 928 } 929 930 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))

··· 871 int enqueue = 0; 872 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 873 struct rq *rq = rq_of_rt_rq(rt_rq); 874 + struct rq_flags rf; 875 int skip; 876 877 /* ··· 885 if (skip) 886 continue; 887 888 + rq_lock(rq, &rf); 889 update_rq_clock(rq); 890 891 if (rt_rq->rt_time) { ··· 923 924 if (enqueue) 925 sched_rt_rq_enqueue(rt_rq); 926 + rq_unlock(rq, &rf); 927 } 928 929 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))

+36 -19

kernel/sched/sched.h

··· 603 s64 runtime_remaining; 604 605 u64 throttled_clock; 606 - u64 throttled_clock_task; 607 - u64 throttled_clock_task_time; 608 int throttled; 609 int throttle_count; 610 struct list_head throttled_list; ··· 1827 #endif 1828 1829 extern int sched_update_scaling(void); 1830 - 1831 - extern void flush_smp_call_function_from_idle(void); 1832 - 1833 - #else /* !CONFIG_SMP: */ 1834 - static inline void flush_smp_call_function_from_idle(void) { } 1835 - #endif 1836 1837 #include "stats.h" 1838 ··· 2177 * 2178 * include/asm-generic/vmlinux.lds.h 2179 * 2180 * Also enforce alignment on the instance, not the type, to guarantee layout. 2181 */ 2182 #define DEFINE_SCHED_CLASS(name) \ ··· 2187 __section("__" #name "_sched_class") 2188 2189 /* Defined in include/asm-generic/vmlinux.lds.h */ 2190 - extern struct sched_class __begin_sched_classes[]; 2191 - extern struct sched_class __end_sched_classes[]; 2192 - 2193 - #define sched_class_highest (__end_sched_classes - 1) 2194 - #define sched_class_lowest (__begin_sched_classes - 1) 2195 2196 #define for_class_range(class, _from, _to) \ 2197 - for (class = (_from); class != (_to); class--) 2198 2199 #define for_each_class(class) \ 2200 - for_class_range(class, sched_class_highest, sched_class_lowest) 2201 2202 extern const struct sched_class stop_sched_class; 2203 extern const struct sched_class dl_sched_class; ··· 2305 2306 extern struct rt_bandwidth def_rt_bandwidth; 2307 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 2308 2309 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 2310 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); ··· 2475 } 2476 #endif 2477 2478 2479 #ifdef CONFIG_SMP 2480 ··· 2558 __acquires(busiest->lock) 2559 __acquires(this_rq->lock) 2560 { 2561 - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) 2562 return 0; 2563 - 2564 - if (likely(raw_spin_rq_trylock(busiest))) 2565 - return 0; 2566 2567 if (rq_order_less(this_rq, busiest)) { 2568 raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); 2569 return 0; 2570 } 2571 ··· 2660 BUG_ON(rq1 != rq2); 2661 raw_spin_rq_lock(rq1); 2662 __acquire(rq2->lock); /* Fake it out ;) */ 2663 } 2664 2665 /*

··· 603 s64 runtime_remaining; 604 605 u64 throttled_clock; 606 + u64 throttled_clock_pelt; 607 + u64 throttled_clock_pelt_time; 608 int throttled; 609 int throttle_count; 610 struct list_head throttled_list; ··· 1827 #endif 1828 1829 extern int sched_update_scaling(void); 1830 + #endif /* CONFIG_SMP */ 1831 1832 #include "stats.h" 1833 ··· 2182 * 2183 * include/asm-generic/vmlinux.lds.h 2184 * 2185 + * *CAREFUL* they are laid out in *REVERSE* order!!! 2186 + * 2187 * Also enforce alignment on the instance, not the type, to guarantee layout. 2188 */ 2189 #define DEFINE_SCHED_CLASS(name) \ ··· 2190 __section("__" #name "_sched_class") 2191 2192 /* Defined in include/asm-generic/vmlinux.lds.h */ 2193 + extern struct sched_class __sched_class_highest[]; 2194 + extern struct sched_class __sched_class_lowest[]; 2195 2196 #define for_class_range(class, _from, _to) \ 2197 + for (class = (_from); class < (_to); class++) 2198 2199 #define for_each_class(class) \ 2200 + for_class_range(class, __sched_class_highest, __sched_class_lowest) 2201 + 2202 + #define sched_class_above(_a, _b) ((_a) < (_b)) 2203 2204 extern const struct sched_class stop_sched_class; 2205 extern const struct sched_class dl_sched_class; ··· 2309 2310 extern struct rt_bandwidth def_rt_bandwidth; 2311 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 2312 + extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); 2313 2314 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 2315 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); ··· 2478 } 2479 #endif 2480 2481 + #ifdef CONFIG_SCHED_DEBUG 2482 + /* 2483 + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to 2484 + * acquire rq lock instead of rq_lock(). So at the end of these two functions 2485 + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of 2486 + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. 2487 + */ 2488 + static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) 2489 + { 2490 + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2491 + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ 2492 + #ifdef CONFIG_SMP 2493 + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 2494 + #endif 2495 + } 2496 + #else 2497 + static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} 2498 + #endif 2499 2500 #ifdef CONFIG_SMP 2501 ··· 2543 __acquires(busiest->lock) 2544 __acquires(this_rq->lock) 2545 { 2546 + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || 2547 + likely(raw_spin_rq_trylock(busiest))) { 2548 + double_rq_clock_clear_update(this_rq, busiest); 2549 return 0; 2550 + } 2551 2552 if (rq_order_less(this_rq, busiest)) { 2553 raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); 2554 + double_rq_clock_clear_update(this_rq, busiest); 2555 return 0; 2556 } 2557 ··· 2644 BUG_ON(rq1 != rq2); 2645 raw_spin_rq_lock(rq1); 2646 __acquire(rq2->lock); /* Fake it out ;) */ 2647 + double_rq_clock_clear_update(rq1, rq2); 2648 } 2649 2650 /*

+6

kernel/sched/smp.h

··· 7 extern void sched_ttwu_pending(void *arg); 8 9 extern void send_call_function_single_ipi(int cpu);

··· 7 extern void sched_ttwu_pending(void *arg); 8 9 extern void send_call_function_single_ipi(int cpu); 10 + 11 + #ifdef CONFIG_SMP 12 + extern void flush_smp_call_function_queue(void); 13 + #else 14 + static inline void flush_smp_call_function_queue(void) { } 15 + #endif

+24 -8

kernel/smp.c

··· 96 97 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 98 99 - static void flush_smp_call_function_queue(bool warn_cpu_offline); 100 101 int smpcfd_prepare_cpu(unsigned int cpu) 102 { ··· 141 * ensure that the outgoing CPU doesn't go offline with work 142 * still pending. 143 */ 144 - flush_smp_call_function_queue(false); 145 irq_work_run(); 146 return 0; 147 } ··· 544 { 545 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, 546 smp_processor_id(), CFD_SEQ_GOTIPI); 547 - flush_smp_call_function_queue(true); 548 } 549 550 /** 551 - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks 552 * 553 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an 554 * offline CPU. Skip this check if set to 'false'. ··· 561 * Loop through the call_single_queue and run all the queued callbacks. 562 * Must be called with interrupts disabled. 563 */ 564 - static void flush_smp_call_function_queue(bool warn_cpu_offline) 565 { 566 call_single_data_t *csd, *csd_next; 567 struct llist_node *entry, *prev; ··· 684 smp_processor_id(), CFD_SEQ_HDLEND); 685 } 686 687 - void flush_smp_call_function_from_idle(void) 688 { 689 unsigned long flags; 690 691 if (llist_empty(this_cpu_ptr(&call_single_queue))) ··· 708 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, 709 smp_processor_id(), CFD_SEQ_IDLE); 710 local_irq_save(flags); 711 - flush_smp_call_function_queue(true); 712 if (local_softirq_pending()) 713 - do_softirq(); 714 715 local_irq_restore(flags); 716 }

··· 96 97 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 98 99 + static void __flush_smp_call_function_queue(bool warn_cpu_offline); 100 101 int smpcfd_prepare_cpu(unsigned int cpu) 102 { ··· 141 * ensure that the outgoing CPU doesn't go offline with work 142 * still pending. 143 */ 144 + __flush_smp_call_function_queue(false); 145 irq_work_run(); 146 return 0; 147 } ··· 544 { 545 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, 546 smp_processor_id(), CFD_SEQ_GOTIPI); 547 + __flush_smp_call_function_queue(true); 548 } 549 550 /** 551 + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks 552 * 553 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an 554 * offline CPU. Skip this check if set to 'false'. ··· 561 * Loop through the call_single_queue and run all the queued callbacks. 562 * Must be called with interrupts disabled. 563 */ 564 + static void __flush_smp_call_function_queue(bool warn_cpu_offline) 565 { 566 call_single_data_t *csd, *csd_next; 567 struct llist_node *entry, *prev; ··· 684 smp_processor_id(), CFD_SEQ_HDLEND); 685 } 686 687 + 688 + /** 689 + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks 690 + * from task context (idle, migration thread) 691 + * 692 + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it 693 + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by 694 + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to 695 + * handle queued SMP function calls before scheduling. 696 + * 697 + * The migration thread has to ensure that an eventually pending wakeup has 698 + * been handled before it migrates a task. 699 + */ 700 + void flush_smp_call_function_queue(void) 701 { 702 + unsigned int was_pending; 703 unsigned long flags; 704 705 if (llist_empty(this_cpu_ptr(&call_single_queue))) ··· 694 cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, 695 smp_processor_id(), CFD_SEQ_IDLE); 696 local_irq_save(flags); 697 + /* Get the already pending soft interrupts for RT enabled kernels */ 698 + was_pending = local_softirq_pending(); 699 + __flush_smp_call_function_queue(true); 700 if (local_softirq_pending()) 701 + do_softirq_post_smp_call_flush(was_pending); 702 703 local_irq_restore(flags); 704 }

+13

kernel/softirq.c

··· 294 wakeup_softirqd(); 295 } 296 297 #else /* CONFIG_PREEMPT_RT */ 298 299 /*

··· 294 wakeup_softirqd(); 295 } 296 297 + /* 298 + * flush_smp_call_function_queue() can raise a soft interrupt in a function 299 + * call. On RT kernels this is undesired and the only known functionality 300 + * in the block layer which does this is disabled on RT. If soft interrupts 301 + * get raised which haven't been raised before the flush, warn so it can be 302 + * investigated. 303 + */ 304 + void do_softirq_post_smp_call_flush(unsigned int was_pending) 305 + { 306 + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) 307 + invoke_softirq(); 308 + } 309 + 310 #else /* CONFIG_PREEMPT_RT */ 311 312 /*

-2

kernel/stop_machine.c

··· 535 kthread_park(stopper->thread); 536 } 537 538 - extern void sched_set_stop_task(int cpu, struct task_struct *stop); 539 - 540 static void cpu_stop_create(unsigned int cpu) 541 { 542 sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));

··· 535 kthread_park(stopper->thread); 536 } 537 538 static void cpu_stop_create(unsigned int cpu) 539 { 540 sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));

+4 -10

kernel/trace/trace.c

··· 4289 entries, 4290 total, 4291 buf->cpu, 4292 - #if defined(CONFIG_PREEMPT_NONE) 4293 - "server", 4294 - #elif defined(CONFIG_PREEMPT_VOLUNTARY) 4295 - "desktop", 4296 - #elif defined(CONFIG_PREEMPT) 4297 - "preempt", 4298 - #elif defined(CONFIG_PREEMPT_RT) 4299 - "preempt_rt", 4300 - #else 4301 "unknown", 4302 - #endif 4303 /* These are reserved for later use */ 4304 0, 0, 0, 0); 4305 #ifdef CONFIG_SMP

··· 4289 entries, 4290 total, 4291 buf->cpu, 4292 + preempt_model_none() ? "server" : 4293 + preempt_model_voluntary() ? "desktop" : 4294 + preempt_model_full() ? "preempt" : 4295 + preempt_model_rt() ? "preempt_rt" : 4296 "unknown", 4297 /* These are reserved for later use */ 4298 0, 0, 0, 0); 4299 #ifdef CONFIG_SMP