Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+28 -12

Documentation/admin-guide/kernel-parameters.txt

··· 1730 1730 isapnp= [ISAPNP] 1731 1731 Format: <RDP>,<reset>,<pci_scan>,<verbosity> 1732 1732 1733 - isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. 1734 - The argument is a cpu list, as described above. 1733 + isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance. 1734 + [Deprecated - use cpusets instead] 1735 + Format: [flag-list,]<cpu-list> 1735 1736 1736 - This option can be used to specify one or more CPUs 1737 - to isolate from the general SMP balancing and scheduling 1738 - algorithms. You can move a process onto or off an 1739 - "isolated" CPU via the CPU affinity syscalls or cpuset. 1740 - <cpu number> begins at 0 and the maximum value is 1741 - "number of CPUs in system - 1". 1737 + Specify one or more CPUs to isolate from disturbances 1738 + specified in the flag list (default: domain): 1742 1739 1743 - This option is the preferred way to isolate CPUs. The 1744 - alternative -- manually setting the CPU mask of all 1745 - tasks in the system -- can cause problems and 1746 - suboptimal load balancer performance. 1740 + nohz 1741 + Disable the tick when a single task runs. 1742 + domain 1743 + Isolate from the general SMP balancing and scheduling 1744 + algorithms. Note that performing domain isolation this way 1745 + is irreversible: it's not possible to bring back a CPU to 1746 + the domains once isolated through isolcpus. It's strongly 1747 + advised to use cpusets instead to disable scheduler load 1748 + balancing through the "cpuset.sched_load_balance" file. 1749 + It offers a much more flexible interface where CPUs can 1750 + move in and out of an isolated set anytime. 1751 + 1752 + You can move a process onto or off an "isolated" CPU via 1753 + the CPU affinity syscalls or cpuset. 1754 + <cpu number> begins at 0 and the maximum value is 1755 + "number of CPUs in system - 1". 1756 + 1757 + The format of <cpu-list> is described above. 1758 + 1759 + 1747 1760 1748 1761 iucv= [HW,NET] 1749 1762 ··· 4222 4209 Used to run time disable IRQ_TIME_ACCOUNTING on any 4223 4210 platforms where RDTSC is slow and this accounting 4224 4211 can add overhead. 4212 + [x86] unstable: mark the TSC clocksource as unstable, this 4213 + marks the TSC unconditionally unstable at bootup and 4214 + avoids any further wobbles once the TSC watchdog notices. 4225 4215 4226 4216 turbografx.map[2|3]= [HW,JOY] 4227 4217 TurboGraFX parallel port interface

+10 -1

drivers/base/cpu.c

··· 18 18 #include <linux/cpufeature.h> 19 19 #include <linux/tick.h> 20 20 #include <linux/pm_qos.h> 21 + #include <linux/sched/isolation.h> 21 22 22 23 #include "base.h" 23 24 ··· 272 271 struct device_attribute *attr, char *buf) 273 272 { 274 273 int n = 0, len = PAGE_SIZE-2; 274 + cpumask_var_t isolated; 275 275 276 - n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map)); 276 + if (!alloc_cpumask_var(&isolated, GFP_KERNEL)) 277 + return -ENOMEM; 278 + 279 + cpumask_andnot(isolated, cpu_possible_mask, 280 + housekeeping_cpumask(HK_FLAG_DOMAIN)); 281 + n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated)); 282 + 283 + free_cpumask_var(isolated); 277 284 278 285 return n; 279 286 }

+3 -3

drivers/net/ethernet/tile/tilegx.c

··· 40 40 #include <linux/tcp.h> 41 41 #include <linux/net_tstamp.h> 42 42 #include <linux/ptp_clock_kernel.h> 43 - #include <linux/tick.h> 43 + #include <linux/sched/isolation.h> 44 44 45 45 #include <asm/checksum.h> 46 46 #include <asm/homecache.h> ··· 2270 2270 tile_net_dev_init(name, mac); 2271 2271 2272 2272 if (!network_cpus_init()) 2273 - cpumask_and(&network_cpus_map, housekeeping_cpumask(), 2274 - cpu_online_mask); 2273 + cpumask_and(&network_cpus_map, 2274 + housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask); 2275 2275 2276 2276 return 0; 2277 2277 }

+1 -1

fs/proc/array.c

··· 138 138 static inline const char *get_task_state(struct task_struct *tsk) 139 139 { 140 140 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); 141 - return task_state_array[__get_task_state(tsk)]; 141 + return task_state_array[task_state_index(tsk)]; 142 142 } 143 143 144 144 static inline int get_task_umask(struct task_struct *tsk)

+16

include/linux/cpumask.h

··· 131 131 return 0; 132 132 } 133 133 134 + static inline unsigned int cpumask_last(const struct cpumask *srcp) 135 + { 136 + return 0; 137 + } 138 + 134 139 /* Valid inputs for n are -1 and 0. */ 135 140 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) 136 141 { ··· 182 177 static inline unsigned int cpumask_first(const struct cpumask *srcp) 183 178 { 184 179 return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); 180 + } 181 + 182 + /** 183 + * cpumask_last - get the last CPU in a cpumask 184 + * @srcp: - the cpumask pointer 185 + * 186 + * Returns >= nr_cpumask_bits if no CPUs set. 187 + */ 188 + static inline unsigned int cpumask_last(const struct cpumask *srcp) 189 + { 190 + return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); 185 191 } 186 192 187 193 unsigned int cpumask_next(int n, const struct cpumask *srcp);

+2 -1

include/linux/ioprio.h

··· 3 3 #define IOPRIO_H 4 4 5 5 #include <linux/sched.h> 6 + #include <linux/sched/rt.h> 6 7 #include <linux/iocontext.h> 7 8 8 9 /* ··· 64 63 { 65 64 if (task->policy == SCHED_IDLE) 66 65 return IOPRIO_CLASS_IDLE; 67 - else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) 66 + else if (task_is_realtime(task)) 68 67 return IOPRIO_CLASS_RT; 69 68 else 70 69 return IOPRIO_CLASS_BE;

+10 -9

include/linux/sched.h

··· 166 166 /* Task command name length: */ 167 167 #define TASK_COMM_LEN 16 168 168 169 - extern cpumask_var_t cpu_isolated_map; 170 - 171 169 extern void scheduler_tick(void); 172 170 173 171 #define MAX_SCHEDULE_TIMEOUT LONG_MAX ··· 330 332 struct sched_avg { 331 333 u64 last_update_time; 332 334 u64 load_sum; 335 + u64 runnable_load_sum; 333 336 u32 util_sum; 334 337 u32 period_contrib; 335 338 unsigned long load_avg; 339 + unsigned long runnable_load_avg; 336 340 unsigned long util_avg; 337 341 }; 338 342 ··· 377 377 struct sched_entity { 378 378 /* For load-balancing: */ 379 379 struct load_weight load; 380 + unsigned long runnable_weight; 380 381 struct rb_node run_node; 381 382 struct list_head group_node; 382 383 unsigned int on_rq; ··· 473 472 * conditions between the inactive timer handler and the wakeup 474 473 * code. 475 474 */ 476 - int dl_throttled; 477 - int dl_boosted; 478 - int dl_yielded; 479 - int dl_non_contending; 475 + int dl_throttled : 1; 476 + int dl_boosted : 1; 477 + int dl_yielded : 1; 478 + int dl_non_contending : 1; 480 479 481 480 /* 482 481 * Bandwidth enforcement timer. Each -deadline task has its ··· 1247 1246 #define TASK_REPORT_IDLE (TASK_REPORT + 1) 1248 1247 #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) 1249 1248 1250 - static inline unsigned int __get_task_state(struct task_struct *tsk) 1249 + static inline unsigned int task_state_index(struct task_struct *tsk) 1251 1250 { 1252 1251 unsigned int tsk_state = READ_ONCE(tsk->state); 1253 1252 unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; ··· 1260 1259 return fls(state); 1261 1260 } 1262 1261 1263 - static inline char __task_state_to_char(unsigned int state) 1262 + static inline char task_index_to_char(unsigned int state) 1264 1263 { 1265 1264 static const char state_char[] = "RSDTtXZPI"; 1266 1265 ··· 1271 1270 1272 1271 static inline char task_state_to_char(struct task_struct *tsk) 1273 1272 { 1274 - return __task_state_to_char(__get_task_state(tsk)); 1273 + return task_index_to_char(task_state_index(tsk)); 1275 1274 } 1276 1275 1277 1276 /**

+51

include/linux/sched/isolation.h

··· 1 + #ifndef _LINUX_SCHED_ISOLATION_H 2 + #define _LINUX_SCHED_ISOLATION_H 3 + 4 + #include <linux/cpumask.h> 5 + #include <linux/init.h> 6 + #include <linux/tick.h> 7 + 8 + enum hk_flags { 9 + HK_FLAG_TIMER = 1, 10 + HK_FLAG_RCU = (1 << 1), 11 + HK_FLAG_MISC = (1 << 2), 12 + HK_FLAG_SCHED = (1 << 3), 13 + HK_FLAG_TICK = (1 << 4), 14 + HK_FLAG_DOMAIN = (1 << 5), 15 + }; 16 + 17 + #ifdef CONFIG_CPU_ISOLATION 18 + DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); 19 + extern int housekeeping_any_cpu(enum hk_flags flags); 20 + extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); 21 + extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); 22 + extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); 23 + extern void __init housekeeping_init(void); 24 + 25 + #else 26 + 27 + static inline int housekeeping_any_cpu(enum hk_flags flags) 28 + { 29 + return smp_processor_id(); 30 + } 31 + 32 + static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) 33 + { 34 + return cpu_possible_mask; 35 + } 36 + 37 + static inline void housekeeping_affine(struct task_struct *t, 38 + enum hk_flags flags) { } 39 + static inline void housekeeping_init(void) { } 40 + #endif /* CONFIG_CPU_ISOLATION */ 41 + 42 + static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) 43 + { 44 + #ifdef CONFIG_CPU_ISOLATION 45 + if (static_branch_unlikely(&housekeeping_overriden)) 46 + return housekeeping_test_cpu(cpu, flags); 47 + #endif 48 + return true; 49 + } 50 + 51 + #endif /* _LINUX_SCHED_ISOLATION_H */

+11

include/linux/sched/rt.h

··· 18 18 return rt_prio(p->prio); 19 19 } 20 20 21 + static inline bool task_is_realtime(struct task_struct *tsk) 22 + { 23 + int policy = tsk->policy; 24 + 25 + if (policy == SCHED_FIFO || policy == SCHED_RR) 26 + return true; 27 + if (policy == SCHED_DEADLINE) 28 + return true; 29 + return false; 30 + } 31 + 21 32 #ifdef CONFIG_RT_MUTEXES 22 33 /* 23 34 * Must hold either p->pi_lock or task_rq(p)->lock.

+3 -3

include/linux/sched/sysctl.h

··· 38 38 extern unsigned int sysctl_numa_balancing_scan_size; 39 39 40 40 #ifdef CONFIG_SCHED_DEBUG 41 - extern unsigned int sysctl_sched_migration_cost; 42 - extern unsigned int sysctl_sched_nr_migrate; 43 - extern unsigned int sysctl_sched_time_avg; 41 + extern __read_mostly unsigned int sysctl_sched_migration_cost; 42 + extern __read_mostly unsigned int sysctl_sched_nr_migrate; 43 + extern __read_mostly unsigned int sysctl_sched_time_avg; 44 44 45 45 int sched_proc_update_handler(struct ctl_table *table, int write, 46 46 void __user *buffer, size_t *length,

+2 -37

include/linux/tick.h

··· 138 138 #ifdef CONFIG_NO_HZ_FULL 139 139 extern bool tick_nohz_full_running; 140 140 extern cpumask_var_t tick_nohz_full_mask; 141 - extern cpumask_var_t housekeeping_mask; 142 141 143 142 static inline bool tick_nohz_full_enabled(void) 144 143 { ··· 159 160 { 160 161 if (tick_nohz_full_enabled()) 161 162 cpumask_or(mask, mask, tick_nohz_full_mask); 162 - } 163 - 164 - static inline int housekeeping_any_cpu(void) 165 - { 166 - return cpumask_any_and(housekeeping_mask, cpu_online_mask); 167 163 } 168 164 169 165 extern void tick_nohz_dep_set(enum tick_dep_bits bit); ··· 229 235 230 236 extern void tick_nohz_full_kick_cpu(int cpu); 231 237 extern void __tick_nohz_task_switch(void); 238 + extern void __init tick_nohz_full_setup(cpumask_var_t cpumask); 232 239 #else 233 - static inline int housekeeping_any_cpu(void) 234 - { 235 - return smp_processor_id(); 236 - } 237 240 static inline bool tick_nohz_full_enabled(void) { return false; } 238 241 static inline bool tick_nohz_full_cpu(int cpu) { return false; } 239 242 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } ··· 250 259 251 260 static inline void tick_nohz_full_kick_cpu(int cpu) { } 252 261 static inline void __tick_nohz_task_switch(void) { } 262 + static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { } 253 263 #endif 254 - 255 - static inline const struct cpumask *housekeeping_cpumask(void) 256 - { 257 - #ifdef CONFIG_NO_HZ_FULL 258 - if (tick_nohz_full_enabled()) 259 - return housekeeping_mask; 260 - #endif 261 - return cpu_possible_mask; 262 - } 263 - 264 - static inline bool is_housekeeping_cpu(int cpu) 265 - { 266 - #ifdef CONFIG_NO_HZ_FULL 267 - if (tick_nohz_full_enabled()) 268 - return cpumask_test_cpu(cpu, housekeeping_mask); 269 - #endif 270 - return true; 271 - } 272 - 273 - static inline void housekeeping_affine(struct task_struct *t) 274 - { 275 - #ifdef CONFIG_NO_HZ_FULL 276 - if (tick_nohz_full_enabled()) 277 - set_cpus_allowed_ptr(t, housekeeping_mask); 278 - 279 - #endif 280 - } 281 264 282 265 static inline void tick_nohz_task_switch(void) 283 266 {

+1 -1

include/trace/events/sched.h

··· 118 118 if (preempt) 119 119 return TASK_STATE_MAX; 120 120 121 - return __get_task_state(p); 121 + return task_state_index(p); 122 122 } 123 123 #endif /* CREATE_TRACE_POINTS */ 124 124

+7

init/Kconfig

··· 472 472 473 473 endmenu # "CPU/Task time and stats accounting" 474 474 475 + config CPU_ISOLATION 476 + bool "CPU isolation" 477 + help 478 + Make sure that CPUs running critical tasks are not disturbed by 479 + any source of "noise" such as unbound workqueues, timers, kthreads... 480 + Unbound jobs get offloaded to housekeeping CPUs. 481 + 475 482 source "kernel/rcu/Kconfig" 476 483 477 484 config BUILD_BIN2C

+2

init/main.c

··· 46 46 #include <linux/cgroup.h> 47 47 #include <linux/efi.h> 48 48 #include <linux/tick.h> 49 + #include <linux/sched/isolation.h> 49 50 #include <linux/interrupt.h> 50 51 #include <linux/taskstats_kern.h> 51 52 #include <linux/delayacct.h> ··· 607 606 early_irq_init(); 608 607 init_IRQ(); 609 608 tick_init(); 609 + housekeeping_init(); 610 610 rcu_init_nohz(); 611 611 init_timers(); 612 612 hrtimers_init();

+5 -10

kernel/cgroup/cpuset.c

··· 57 57 #include <linux/backing-dev.h> 58 58 #include <linux/sort.h> 59 59 #include <linux/oom.h> 60 - 60 + #include <linux/sched/isolation.h> 61 61 #include <linux/uaccess.h> 62 62 #include <linux/atomic.h> 63 63 #include <linux/mutex.h> ··· 656 656 int csn; /* how many cpuset ptrs in csa so far */ 657 657 int i, j, k; /* indices for partition finding loops */ 658 658 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 659 - cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ 660 659 struct sched_domain_attr *dattr; /* attributes for custom domains */ 661 660 int ndoms = 0; /* number of sched domains in result */ 662 661 int nslot; /* next empty doms[] struct cpumask slot */ ··· 664 665 doms = NULL; 665 666 dattr = NULL; 666 667 csa = NULL; 667 - 668 - if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) 669 - goto done; 670 - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 671 668 672 669 /* Special case for the 99% of systems with one, full, sched domain */ 673 670 if (is_sched_load_balance(&top_cpuset)) { ··· 678 683 update_domain_attr_tree(dattr, &top_cpuset); 679 684 } 680 685 cpumask_and(doms[0], top_cpuset.effective_cpus, 681 - non_isolated_cpus); 686 + housekeeping_cpumask(HK_FLAG_DOMAIN)); 682 687 683 688 goto done; 684 689 } ··· 702 707 */ 703 708 if (!cpumask_empty(cp->cpus_allowed) && 704 709 !(is_sched_load_balance(cp) && 705 - cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) 710 + cpumask_intersects(cp->cpus_allowed, 711 + housekeeping_cpumask(HK_FLAG_DOMAIN)))) 706 712 continue; 707 713 708 714 if (is_sched_load_balance(cp)) ··· 785 789 786 790 if (apn == b->pn) { 787 791 cpumask_or(dp, dp, b->effective_cpus); 788 - cpumask_and(dp, dp, non_isolated_cpus); 792 + cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); 789 793 if (dattr) 790 794 update_domain_attr_tree(dattr + nslot, b); 791 795 ··· 798 802 BUG_ON(nslot != ndoms); 799 803 800 804 done: 801 - free_cpumask_var(non_isolated_cpus); 802 805 kfree(csa); 803 806 804 807 /*

+2 -1

kernel/rcu/tree_plugin.h

··· 29 29 #include <linux/oom.h> 30 30 #include <linux/sched/debug.h> 31 31 #include <linux/smpboot.h> 32 + #include <linux/sched/isolation.h> 32 33 #include <uapi/linux/sched/types.h> 33 34 #include "../time/tick-internal.h" 34 35 ··· 2588 2587 2589 2588 if (!tick_nohz_full_enabled()) 2590 2589 return; 2591 - housekeeping_affine(current); 2590 + housekeeping_affine(current, HK_FLAG_RCU); 2592 2591 } 2593 2592 2594 2593 /* Record the current task on dyntick-idle entry. */

+2 -1

kernel/rcu/update.c

··· 51 51 #include <linux/kthread.h> 52 52 #include <linux/tick.h> 53 53 #include <linux/rcupdate_wait.h> 54 + #include <linux/sched/isolation.h> 54 55 55 56 #define CREATE_TRACE_POINTS 56 57 ··· 715 714 LIST_HEAD(rcu_tasks_holdouts); 716 715 717 716 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ 718 - housekeeping_affine(current); 717 + housekeeping_affine(current, HK_FLAG_RCU); 719 718 720 719 /* 721 720 * Each pass through the following loop makes one check for

+1

kernel/sched/Makefile

··· 27 27 obj-$(CONFIG_CPU_FREQ) += cpufreq.o 28 28 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o 29 29 obj-$(CONFIG_MEMBARRIER) += membarrier.o 30 + obj-$(CONFIG_CPU_ISOLATION) += isolation.o

+27 -29

kernel/sched/core.c

··· 26 26 #include <linux/profile.h> 27 27 #include <linux/security.h> 28 28 #include <linux/syscalls.h> 29 + #include <linux/sched/isolation.h> 29 30 30 31 #include <asm/switch_to.h> 31 32 #include <asm/tlb.h> ··· 43 42 44 43 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 45 44 45 + #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 46 46 /* 47 47 * Debugging: various feature bits 48 + * 49 + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of 50 + * sysctl_sched_features, defined in sched.h, to allow constants propagation 51 + * at compile time and compiler optimization based on features default. 48 52 */ 49 - 50 53 #define SCHED_FEAT(name, enabled) \ 51 54 (1UL << __SCHED_FEAT_##name) * enabled | 52 - 53 55 const_debug unsigned int sysctl_sched_features = 54 56 #include "features.h" 55 57 0; 56 - 57 58 #undef SCHED_FEAT 59 + #endif 58 60 59 61 /* 60 62 * Number of tasks to iterate in a single balance run. ··· 86 82 * default: 0.95s 87 83 */ 88 84 int sysctl_sched_rt_runtime = 950000; 89 - 90 - /* CPUs with isolated domains */ 91 - cpumask_var_t cpu_isolated_map; 92 85 93 86 /* 94 87 * __task_rq_lock - lock the rq @p resides on. ··· 526 525 int i, cpu = smp_processor_id(); 527 526 struct sched_domain *sd; 528 527 529 - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) 528 + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) 530 529 return cpu; 531 530 532 531 rcu_read_lock(); ··· 535 534 if (cpu == i) 536 535 continue; 537 536 538 - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { 537 + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { 539 538 cpu = i; 540 539 goto unlock; 541 540 } 542 541 } 543 542 } 544 543 545 - if (!is_housekeeping_cpu(cpu)) 546 - cpu = housekeeping_any_cpu(); 544 + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) 545 + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 547 546 unlock: 548 547 rcu_read_unlock(); 549 548 return cpu; ··· 733 732 } 734 733 #endif 735 734 736 - static void set_load_weight(struct task_struct *p) 735 + static void set_load_weight(struct task_struct *p, bool update_load) 737 736 { 738 737 int prio = p->static_prio - MAX_RT_PRIO; 739 738 struct load_weight *load = &p->se.load; ··· 747 746 return; 748 747 } 749 748 750 - load->weight = scale_load(sched_prio_to_weight[prio]); 751 - load->inv_weight = sched_prio_to_wmult[prio]; 749 + /* 750 + * SCHED_OTHER tasks have to update their load when changing their 751 + * weight 752 + */ 753 + if (update_load && p->sched_class == &fair_sched_class) { 754 + reweight_task(p, prio); 755 + } else { 756 + load->weight = scale_load(sched_prio_to_weight[prio]); 757 + load->inv_weight = sched_prio_to_wmult[prio]; 758 + } 752 759 } 753 760 754 761 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ··· 2366 2357 p->static_prio = NICE_TO_PRIO(0); 2367 2358 2368 2359 p->prio = p->normal_prio = __normal_prio(p); 2369 - set_load_weight(p); 2360 + set_load_weight(p, false); 2370 2361 2371 2362 /* 2372 2363 * We don't need the reset flag anymore after the fork. It has ··· 3813 3804 put_prev_task(rq, p); 3814 3805 3815 3806 p->static_prio = NICE_TO_PRIO(nice); 3816 - set_load_weight(p); 3807 + set_load_weight(p, true); 3817 3808 old_prio = p->prio; 3818 3809 p->prio = effective_prio(p); 3819 3810 delta = p->prio - old_prio; ··· 3970 3961 */ 3971 3962 p->rt_priority = attr->sched_priority; 3972 3963 p->normal_prio = normal_prio(p); 3973 - set_load_weight(p); 3964 + set_load_weight(p, true); 3974 3965 } 3975 3966 3976 3967 /* Actually do priority change: must hold pi & rq lock. */ ··· 5736 5727 5737 5728 void __init sched_init_smp(void) 5738 5729 { 5739 - cpumask_var_t non_isolated_cpus; 5740 - 5741 - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 5742 - 5743 5730 sched_init_numa(); 5744 5731 5745 5732 /* ··· 5745 5740 */ 5746 5741 mutex_lock(&sched_domains_mutex); 5747 5742 sched_init_domains(cpu_active_mask); 5748 - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 5749 - if (cpumask_empty(non_isolated_cpus)) 5750 - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 5751 5743 mutex_unlock(&sched_domains_mutex); 5752 5744 5753 5745 /* Move init over to a non-isolated CPU */ 5754 - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 5746 + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) 5755 5747 BUG(); 5756 5748 sched_init_granularity(); 5757 - free_cpumask_var(non_isolated_cpus); 5758 5749 5759 5750 init_sched_rt_class(); 5760 5751 init_sched_dl_class(); ··· 5935 5934 atomic_set(&rq->nr_iowait, 0); 5936 5935 } 5937 5936 5938 - set_load_weight(&init_task); 5937 + set_load_weight(&init_task, false); 5939 5938 5940 5939 /* 5941 5940 * The boot idle thread does lazy MMU switching as well: ··· 5954 5953 calc_load_update = jiffies + LOAD_FREQ; 5955 5954 5956 5955 #ifdef CONFIG_SMP 5957 - /* May be allocated at isolcpus cmdline parse time */ 5958 - if (cpu_isolated_map == NULL) 5959 - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 5960 5956 idle_thread_set_boot_cpu(); 5961 5957 set_cpu_rq_start_time(smp_processor_id()); 5962 5958 #endif

+9 -12

kernel/sched/deadline.c

··· 243 243 if (p->state == TASK_DEAD) 244 244 sub_rq_bw(p->dl.dl_bw, &rq->dl); 245 245 raw_spin_lock(&dl_b->lock); 246 - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 246 + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 247 247 __dl_clear_params(p); 248 248 raw_spin_unlock(&dl_b->lock); 249 249 } ··· 1210 1210 } 1211 1211 1212 1212 raw_spin_lock(&dl_b->lock); 1213 - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 1213 + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 1214 1214 raw_spin_unlock(&dl_b->lock); 1215 1215 __dl_clear_params(p); 1216 1216 ··· 1365 1365 update_dl_entity(dl_se, pi_se); 1366 1366 } else if (flags & ENQUEUE_REPLENISH) { 1367 1367 replenish_dl_entity(dl_se, pi_se); 1368 + } else if ((flags & ENQUEUE_RESTORE) && 1369 + dl_time_before(dl_se->deadline, 1370 + rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { 1371 + setup_new_dl_entity(dl_se); 1368 1372 } 1369 1373 1370 1374 __enqueue_dl_entity(dl_se); ··· 2171 2167 * until we complete the update. 2172 2168 */ 2173 2169 raw_spin_lock(&src_dl_b->lock); 2174 - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 2170 + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 2175 2171 raw_spin_unlock(&src_dl_b->lock); 2176 2172 } 2177 2173 ··· 2260 2256 2261 2257 return; 2262 2258 } 2263 - /* 2264 - * If p is boosted we already updated its params in 2265 - * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), 2266 - * p's deadline being now already after rq_clock(rq). 2267 - */ 2268 - if (dl_time_before(p->dl.deadline, rq_clock(rq))) 2269 - setup_new_dl_entity(&p->dl); 2270 2259 2271 2260 if (rq->curr != p) { 2272 2261 #ifdef CONFIG_SMP ··· 2449 2452 if (dl_policy(policy) && !task_has_dl_policy(p) && 2450 2453 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2451 2454 if (hrtimer_active(&p->dl.inactive_timer)) 2452 - __dl_clear(dl_b, p->dl.dl_bw, cpus); 2455 + __dl_sub(dl_b, p->dl.dl_bw, cpus); 2453 2456 __dl_add(dl_b, new_bw, cpus); 2454 2457 err = 0; 2455 2458 } else if (dl_policy(policy) && task_has_dl_policy(p) && ··· 2461 2464 * But this would require to set the task's "inactive 2462 2465 * timer" when the task is not inactive. 2463 2466 */ 2464 - __dl_clear(dl_b, p->dl.dl_bw, cpus); 2467 + __dl_sub(dl_b, p->dl.dl_bw, cpus); 2465 2468 __dl_add(dl_b, new_bw, cpus); 2466 2469 dl_change_utilization(p, new_bw); 2467 2470 err = 0;

+13 -5

kernel/sched/debug.c

··· 441 441 P_SCHEDSTAT(se->statistics.wait_count); 442 442 } 443 443 P(se->load.weight); 444 + P(se->runnable_weight); 444 445 #ifdef CONFIG_SMP 445 446 P(se->avg.load_avg); 446 447 P(se->avg.util_avg); 448 + P(se->avg.runnable_load_avg); 447 449 #endif 448 450 449 451 #undef PN_SCHEDSTAT ··· 560 558 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 561 559 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 562 560 #ifdef CONFIG_SMP 561 + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); 563 562 SEQ_printf(m, " .%-30s: %lu\n", "load_avg", 564 563 cfs_rq->avg.load_avg); 565 564 SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", 566 - cfs_rq->runnable_load_avg); 565 + cfs_rq->avg.runnable_load_avg); 567 566 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 568 567 cfs_rq->avg.util_avg); 569 - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", 570 - atomic_long_read(&cfs_rq->removed_load_avg)); 571 - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", 572 - atomic_long_read(&cfs_rq->removed_util_avg)); 568 + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", 569 + cfs_rq->removed.load_avg); 570 + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 571 + cfs_rq->removed.util_avg); 572 + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", 573 + cfs_rq->removed.runnable_sum); 573 574 #ifdef CONFIG_FAIR_GROUP_SCHED 574 575 SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", 575 576 cfs_rq->tg_load_avg_contrib); ··· 1009 1004 "nr_involuntary_switches", (long long)p->nivcsw); 1010 1005 1011 1006 P(se.load.weight); 1007 + P(se.runnable_weight); 1012 1008 #ifdef CONFIG_SMP 1013 1009 P(se.avg.load_sum); 1010 + P(se.avg.runnable_load_sum); 1014 1011 P(se.avg.util_sum); 1015 1012 P(se.avg.load_avg); 1013 + P(se.avg.runnable_load_avg); 1016 1014 P(se.avg.util_avg); 1017 1015 P(se.avg.last_update_time); 1018 1016 #endif

+711 -380

kernel/sched/fair.c

··· 33 33 #include <linux/mempolicy.h> 34 34 #include <linux/migrate.h> 35 35 #include <linux/task_work.h> 36 + #include <linux/sched/isolation.h> 36 37 37 38 #include <trace/events/sched.h> 38 39 ··· 718 717 { 719 718 struct sched_avg *sa = &se->avg; 720 719 721 - sa->last_update_time = 0; 722 - /* 723 - * sched_avg's period_contrib should be strictly less then 1024, so 724 - * we give it 1023 to make sure it is almost a period (1024us), and 725 - * will definitely be update (after enqueue). 726 - */ 727 - sa->period_contrib = 1023; 720 + memset(sa, 0, sizeof(*sa)); 721 + 728 722 /* 729 723 * Tasks are intialized with full load to be seen as heavy tasks until 730 724 * they get a chance to stabilize to their real load level. ··· 727 731 * nothing has been attached to the task group yet. 728 732 */ 729 733 if (entity_is_task(se)) 730 - sa->load_avg = scale_load_down(se->load.weight); 731 - sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 732 - /* 733 - * At this point, util_avg won't be used in select_task_rq_fair anyway 734 - */ 735 - sa->util_avg = 0; 736 - sa->util_sum = 0; 734 + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); 735 + 736 + se->runnable_weight = se->load.weight; 737 + 737 738 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 738 739 } 739 740 ··· 778 785 } else { 779 786 sa->util_avg = cap; 780 787 } 781 - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; 782 788 } 783 789 784 790 if (entity_is_task(se)) { ··· 2018 2026 delta = runtime - p->last_sum_exec_runtime; 2019 2027 *period = now - p->last_task_numa_placement; 2020 2028 } else { 2021 - delta = p->se.avg.load_sum / p->se.load.weight; 2029 + delta = p->se.avg.load_sum; 2022 2030 *period = LOAD_AVG_MAX; 2023 2031 } 2024 2032 ··· 2685 2693 cfs_rq->nr_running--; 2686 2694 } 2687 2695 2696 + /* 2697 + * Signed add and clamp on underflow. 2698 + * 2699 + * Explicitly do a load-store to ensure the intermediate value never hits 2700 + * memory. This allows lockless observations without ever seeing the negative 2701 + * values. 2702 + */ 2703 + #define add_positive(_ptr, _val) do { \ 2704 + typeof(_ptr) ptr = (_ptr); \ 2705 + typeof(_val) val = (_val); \ 2706 + typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2707 + \ 2708 + res = var + val; \ 2709 + \ 2710 + if (val < 0 && res > var) \ 2711 + res = 0; \ 2712 + \ 2713 + WRITE_ONCE(*ptr, res); \ 2714 + } while (0) 2715 + 2716 + /* 2717 + * Unsigned subtract and clamp on underflow. 2718 + * 2719 + * Explicitly do a load-store to ensure the intermediate value never hits 2720 + * memory. This allows lockless observations without ever seeing the negative 2721 + * values. 2722 + */ 2723 + #define sub_positive(_ptr, _val) do { \ 2724 + typeof(_ptr) ptr = (_ptr); \ 2725 + typeof(*ptr) val = (_val); \ 2726 + typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2727 + res = var - val; \ 2728 + if (res > var) \ 2729 + res = 0; \ 2730 + WRITE_ONCE(*ptr, res); \ 2731 + } while (0) 2732 + 2733 + #ifdef CONFIG_SMP 2734 + /* 2735 + * XXX we want to get rid of these helpers and use the full load resolution. 2736 + */ 2737 + static inline long se_weight(struct sched_entity *se) 2738 + { 2739 + return scale_load_down(se->load.weight); 2740 + } 2741 + 2742 + static inline long se_runnable(struct sched_entity *se) 2743 + { 2744 + return scale_load_down(se->runnable_weight); 2745 + } 2746 + 2747 + static inline void 2748 + enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2749 + { 2750 + cfs_rq->runnable_weight += se->runnable_weight; 2751 + 2752 + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; 2753 + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; 2754 + } 2755 + 2756 + static inline void 2757 + dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2758 + { 2759 + cfs_rq->runnable_weight -= se->runnable_weight; 2760 + 2761 + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); 2762 + sub_positive(&cfs_rq->avg.runnable_load_sum, 2763 + se_runnable(se) * se->avg.runnable_load_sum); 2764 + } 2765 + 2766 + static inline void 2767 + enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2768 + { 2769 + cfs_rq->avg.load_avg += se->avg.load_avg; 2770 + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; 2771 + } 2772 + 2773 + static inline void 2774 + dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2775 + { 2776 + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 2777 + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); 2778 + } 2779 + #else 2780 + static inline void 2781 + enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2782 + static inline void 2783 + dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2784 + static inline void 2785 + enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2786 + static inline void 2787 + dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 2788 + #endif 2789 + 2790 + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 2791 + unsigned long weight, unsigned long runnable) 2792 + { 2793 + if (se->on_rq) { 2794 + /* commit outstanding execution time */ 2795 + if (cfs_rq->curr == se) 2796 + update_curr(cfs_rq); 2797 + account_entity_dequeue(cfs_rq, se); 2798 + dequeue_runnable_load_avg(cfs_rq, se); 2799 + } 2800 + dequeue_load_avg(cfs_rq, se); 2801 + 2802 + se->runnable_weight = runnable; 2803 + update_load_set(&se->load, weight); 2804 + 2805 + #ifdef CONFIG_SMP 2806 + do { 2807 + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; 2808 + 2809 + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 2810 + se->avg.runnable_load_avg = 2811 + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); 2812 + } while (0); 2813 + #endif 2814 + 2815 + enqueue_load_avg(cfs_rq, se); 2816 + if (se->on_rq) { 2817 + account_entity_enqueue(cfs_rq, se); 2818 + enqueue_runnable_load_avg(cfs_rq, se); 2819 + } 2820 + } 2821 + 2822 + void reweight_task(struct task_struct *p, int prio) 2823 + { 2824 + struct sched_entity *se = &p->se; 2825 + struct cfs_rq *cfs_rq = cfs_rq_of(se); 2826 + struct load_weight *load = &se->load; 2827 + unsigned long weight = scale_load(sched_prio_to_weight[prio]); 2828 + 2829 + reweight_entity(cfs_rq, se, weight, weight); 2830 + load->inv_weight = sched_prio_to_wmult[prio]; 2831 + } 2832 + 2688 2833 #ifdef CONFIG_FAIR_GROUP_SCHED 2689 2834 # ifdef CONFIG_SMP 2690 - static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 2835 + /* 2836 + * All this does is approximate the hierarchical proportion which includes that 2837 + * global sum we all love to hate. 2838 + * 2839 + * That is, the weight of a group entity, is the proportional share of the 2840 + * group weight based on the group runqueue weights. That is: 2841 + * 2842 + * tg->weight * grq->load.weight 2843 + * ge->load.weight = ----------------------------- (1) 2844 + * \Sum grq->load.weight 2845 + * 2846 + * Now, because computing that sum is prohibitively expensive to compute (been 2847 + * there, done that) we approximate it with this average stuff. The average 2848 + * moves slower and therefore the approximation is cheaper and more stable. 2849 + * 2850 + * So instead of the above, we substitute: 2851 + * 2852 + * grq->load.weight -> grq->avg.load_avg (2) 2853 + * 2854 + * which yields the following: 2855 + * 2856 + * tg->weight * grq->avg.load_avg 2857 + * ge->load.weight = ------------------------------ (3) 2858 + * tg->load_avg 2859 + * 2860 + * Where: tg->load_avg ~= \Sum grq->avg.load_avg 2861 + * 2862 + * That is shares_avg, and it is right (given the approximation (2)). 2863 + * 2864 + * The problem with it is that because the average is slow -- it was designed 2865 + * to be exactly that of course -- this leads to transients in boundary 2866 + * conditions. In specific, the case where the group was idle and we start the 2867 + * one task. It takes time for our CPU's grq->avg.load_avg to build up, 2868 + * yielding bad latency etc.. 2869 + * 2870 + * Now, in that special case (1) reduces to: 2871 + * 2872 + * tg->weight * grq->load.weight 2873 + * ge->load.weight = ----------------------------- = tg->weight (4) 2874 + * grp->load.weight 2875 + * 2876 + * That is, the sum collapses because all other CPUs are idle; the UP scenario. 2877 + * 2878 + * So what we do is modify our approximation (3) to approach (4) in the (near) 2879 + * UP case, like: 2880 + * 2881 + * ge->load.weight = 2882 + * 2883 + * tg->weight * grq->load.weight 2884 + * --------------------------------------------------- (5) 2885 + * tg->load_avg - grq->avg.load_avg + grq->load.weight 2886 + * 2887 + * But because grq->load.weight can drop to 0, resulting in a divide by zero, 2888 + * we need to use grq->avg.load_avg as its lower bound, which then gives: 2889 + * 2890 + * 2891 + * tg->weight * grq->load.weight 2892 + * ge->load.weight = ----------------------------- (6) 2893 + * tg_load_avg' 2894 + * 2895 + * Where: 2896 + * 2897 + * tg_load_avg' = tg->load_avg - grq->avg.load_avg + 2898 + * max(grq->load.weight, grq->avg.load_avg) 2899 + * 2900 + * And that is shares_weight and is icky. In the (near) UP case it approaches 2901 + * (4) while in the normal case it approaches (3). It consistently 2902 + * overestimates the ge->load.weight and therefore: 2903 + * 2904 + * \Sum ge->load.weight >= tg->weight 2905 + * 2906 + * hence icky! 2907 + */ 2908 + static long calc_group_shares(struct cfs_rq *cfs_rq) 2691 2909 { 2692 - long tg_weight, load, shares; 2910 + long tg_weight, tg_shares, load, shares; 2911 + struct task_group *tg = cfs_rq->tg; 2693 2912 2694 - /* 2695 - * This really should be: cfs_rq->avg.load_avg, but instead we use 2696 - * cfs_rq->load.weight, which is its upper bound. This helps ramp up 2697 - * the shares for small weight interactive tasks. 2698 - */ 2699 - load = scale_load_down(cfs_rq->load.weight); 2913 + tg_shares = READ_ONCE(tg->shares); 2914 + 2915 + load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); 2700 2916 2701 2917 tg_weight = atomic_long_read(&tg->load_avg); 2702 2918 ··· 2912 2712 tg_weight -= cfs_rq->tg_load_avg_contrib; 2913 2713 tg_weight += load; 2914 2714 2915 - shares = (tg->shares * load); 2715 + shares = (tg_shares * load); 2916 2716 if (tg_weight) 2917 2717 shares /= tg_weight; 2918 2718 ··· 2928 2728 * case no task is runnable on a CPU MIN_SHARES=2 should be returned 2929 2729 * instead of 0. 2930 2730 */ 2931 - if (shares < MIN_SHARES) 2932 - shares = MIN_SHARES; 2933 - if (shares > tg->shares) 2934 - shares = tg->shares; 2935 - 2936 - return shares; 2731 + return clamp_t(long, shares, MIN_SHARES, tg_shares); 2937 2732 } 2938 - # else /* CONFIG_SMP */ 2939 - static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 2733 + 2734 + /* 2735 + * This calculates the effective runnable weight for a group entity based on 2736 + * the group entity weight calculated above. 2737 + * 2738 + * Because of the above approximation (2), our group entity weight is 2739 + * an load_avg based ratio (3). This means that it includes blocked load and 2740 + * does not represent the runnable weight. 2741 + * 2742 + * Approximate the group entity's runnable weight per ratio from the group 2743 + * runqueue: 2744 + * 2745 + * grq->avg.runnable_load_avg 2746 + * ge->runnable_weight = ge->load.weight * -------------------------- (7) 2747 + * grq->avg.load_avg 2748 + * 2749 + * However, analogous to above, since the avg numbers are slow, this leads to 2750 + * transients in the from-idle case. Instead we use: 2751 + * 2752 + * ge->runnable_weight = ge->load.weight * 2753 + * 2754 + * max(grq->avg.runnable_load_avg, grq->runnable_weight) 2755 + * ----------------------------------------------------- (8) 2756 + * max(grq->avg.load_avg, grq->load.weight) 2757 + * 2758 + * Where these max() serve both to use the 'instant' values to fix the slow 2759 + * from-idle and avoid the /0 on to-idle, similar to (6). 2760 + */ 2761 + static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) 2940 2762 { 2941 - return tg->shares; 2763 + long runnable, load_avg; 2764 + 2765 + load_avg = max(cfs_rq->avg.load_avg, 2766 + scale_load_down(cfs_rq->load.weight)); 2767 + 2768 + runnable = max(cfs_rq->avg.runnable_load_avg, 2769 + scale_load_down(cfs_rq->runnable_weight)); 2770 + 2771 + runnable *= shares; 2772 + if (load_avg) 2773 + runnable /= load_avg; 2774 + 2775 + return clamp_t(long, runnable, MIN_SHARES, shares); 2942 2776 } 2943 2777 # endif /* CONFIG_SMP */ 2944 2778 2945 - static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 2946 - unsigned long weight) 2947 - { 2948 - if (se->on_rq) { 2949 - /* commit outstanding execution time */ 2950 - if (cfs_rq->curr == se) 2951 - update_curr(cfs_rq); 2952 - account_entity_dequeue(cfs_rq, se); 2953 - } 2954 - 2955 - update_load_set(&se->load, weight); 2956 - 2957 - if (se->on_rq) 2958 - account_entity_enqueue(cfs_rq, se); 2959 - } 2960 - 2961 2779 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2962 2780 2963 - static void update_cfs_shares(struct sched_entity *se) 2781 + /* 2782 + * Recomputes the group entity based on the current state of its group 2783 + * runqueue. 2784 + */ 2785 + static void update_cfs_group(struct sched_entity *se) 2964 2786 { 2965 - struct cfs_rq *cfs_rq = group_cfs_rq(se); 2966 - struct task_group *tg; 2967 - long shares; 2787 + struct cfs_rq *gcfs_rq = group_cfs_rq(se); 2788 + long shares, runnable; 2968 2789 2969 - if (!cfs_rq) 2790 + if (!gcfs_rq) 2970 2791 return; 2971 2792 2972 - if (throttled_hierarchy(cfs_rq)) 2793 + if (throttled_hierarchy(gcfs_rq)) 2973 2794 return; 2974 - 2975 - tg = cfs_rq->tg; 2976 2795 2977 2796 #ifndef CONFIG_SMP 2978 - if (likely(se->load.weight == tg->shares)) 2979 - return; 2980 - #endif 2981 - shares = calc_cfs_shares(cfs_rq, tg); 2797 + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); 2982 2798 2983 - reweight_entity(cfs_rq_of(se), se, shares); 2799 + if (likely(se->load.weight == shares)) 2800 + return; 2801 + #else 2802 + shares = calc_group_shares(gcfs_rq); 2803 + runnable = calc_group_runnable(gcfs_rq, shares); 2804 + #endif 2805 + 2806 + reweight_entity(cfs_rq_of(se), se, shares, runnable); 2984 2807 } 2985 2808 2986 2809 #else /* CONFIG_FAIR_GROUP_SCHED */ 2987 - static inline void update_cfs_shares(struct sched_entity *se) 2810 + static inline void update_cfs_group(struct sched_entity *se) 2988 2811 { 2989 2812 } 2990 2813 #endif /* CONFIG_FAIR_GROUP_SCHED */ ··· 3116 2893 */ 3117 2894 static __always_inline u32 3118 2895 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, 3119 - unsigned long weight, int running, struct cfs_rq *cfs_rq) 2896 + unsigned long load, unsigned long runnable, int running) 3120 2897 { 3121 2898 unsigned long scale_freq, scale_cpu; 3122 2899 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ ··· 3133 2910 */ 3134 2911 if (periods) { 3135 2912 sa->load_sum = decay_load(sa->load_sum, periods); 3136 - if (cfs_rq) { 3137 - cfs_rq->runnable_load_sum = 3138 - decay_load(cfs_rq->runnable_load_sum, periods); 3139 - } 2913 + sa->runnable_load_sum = 2914 + decay_load(sa->runnable_load_sum, periods); 3140 2915 sa->util_sum = decay_load((u64)(sa->util_sum), periods); 3141 2916 3142 2917 /* ··· 3147 2926 sa->period_contrib = delta; 3148 2927 3149 2928 contrib = cap_scale(contrib, scale_freq); 3150 - if (weight) { 3151 - sa->load_sum += weight * contrib; 3152 - if (cfs_rq) 3153 - cfs_rq->runnable_load_sum += weight * contrib; 3154 - } 2929 + if (load) 2930 + sa->load_sum += load * contrib; 2931 + if (runnable) 2932 + sa->runnable_load_sum += runnable * contrib; 3155 2933 if (running) 3156 2934 sa->util_sum += contrib * scale_cpu; 3157 2935 ··· 3186 2966 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 3187 2967 */ 3188 2968 static __always_inline int 3189 - ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, 3190 - unsigned long weight, int running, struct cfs_rq *cfs_rq) 2969 + ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, 2970 + unsigned long load, unsigned long runnable, int running) 3191 2971 { 3192 2972 u64 delta; 3193 2973 ··· 3220 3000 * this happens during idle_balance() which calls 3221 3001 * update_blocked_averages() 3222 3002 */ 3223 - if (!weight) 3224 - running = 0; 3003 + if (!load) 3004 + runnable = running = 0; 3225 3005 3226 3006 /* 3227 3007 * Now we know we crossed measurement unit boundaries. The *_avg ··· 3230 3010 * Step 1: accumulate *_sum since last_update_time. If we haven't 3231 3011 * crossed period boundaries, finish. 3232 3012 */ 3233 - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) 3013 + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) 3234 3014 return 0; 3235 - 3236 - /* 3237 - * Step 2: update *_avg. 3238 - */ 3239 - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); 3240 - if (cfs_rq) { 3241 - cfs_rq->runnable_load_avg = 3242 - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); 3243 - } 3244 - sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); 3245 3015 3246 3016 return 1; 3247 3017 } 3248 3018 3019 + static __always_inline void 3020 + ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) 3021 + { 3022 + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; 3023 + 3024 + /* 3025 + * Step 2: update *_avg. 3026 + */ 3027 + sa->load_avg = div_u64(load * sa->load_sum, divider); 3028 + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); 3029 + sa->util_avg = sa->util_sum / divider; 3030 + } 3031 + 3032 + /* 3033 + * sched_entity: 3034 + * 3035 + * task: 3036 + * se_runnable() == se_weight() 3037 + * 3038 + * group: [ see update_cfs_group() ] 3039 + * se_weight() = tg->weight * grq->load_avg / tg->load_avg 3040 + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg 3041 + * 3042 + * load_sum := runnable_sum 3043 + * load_avg = se_weight(se) * runnable_avg 3044 + * 3045 + * runnable_load_sum := runnable_sum 3046 + * runnable_load_avg = se_runnable(se) * runnable_avg 3047 + * 3048 + * XXX collapse load_sum and runnable_load_sum 3049 + * 3050 + * cfq_rs: 3051 + * 3052 + * load_sum = \Sum se_weight(se) * se->avg.load_sum 3053 + * load_avg = \Sum se->avg.load_avg 3054 + * 3055 + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum 3056 + * runnable_load_avg = \Sum se->avg.runable_load_avg 3057 + */ 3058 + 3249 3059 static int 3250 3060 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) 3251 3061 { 3252 - return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); 3062 + if (entity_is_task(se)) 3063 + se->runnable_weight = se->load.weight; 3064 + 3065 + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { 3066 + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3067 + return 1; 3068 + } 3069 + 3070 + return 0; 3253 3071 } 3254 3072 3255 3073 static int 3256 3074 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) 3257 3075 { 3258 - return ___update_load_avg(now, cpu, &se->avg, 3259 - se->on_rq * scale_load_down(se->load.weight), 3260 - cfs_rq->curr == se, NULL); 3076 + if (entity_is_task(se)) 3077 + se->runnable_weight = se->load.weight; 3078 + 3079 + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, 3080 + cfs_rq->curr == se)) { 3081 + 3082 + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3083 + return 1; 3084 + } 3085 + 3086 + return 0; 3261 3087 } 3262 3088 3263 3089 static int 3264 3090 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) 3265 3091 { 3266 - return ___update_load_avg(now, cpu, &cfs_rq->avg, 3267 - scale_load_down(cfs_rq->load.weight), 3268 - cfs_rq->curr != NULL, cfs_rq); 3269 - } 3092 + if (___update_load_sum(now, cpu, &cfs_rq->avg, 3093 + scale_load_down(cfs_rq->load.weight), 3094 + scale_load_down(cfs_rq->runnable_weight), 3095 + cfs_rq->curr != NULL)) { 3270 3096 3271 - /* 3272 - * Signed add and clamp on underflow. 3273 - * 3274 - * Explicitly do a load-store to ensure the intermediate value never hits 3275 - * memory. This allows lockless observations without ever seeing the negative 3276 - * values. 3277 - */ 3278 - #define add_positive(_ptr, _val) do { \ 3279 - typeof(_ptr) ptr = (_ptr); \ 3280 - typeof(_val) val = (_val); \ 3281 - typeof(*ptr) res, var = READ_ONCE(*ptr); \ 3282 - \ 3283 - res = var + val; \ 3284 - \ 3285 - if (val < 0 && res > var) \ 3286 - res = 0; \ 3287 - \ 3288 - WRITE_ONCE(*ptr, res); \ 3289 - } while (0) 3097 + ___update_load_avg(&cfs_rq->avg, 1, 1); 3098 + return 1; 3099 + } 3100 + 3101 + return 0; 3102 + } 3290 3103 3291 3104 #ifdef CONFIG_FAIR_GROUP_SCHED 3292 3105 /** ··· 3402 3149 se->avg.last_update_time = n_last_update_time; 3403 3150 } 3404 3151 3405 - /* Take into account change of utilization of a child task group */ 3152 + 3153 + /* 3154 + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to 3155 + * propagate its contribution. The key to this propagation is the invariant 3156 + * that for each group: 3157 + * 3158 + * ge->avg == grq->avg (1) 3159 + * 3160 + * _IFF_ we look at the pure running and runnable sums. Because they 3161 + * represent the very same entity, just at different points in the hierarchy. 3162 + * 3163 + * 3164 + * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and 3165 + * simply copies the running sum over. 3166 + * 3167 + * However, update_tg_cfs_runnable() is more complex. So we have: 3168 + * 3169 + * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) 3170 + * 3171 + * And since, like util, the runnable part should be directly transferable, 3172 + * the following would _appear_ to be the straight forward approach: 3173 + * 3174 + * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) 3175 + * 3176 + * And per (1) we have: 3177 + * 3178 + * ge->avg.running_avg == grq->avg.running_avg 3179 + * 3180 + * Which gives: 3181 + * 3182 + * ge->load.weight * grq->avg.load_avg 3183 + * ge->avg.load_avg = ----------------------------------- (4) 3184 + * grq->load.weight 3185 + * 3186 + * Except that is wrong! 3187 + * 3188 + * Because while for entities historical weight is not important and we 3189 + * really only care about our future and therefore can consider a pure 3190 + * runnable sum, runqueues can NOT do this. 3191 + * 3192 + * We specifically want runqueues to have a load_avg that includes 3193 + * historical weights. Those represent the blocked load, the load we expect 3194 + * to (shortly) return to us. This only works by keeping the weights as 3195 + * integral part of the sum. We therefore cannot decompose as per (3). 3196 + * 3197 + * OK, so what then? 3198 + * 3199 + * 3200 + * Another way to look at things is: 3201 + * 3202 + * grq->avg.load_avg = \Sum se->avg.load_avg 3203 + * 3204 + * Therefore, per (2): 3205 + * 3206 + * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg 3207 + * 3208 + * And the very thing we're propagating is a change in that sum (someone 3209 + * joined/left). So we can easily know the runnable change, which would be, per 3210 + * (2) the already tracked se->load_avg divided by the corresponding 3211 + * se->weight. 3212 + * 3213 + * Basically (4) but in differential form: 3214 + * 3215 + * d(runnable_avg) += se->avg.load_avg / se->load.weight 3216 + * (5) 3217 + * ge->avg.load_avg += ge->load.weight * d(runnable_avg) 3218 + */ 3219 + 3406 3220 static inline void 3407 - update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) 3221 + update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3408 3222 { 3409 - struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3410 3223 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; 3411 3224 3412 3225 /* Nothing to update */ ··· 3488 3169 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; 3489 3170 } 3490 3171 3491 - /* Take into account change of load of a child task group */ 3492 3172 static inline void 3493 - update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) 3173 + update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3494 3174 { 3495 - struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3496 - long delta, load = gcfs_rq->avg.load_avg; 3175 + long runnable_sum = gcfs_rq->prop_runnable_sum; 3176 + long runnable_load_avg, load_avg; 3177 + s64 runnable_load_sum, load_sum; 3497 3178 3498 - /* 3499 - * If the load of group cfs_rq is null, the load of the 3500 - * sched_entity will also be null so we can skip the formula 3501 - */ 3502 - if (load) { 3503 - long tg_load; 3504 - 3505 - /* Get tg's load and ensure tg_load > 0 */ 3506 - tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; 3507 - 3508 - /* Ensure tg_load >= load and updated with current load*/ 3509 - tg_load -= gcfs_rq->tg_load_avg_contrib; 3510 - tg_load += load; 3511 - 3512 - /* 3513 - * We need to compute a correction term in the case that the 3514 - * task group is consuming more CPU than a task of equal 3515 - * weight. A task with a weight equals to tg->shares will have 3516 - * a load less or equal to scale_load_down(tg->shares). 3517 - * Similarly, the sched_entities that represent the task group 3518 - * at parent level, can't have a load higher than 3519 - * scale_load_down(tg->shares). And the Sum of sched_entities' 3520 - * load must be <= scale_load_down(tg->shares). 3521 - */ 3522 - if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { 3523 - /* scale gcfs_rq's load into tg's shares*/ 3524 - load *= scale_load_down(gcfs_rq->tg->shares); 3525 - load /= tg_load; 3526 - } 3527 - } 3528 - 3529 - delta = load - se->avg.load_avg; 3530 - 3531 - /* Nothing to update */ 3532 - if (!delta) 3179 + if (!runnable_sum) 3533 3180 return; 3534 3181 3535 - /* Set new sched_entity's load */ 3536 - se->avg.load_avg = load; 3537 - se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; 3182 + gcfs_rq->prop_runnable_sum = 0; 3538 3183 3539 - /* Update parent cfs_rq load */ 3540 - add_positive(&cfs_rq->avg.load_avg, delta); 3541 - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; 3184 + load_sum = (s64)se_weight(se) * runnable_sum; 3185 + load_avg = div_s64(load_sum, LOAD_AVG_MAX); 3542 3186 3543 - /* 3544 - * If the sched_entity is already enqueued, we also have to update the 3545 - * runnable load avg. 3546 - */ 3187 + add_positive(&se->avg.load_sum, runnable_sum); 3188 + add_positive(&se->avg.load_avg, load_avg); 3189 + 3190 + add_positive(&cfs_rq->avg.load_avg, load_avg); 3191 + add_positive(&cfs_rq->avg.load_sum, load_sum); 3192 + 3193 + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; 3194 + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); 3195 + 3196 + add_positive(&se->avg.runnable_load_sum, runnable_sum); 3197 + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); 3198 + 3547 3199 if (se->on_rq) { 3548 - /* Update parent cfs_rq runnable_load_avg */ 3549 - add_positive(&cfs_rq->runnable_load_avg, delta); 3550 - cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; 3200 + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); 3201 + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); 3551 3202 } 3552 3203 } 3553 3204 3554 - static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) 3205 + static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) 3555 3206 { 3556 - cfs_rq->propagate_avg = 1; 3557 - } 3558 - 3559 - static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) 3560 - { 3561 - struct cfs_rq *cfs_rq = group_cfs_rq(se); 3562 - 3563 - if (!cfs_rq->propagate_avg) 3564 - return 0; 3565 - 3566 - cfs_rq->propagate_avg = 0; 3567 - return 1; 3207 + cfs_rq->propagate = 1; 3208 + cfs_rq->prop_runnable_sum += runnable_sum; 3568 3209 } 3569 3210 3570 3211 /* Update task and its cfs_rq load average */ 3571 3212 static inline int propagate_entity_load_avg(struct sched_entity *se) 3572 3213 { 3573 - struct cfs_rq *cfs_rq; 3214 + struct cfs_rq *cfs_rq, *gcfs_rq; 3574 3215 3575 3216 if (entity_is_task(se)) 3576 3217 return 0; 3577 3218 3578 - if (!test_and_clear_tg_cfs_propagate(se)) 3219 + gcfs_rq = group_cfs_rq(se); 3220 + if (!gcfs_rq->propagate) 3579 3221 return 0; 3222 + 3223 + gcfs_rq->propagate = 0; 3580 3224 3581 3225 cfs_rq = cfs_rq_of(se); 3582 3226 3583 - set_tg_cfs_propagate(cfs_rq); 3227 + add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); 3584 3228 3585 - update_tg_cfs_util(cfs_rq, se); 3586 - update_tg_cfs_load(cfs_rq, se); 3229 + update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3230 + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3587 3231 3588 3232 return 1; 3589 3233 } ··· 3570 3288 * If there is a pending propagation, we have to update the load and 3571 3289 * the utilization of the sched_entity: 3572 3290 */ 3573 - if (gcfs_rq->propagate_avg) 3291 + if (gcfs_rq->propagate) 3574 3292 return false; 3575 3293 3576 3294 /* ··· 3590 3308 return 0; 3591 3309 } 3592 3310 3593 - static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} 3311 + static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} 3594 3312 3595 3313 #endif /* CONFIG_FAIR_GROUP_SCHED */ 3596 - 3597 - /* 3598 - * Unsigned subtract and clamp on underflow. 3599 - * 3600 - * Explicitly do a load-store to ensure the intermediate value never hits 3601 - * memory. This allows lockless observations without ever seeing the negative 3602 - * values. 3603 - */ 3604 - #define sub_positive(_ptr, _val) do { \ 3605 - typeof(_ptr) ptr = (_ptr); \ 3606 - typeof(*ptr) val = (_val); \ 3607 - typeof(*ptr) res, var = READ_ONCE(*ptr); \ 3608 - res = var - val; \ 3609 - if (res > var) \ 3610 - res = 0; \ 3611 - WRITE_ONCE(*ptr, res); \ 3612 - } while (0) 3613 3314 3614 3315 /** 3615 3316 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages ··· 3613 3348 static inline int 3614 3349 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3615 3350 { 3351 + unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; 3616 3352 struct sched_avg *sa = &cfs_rq->avg; 3617 - int decayed, removed_load = 0, removed_util = 0; 3353 + int decayed = 0; 3618 3354 3619 - if (atomic_long_read(&cfs_rq->removed_load_avg)) { 3620 - s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 3355 + if (cfs_rq->removed.nr) { 3356 + unsigned long r; 3357 + u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; 3358 + 3359 + raw_spin_lock(&cfs_rq->removed.lock); 3360 + swap(cfs_rq->removed.util_avg, removed_util); 3361 + swap(cfs_rq->removed.load_avg, removed_load); 3362 + swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); 3363 + cfs_rq->removed.nr = 0; 3364 + raw_spin_unlock(&cfs_rq->removed.lock); 3365 + 3366 + r = removed_load; 3621 3367 sub_positive(&sa->load_avg, r); 3622 - sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); 3623 - removed_load = 1; 3624 - set_tg_cfs_propagate(cfs_rq); 3625 - } 3368 + sub_positive(&sa->load_sum, r * divider); 3626 3369 3627 - if (atomic_long_read(&cfs_rq->removed_util_avg)) { 3628 - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 3370 + r = removed_util; 3629 3371 sub_positive(&sa->util_avg, r); 3630 - sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); 3631 - removed_util = 1; 3632 - set_tg_cfs_propagate(cfs_rq); 3372 + sub_positive(&sa->util_sum, r * divider); 3373 + 3374 + add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); 3375 + 3376 + decayed = 1; 3633 3377 } 3634 3378 3635 - decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3379 + decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3636 3380 3637 3381 #ifndef CONFIG_64BIT 3638 3382 smp_wmb(); 3639 3383 cfs_rq->load_last_update_time_copy = sa->last_update_time; 3640 3384 #endif 3641 3385 3642 - if (decayed || removed_util) 3386 + if (decayed) 3643 3387 cfs_rq_util_change(cfs_rq); 3644 3388 3645 - return decayed || removed_load; 3389 + return decayed; 3390 + } 3391 + 3392 + /** 3393 + * attach_entity_load_avg - attach this entity to its cfs_rq load avg 3394 + * @cfs_rq: cfs_rq to attach to 3395 + * @se: sched_entity to attach 3396 + * 3397 + * Must call update_cfs_rq_load_avg() before this, since we rely on 3398 + * cfs_rq->avg.last_update_time being current. 3399 + */ 3400 + static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3401 + { 3402 + u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; 3403 + 3404 + /* 3405 + * When we attach the @se to the @cfs_rq, we must align the decay 3406 + * window because without that, really weird and wonderful things can 3407 + * happen. 3408 + * 3409 + * XXX illustrate 3410 + */ 3411 + se->avg.last_update_time = cfs_rq->avg.last_update_time; 3412 + se->avg.period_contrib = cfs_rq->avg.period_contrib; 3413 + 3414 + /* 3415 + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new 3416 + * period_contrib. This isn't strictly correct, but since we're 3417 + * entirely outside of the PELT hierarchy, nobody cares if we truncate 3418 + * _sum a little. 3419 + */ 3420 + se->avg.util_sum = se->avg.util_avg * divider; 3421 + 3422 + se->avg.load_sum = divider; 3423 + if (se_weight(se)) { 3424 + se->avg.load_sum = 3425 + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); 3426 + } 3427 + 3428 + se->avg.runnable_load_sum = se->avg.load_sum; 3429 + 3430 + enqueue_load_avg(cfs_rq, se); 3431 + cfs_rq->avg.util_avg += se->avg.util_avg; 3432 + cfs_rq->avg.util_sum += se->avg.util_sum; 3433 + 3434 + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3435 + 3436 + cfs_rq_util_change(cfs_rq); 3437 + } 3438 + 3439 + /** 3440 + * detach_entity_load_avg - detach this entity from its cfs_rq load avg 3441 + * @cfs_rq: cfs_rq to detach from 3442 + * @se: sched_entity to detach 3443 + * 3444 + * Must call update_cfs_rq_load_avg() before this, since we rely on 3445 + * cfs_rq->avg.last_update_time being current. 3446 + */ 3447 + static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3448 + { 3449 + dequeue_load_avg(cfs_rq, se); 3450 + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3451 + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3452 + 3453 + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3454 + 3455 + cfs_rq_util_change(cfs_rq); 3646 3456 } 3647 3457 3648 3458 /* ··· 3725 3385 */ 3726 3386 #define UPDATE_TG 0x1 3727 3387 #define SKIP_AGE_LOAD 0x2 3388 + #define DO_ATTACH 0x4 3728 3389 3729 3390 /* Update task and its cfs_rq load average */ 3730 - static inline void update_load_avg(struct sched_entity *se, int flags) 3391 + static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3731 3392 { 3732 - struct cfs_rq *cfs_rq = cfs_rq_of(se); 3733 3393 u64 now = cfs_rq_clock_task(cfs_rq); 3734 3394 struct rq *rq = rq_of(cfs_rq); 3735 3395 int cpu = cpu_of(rq); ··· 3745 3405 decayed = update_cfs_rq_load_avg(now, cfs_rq); 3746 3406 decayed |= propagate_entity_load_avg(se); 3747 3407 3748 - if (decayed && (flags & UPDATE_TG)) 3749 - update_tg_load_avg(cfs_rq, 0); 3750 - } 3408 + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 3751 3409 3752 - /** 3753 - * attach_entity_load_avg - attach this entity to its cfs_rq load avg 3754 - * @cfs_rq: cfs_rq to attach to 3755 - * @se: sched_entity to attach 3756 - * 3757 - * Must call update_cfs_rq_load_avg() before this, since we rely on 3758 - * cfs_rq->avg.last_update_time being current. 3759 - */ 3760 - static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3761 - { 3762 - se->avg.last_update_time = cfs_rq->avg.last_update_time; 3763 - cfs_rq->avg.load_avg += se->avg.load_avg; 3764 - cfs_rq->avg.load_sum += se->avg.load_sum; 3765 - cfs_rq->avg.util_avg += se->avg.util_avg; 3766 - cfs_rq->avg.util_sum += se->avg.util_sum; 3767 - set_tg_cfs_propagate(cfs_rq); 3768 - 3769 - cfs_rq_util_change(cfs_rq); 3770 - } 3771 - 3772 - /** 3773 - * detach_entity_load_avg - detach this entity from its cfs_rq load avg 3774 - * @cfs_rq: cfs_rq to detach from 3775 - * @se: sched_entity to detach 3776 - * 3777 - * Must call update_cfs_rq_load_avg() before this, since we rely on 3778 - * cfs_rq->avg.last_update_time being current. 3779 - */ 3780 - static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3781 - { 3782 - 3783 - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 3784 - sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); 3785 - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3786 - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3787 - set_tg_cfs_propagate(cfs_rq); 3788 - 3789 - cfs_rq_util_change(cfs_rq); 3790 - } 3791 - 3792 - /* Add the load generated by se into cfs_rq's load average */ 3793 - static inline void 3794 - enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3795 - { 3796 - struct sched_avg *sa = &se->avg; 3797 - 3798 - cfs_rq->runnable_load_avg += sa->load_avg; 3799 - cfs_rq->runnable_load_sum += sa->load_sum; 3800 - 3801 - if (!sa->last_update_time) { 3802 3410 attach_entity_load_avg(cfs_rq, se); 3803 3411 update_tg_load_avg(cfs_rq, 0); 3804 - } 3805 - } 3806 3412 3807 - /* Remove the runnable load generated by se from cfs_rq's runnable load average */ 3808 - static inline void 3809 - dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3810 - { 3811 - cfs_rq->runnable_load_avg = 3812 - max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); 3813 - cfs_rq->runnable_load_sum = 3814 - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); 3413 + } else if (decayed && (flags & UPDATE_TG)) 3414 + update_tg_load_avg(cfs_rq, 0); 3815 3415 } 3816 3416 3817 3417 #ifndef CONFIG_64BIT ··· 3795 3515 void remove_entity_load_avg(struct sched_entity *se) 3796 3516 { 3797 3517 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3518 + unsigned long flags; 3798 3519 3799 3520 /* 3800 3521 * tasks cannot exit without having gone through wake_up_new_task() -> ··· 3808 3527 */ 3809 3528 3810 3529 sync_entity_load_avg(se); 3811 - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); 3812 - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); 3530 + 3531 + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); 3532 + ++cfs_rq->removed.nr; 3533 + cfs_rq->removed.util_avg += se->avg.util_avg; 3534 + cfs_rq->removed.load_avg += se->avg.load_avg; 3535 + cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ 3536 + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); 3813 3537 } 3814 3538 3815 3539 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) 3816 3540 { 3817 - return cfs_rq->runnable_load_avg; 3541 + return cfs_rq->avg.runnable_load_avg; 3818 3542 } 3819 3543 3820 3544 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) ··· 3839 3553 3840 3554 #define UPDATE_TG 0x0 3841 3555 #define SKIP_AGE_LOAD 0x0 3556 + #define DO_ATTACH 0x0 3842 3557 3843 - static inline void update_load_avg(struct sched_entity *se, int not_used1) 3558 + static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 3844 3559 { 3845 - cfs_rq_util_change(cfs_rq_of(se)); 3560 + cfs_rq_util_change(cfs_rq); 3846 3561 } 3847 3562 3848 - static inline void 3849 - enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 3850 - static inline void 3851 - dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 3852 3563 static inline void remove_entity_load_avg(struct sched_entity *se) {} 3853 3564 3854 3565 static inline void ··· 3990 3707 * its group cfs_rq 3991 3708 * - Add its new weight to cfs_rq->load.weight 3992 3709 */ 3993 - update_load_avg(se, UPDATE_TG); 3994 - enqueue_entity_load_avg(cfs_rq, se); 3995 - update_cfs_shares(se); 3710 + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); 3711 + update_cfs_group(se); 3712 + enqueue_runnable_load_avg(cfs_rq, se); 3996 3713 account_entity_enqueue(cfs_rq, se); 3997 3714 3998 3715 if (flags & ENQUEUE_WAKEUP) ··· 4074 3791 * - For group entity, update its weight to reflect the new share 4075 3792 * of its group cfs_rq. 4076 3793 */ 4077 - update_load_avg(se, UPDATE_TG); 4078 - dequeue_entity_load_avg(cfs_rq, se); 3794 + update_load_avg(cfs_rq, se, UPDATE_TG); 3795 + dequeue_runnable_load_avg(cfs_rq, se); 4079 3796 4080 3797 update_stats_dequeue(cfs_rq, se, flags); 4081 3798 ··· 4098 3815 /* return excess runtime on last dequeue */ 4099 3816 return_cfs_rq_runtime(cfs_rq); 4100 3817 4101 - update_cfs_shares(se); 3818 + update_cfs_group(se); 4102 3819 4103 3820 /* 4104 3821 * Now advance min_vruntime if @se was the entity holding it back, ··· 4162 3879 */ 4163 3880 update_stats_wait_end(cfs_rq, se); 4164 3881 __dequeue_entity(cfs_rq, se); 4165 - update_load_avg(se, UPDATE_TG); 3882 + update_load_avg(cfs_rq, se, UPDATE_TG); 4166 3883 } 4167 3884 4168 3885 update_stats_curr_start(cfs_rq, se); ··· 4264 3981 /* Put 'current' back into the tree. */ 4265 3982 __enqueue_entity(cfs_rq, prev); 4266 3983 /* in !on_rq case, update occurred at dequeue */ 4267 - update_load_avg(prev, 0); 3984 + update_load_avg(cfs_rq, prev, 0); 4268 3985 } 4269 3986 cfs_rq->curr = NULL; 4270 3987 } ··· 4280 3997 /* 4281 3998 * Ensure that runnable average is periodically updated. 4282 3999 */ 4283 - update_load_avg(curr, UPDATE_TG); 4284 - update_cfs_shares(curr); 4000 + update_load_avg(cfs_rq, curr, UPDATE_TG); 4001 + update_cfs_group(curr); 4285 4002 4286 4003 #ifdef CONFIG_SCHED_HRTICK 4287 4004 /* ··· 5198 4915 if (cfs_rq_throttled(cfs_rq)) 5199 4916 break; 5200 4917 5201 - update_load_avg(se, UPDATE_TG); 5202 - update_cfs_shares(se); 4918 + update_load_avg(cfs_rq, se, UPDATE_TG); 4919 + update_cfs_group(se); 5203 4920 } 5204 4921 5205 4922 if (!se) ··· 5257 4974 if (cfs_rq_throttled(cfs_rq)) 5258 4975 break; 5259 4976 5260 - update_load_avg(se, UPDATE_TG); 5261 - update_cfs_shares(se); 4977 + update_load_avg(cfs_rq, se, UPDATE_TG); 4978 + update_cfs_group(se); 5262 4979 } 5263 4980 5264 4981 if (!se) ··· 5732 5449 /* 5733 5450 * find_idlest_group finds and returns the least busy CPU group within the 5734 5451 * domain. 5452 + * 5453 + * Assumes p is allowed on at least one CPU in sd. 5735 5454 */ 5736 5455 static struct sched_group * 5737 5456 find_idlest_group(struct sched_domain *sd, struct task_struct *p, ··· 5741 5456 { 5742 5457 struct sched_group *idlest = NULL, *group = sd->groups; 5743 5458 struct sched_group *most_spare_sg = NULL; 5744 - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; 5745 - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; 5459 + unsigned long min_runnable_load = ULONG_MAX; 5460 + unsigned long this_runnable_load = ULONG_MAX; 5461 + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; 5746 5462 unsigned long most_spare = 0, this_spare = 0; 5747 5463 int load_idx = sd->forkexec_idx; 5748 5464 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; ··· 5864 5578 } 5865 5579 5866 5580 /* 5867 - * find_idlest_cpu - find the idlest cpu among the cpus in group. 5581 + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 5868 5582 */ 5869 5583 static int 5870 - find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5584 + find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5871 5585 { 5872 5586 unsigned long load, min_load = ULONG_MAX; 5873 5587 unsigned int min_exit_latency = UINT_MAX; ··· 5914 5628 } 5915 5629 5916 5630 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 5631 + } 5632 + 5633 + static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, 5634 + int cpu, int prev_cpu, int sd_flag) 5635 + { 5636 + int new_cpu = cpu; 5637 + 5638 + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 5639 + return prev_cpu; 5640 + 5641 + while (sd) { 5642 + struct sched_group *group; 5643 + struct sched_domain *tmp; 5644 + int weight; 5645 + 5646 + if (!(sd->flags & sd_flag)) { 5647 + sd = sd->child; 5648 + continue; 5649 + } 5650 + 5651 + group = find_idlest_group(sd, p, cpu, sd_flag); 5652 + if (!group) { 5653 + sd = sd->child; 5654 + continue; 5655 + } 5656 + 5657 + new_cpu = find_idlest_group_cpu(group, p, cpu); 5658 + if (new_cpu == cpu) { 5659 + /* Now try balancing at a lower domain level of cpu */ 5660 + sd = sd->child; 5661 + continue; 5662 + } 5663 + 5664 + /* Now try balancing at a lower domain level of new_cpu */ 5665 + cpu = new_cpu; 5666 + weight = sd->span_weight; 5667 + sd = NULL; 5668 + for_each_domain(cpu, tmp) { 5669 + if (weight <= tmp->span_weight) 5670 + break; 5671 + if (tmp->flags & sd_flag) 5672 + sd = tmp; 5673 + } 5674 + /* while loop will break here if sd == NULL */ 5675 + } 5676 + 5677 + return new_cpu; 5917 5678 } 5918 5679 5919 5680 #ifdef CONFIG_SCHED_SMT ··· 6315 5982 new_cpu = cpu; 6316 5983 } 6317 5984 5985 + if (sd && !(sd_flag & SD_BALANCE_FORK)) { 5986 + /* 5987 + * We're going to need the task's util for capacity_spare_wake 5988 + * in find_idlest_group. Sync it up to prev_cpu's 5989 + * last_update_time. 5990 + */ 5991 + sync_entity_load_avg(&p->se); 5992 + } 5993 + 6318 5994 if (!sd) { 6319 - pick_cpu: 5995 + pick_cpu: 6320 5996 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 6321 5997 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 6322 5998 6323 - } else while (sd) { 6324 - struct sched_group *group; 6325 - int weight; 6326 - 6327 - if (!(sd->flags & sd_flag)) { 6328 - sd = sd->child; 6329 - continue; 6330 - } 6331 - 6332 - group = find_idlest_group(sd, p, cpu, sd_flag); 6333 - if (!group) { 6334 - sd = sd->child; 6335 - continue; 6336 - } 6337 - 6338 - new_cpu = find_idlest_cpu(group, p, cpu); 6339 - if (new_cpu == -1 || new_cpu == cpu) { 6340 - /* Now try balancing at a lower domain level of cpu */ 6341 - sd = sd->child; 6342 - continue; 6343 - } 6344 - 6345 - /* Now try balancing at a lower domain level of new_cpu */ 6346 - cpu = new_cpu; 6347 - weight = sd->span_weight; 6348 - sd = NULL; 6349 - for_each_domain(cpu, tmp) { 6350 - if (weight <= tmp->span_weight) 6351 - break; 6352 - if (tmp->flags & sd_flag) 6353 - sd = tmp; 6354 - } 6355 - /* while loop will break here if sd == NULL */ 5999 + } else { 6000 + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); 6356 6001 } 6357 6002 rcu_read_unlock(); 6358 6003 6359 6004 return new_cpu; 6360 6005 } 6006 + 6007 + static void detach_entity_cfs_rq(struct sched_entity *se); 6361 6008 6362 6009 /* 6363 6010 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and ··· 6372 6059 se->vruntime -= min_vruntime; 6373 6060 } 6374 6061 6375 - /* 6376 - * We are supposed to update the task to "current" time, then its up to date 6377 - * and ready to go to new CPU/cfs_rq. But we have difficulty in getting 6378 - * what current time is, so simply throw away the out-of-date time. This 6379 - * will result in the wakee task is less decayed, but giving the wakee more 6380 - * load sounds not bad. 6381 - */ 6382 - remove_entity_load_avg(&p->se); 6062 + if (p->on_rq == TASK_ON_RQ_MIGRATING) { 6063 + /* 6064 + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' 6065 + * rq->lock and can modify state directly. 6066 + */ 6067 + lockdep_assert_held(&task_rq(p)->lock); 6068 + detach_entity_cfs_rq(&p->se); 6069 + 6070 + } else { 6071 + /* 6072 + * We are supposed to update the task to "current" time, then 6073 + * its up to date and ready to go to new CPU/cfs_rq. But we 6074 + * have difficulty in getting what current time is, so simply 6075 + * throw away the out-of-date time. This will result in the 6076 + * wakee task is less decayed, but giving the wakee more load 6077 + * sounds not bad. 6078 + */ 6079 + remove_entity_load_avg(&p->se); 6080 + } 6383 6081 6384 6082 /* Tell new CPU we are migrated */ 6385 6083 p->se.avg.last_update_time = 0; ··· 6658 6334 set_next_entity(cfs_rq, se); 6659 6335 } 6660 6336 6661 - if (hrtick_enabled(rq)) 6662 - hrtick_start_fair(rq, p); 6663 - 6664 - return p; 6337 + goto done; 6665 6338 simple: 6666 6339 #endif 6667 6340 ··· 6671 6350 } while (cfs_rq); 6672 6351 6673 6352 p = task_of(se); 6353 + 6354 + done: __maybe_unused 6355 + #ifdef CONFIG_SMP 6356 + /* 6357 + * Move the next running task to the front of 6358 + * the list, so our cfs_tasks list becomes MRU 6359 + * one. 6360 + */ 6361 + list_move(&p->se.group_node, &rq->cfs_tasks); 6362 + #endif 6674 6363 6675 6364 if (hrtick_enabled(rq)) 6676 6365 hrtick_start_fair(rq, p); ··· 7117 6786 */ 7118 6787 static struct task_struct *detach_one_task(struct lb_env *env) 7119 6788 { 7120 - struct task_struct *p, *n; 6789 + struct task_struct *p; 7121 6790 7122 6791 lockdep_assert_held(&env->src_rq->lock); 7123 6792 7124 - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 6793 + list_for_each_entry_reverse(p, 6794 + &env->src_rq->cfs_tasks, se.group_node) { 7125 6795 if (!can_migrate_task(p, env)) 7126 6796 continue; 7127 6797 ··· 7168 6836 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) 7169 6837 break; 7170 6838 7171 - p = list_first_entry(tasks, struct task_struct, se.group_node); 6839 + p = list_last_entry(tasks, struct task_struct, se.group_node); 7172 6840 7173 6841 env->loop++; 7174 6842 /* We've more or less seen every task there is, call it quits */ ··· 7218 6886 7219 6887 continue; 7220 6888 next: 7221 - list_move_tail(&p->se.group_node, tasks); 6889 + list_move(&p->se.group_node, tasks); 7222 6890 } 7223 6891 7224 6892 /* ··· 7294 6962 if (cfs_rq->avg.util_sum) 7295 6963 return false; 7296 6964 7297 - if (cfs_rq->runnable_load_sum) 6965 + if (cfs_rq->avg.runnable_load_sum) 7298 6966 return false; 7299 6967 7300 6968 return true; ··· 7326 6994 /* Propagate pending load changes to the parent, if any: */ 7327 6995 se = cfs_rq->tg->se[cpu]; 7328 6996 if (se && !skip_blocked_update(se)) 7329 - update_load_avg(se, 0); 6997 + update_load_avg(cfs_rq_of(se), se, 0); 7330 6998 7331 6999 /* 7332 7000 * There can be a lot of idle CPU cgroups. Don't let fully ··· 8207 7875 if (busiest->group_type == group_imbalanced) 8208 7876 goto force_balance; 8209 7877 8210 - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 8211 - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && 7878 + /* 7879 + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group 7880 + * capacities from resulting in underutilization due to avg_load. 7881 + */ 7882 + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && 8212 7883 busiest->group_no_capacity) 8213 7884 goto force_balance; 8214 7885 ··· 9028 8693 return; 9029 8694 9030 8695 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 9031 - if (!is_housekeeping_cpu(cpu)) 8696 + if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) 9032 8697 return; 9033 8698 9034 8699 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) ··· 9493 9158 if (cfs_rq_throttled(cfs_rq)) 9494 9159 break; 9495 9160 9496 - update_load_avg(se, UPDATE_TG); 9161 + update_load_avg(cfs_rq, se, UPDATE_TG); 9497 9162 } 9498 9163 } 9499 9164 #else ··· 9505 9170 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9506 9171 9507 9172 /* Catch up with the cfs_rq and remove our load when we leave */ 9508 - update_load_avg(se, 0); 9173 + update_load_avg(cfs_rq, se, 0); 9509 9174 detach_entity_load_avg(cfs_rq, se); 9510 9175 update_tg_load_avg(cfs_rq, false); 9511 9176 propagate_entity_cfs_rq(se); ··· 9524 9189 #endif 9525 9190 9526 9191 /* Synchronize entity with its cfs_rq */ 9527 - update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 9192 + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 9528 9193 attach_entity_load_avg(cfs_rq, se); 9529 9194 update_tg_load_avg(cfs_rq, false); 9530 9195 propagate_entity_cfs_rq(se); ··· 9606 9271 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 9607 9272 #endif 9608 9273 #ifdef CONFIG_SMP 9609 - #ifdef CONFIG_FAIR_GROUP_SCHED 9610 - cfs_rq->propagate_avg = 0; 9611 - #endif 9612 - atomic_long_set(&cfs_rq->removed_load_avg, 0); 9613 - atomic_long_set(&cfs_rq->removed_util_avg, 0); 9274 + raw_spin_lock_init(&cfs_rq->removed.lock); 9614 9275 #endif 9615 9276 } 9616 9277 ··· 9804 9473 rq_lock_irqsave(rq, &rf); 9805 9474 update_rq_clock(rq); 9806 9475 for_each_sched_entity(se) { 9807 - update_load_avg(se, UPDATE_TG); 9808 - update_cfs_shares(se); 9476 + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); 9477 + update_cfs_group(se); 9809 9478 } 9810 9479 rq_unlock_irqrestore(rq, &rf); 9811 9480 }

+2 -2

kernel/sched/idle.c

··· 209 209 */ 210 210 static void do_idle(void) 211 211 { 212 + int cpu = smp_processor_id(); 212 213 /* 213 214 * If the arch has a polling bit, we maintain an invariant: 214 215 * ··· 220 219 */ 221 220 222 221 __current_set_polling(); 223 - quiet_vmstat(); 224 222 tick_nohz_idle_enter(); 225 223 226 224 while (!need_resched()) { 227 225 check_pgt_cache(); 228 226 rmb(); 229 227 230 - if (cpu_is_offline(smp_processor_id())) { 228 + if (cpu_is_offline(cpu)) { 231 229 cpuhp_report_idle_dead(); 232 230 arch_cpu_idle_dead(); 233 231 }

+155

kernel/sched/isolation.c

··· 1 + /* 2 + * Housekeeping management. Manage the targets for routine code that can run on 3 + * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 4 + * 5 + * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker 6 + * 7 + */ 8 + 9 + #include <linux/sched/isolation.h> 10 + #include <linux/tick.h> 11 + #include <linux/init.h> 12 + #include <linux/kernel.h> 13 + #include <linux/static_key.h> 14 + #include <linux/ctype.h> 15 + 16 + DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 17 + EXPORT_SYMBOL_GPL(housekeeping_overriden); 18 + static cpumask_var_t housekeeping_mask; 19 + static unsigned int housekeeping_flags; 20 + 21 + int housekeeping_any_cpu(enum hk_flags flags) 22 + { 23 + if (static_branch_unlikely(&housekeeping_overriden)) 24 + if (housekeeping_flags & flags) 25 + return cpumask_any_and(housekeeping_mask, cpu_online_mask); 26 + return smp_processor_id(); 27 + } 28 + EXPORT_SYMBOL_GPL(housekeeping_any_cpu); 29 + 30 + const struct cpumask *housekeeping_cpumask(enum hk_flags flags) 31 + { 32 + if (static_branch_unlikely(&housekeeping_overriden)) 33 + if (housekeeping_flags & flags) 34 + return housekeeping_mask; 35 + return cpu_possible_mask; 36 + } 37 + EXPORT_SYMBOL_GPL(housekeeping_cpumask); 38 + 39 + void housekeeping_affine(struct task_struct *t, enum hk_flags flags) 40 + { 41 + if (static_branch_unlikely(&housekeeping_overriden)) 42 + if (housekeeping_flags & flags) 43 + set_cpus_allowed_ptr(t, housekeeping_mask); 44 + } 45 + EXPORT_SYMBOL_GPL(housekeeping_affine); 46 + 47 + bool housekeeping_test_cpu(int cpu, enum hk_flags flags) 48 + { 49 + if (static_branch_unlikely(&housekeeping_overriden)) 50 + if (housekeeping_flags & flags) 51 + return cpumask_test_cpu(cpu, housekeeping_mask); 52 + return true; 53 + } 54 + EXPORT_SYMBOL_GPL(housekeeping_test_cpu); 55 + 56 + void __init housekeeping_init(void) 57 + { 58 + if (!housekeeping_flags) 59 + return; 60 + 61 + static_branch_enable(&housekeeping_overriden); 62 + 63 + /* We need at least one CPU to handle housekeeping work */ 64 + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 65 + } 66 + 67 + static int __init housekeeping_setup(char *str, enum hk_flags flags) 68 + { 69 + cpumask_var_t non_housekeeping_mask; 70 + int err; 71 + 72 + alloc_bootmem_cpumask_var(&non_housekeeping_mask); 73 + err = cpulist_parse(str, non_housekeeping_mask); 74 + if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { 75 + pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); 76 + free_bootmem_cpumask_var(non_housekeeping_mask); 77 + return 0; 78 + } 79 + 80 + if (!housekeeping_flags) { 81 + alloc_bootmem_cpumask_var(&housekeeping_mask); 82 + cpumask_andnot(housekeeping_mask, 83 + cpu_possible_mask, non_housekeeping_mask); 84 + if (cpumask_empty(housekeeping_mask)) 85 + cpumask_set_cpu(smp_processor_id(), housekeeping_mask); 86 + } else { 87 + cpumask_var_t tmp; 88 + 89 + alloc_bootmem_cpumask_var(&tmp); 90 + cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); 91 + if (!cpumask_equal(tmp, housekeeping_mask)) { 92 + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); 93 + free_bootmem_cpumask_var(tmp); 94 + free_bootmem_cpumask_var(non_housekeeping_mask); 95 + return 0; 96 + } 97 + free_bootmem_cpumask_var(tmp); 98 + } 99 + 100 + if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { 101 + if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { 102 + tick_nohz_full_setup(non_housekeeping_mask); 103 + } else { 104 + pr_warn("Housekeeping: nohz unsupported." 105 + " Build with CONFIG_NO_HZ_FULL\n"); 106 + free_bootmem_cpumask_var(non_housekeeping_mask); 107 + return 0; 108 + } 109 + } 110 + 111 + housekeeping_flags |= flags; 112 + 113 + free_bootmem_cpumask_var(non_housekeeping_mask); 114 + 115 + return 1; 116 + } 117 + 118 + static int __init housekeeping_nohz_full_setup(char *str) 119 + { 120 + unsigned int flags; 121 + 122 + flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; 123 + 124 + return housekeeping_setup(str, flags); 125 + } 126 + __setup("nohz_full=", housekeeping_nohz_full_setup); 127 + 128 + static int __init housekeeping_isolcpus_setup(char *str) 129 + { 130 + unsigned int flags = 0; 131 + 132 + while (isalpha(*str)) { 133 + if (!strncmp(str, "nohz,", 5)) { 134 + str += 5; 135 + flags |= HK_FLAG_TICK; 136 + continue; 137 + } 138 + 139 + if (!strncmp(str, "domain,", 7)) { 140 + str += 7; 141 + flags |= HK_FLAG_DOMAIN; 142 + continue; 143 + } 144 + 145 + pr_warn("isolcpus: Error, unknown flag\n"); 146 + return 0; 147 + } 148 + 149 + /* Default behaviour for isolcpus without flags */ 150 + if (!flags) 151 + flags |= HK_FLAG_DOMAIN; 152 + 153 + return housekeeping_setup(str, flags); 154 + } 155 + __setup("isolcpus=", housekeeping_isolcpus_setup);

+118 -204

kernel/sched/rt.c

··· 74 74 raw_spin_unlock(&rt_b->rt_runtime_lock); 75 75 } 76 76 77 - #if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) 78 - static void push_irq_work_func(struct irq_work *work); 79 - #endif 80 - 81 77 void init_rt_rq(struct rt_rq *rt_rq) 82 78 { 83 79 struct rt_prio_array *array; ··· 93 97 rt_rq->rt_nr_migratory = 0; 94 98 rt_rq->overloaded = 0; 95 99 plist_head_init(&rt_rq->pushable_tasks); 96 - 97 - #ifdef HAVE_RT_PUSH_IPI 98 - rt_rq->push_flags = 0; 99 - rt_rq->push_cpu = nr_cpu_ids; 100 - raw_spin_lock_init(&rt_rq->push_lock); 101 - init_irq_work(&rt_rq->push_work, push_irq_work_func); 102 - #endif 103 100 #endif /* CONFIG_SMP */ 104 101 /* We start is dequeued state, because no RT tasks are queued */ 105 102 rt_rq->rt_queued = 0; ··· 1865 1876 } 1866 1877 1867 1878 #ifdef HAVE_RT_PUSH_IPI 1868 - /* 1869 - * The search for the next cpu always starts at rq->cpu and ends 1870 - * when we reach rq->cpu again. It will never return rq->cpu. 1871 - * This returns the next cpu to check, or nr_cpu_ids if the loop 1872 - * is complete. 1873 - * 1874 - * rq->rt.push_cpu holds the last cpu returned by this function, 1875 - * or if this is the first instance, it must hold rq->cpu. 1876 - */ 1877 - static int rto_next_cpu(struct rq *rq) 1878 - { 1879 - int prev_cpu = rq->rt.push_cpu; 1880 - int cpu; 1881 - 1882 - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); 1883 - 1884 - /* 1885 - * If the previous cpu is less than the rq's CPU, then it already 1886 - * passed the end of the mask, and has started from the beginning. 1887 - * We end if the next CPU is greater or equal to rq's CPU. 1888 - */ 1889 - if (prev_cpu < rq->cpu) { 1890 - if (cpu >= rq->cpu) 1891 - return nr_cpu_ids; 1892 - 1893 - } else if (cpu >= nr_cpu_ids) { 1894 - /* 1895 - * We passed the end of the mask, start at the beginning. 1896 - * If the result is greater or equal to the rq's CPU, then 1897 - * the loop is finished. 1898 - */ 1899 - cpu = cpumask_first(rq->rd->rto_mask); 1900 - if (cpu >= rq->cpu) 1901 - return nr_cpu_ids; 1902 - } 1903 - rq->rt.push_cpu = cpu; 1904 - 1905 - /* Return cpu to let the caller know if the loop is finished or not */ 1906 - return cpu; 1907 - } 1908 - 1909 - static int find_next_push_cpu(struct rq *rq) 1910 - { 1911 - struct rq *next_rq; 1912 - int cpu; 1913 - 1914 - while (1) { 1915 - cpu = rto_next_cpu(rq); 1916 - if (cpu >= nr_cpu_ids) 1917 - break; 1918 - next_rq = cpu_rq(cpu); 1919 - 1920 - /* Make sure the next rq can push to this rq */ 1921 - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) 1922 - break; 1923 - } 1924 - 1925 - return cpu; 1926 - } 1927 - 1928 - #define RT_PUSH_IPI_EXECUTING 1 1929 - #define RT_PUSH_IPI_RESTART 2 1930 1879 1931 1880 /* 1932 1881 * When a high priority task schedules out from a CPU and a lower priority ··· 1874 1947 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 1875 1948 * up that may be able to run one of its non-running queued RT tasks. 1876 1949 * 1877 - * On large CPU boxes, there's the case that several CPUs could schedule 1878 - * a lower priority task at the same time, in which case it will look for 1879 - * any overloaded CPUs that it could pull a task from. To do this, the runqueue 1880 - * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting 1881 - * for a single overloaded CPU's runqueue lock can produce a large latency. 1882 - * (This has actually been observed on large boxes running cyclictest). 1883 - * Instead of taking the runqueue lock of the overloaded CPU, each of the 1884 - * CPUs that scheduled a lower priority task simply sends an IPI to the 1885 - * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with 1886 - * lots of contention. The overloaded CPU will look to push its non-running 1887 - * RT task off, and if it does, it can then ignore the other IPIs coming 1888 - * in, and just pass those IPIs off to any other overloaded CPU. 1950 + * All CPUs with overloaded RT tasks need to be notified as there is currently 1951 + * no way to know which of these CPUs have the highest priority task waiting 1952 + * to run. Instead of trying to take a spinlock on each of these CPUs, 1953 + * which has shown to cause large latency when done on machines with many 1954 + * CPUs, sending an IPI to the CPUs to have them push off the overloaded 1955 + * RT tasks waiting to run. 1889 1956 * 1890 - * When a CPU schedules a lower priority task, it only sends an IPI to 1891 - * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, 1892 - * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with 1893 - * RT overloaded tasks, would cause 100 IPIs to go out at once. 1957 + * Just sending an IPI to each of the CPUs is also an issue, as on large 1958 + * count CPU machines, this can cause an IPI storm on a CPU, especially 1959 + * if its the only CPU with multiple RT tasks queued, and a large number 1960 + * of CPUs scheduling a lower priority task at the same time. 1894 1961 * 1895 - * The overloaded RT CPU, when receiving an IPI, will try to push off its 1896 - * overloaded RT tasks and then send an IPI to the next CPU that has 1897 - * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks 1898 - * have completed. Just because a CPU may have pushed off its own overloaded 1899 - * RT task does not mean it should stop sending the IPI around to other 1900 - * overloaded CPUs. There may be another RT task waiting to run on one of 1901 - * those CPUs that are of higher priority than the one that was just 1902 - * pushed. 1962 + * Each root domain has its own irq work function that can iterate over 1963 + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 1964 + * tassk must be checked if there's one or many CPUs that are lowering 1965 + * their priority, there's a single irq work iterator that will try to 1966 + * push off RT tasks that are waiting to run. 1903 1967 * 1904 - * An optimization that could possibly be made is to make a CPU array similar 1905 - * to the cpupri array mask of all running RT tasks, but for the overloaded 1906 - * case, then the IPI could be sent to only the CPU with the highest priority 1907 - * RT task waiting, and that CPU could send off further IPIs to the CPU with 1908 - * the next highest waiting task. Since the overloaded case is much less likely 1909 - * to happen, the complexity of this implementation may not be worth it. 1910 - * Instead, just send an IPI around to all overloaded CPUs. 1968 + * When a CPU schedules a lower priority task, it will kick off the 1969 + * irq work iterator that will jump to each CPU with overloaded RT tasks. 1970 + * As it only takes the first CPU that schedules a lower priority task 1971 + * to start the process, the rto_start variable is incremented and if 1972 + * the atomic result is one, then that CPU will try to take the rto_lock. 1973 + * This prevents high contention on the lock as the process handles all 1974 + * CPUs scheduling lower priority tasks. 1911 1975 * 1912 - * The rq->rt.push_flags holds the status of the IPI that is going around. 1913 - * A run queue can only send out a single IPI at a time. The possible flags 1914 - * for rq->rt.push_flags are: 1976 + * All CPUs that are scheduling a lower priority task will increment the 1977 + * rt_loop_next variable. This will make sure that the irq work iterator 1978 + * checks all RT overloaded CPUs whenever a CPU schedules a new lower 1979 + * priority task, even if the iterator is in the middle of a scan. Incrementing 1980 + * the rt_loop_next will cause the iterator to perform another scan. 1915 1981 * 1916 - * (None or zero): No IPI is going around for the current rq 1917 - * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around 1918 - * RT_PUSH_IPI_RESTART: The priority of the running task for the rq 1919 - * has changed, and the IPI should restart 1920 - * circulating the overloaded CPUs again. 1921 - * 1922 - * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated 1923 - * before sending to the next CPU. 1924 - * 1925 - * Instead of having all CPUs that schedule a lower priority task send 1926 - * an IPI to the same "first" CPU in the RT overload mask, they send it 1927 - * to the next overloaded CPU after their own CPU. This helps distribute 1928 - * the work when there's more than one overloaded CPU and multiple CPUs 1929 - * scheduling in lower priority tasks. 1930 - * 1931 - * When a rq schedules a lower priority task than what was currently 1932 - * running, the next CPU with overloaded RT tasks is examined first. 1933 - * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower 1934 - * priority task, it will send an IPI first to CPU 5, then CPU 5 will 1935 - * send to CPU 1 if it is still overloaded. CPU 1 will clear the 1936 - * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. 1937 - * 1938 - * The first CPU to notice IPI_RESTART is set, will clear that flag and then 1939 - * send an IPI to the next overloaded CPU after the rq->cpu and not the next 1940 - * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 1941 - * schedules a lower priority task, and the IPI_RESTART gets set while the 1942 - * handling is being done on CPU 5, it will clear the flag and send it back to 1943 - * CPU 4 instead of CPU 1. 1944 - * 1945 - * Note, the above logic can be disabled by turning off the sched_feature 1946 - * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be 1947 - * taken by the CPU requesting a pull and the waiting RT task will be pulled 1948 - * by that CPU. This may be fine for machines with few CPUs. 1949 1982 */ 1950 - static void tell_cpu_to_push(struct rq *rq) 1983 + static int rto_next_cpu(struct rq *rq) 1951 1984 { 1985 + struct root_domain *rd = rq->rd; 1986 + int next; 1952 1987 int cpu; 1953 1988 1954 - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1955 - raw_spin_lock(&rq->rt.push_lock); 1956 - /* Make sure it's still executing */ 1957 - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1958 - /* 1959 - * Tell the IPI to restart the loop as things have 1960 - * changed since it started. 1961 - */ 1962 - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; 1963 - raw_spin_unlock(&rq->rt.push_lock); 1964 - return; 1965 - } 1966 - raw_spin_unlock(&rq->rt.push_lock); 1989 + /* 1990 + * When starting the IPI RT pushing, the rto_cpu is set to -1, 1991 + * rt_next_cpu() will simply return the first CPU found in 1992 + * the rto_mask. 1993 + * 1994 + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1995 + * will return the next CPU found in the rto_mask. 1996 + * 1997 + * If there are no more CPUs left in the rto_mask, then a check is made 1998 + * against rto_loop and rto_loop_next. rto_loop is only updated with 1999 + * the rto_lock held, but any CPU may increment the rto_loop_next 2000 + * without any locking. 2001 + */ 2002 + for (;;) { 2003 + 2004 + /* When rto_cpu is -1 this acts like cpumask_first() */ 2005 + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2006 + 2007 + rd->rto_cpu = cpu; 2008 + 2009 + if (cpu < nr_cpu_ids) 2010 + return cpu; 2011 + 2012 + rd->rto_cpu = -1; 2013 + 2014 + /* 2015 + * ACQUIRE ensures we see the @rto_mask changes 2016 + * made prior to the @next value observed. 2017 + * 2018 + * Matches WMB in rt_set_overload(). 2019 + */ 2020 + next = atomic_read_acquire(&rd->rto_loop_next); 2021 + 2022 + if (rd->rto_loop == next) 2023 + break; 2024 + 2025 + rd->rto_loop = next; 1967 2026 } 1968 2027 1969 - /* When here, there's no IPI going around */ 2028 + return -1; 2029 + } 1970 2030 1971 - rq->rt.push_cpu = rq->cpu; 1972 - cpu = find_next_push_cpu(rq); 1973 - if (cpu >= nr_cpu_ids) 2031 + static inline bool rto_start_trylock(atomic_t *v) 2032 + { 2033 + return !atomic_cmpxchg_acquire(v, 0, 1); 2034 + } 2035 + 2036 + static inline void rto_start_unlock(atomic_t *v) 2037 + { 2038 + atomic_set_release(v, 0); 2039 + } 2040 + 2041 + static void tell_cpu_to_push(struct rq *rq) 2042 + { 2043 + int cpu = -1; 2044 + 2045 + /* Keep the loop going if the IPI is currently active */ 2046 + atomic_inc(&rq->rd->rto_loop_next); 2047 + 2048 + /* Only one CPU can initiate a loop at a time */ 2049 + if (!rto_start_trylock(&rq->rd->rto_loop_start)) 1974 2050 return; 1975 2051 1976 - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; 2052 + raw_spin_lock(&rq->rd->rto_lock); 1977 2053 1978 - irq_work_queue_on(&rq->rt.push_work, cpu); 2054 + /* 2055 + * The rto_cpu is updated under the lock, if it has a valid cpu 2056 + * then the IPI is still running and will continue due to the 2057 + * update to loop_next, and nothing needs to be done here. 2058 + * Otherwise it is finishing up and an ipi needs to be sent. 2059 + */ 2060 + if (rq->rd->rto_cpu < 0) 2061 + cpu = rto_next_cpu(rq); 2062 + 2063 + raw_spin_unlock(&rq->rd->rto_lock); 2064 + 2065 + rto_start_unlock(&rq->rd->rto_loop_start); 2066 + 2067 + if (cpu >= 0) 2068 + irq_work_queue_on(&rq->rd->rto_push_work, cpu); 1979 2069 } 1980 2070 1981 2071 /* Called from hardirq context */ 1982 - static void try_to_push_tasks(void *arg) 2072 + void rto_push_irq_work_func(struct irq_work *work) 1983 2073 { 1984 - struct rt_rq *rt_rq = arg; 1985 - struct rq *rq, *src_rq; 1986 - int this_cpu; 2074 + struct rq *rq; 1987 2075 int cpu; 1988 2076 1989 - this_cpu = rt_rq->push_cpu; 2077 + rq = this_rq(); 1990 2078 1991 - /* Paranoid check */ 1992 - BUG_ON(this_cpu != smp_processor_id()); 1993 - 1994 - rq = cpu_rq(this_cpu); 1995 - src_rq = rq_of_rt_rq(rt_rq); 1996 - 1997 - again: 2079 + /* 2080 + * We do not need to grab the lock to check for has_pushable_tasks. 2081 + * When it gets updated, a check is made if a push is possible. 2082 + */ 1998 2083 if (has_pushable_tasks(rq)) { 1999 2084 raw_spin_lock(&rq->lock); 2000 - push_rt_task(rq); 2085 + push_rt_tasks(rq); 2001 2086 raw_spin_unlock(&rq->lock); 2002 2087 } 2003 2088 2089 + raw_spin_lock(&rq->rd->rto_lock); 2090 + 2004 2091 /* Pass the IPI to the next rt overloaded queue */ 2005 - raw_spin_lock(&rt_rq->push_lock); 2006 - /* 2007 - * If the source queue changed since the IPI went out, 2008 - * we need to restart the search from that CPU again. 2009 - */ 2010 - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { 2011 - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; 2012 - rt_rq->push_cpu = src_rq->cpu; 2013 - } 2092 + cpu = rto_next_cpu(rq); 2014 2093 2015 - cpu = find_next_push_cpu(src_rq); 2094 + raw_spin_unlock(&rq->rd->rto_lock); 2016 2095 2017 - if (cpu >= nr_cpu_ids) 2018 - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; 2019 - raw_spin_unlock(&rt_rq->push_lock); 2020 - 2021 - if (cpu >= nr_cpu_ids) 2096 + if (cpu < 0) 2022 2097 return; 2023 2098 2024 - /* 2025 - * It is possible that a restart caused this CPU to be 2026 - * chosen again. Don't bother with an IPI, just see if we 2027 - * have more to push. 2028 - */ 2029 - if (unlikely(cpu == rq->cpu)) 2030 - goto again; 2031 - 2032 2099 /* Try the next RT overloaded CPU */ 2033 - irq_work_queue_on(&rt_rq->push_work, cpu); 2034 - } 2035 - 2036 - static void push_irq_work_func(struct irq_work *work) 2037 - { 2038 - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); 2039 - 2040 - try_to_push_tasks(rt_rq); 2100 + irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2041 2101 } 2042 2102 #endif /* HAVE_RT_PUSH_IPI */ 2043 2103

+54 -19

kernel/sched/sched.h

··· 227 227 static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 228 228 229 229 static inline 230 - void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 230 + void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 231 231 { 232 232 dl_b->total_bw -= tsk_bw; 233 233 __dl_update(dl_b, (s32)tsk_bw / cpus); ··· 256 256 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 257 257 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 258 258 extern bool __checkparam_dl(const struct sched_attr *attr); 259 - extern void __dl_clear_params(struct task_struct *p); 260 259 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 261 260 extern int dl_task_can_attach(struct task_struct *p, 262 261 const struct cpumask *cs_cpus_allowed); ··· 418 419 /* CFS-related fields in a runqueue */ 419 420 struct cfs_rq { 420 421 struct load_weight load; 422 + unsigned long runnable_weight; 421 423 unsigned int nr_running, h_nr_running; 422 424 423 425 u64 exec_clock; ··· 444 444 * CFS load tracking 445 445 */ 446 446 struct sched_avg avg; 447 - u64 runnable_load_sum; 448 - unsigned long runnable_load_avg; 449 - #ifdef CONFIG_FAIR_GROUP_SCHED 450 - unsigned long tg_load_avg_contrib; 451 - unsigned long propagate_avg; 452 - #endif 453 - atomic_long_t removed_load_avg, removed_util_avg; 454 447 #ifndef CONFIG_64BIT 455 448 u64 load_last_update_time_copy; 456 449 #endif 450 + struct { 451 + raw_spinlock_t lock ____cacheline_aligned; 452 + int nr; 453 + unsigned long load_avg; 454 + unsigned long util_avg; 455 + unsigned long runnable_sum; 456 + } removed; 457 457 458 458 #ifdef CONFIG_FAIR_GROUP_SCHED 459 + unsigned long tg_load_avg_contrib; 460 + long propagate; 461 + long prop_runnable_sum; 462 + 459 463 /* 460 464 * h_load = weight * f(tg) 461 465 * ··· 506 502 } 507 503 508 504 /* RT IPI pull logic requires IRQ_WORK */ 509 - #ifdef CONFIG_IRQ_WORK 505 + #if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) 510 506 # define HAVE_RT_PUSH_IPI 511 507 #endif 512 508 ··· 528 524 unsigned long rt_nr_total; 529 525 int overloaded; 530 526 struct plist_head pushable_tasks; 531 - #ifdef HAVE_RT_PUSH_IPI 532 - int push_flags; 533 - int push_cpu; 534 - struct irq_work push_work; 535 - raw_spinlock_t push_lock; 536 - #endif 537 527 #endif /* CONFIG_SMP */ 538 528 int rt_queued; 539 529 ··· 636 638 struct dl_bw dl_bw; 637 639 struct cpudl cpudl; 638 640 641 + #ifdef HAVE_RT_PUSH_IPI 642 + /* 643 + * For IPI pull requests, loop across the rto_mask. 644 + */ 645 + struct irq_work rto_push_work; 646 + raw_spinlock_t rto_lock; 647 + /* These are only updated and read within rto_lock */ 648 + int rto_loop; 649 + int rto_cpu; 650 + /* These atomics are updated outside of a lock */ 651 + atomic_t rto_loop_next; 652 + atomic_t rto_loop_start; 653 + #endif 639 654 /* 640 655 * The "RT overload" flag: it gets set if a CPU has more than 641 656 * one runnable RT task. ··· 666 655 extern int sched_init_domains(const struct cpumask *cpu_map); 667 656 extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 668 657 658 + #ifdef HAVE_RT_PUSH_IPI 659 + extern void rto_push_irq_work_func(struct irq_work *work); 660 + #endif 669 661 #endif /* CONFIG_SMP */ 670 662 671 663 /* ··· 1233 1219 # define const_debug const 1234 1220 #endif 1235 1221 1236 - extern const_debug unsigned int sysctl_sched_features; 1237 - 1238 1222 #define SCHED_FEAT(name, enabled) \ 1239 1223 __SCHED_FEAT_##name , 1240 1224 ··· 1244 1232 #undef SCHED_FEAT 1245 1233 1246 1234 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 1235 + 1236 + /* 1237 + * To support run-time toggling of sched features, all the translation units 1238 + * (but core.c) reference the sysctl_sched_features defined in core.c. 1239 + */ 1240 + extern const_debug unsigned int sysctl_sched_features; 1241 + 1247 1242 #define SCHED_FEAT(name, enabled) \ 1248 1243 static __always_inline bool static_branch_##name(struct static_key *key) \ 1249 1244 { \ ··· 1258 1239 } 1259 1240 1260 1241 #include "features.h" 1261 - 1262 1242 #undef SCHED_FEAT 1263 1243 1264 1244 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; 1265 1245 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 1246 + 1266 1247 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 1248 + 1249 + /* 1250 + * Each translation unit has its own copy of sysctl_sched_features to allow 1251 + * constants propagation at compile time and compiler optimization based on 1252 + * features default. 1253 + */ 1254 + #define SCHED_FEAT(name, enabled) \ 1255 + (1UL << __SCHED_FEAT_##name) * enabled | 1256 + static const_debug __maybe_unused unsigned int sysctl_sched_features = 1257 + #include "features.h" 1258 + 0; 1259 + #undef SCHED_FEAT 1260 + 1267 1261 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1262 + 1268 1263 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1269 1264 1270 1265 extern struct static_key_false sched_numa_balancing; ··· 1562 1529 extern void init_sched_dl_class(void); 1563 1530 extern void init_sched_rt_class(void); 1564 1531 extern void init_sched_fair_class(void); 1532 + 1533 + extern void reweight_task(struct task_struct *p, int prio); 1565 1534 1566 1535 extern void resched_curr(struct rq *rq); 1567 1536 extern void resched_cpu(int cpu);

+28 -21

kernel/sched/topology.c

··· 4 4 */ 5 5 #include <linux/sched.h> 6 6 #include <linux/mutex.h> 7 + #include <linux/sched/isolation.h> 7 8 8 9 #include "sched.h" 9 10 ··· 270 269 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 271 270 goto free_dlo_mask; 272 271 272 + #ifdef HAVE_RT_PUSH_IPI 273 + rd->rto_cpu = -1; 274 + raw_spin_lock_init(&rd->rto_lock); 275 + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); 276 + #endif 277 + 273 278 init_dl_bw(&rd->dl_bw); 274 279 if (cpudl_init(&rd->cpudl) != 0) 275 280 goto free_rto_mask; ··· 470 463 471 464 update_top_cache_domain(cpu); 472 465 } 473 - 474 - /* Setup the mask of CPUs configured for isolated domains */ 475 - static int __init isolated_cpu_setup(char *str) 476 - { 477 - int ret; 478 - 479 - alloc_bootmem_cpumask_var(&cpu_isolated_map); 480 - ret = cpulist_parse(str, cpu_isolated_map); 481 - if (ret) { 482 - pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); 483 - return 0; 484 - } 485 - return 1; 486 - } 487 - __setup("isolcpus=", isolated_cpu_setup); 488 466 489 467 struct s_data { 490 468 struct sched_domain ** __percpu sd; ··· 1150 1158 sd->smt_gain = 1178; /* ~15% */ 1151 1159 1152 1160 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1161 + sd->flags |= SD_PREFER_SIBLING; 1153 1162 sd->imbalance_pct = 117; 1154 1163 sd->cache_nice_tries = 1; 1155 1164 sd->busy_idx = 2; ··· 1325 1332 if (!sched_domains_numa_distance) 1326 1333 return; 1327 1334 1335 + /* Includes NUMA identity node at level 0. */ 1336 + sched_domains_numa_distance[level++] = curr_distance; 1337 + sched_domains_numa_levels = level; 1338 + 1328 1339 /* 1329 1340 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 1330 1341 * unique distances in the node_distance() table. ··· 1376 1379 return; 1377 1380 1378 1381 /* 1379 - * 'level' contains the number of unique distances, excluding the 1380 - * identity distance node_distance(i,i). 1382 + * 'level' contains the number of unique distances 1381 1383 * 1382 1384 * The sched_domains_numa_distance[] array includes the actual distance 1383 1385 * numbers. ··· 1438 1442 tl[i] = sched_domain_topology[i]; 1439 1443 1440 1444 /* 1445 + * Add the NUMA identity distance, aka single NODE. 1446 + */ 1447 + tl[i++] = (struct sched_domain_topology_level){ 1448 + .mask = sd_numa_mask, 1449 + .numa_level = 0, 1450 + SD_INIT_NAME(NODE) 1451 + }; 1452 + 1453 + /* 1441 1454 * .. and append 'j' levels of NUMA goodness. 1442 1455 */ 1443 - for (j = 0; j < level; i++, j++) { 1456 + for (j = 1; j < level; i++, j++) { 1444 1457 tl[i] = (struct sched_domain_topology_level){ 1445 1458 .mask = sd_numa_mask, 1446 1459 .sd_flags = cpu_numa_flags, ··· 1779 1774 doms_cur = alloc_sched_domains(ndoms_cur); 1780 1775 if (!doms_cur) 1781 1776 doms_cur = &fallback_doms; 1782 - cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 1777 + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); 1783 1778 err = build_sched_domains(doms_cur[0], NULL); 1784 1779 register_sched_domain_sysctl(); 1785 1780 ··· 1862 1857 doms_new = alloc_sched_domains(1); 1863 1858 if (doms_new) { 1864 1859 n = 1; 1865 - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 1860 + cpumask_and(doms_new[0], cpu_active_mask, 1861 + housekeeping_cpumask(HK_FLAG_DOMAIN)); 1866 1862 } 1867 1863 } else { 1868 1864 n = ndoms_new; ··· 1886 1880 if (!doms_new) { 1887 1881 n = 0; 1888 1882 doms_new = &fallback_doms; 1889 - cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 1883 + cpumask_and(doms_new[0], cpu_active_mask, 1884 + housekeeping_cpumask(HK_FLAG_DOMAIN)); 1890 1885 } 1891 1886 1892 1887 /* Build new domains: */

+5 -28

kernel/time/tick-sched.c

··· 27 27 #include <linux/irq_work.h> 28 28 #include <linux/posix-timers.h> 29 29 #include <linux/context_tracking.h> 30 + #include <linux/mm.h> 30 31 31 32 #include <asm/irq_regs.h> 32 33 ··· 166 165 167 166 #ifdef CONFIG_NO_HZ_FULL 168 167 cpumask_var_t tick_nohz_full_mask; 169 - cpumask_var_t housekeeping_mask; 170 168 bool tick_nohz_full_running; 171 169 static atomic_t tick_dep_mask; 172 170 ··· 385 385 local_irq_restore(flags); 386 386 } 387 387 388 - /* Parse the boot-time nohz CPU list from the kernel parameters. */ 389 - static int __init tick_nohz_full_setup(char *str) 388 + /* Get the boot-time nohz CPU list from the kernel parameters. */ 389 + void __init tick_nohz_full_setup(cpumask_var_t cpumask) 390 390 { 391 391 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 392 - if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 393 - pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); 394 - free_bootmem_cpumask_var(tick_nohz_full_mask); 395 - return 1; 396 - } 392 + cpumask_copy(tick_nohz_full_mask, cpumask); 397 393 tick_nohz_full_running = true; 398 - 399 - return 1; 400 394 } 401 - __setup("nohz_full=", tick_nohz_full_setup); 402 395 403 396 static int tick_nohz_cpu_down(unsigned int cpu) 404 397 { ··· 430 437 return; 431 438 } 432 439 433 - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { 434 - WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); 435 - cpumask_clear(tick_nohz_full_mask); 436 - tick_nohz_full_running = false; 437 - return; 438 - } 439 - 440 440 /* 441 441 * Full dynticks uses irq work to drive the tick rescheduling on safe 442 442 * locking contexts. But then we need irq work to raise its own ··· 438 452 if (!arch_irq_work_has_interrupt()) { 439 453 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); 440 454 cpumask_clear(tick_nohz_full_mask); 441 - cpumask_copy(housekeeping_mask, cpu_possible_mask); 442 455 tick_nohz_full_running = false; 443 456 return; 444 457 } ··· 450 465 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 451 466 } 452 467 453 - cpumask_andnot(housekeeping_mask, 454 - cpu_possible_mask, tick_nohz_full_mask); 455 - 456 468 for_each_cpu(cpu, tick_nohz_full_mask) 457 469 context_tracking_cpu_set(cpu); 458 470 ··· 459 477 WARN_ON(ret < 0); 460 478 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 461 479 cpumask_pr_args(tick_nohz_full_mask)); 462 - 463 - /* 464 - * We need at least one CPU to handle housekeeping work such 465 - * as timekeeping, unbound timers, workqueues, ... 466 - */ 467 - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 468 480 } 469 481 #endif 470 482 ··· 763 787 if (!ts->tick_stopped) { 764 788 calc_load_nohz_start(); 765 789 cpu_load_update_nohz_start(); 790 + quiet_vmstat(); 766 791 767 792 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 768 793 ts->tick_stopped = 1;

+6 -6

kernel/trace/trace_output.c

··· 921 921 922 922 trace_assign_type(field, iter->ent); 923 923 924 - T = __task_state_to_char(field->next_state); 925 - S = __task_state_to_char(field->prev_state); 924 + T = task_index_to_char(field->next_state); 925 + S = task_index_to_char(field->prev_state); 926 926 trace_find_cmdline(field->next_pid, comm); 927 927 trace_seq_printf(&iter->seq, 928 928 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", ··· 957 957 trace_assign_type(field, iter->ent); 958 958 959 959 if (!S) 960 - S = __task_state_to_char(field->prev_state); 961 - T = __task_state_to_char(field->next_state); 960 + S = task_index_to_char(field->prev_state); 961 + T = task_index_to_char(field->next_state); 962 962 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 963 963 field->prev_pid, 964 964 field->prev_prio, ··· 993 993 trace_assign_type(field, iter->ent); 994 994 995 995 if (!S) 996 - S = __task_state_to_char(field->prev_state); 997 - T = __task_state_to_char(field->next_state); 996 + S = task_index_to_char(field->prev_state); 997 + T = task_index_to_char(field->next_state); 998 998 999 999 SEQ_PUT_HEX_FIELD(s, field->prev_pid); 1000 1000 SEQ_PUT_HEX_FIELD(s, field->prev_prio);

+4 -4

kernel/trace/trace_sched_wakeup.c

··· 398 398 entry = ring_buffer_event_data(event); 399 399 entry->prev_pid = prev->pid; 400 400 entry->prev_prio = prev->prio; 401 - entry->prev_state = __get_task_state(prev); 401 + entry->prev_state = task_state_index(prev); 402 402 entry->next_pid = next->pid; 403 403 entry->next_prio = next->prio; 404 - entry->next_state = __get_task_state(next); 404 + entry->next_state = task_state_index(next); 405 405 entry->next_cpu = task_cpu(next); 406 406 407 407 if (!call_filter_check_discard(call, entry, buffer, event)) ··· 426 426 entry = ring_buffer_event_data(event); 427 427 entry->prev_pid = curr->pid; 428 428 entry->prev_prio = curr->prio; 429 - entry->prev_state = __get_task_state(curr); 429 + entry->prev_state = task_state_index(curr); 430 430 entry->next_pid = wakee->pid; 431 431 entry->next_prio = wakee->prio; 432 - entry->next_state = __get_task_state(wakee); 432 + entry->next_state = task_state_index(wakee); 433 433 entry->next_cpu = task_cpu(wakee); 434 434 435 435 if (!call_filter_check_discard(call, entry, buffer, event))

+5 -8

kernel/watchdog.c

··· 25 25 #include <linux/workqueue.h> 26 26 #include <linux/sched/clock.h> 27 27 #include <linux/sched/debug.h> 28 + #include <linux/sched/isolation.h> 28 29 29 30 #include <asm/irq_regs.h> 30 31 #include <linux/kvm_para.h> ··· 775 774 776 775 void __init lockup_detector_init(void) 777 776 { 778 - #ifdef CONFIG_NO_HZ_FULL 779 - if (tick_nohz_full_enabled()) { 777 + if (tick_nohz_full_enabled()) 780 778 pr_info("Disabling watchdog on nohz_full cores by default\n"); 781 - cpumask_copy(&watchdog_cpumask, housekeeping_mask); 782 - } else 783 - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 784 - #else 785 - cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 786 - #endif 779 + 780 + cpumask_copy(&watchdog_cpumask, 781 + housekeeping_cpumask(HK_FLAG_TIMER)); 787 782 788 783 if (!watchdog_nmi_probe()) 789 784 nmi_watchdog_available = true;