Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle are:

- Make schedstats a runtime tunable (disabled by default) and
optimize it via static keys.

As most distributions enable CONFIG_SCHEDSTATS=y due to its
instrumentation value, this is a nice performance enhancement.
(Mel Gorman)

- Implement 'simple waitqueues' (swait): these are just pure
waitqueues without any of the more complex features of full-blown
waitqueues (callbacks, wake flags, wake keys, etc.). Simple
waitqueues have less memory overhead and are faster.

Use simple waitqueues in the RCU code (in 4 different places) and
for handling KVM vCPU wakeups.

(Peter Zijlstra, Daniel Wagner, Thomas Gleixner, Paul Gortmaker,
Marcelo Tosatti)

- sched/numa enhancements (Rik van Riel)

- NOHZ performance enhancements (Rik van Riel)

- Various sched/deadline enhancements (Steven Rostedt)

- Various fixes (Peter Zijlstra)

- ... and a number of other fixes, cleanups and smaller enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits)
sched/cputime: Fix steal_account_process_tick() to always return jiffies
sched/deadline: Remove dl_new from struct sched_dl_entity
Revert "kbuild: Add option to turn incompatible pointer check into error"
sched/deadline: Remove superfluous call to switched_to_dl()
sched/debug: Fix preempt_disable_ip recording for preempt_disable()
sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity
time, acct: Drop irq save & restore from __acct_update_integrals()
acct, time: Change indentation in __acct_update_integrals()
sched, time: Remove non-power-of-two divides from __acct_update_integrals()
sched/rt: Kick RT bandwidth timer immediately on start up
sched/debug: Add deadline scheduler bandwidth ratio to /proc/sched_debug
sched/debug: Move sched_domain_sysctl to debug.c
sched/debug: Move the /sys/kernel/debug/sched_features file setup into debug.c
sched/rt: Fix PI handling vs. sched_setscheduler()
sched/core: Remove duplicated sched_group_set_shares() prototype
sched/fair: Consolidate nohz CPU load update code
sched/fair: Avoid using decay_load_missed() with a negative value
sched/deadline: Always calculate end of period on sched_yield()
sched/cgroup: Fix cgroup entity load tracking tear-down
rcu: Use simple wait queues where possible in rcutree
...

+1298 -713
+5
Documentation/kernel-parameters.txt
··· 3532 3532 3533 3533 sched_debug [KNL] Enables verbose scheduler debug messages. 3534 3534 3535 + schedstats= [KNL,X86] Enable or disable scheduled statistics. 3536 + Allowed values are enable and disable. This feature 3537 + incurs a small amount of overhead in the scheduler 3538 + but is useful for debugging and performance tuning. 3539 + 3535 3540 skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate 3536 3541 xtime_lock contention on larger systems, and/or RCU lock 3537 3542 contention on all systems with CONFIG_MAXSMP set.
+8
Documentation/sysctl/kernel.txt
··· 773 773 774 774 ============================================================== 775 775 776 + sched_schedstats: 777 + 778 + Enables/disables scheduler statistics. Enabling this feature 779 + incurs a small amount of overhead in the scheduler but is 780 + useful for debugging and performance tuning. 781 + 782 + ============================================================== 783 + 776 784 sg-big-buff: 777 785 778 786 This file shows the size of the generic SCSI (sg) buffer.
+4 -4
arch/arm/kvm/arm.c
··· 506 506 struct kvm_vcpu *vcpu; 507 507 508 508 kvm_for_each_vcpu(i, vcpu, kvm) { 509 - wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); 509 + struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); 510 510 511 511 vcpu->arch.pause = false; 512 - wake_up_interruptible(wq); 512 + swake_up(wq); 513 513 } 514 514 } 515 515 516 516 static void vcpu_sleep(struct kvm_vcpu *vcpu) 517 517 { 518 - wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); 518 + struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); 519 519 520 - wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && 520 + swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && 521 521 (!vcpu->arch.pause))); 522 522 } 523 523
+2 -2
arch/arm/kvm/psci.c
··· 70 70 { 71 71 struct kvm *kvm = source_vcpu->kvm; 72 72 struct kvm_vcpu *vcpu = NULL; 73 - wait_queue_head_t *wq; 73 + struct swait_queue_head *wq; 74 74 unsigned long cpu_id; 75 75 unsigned long context_id; 76 76 phys_addr_t target_pc; ··· 119 119 smp_mb(); /* Make sure the above is visible */ 120 120 121 121 wq = kvm_arch_vcpu_wq(vcpu); 122 - wake_up_interruptible(wq); 122 + swake_up(wq); 123 123 124 124 return PSCI_RET_SUCCESS; 125 125 }
+4 -4
arch/mips/kvm/mips.c
··· 445 445 446 446 dvcpu->arch.wait = 0; 447 447 448 - if (waitqueue_active(&dvcpu->wq)) 449 - wake_up_interruptible(&dvcpu->wq); 448 + if (swait_active(&dvcpu->wq)) 449 + swake_up(&dvcpu->wq); 450 450 451 451 return 0; 452 452 } ··· 1174 1174 kvm_mips_callbacks->queue_timer_int(vcpu); 1175 1175 1176 1176 vcpu->arch.wait = 0; 1177 - if (waitqueue_active(&vcpu->wq)) 1178 - wake_up_interruptible(&vcpu->wq); 1177 + if (swait_active(&vcpu->wq)) 1178 + swake_up(&vcpu->wq); 1179 1179 } 1180 1180 1181 1181 /* low level hrtimer wake routine */
+2 -2
arch/powerpc/include/asm/kvm_host.h
··· 289 289 struct list_head runnable_threads; 290 290 struct list_head preempt_list; 291 291 spinlock_t lock; 292 - wait_queue_head_t wq; 292 + struct swait_queue_head wq; 293 293 spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ 294 294 u64 stolen_tb; 295 295 u64 preempt_tb; ··· 629 629 u8 prodded; 630 630 u32 last_inst; 631 631 632 - wait_queue_head_t *wqp; 632 + struct swait_queue_head *wqp; 633 633 struct kvmppc_vcore *vcore; 634 634 int ret; 635 635 int trap;
+11 -12
arch/powerpc/kvm/book3s_hv.c
··· 114 114 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 115 115 { 116 116 int cpu; 117 - wait_queue_head_t *wqp; 117 + struct swait_queue_head *wqp; 118 118 119 119 wqp = kvm_arch_vcpu_wq(vcpu); 120 - if (waitqueue_active(wqp)) { 121 - wake_up_interruptible(wqp); 120 + if (swait_active(wqp)) { 121 + swake_up(wqp); 122 122 ++vcpu->stat.halt_wakeup; 123 123 } 124 124 ··· 701 701 tvcpu->arch.prodded = 1; 702 702 smp_mb(); 703 703 if (vcpu->arch.ceded) { 704 - if (waitqueue_active(&vcpu->wq)) { 705 - wake_up_interruptible(&vcpu->wq); 704 + if (swait_active(&vcpu->wq)) { 705 + swake_up(&vcpu->wq); 706 706 vcpu->stat.halt_wakeup++; 707 707 } 708 708 } ··· 1459 1459 INIT_LIST_HEAD(&vcore->runnable_threads); 1460 1460 spin_lock_init(&vcore->lock); 1461 1461 spin_lock_init(&vcore->stoltb_lock); 1462 - init_waitqueue_head(&vcore->wq); 1462 + init_swait_queue_head(&vcore->wq); 1463 1463 vcore->preempt_tb = TB_NIL; 1464 1464 vcore->lpcr = kvm->arch.lpcr; 1465 1465 vcore->first_vcpuid = core * threads_per_subcore; ··· 2531 2531 { 2532 2532 struct kvm_vcpu *vcpu; 2533 2533 int do_sleep = 1; 2534 + DECLARE_SWAITQUEUE(wait); 2534 2535 2535 - DEFINE_WAIT(wait); 2536 - 2537 - prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 2536 + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 2538 2537 2539 2538 /* 2540 2539 * Check one last time for pending exceptions and ceded state after ··· 2547 2548 } 2548 2549 2549 2550 if (!do_sleep) { 2550 - finish_wait(&vc->wq, &wait); 2551 + finish_swait(&vc->wq, &wait); 2551 2552 return; 2552 2553 } 2553 2554 ··· 2555 2556 trace_kvmppc_vcore_blocked(vc, 0); 2556 2557 spin_unlock(&vc->lock); 2557 2558 schedule(); 2558 - finish_wait(&vc->wq, &wait); 2559 + finish_swait(&vc->wq, &wait); 2559 2560 spin_lock(&vc->lock); 2560 2561 vc->vcore_state = VCORE_INACTIVE; 2561 2562 trace_kvmppc_vcore_blocked(vc, 1); ··· 2611 2612 kvmppc_start_thread(vcpu, vc); 2612 2613 trace_kvm_guest_enter(vcpu); 2613 2614 } else if (vc->vcore_state == VCORE_SLEEPING) { 2614 - wake_up(&vc->wq); 2615 + swake_up(&vc->wq); 2615 2616 } 2616 2617 2617 2618 }
+1 -1
arch/s390/include/asm/kvm_host.h
··· 467 467 struct kvm_s390_local_interrupt { 468 468 spinlock_t lock; 469 469 struct kvm_s390_float_interrupt *float_int; 470 - wait_queue_head_t *wq; 470 + struct swait_queue_head *wq; 471 471 atomic_t *cpuflags; 472 472 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); 473 473 struct kvm_s390_irq_payload irq;
+2 -2
arch/s390/kvm/interrupt.c
··· 966 966 967 967 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) 968 968 { 969 - if (waitqueue_active(&vcpu->wq)) { 969 + if (swait_active(&vcpu->wq)) { 970 970 /* 971 971 * The vcpu gave up the cpu voluntarily, mark it as a good 972 972 * yield-candidate. 973 973 */ 974 974 vcpu->preempted = true; 975 - wake_up_interruptible(&vcpu->wq); 975 + swake_up(&vcpu->wq); 976 976 vcpu->stat.halt_wakeup++; 977 977 } 978 978 }
+3 -3
arch/x86/kvm/lapic.c
··· 1195 1195 static void apic_timer_expired(struct kvm_lapic *apic) 1196 1196 { 1197 1197 struct kvm_vcpu *vcpu = apic->vcpu; 1198 - wait_queue_head_t *q = &vcpu->wq; 1198 + struct swait_queue_head *q = &vcpu->wq; 1199 1199 struct kvm_timer *ktimer = &apic->lapic_timer; 1200 1200 1201 1201 if (atomic_read(&apic->lapic_timer.pending)) ··· 1204 1204 atomic_inc(&apic->lapic_timer.pending); 1205 1205 kvm_set_pending_timer(vcpu); 1206 1206 1207 - if (waitqueue_active(q)) 1208 - wake_up_interruptible(q); 1207 + if (swait_active(q)) 1208 + swake_up(q); 1209 1209 1210 1210 if (apic_lvtt_tscdeadline(apic)) 1211 1211 ktimer->expired_tscdeadline = ktimer->tscdeadline;
+12
include/linux/ftrace.h
··· 713 713 #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) 714 714 #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) 715 715 716 + static inline unsigned long get_lock_parent_ip(void) 717 + { 718 + unsigned long addr = CALLER_ADDR0; 719 + 720 + if (!in_lock_functions(addr)) 721 + return addr; 722 + addr = CALLER_ADDR1; 723 + if (!in_lock_functions(addr)) 724 + return addr; 725 + return CALLER_ADDR2; 726 + } 727 + 716 728 #ifdef CONFIG_IRQSOFF_TRACER 717 729 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 718 730 extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
+3 -2
include/linux/kvm_host.h
··· 25 25 #include <linux/irqflags.h> 26 26 #include <linux/context_tracking.h> 27 27 #include <linux/irqbypass.h> 28 + #include <linux/swait.h> 28 29 #include <asm/signal.h> 29 30 30 31 #include <linux/kvm.h> ··· 219 218 int fpu_active; 220 219 int guest_fpu_loaded, guest_xcr0_loaded; 221 220 unsigned char fpu_counter; 222 - wait_queue_head_t wq; 221 + struct swait_queue_head wq; 223 222 struct pid *pid; 224 223 int sigset_active; 225 224 sigset_t sigset; ··· 783 782 } 784 783 #endif 785 784 786 - static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) 785 + static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) 787 786 { 788 787 #ifdef __KVM_HAVE_ARCH_WQP 789 788 return vcpu->arch.wqp;
+3
include/linux/latencytop.h
··· 37 37 38 38 void clear_all_latency_tracing(struct task_struct *p); 39 39 40 + extern int sysctl_latencytop(struct ctl_table *table, int write, 41 + void __user *buffer, size_t *lenp, loff_t *ppos); 42 + 40 43 #else 41 44 42 45 static inline void
+7 -7
include/linux/sched.h
··· 182 182 static inline void update_cpu_load_nohz(int active) { } 183 183 #endif 184 184 185 - extern unsigned long get_parent_ip(unsigned long addr); 186 - 187 185 extern void dump_cpu_task(int cpu); 188 186 189 187 struct seq_file; ··· 918 920 #endif 919 921 } 920 922 923 + #ifdef CONFIG_SCHEDSTATS 924 + void force_schedstat_enabled(void); 925 + #endif 926 + 921 927 enum cpu_idle_type { 922 928 CPU_IDLE, 923 929 CPU_NOT_IDLE, ··· 1291 1289 unsigned long timeout; 1292 1290 unsigned long watchdog_stamp; 1293 1291 unsigned int time_slice; 1292 + unsigned short on_rq; 1293 + unsigned short on_list; 1294 1294 1295 1295 struct sched_rt_entity *back; 1296 1296 #ifdef CONFIG_RT_GROUP_SCHED ··· 1333 1329 * task has to wait for a replenishment to be performed at the 1334 1330 * next firing of dl_timer. 1335 1331 * 1336 - * @dl_new tells if a new instance arrived. If so we must 1337 - * start executing it with full runtime and reset its absolute 1338 - * deadline; 1339 - * 1340 1332 * @dl_boosted tells if we are boosted due to DI. If so we are 1341 1333 * outside bandwidth enforcement mechanism (but only until we 1342 1334 * exit the critical section); ··· 1340 1340 * @dl_yielded tells if task gave up the cpu before consuming 1341 1341 * all its available runtime during the last job. 1342 1342 */ 1343 - int dl_throttled, dl_new, dl_boosted, dl_yielded; 1343 + int dl_throttled, dl_boosted, dl_yielded; 1344 1344 1345 1345 /* 1346 1346 * Bandwidth enforcement timer. Each -deadline task has its
+4
include/linux/sched/sysctl.h
··· 95 95 void __user *buffer, size_t *lenp, 96 96 loff_t *ppos); 97 97 98 + extern int sysctl_schedstats(struct ctl_table *table, int write, 99 + void __user *buffer, size_t *lenp, 100 + loff_t *ppos); 101 + 98 102 #endif /* _SCHED_SYSCTL_H */
+172
include/linux/swait.h
··· 1 + #ifndef _LINUX_SWAIT_H 2 + #define _LINUX_SWAIT_H 3 + 4 + #include <linux/list.h> 5 + #include <linux/stddef.h> 6 + #include <linux/spinlock.h> 7 + #include <asm/current.h> 8 + 9 + /* 10 + * Simple wait queues 11 + * 12 + * While these are very similar to the other/complex wait queues (wait.h) the 13 + * most important difference is that the simple waitqueue allows for 14 + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold 15 + * times. 16 + * 17 + * In order to make this so, we had to drop a fair number of features of the 18 + * other waitqueue code; notably: 19 + * 20 + * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; 21 + * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right 22 + * sleeper state. 23 + * 24 + * - the exclusive mode; because this requires preserving the list order 25 + * and this is hard. 26 + * 27 + * - custom wake functions; because you cannot give any guarantees about 28 + * random code. 29 + * 30 + * As a side effect of this; the data structures are slimmer. 31 + * 32 + * One would recommend using this wait queue where possible. 33 + */ 34 + 35 + struct task_struct; 36 + 37 + struct swait_queue_head { 38 + raw_spinlock_t lock; 39 + struct list_head task_list; 40 + }; 41 + 42 + struct swait_queue { 43 + struct task_struct *task; 44 + struct list_head task_list; 45 + }; 46 + 47 + #define __SWAITQUEUE_INITIALIZER(name) { \ 48 + .task = current, \ 49 + .task_list = LIST_HEAD_INIT((name).task_list), \ 50 + } 51 + 52 + #define DECLARE_SWAITQUEUE(name) \ 53 + struct swait_queue name = __SWAITQUEUE_INITIALIZER(name) 54 + 55 + #define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \ 56 + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ 57 + .task_list = LIST_HEAD_INIT((name).task_list), \ 58 + } 59 + 60 + #define DECLARE_SWAIT_QUEUE_HEAD(name) \ 61 + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name) 62 + 63 + extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 64 + struct lock_class_key *key); 65 + 66 + #define init_swait_queue_head(q) \ 67 + do { \ 68 + static struct lock_class_key __key; \ 69 + __init_swait_queue_head((q), #q, &__key); \ 70 + } while (0) 71 + 72 + #ifdef CONFIG_LOCKDEP 73 + # define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ 74 + ({ init_swait_queue_head(&name); name; }) 75 + # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ 76 + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) 77 + #else 78 + # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ 79 + DECLARE_SWAIT_QUEUE_HEAD(name) 80 + #endif 81 + 82 + static inline int swait_active(struct swait_queue_head *q) 83 + { 84 + return !list_empty(&q->task_list); 85 + } 86 + 87 + extern void swake_up(struct swait_queue_head *q); 88 + extern void swake_up_all(struct swait_queue_head *q); 89 + extern void swake_up_locked(struct swait_queue_head *q); 90 + 91 + extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); 92 + extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); 93 + extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); 94 + 95 + extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); 96 + extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); 97 + 98 + /* as per ___wait_event() but for swait, therefore "exclusive == 0" */ 99 + #define ___swait_event(wq, condition, state, ret, cmd) \ 100 + ({ \ 101 + struct swait_queue __wait; \ 102 + long __ret = ret; \ 103 + \ 104 + INIT_LIST_HEAD(&__wait.task_list); \ 105 + for (;;) { \ 106 + long __int = prepare_to_swait_event(&wq, &__wait, state);\ 107 + \ 108 + if (condition) \ 109 + break; \ 110 + \ 111 + if (___wait_is_interruptible(state) && __int) { \ 112 + __ret = __int; \ 113 + break; \ 114 + } \ 115 + \ 116 + cmd; \ 117 + } \ 118 + finish_swait(&wq, &__wait); \ 119 + __ret; \ 120 + }) 121 + 122 + #define __swait_event(wq, condition) \ 123 + (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ 124 + schedule()) 125 + 126 + #define swait_event(wq, condition) \ 127 + do { \ 128 + if (condition) \ 129 + break; \ 130 + __swait_event(wq, condition); \ 131 + } while (0) 132 + 133 + #define __swait_event_timeout(wq, condition, timeout) \ 134 + ___swait_event(wq, ___wait_cond_timeout(condition), \ 135 + TASK_UNINTERRUPTIBLE, timeout, \ 136 + __ret = schedule_timeout(__ret)) 137 + 138 + #define swait_event_timeout(wq, condition, timeout) \ 139 + ({ \ 140 + long __ret = timeout; \ 141 + if (!___wait_cond_timeout(condition)) \ 142 + __ret = __swait_event_timeout(wq, condition, timeout); \ 143 + __ret; \ 144 + }) 145 + 146 + #define __swait_event_interruptible(wq, condition) \ 147 + ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ 148 + schedule()) 149 + 150 + #define swait_event_interruptible(wq, condition) \ 151 + ({ \ 152 + int __ret = 0; \ 153 + if (!(condition)) \ 154 + __ret = __swait_event_interruptible(wq, condition); \ 155 + __ret; \ 156 + }) 157 + 158 + #define __swait_event_interruptible_timeout(wq, condition, timeout) \ 159 + ___swait_event(wq, ___wait_cond_timeout(condition), \ 160 + TASK_INTERRUPTIBLE, timeout, \ 161 + __ret = schedule_timeout(__ret)) 162 + 163 + #define swait_event_interruptible_timeout(wq, condition, timeout) \ 164 + ({ \ 165 + long __ret = timeout; \ 166 + if (!___wait_cond_timeout(condition)) \ 167 + __ret = __swait_event_interruptible_timeout(wq, \ 168 + condition, timeout); \ 169 + __ret; \ 170 + }) 171 + 172 + #endif /* _LINUX_SWAIT_H */
+1 -1
include/linux/wait.h
··· 338 338 schedule(); try_to_freeze()) 339 339 340 340 /** 341 - * wait_event - sleep (or freeze) until a condition gets true 341 + * wait_event_freezable - sleep (or freeze) until a condition gets true 342 342 * @wq: the waitqueue to wait on 343 343 * @condition: a C expression for the event to wait for 344 344 *
+13 -1
kernel/latencytop.c
··· 47 47 * of times) 48 48 */ 49 49 50 - #include <linux/latencytop.h> 51 50 #include <linux/kallsyms.h> 52 51 #include <linux/seq_file.h> 53 52 #include <linux/notifier.h> 54 53 #include <linux/spinlock.h> 55 54 #include <linux/proc_fs.h> 55 + #include <linux/latencytop.h> 56 56 #include <linux/export.h> 57 57 #include <linux/sched.h> 58 58 #include <linux/list.h> ··· 288 288 { 289 289 proc_create("latency_stats", 0644, NULL, &lstats_fops); 290 290 return 0; 291 + } 292 + 293 + int sysctl_latencytop(struct ctl_table *table, int write, 294 + void __user *buffer, size_t *lenp, loff_t *ppos) 295 + { 296 + int err; 297 + 298 + err = proc_dointvec(table, write, buffer, lenp, ppos); 299 + if (latencytop_enabled) 300 + force_schedstat_enabled(); 301 + 302 + return err; 291 303 } 292 304 device_initcall(init_lstats_procfs);
+1
kernel/profile.c
··· 59 59 60 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 61 61 #ifdef CONFIG_SCHEDSTATS 62 + force_schedstat_enabled(); 62 63 prof_on = SLEEP_PROFILING; 63 64 if (str[strlen(sleepstr)] == ',') 64 65 str += strlen(sleepstr) + 1;
+13 -11
kernel/rcu/tree.c
··· 1614 1614 int needmore; 1615 1615 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1616 1616 1617 - rcu_nocb_gp_cleanup(rsp, rnp); 1618 1617 rnp->need_future_gp[c & 0x1] = 0; 1619 1618 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1620 1619 trace_rcu_future_gp(rnp, rdp, c, ··· 1634 1635 !READ_ONCE(rsp->gp_flags) || 1635 1636 !rsp->gp_kthread) 1636 1637 return; 1637 - wake_up(&rsp->gp_wq); 1638 + swake_up(&rsp->gp_wq); 1638 1639 } 1639 1640 1640 1641 /* ··· 2009 2010 int nocb = 0; 2010 2011 struct rcu_data *rdp; 2011 2012 struct rcu_node *rnp = rcu_get_root(rsp); 2013 + struct swait_queue_head *sq; 2012 2014 2013 2015 WRITE_ONCE(rsp->gp_activity, jiffies); 2014 2016 raw_spin_lock_irq_rcu_node(rnp); ··· 2046 2046 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 2047 2047 /* smp_mb() provided by prior unlock-lock pair. */ 2048 2048 nocb += rcu_future_gp_cleanup(rsp, rnp); 2049 + sq = rcu_nocb_gp_get(rnp); 2049 2050 raw_spin_unlock_irq(&rnp->lock); 2051 + rcu_nocb_gp_cleanup(sq); 2050 2052 cond_resched_rcu_qs(); 2051 2053 WRITE_ONCE(rsp->gp_activity, jiffies); 2052 2054 rcu_gp_slow(rsp, gp_cleanup_delay); ··· 2094 2092 READ_ONCE(rsp->gpnum), 2095 2093 TPS("reqwait")); 2096 2094 rsp->gp_state = RCU_GP_WAIT_GPS; 2097 - wait_event_interruptible(rsp->gp_wq, 2095 + swait_event_interruptible(rsp->gp_wq, 2098 2096 READ_ONCE(rsp->gp_flags) & 2099 2097 RCU_GP_FLAG_INIT); 2100 2098 rsp->gp_state = RCU_GP_DONE_GPS; ··· 2124 2122 READ_ONCE(rsp->gpnum), 2125 2123 TPS("fqswait")); 2126 2124 rsp->gp_state = RCU_GP_WAIT_FQS; 2127 - ret = wait_event_interruptible_timeout(rsp->gp_wq, 2125 + ret = swait_event_interruptible_timeout(rsp->gp_wq, 2128 2126 rcu_gp_fqs_check_wake(rsp, &gf), j); 2129 2127 rsp->gp_state = RCU_GP_DOING_FQS; 2130 2128 /* Locking provides needed memory barriers. */ ··· 2248 2246 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2249 2247 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2250 2248 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2251 - rcu_gp_kthread_wake(rsp); 2249 + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2252 2250 } 2253 2251 2254 2252 /* ··· 2902 2900 } 2903 2901 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2904 2902 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2905 - rcu_gp_kthread_wake(rsp); 2903 + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ 2906 2904 } 2907 2905 2908 2906 /* ··· 3531 3529 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3532 3530 if (wake) { 3533 3531 smp_mb(); /* EGP done before wake_up(). */ 3534 - wake_up(&rsp->expedited_wq); 3532 + swake_up(&rsp->expedited_wq); 3535 3533 } 3536 3534 break; 3537 3535 } ··· 3782 3780 jiffies_start = jiffies; 3783 3781 3784 3782 for (;;) { 3785 - ret = wait_event_interruptible_timeout( 3783 + ret = swait_event_timeout( 3786 3784 rsp->expedited_wq, 3787 3785 sync_rcu_preempt_exp_done(rnp_root), 3788 3786 jiffies_stall); ··· 3790 3788 return; 3791 3789 if (ret < 0) { 3792 3790 /* Hit a signal, disable CPU stall warnings. */ 3793 - wait_event(rsp->expedited_wq, 3791 + swait_event(rsp->expedited_wq, 3794 3792 sync_rcu_preempt_exp_done(rnp_root)); 3795 3793 return; 3796 3794 } ··· 4484 4482 } 4485 4483 } 4486 4484 4487 - init_waitqueue_head(&rsp->gp_wq); 4488 - init_waitqueue_head(&rsp->expedited_wq); 4485 + init_swait_queue_head(&rsp->gp_wq); 4486 + init_swait_queue_head(&rsp->expedited_wq); 4489 4487 rnp = rsp->level[rcu_num_lvls - 1]; 4490 4488 for_each_possible_cpu(i) { 4491 4489 while (i > rnp->grphi)
+7 -5
kernel/rcu/tree.h
··· 27 27 #include <linux/threads.h> 28 28 #include <linux/cpumask.h> 29 29 #include <linux/seqlock.h> 30 + #include <linux/swait.h> 30 31 #include <linux/stop_machine.h> 31 32 32 33 /* ··· 244 243 /* Refused to boost: not sure why, though. */ 245 244 /* This can happen due to race conditions. */ 246 245 #ifdef CONFIG_RCU_NOCB_CPU 247 - wait_queue_head_t nocb_gp_wq[2]; 246 + struct swait_queue_head nocb_gp_wq[2]; 248 247 /* Place for rcu_nocb_kthread() to wait GP. */ 249 248 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 250 249 int need_future_gp[2]; ··· 400 399 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ 401 400 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 402 401 struct rcu_head **nocb_follower_tail; 403 - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 402 + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ 404 403 struct task_struct *nocb_kthread; 405 404 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 406 405 ··· 479 478 unsigned long gpnum; /* Current gp number. */ 480 479 unsigned long completed; /* # of last completed gp. */ 481 480 struct task_struct *gp_kthread; /* Task for grace periods. */ 482 - wait_queue_head_t gp_wq; /* Where GP task waits. */ 481 + struct swait_queue_head gp_wq; /* Where GP task waits. */ 483 482 short gp_flags; /* Commands for GP task. */ 484 483 short gp_state; /* GP kthread sleep state. */ 485 484 ··· 507 506 unsigned long expedited_sequence; /* Take a ticket. */ 508 507 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 509 508 atomic_t expedited_need_qs; /* # CPUs left to check in. */ 510 - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ 509 + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ 511 510 int ncpus_snap; /* # CPUs seen last time. */ 512 511 513 512 unsigned long jiffies_force_qs; /* Time at which to invoke */ ··· 622 621 static void increment_cpu_stall_ticks(void); 623 622 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); 624 623 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 625 - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 624 + static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); 625 + static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); 626 626 static void rcu_init_one_nocb(struct rcu_node *rnp); 627 627 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 628 628 bool lazy, unsigned long flags);
+21 -11
kernel/rcu/tree_plugin.h
··· 1811 1811 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1812 1812 * grace period. 1813 1813 */ 1814 - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1814 + static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 1815 1815 { 1816 - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); 1816 + swake_up_all(sq); 1817 1817 } 1818 1818 1819 1819 /* ··· 1829 1829 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; 1830 1830 } 1831 1831 1832 + static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 1833 + { 1834 + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; 1835 + } 1836 + 1832 1837 static void rcu_init_one_nocb(struct rcu_node *rnp) 1833 1838 { 1834 - init_waitqueue_head(&rnp->nocb_gp_wq[0]); 1835 - init_waitqueue_head(&rnp->nocb_gp_wq[1]); 1839 + init_swait_queue_head(&rnp->nocb_gp_wq[0]); 1840 + init_swait_queue_head(&rnp->nocb_gp_wq[1]); 1836 1841 } 1837 1842 1838 1843 #ifndef CONFIG_RCU_NOCB_CPU_ALL ··· 1862 1857 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { 1863 1858 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1864 1859 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1865 - wake_up(&rdp_leader->nocb_wq); 1860 + swake_up(&rdp_leader->nocb_wq); 1866 1861 } 1867 1862 } 1868 1863 ··· 2074 2069 */ 2075 2070 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); 2076 2071 for (;;) { 2077 - wait_event_interruptible( 2072 + swait_event_interruptible( 2078 2073 rnp->nocb_gp_wq[c & 0x1], 2079 2074 (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); 2080 2075 if (likely(d)) ··· 2102 2097 /* Wait for callbacks to appear. */ 2103 2098 if (!rcu_nocb_poll) { 2104 2099 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); 2105 - wait_event_interruptible(my_rdp->nocb_wq, 2100 + swait_event_interruptible(my_rdp->nocb_wq, 2106 2101 !READ_ONCE(my_rdp->nocb_leader_sleep)); 2107 2102 /* Memory barrier handled by smp_mb() calls below and repoll. */ 2108 2103 } else if (firsttime) { ··· 2177 2172 * List was empty, wake up the follower. 2178 2173 * Memory barriers supplied by atomic_long_add(). 2179 2174 */ 2180 - wake_up(&rdp->nocb_wq); 2175 + swake_up(&rdp->nocb_wq); 2181 2176 } 2182 2177 } 2183 2178 ··· 2198 2193 if (!rcu_nocb_poll) { 2199 2194 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2200 2195 "FollowerSleep"); 2201 - wait_event_interruptible(rdp->nocb_wq, 2196 + swait_event_interruptible(rdp->nocb_wq, 2202 2197 READ_ONCE(rdp->nocb_follower_head)); 2203 2198 } else if (firsttime) { 2204 2199 /* Don't drown trace log with "Poll"! */ ··· 2357 2352 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2358 2353 { 2359 2354 rdp->nocb_tail = &rdp->nocb_head; 2360 - init_waitqueue_head(&rdp->nocb_wq); 2355 + init_swait_queue_head(&rdp->nocb_wq); 2361 2356 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2362 2357 } 2363 2358 ··· 2507 2502 return false; 2508 2503 } 2509 2504 2510 - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2505 + static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 2511 2506 { 2512 2507 } 2513 2508 2514 2509 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) 2515 2510 { 2511 + } 2512 + 2513 + static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 2514 + { 2515 + return NULL; 2516 2516 } 2517 2517 2518 2518 static void rcu_init_one_nocb(struct rcu_node *rnp)
+1 -1
kernel/sched/Makefile
··· 13 13 14 14 obj-y += core.o loadavg.o clock.o cputime.o 15 15 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 16 - obj-y += wait.o completion.o idle.o 16 + obj-y += wait.o swait.o completion.o idle.o 17 17 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 18 18 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 19 19 obj-$(CONFIG_SCHEDSTATS) += stats.o
+100 -349
kernel/sched/core.c
··· 67 67 #include <linux/pagemap.h> 68 68 #include <linux/hrtimer.h> 69 69 #include <linux/tick.h> 70 - #include <linux/debugfs.h> 71 70 #include <linux/ctype.h> 72 71 #include <linux/ftrace.h> 73 72 #include <linux/slab.h> 74 73 #include <linux/init_task.h> 75 - #include <linux/binfmts.h> 76 74 #include <linux/context_tracking.h> 77 75 #include <linux/compiler.h> 78 76 ··· 122 124 0; 123 125 124 126 #undef SCHED_FEAT 125 - 126 - #ifdef CONFIG_SCHED_DEBUG 127 - #define SCHED_FEAT(name, enabled) \ 128 - #name , 129 - 130 - static const char * const sched_feat_names[] = { 131 - #include "features.h" 132 - }; 133 - 134 - #undef SCHED_FEAT 135 - 136 - static int sched_feat_show(struct seq_file *m, void *v) 137 - { 138 - int i; 139 - 140 - for (i = 0; i < __SCHED_FEAT_NR; i++) { 141 - if (!(sysctl_sched_features & (1UL << i))) 142 - seq_puts(m, "NO_"); 143 - seq_printf(m, "%s ", sched_feat_names[i]); 144 - } 145 - seq_puts(m, "\n"); 146 - 147 - return 0; 148 - } 149 - 150 - #ifdef HAVE_JUMP_LABEL 151 - 152 - #define jump_label_key__true STATIC_KEY_INIT_TRUE 153 - #define jump_label_key__false STATIC_KEY_INIT_FALSE 154 - 155 - #define SCHED_FEAT(name, enabled) \ 156 - jump_label_key__##enabled , 157 - 158 - struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 159 - #include "features.h" 160 - }; 161 - 162 - #undef SCHED_FEAT 163 - 164 - static void sched_feat_disable(int i) 165 - { 166 - static_key_disable(&sched_feat_keys[i]); 167 - } 168 - 169 - static void sched_feat_enable(int i) 170 - { 171 - static_key_enable(&sched_feat_keys[i]); 172 - } 173 - #else 174 - static void sched_feat_disable(int i) { }; 175 - static void sched_feat_enable(int i) { }; 176 - #endif /* HAVE_JUMP_LABEL */ 177 - 178 - static int sched_feat_set(char *cmp) 179 - { 180 - int i; 181 - int neg = 0; 182 - 183 - if (strncmp(cmp, "NO_", 3) == 0) { 184 - neg = 1; 185 - cmp += 3; 186 - } 187 - 188 - for (i = 0; i < __SCHED_FEAT_NR; i++) { 189 - if (strcmp(cmp, sched_feat_names[i]) == 0) { 190 - if (neg) { 191 - sysctl_sched_features &= ~(1UL << i); 192 - sched_feat_disable(i); 193 - } else { 194 - sysctl_sched_features |= (1UL << i); 195 - sched_feat_enable(i); 196 - } 197 - break; 198 - } 199 - } 200 - 201 - return i; 202 - } 203 - 204 - static ssize_t 205 - sched_feat_write(struct file *filp, const char __user *ubuf, 206 - size_t cnt, loff_t *ppos) 207 - { 208 - char buf[64]; 209 - char *cmp; 210 - int i; 211 - struct inode *inode; 212 - 213 - if (cnt > 63) 214 - cnt = 63; 215 - 216 - if (copy_from_user(&buf, ubuf, cnt)) 217 - return -EFAULT; 218 - 219 - buf[cnt] = 0; 220 - cmp = strstrip(buf); 221 - 222 - /* Ensure the static_key remains in a consistent state */ 223 - inode = file_inode(filp); 224 - inode_lock(inode); 225 - i = sched_feat_set(cmp); 226 - inode_unlock(inode); 227 - if (i == __SCHED_FEAT_NR) 228 - return -EINVAL; 229 - 230 - *ppos += cnt; 231 - 232 - return cnt; 233 - } 234 - 235 - static int sched_feat_open(struct inode *inode, struct file *filp) 236 - { 237 - return single_open(filp, sched_feat_show, NULL); 238 - } 239 - 240 - static const struct file_operations sched_feat_fops = { 241 - .open = sched_feat_open, 242 - .write = sched_feat_write, 243 - .read = seq_read, 244 - .llseek = seq_lseek, 245 - .release = single_release, 246 - }; 247 - 248 - static __init int sched_init_debug(void) 249 - { 250 - debugfs_create_file("sched_features", 0644, NULL, NULL, 251 - &sched_feat_fops); 252 - 253 - return 0; 254 - } 255 - late_initcall(sched_init_debug); 256 - #endif /* CONFIG_SCHED_DEBUG */ 257 127 258 128 /* 259 129 * Number of tasks to iterate in a single balance run. ··· 1960 2094 1961 2095 ttwu_queue(p, cpu); 1962 2096 stat: 1963 - ttwu_stat(p, cpu, wake_flags); 2097 + if (schedstat_enabled()) 2098 + ttwu_stat(p, cpu, wake_flags); 1964 2099 out: 1965 2100 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1966 2101 ··· 2009 2142 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2010 2143 2011 2144 ttwu_do_wakeup(rq, p, 0); 2012 - ttwu_stat(p, smp_processor_id(), 0); 2145 + if (schedstat_enabled()) 2146 + ttwu_stat(p, smp_processor_id(), 0); 2013 2147 out: 2014 2148 raw_spin_unlock(&p->pi_lock); 2015 2149 } ··· 2052 2184 dl_se->dl_bw = 0; 2053 2185 2054 2186 dl_se->dl_throttled = 0; 2055 - dl_se->dl_new = 1; 2056 2187 dl_se->dl_yielded = 0; 2057 2188 } 2058 2189 ··· 2078 2211 #endif 2079 2212 2080 2213 #ifdef CONFIG_SCHEDSTATS 2214 + /* Even if schedstat is disabled, there should not be garbage */ 2081 2215 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2082 2216 #endif 2083 2217 ··· 2087 2219 __dl_clear_params(p); 2088 2220 2089 2221 INIT_LIST_HEAD(&p->rt.run_list); 2222 + p->rt.timeout = 0; 2223 + p->rt.time_slice = sched_rr_timeslice; 2224 + p->rt.on_rq = 0; 2225 + p->rt.on_list = 0; 2090 2226 2091 2227 #ifdef CONFIG_PREEMPT_NOTIFIERS 2092 2228 INIT_HLIST_HEAD(&p->preempt_notifiers); ··· 2149 2277 return err; 2150 2278 if (write) 2151 2279 set_numabalancing_state(state); 2280 + return err; 2281 + } 2282 + #endif 2283 + #endif 2284 + 2285 + DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2286 + 2287 + #ifdef CONFIG_SCHEDSTATS 2288 + static void set_schedstats(bool enabled) 2289 + { 2290 + if (enabled) 2291 + static_branch_enable(&sched_schedstats); 2292 + else 2293 + static_branch_disable(&sched_schedstats); 2294 + } 2295 + 2296 + void force_schedstat_enabled(void) 2297 + { 2298 + if (!schedstat_enabled()) { 2299 + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 2300 + static_branch_enable(&sched_schedstats); 2301 + } 2302 + } 2303 + 2304 + static int __init setup_schedstats(char *str) 2305 + { 2306 + int ret = 0; 2307 + if (!str) 2308 + goto out; 2309 + 2310 + if (!strcmp(str, "enable")) { 2311 + set_schedstats(true); 2312 + ret = 1; 2313 + } else if (!strcmp(str, "disable")) { 2314 + set_schedstats(false); 2315 + ret = 1; 2316 + } 2317 + out: 2318 + if (!ret) 2319 + pr_warn("Unable to parse schedstats=\n"); 2320 + 2321 + return ret; 2322 + } 2323 + __setup("schedstats=", setup_schedstats); 2324 + 2325 + #ifdef CONFIG_PROC_SYSCTL 2326 + int sysctl_schedstats(struct ctl_table *table, int write, 2327 + void __user *buffer, size_t *lenp, loff_t *ppos) 2328 + { 2329 + struct ctl_table t; 2330 + int err; 2331 + int state = static_branch_likely(&sched_schedstats); 2332 + 2333 + if (write && !capable(CAP_SYS_ADMIN)) 2334 + return -EPERM; 2335 + 2336 + t = *table; 2337 + t.data = &state; 2338 + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2339 + if (err < 0) 2340 + return err; 2341 + if (write) 2342 + set_schedstats(state); 2152 2343 return err; 2153 2344 } 2154 2345 #endif ··· 2946 3011 } 2947 3012 #endif 2948 3013 2949 - notrace unsigned long get_parent_ip(unsigned long addr) 2950 - { 2951 - if (in_lock_functions(addr)) { 2952 - addr = CALLER_ADDR2; 2953 - if (in_lock_functions(addr)) 2954 - addr = CALLER_ADDR3; 2955 - } 2956 - return addr; 2957 - } 2958 - 2959 3014 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2960 3015 defined(CONFIG_PREEMPT_TRACER)) 2961 3016 ··· 2967 3042 PREEMPT_MASK - 10); 2968 3043 #endif 2969 3044 if (preempt_count() == val) { 2970 - unsigned long ip = get_parent_ip(CALLER_ADDR1); 3045 + unsigned long ip = get_lock_parent_ip(); 2971 3046 #ifdef CONFIG_DEBUG_PREEMPT 2972 3047 current->preempt_disable_ip = ip; 2973 3048 #endif ··· 2994 3069 #endif 2995 3070 2996 3071 if (preempt_count() == val) 2997 - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3072 + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 2998 3073 __preempt_count_sub(val); 2999 3074 } 3000 3075 EXPORT_SYMBOL(preempt_count_sub); ··· 3206 3281 3207 3282 trace_sched_switch(preempt, prev, next); 3208 3283 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3209 - cpu = cpu_of(rq); 3210 3284 } else { 3211 3285 lockdep_unpin_lock(&rq->lock); 3212 3286 raw_spin_unlock_irq(&rq->lock); ··· 3391 3467 */ 3392 3468 void rt_mutex_setprio(struct task_struct *p, int prio) 3393 3469 { 3394 - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; 3470 + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; 3395 3471 struct rq *rq; 3396 3472 const struct sched_class *prev_class; 3397 3473 ··· 3419 3495 3420 3496 trace_sched_pi_setprio(p, prio); 3421 3497 oldprio = p->prio; 3498 + 3499 + if (oldprio == prio) 3500 + queue_flag &= ~DEQUEUE_MOVE; 3501 + 3422 3502 prev_class = p->sched_class; 3423 3503 queued = task_on_rq_queued(p); 3424 3504 running = task_current(rq, p); 3425 3505 if (queued) 3426 - dequeue_task(rq, p, DEQUEUE_SAVE); 3506 + dequeue_task(rq, p, queue_flag); 3427 3507 if (running) 3428 3508 put_prev_task(rq, p); 3429 3509 ··· 3445 3517 if (!dl_prio(p->normal_prio) || 3446 3518 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3447 3519 p->dl.dl_boosted = 1; 3448 - enqueue_flag |= ENQUEUE_REPLENISH; 3520 + queue_flag |= ENQUEUE_REPLENISH; 3449 3521 } else 3450 3522 p->dl.dl_boosted = 0; 3451 3523 p->sched_class = &dl_sched_class; ··· 3453 3525 if (dl_prio(oldprio)) 3454 3526 p->dl.dl_boosted = 0; 3455 3527 if (oldprio < prio) 3456 - enqueue_flag |= ENQUEUE_HEAD; 3528 + queue_flag |= ENQUEUE_HEAD; 3457 3529 p->sched_class = &rt_sched_class; 3458 3530 } else { 3459 3531 if (dl_prio(oldprio)) ··· 3468 3540 if (running) 3469 3541 p->sched_class->set_curr_task(rq); 3470 3542 if (queued) 3471 - enqueue_task(rq, p, enqueue_flag); 3543 + enqueue_task(rq, p, queue_flag); 3472 3544 3473 3545 check_class_changed(rq, p, prev_class, oldprio); 3474 3546 out_unlock: ··· 3824 3896 const struct sched_class *prev_class; 3825 3897 struct rq *rq; 3826 3898 int reset_on_fork; 3899 + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 3827 3900 3828 3901 /* may grab non-irq protected spin_locks */ 3829 3902 BUG_ON(in_interrupt()); ··· 4007 4078 * itself. 4008 4079 */ 4009 4080 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 4010 - if (new_effective_prio == oldprio) { 4011 - __setscheduler_params(p, attr); 4012 - task_rq_unlock(rq, p, &flags); 4013 - return 0; 4014 - } 4081 + if (new_effective_prio == oldprio) 4082 + queue_flags &= ~DEQUEUE_MOVE; 4015 4083 } 4016 4084 4017 4085 queued = task_on_rq_queued(p); 4018 4086 running = task_current(rq, p); 4019 4087 if (queued) 4020 - dequeue_task(rq, p, DEQUEUE_SAVE); 4088 + dequeue_task(rq, p, queue_flags); 4021 4089 if (running) 4022 4090 put_prev_task(rq, p); 4023 4091 ··· 4024 4098 if (running) 4025 4099 p->sched_class->set_curr_task(rq); 4026 4100 if (queued) { 4027 - int enqueue_flags = ENQUEUE_RESTORE; 4028 4101 /* 4029 4102 * We enqueue to tail when the priority of a task is 4030 4103 * increased (user space view). 4031 4104 */ 4032 - if (oldprio <= p->prio) 4033 - enqueue_flags |= ENQUEUE_HEAD; 4105 + if (oldprio < p->prio) 4106 + queue_flags |= ENQUEUE_HEAD; 4034 4107 4035 - enqueue_task(rq, p, enqueue_flags); 4108 + enqueue_task(rq, p, queue_flags); 4036 4109 } 4037 4110 4038 4111 check_class_changed(rq, p, prev_class, oldprio); ··· 5333 5408 } 5334 5409 #endif /* CONFIG_HOTPLUG_CPU */ 5335 5410 5336 - #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5337 - 5338 - static struct ctl_table sd_ctl_dir[] = { 5339 - { 5340 - .procname = "sched_domain", 5341 - .mode = 0555, 5342 - }, 5343 - {} 5344 - }; 5345 - 5346 - static struct ctl_table sd_ctl_root[] = { 5347 - { 5348 - .procname = "kernel", 5349 - .mode = 0555, 5350 - .child = sd_ctl_dir, 5351 - }, 5352 - {} 5353 - }; 5354 - 5355 - static struct ctl_table *sd_alloc_ctl_entry(int n) 5356 - { 5357 - struct ctl_table *entry = 5358 - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5359 - 5360 - return entry; 5361 - } 5362 - 5363 - static void sd_free_ctl_entry(struct ctl_table **tablep) 5364 - { 5365 - struct ctl_table *entry; 5366 - 5367 - /* 5368 - * In the intermediate directories, both the child directory and 5369 - * procname are dynamically allocated and could fail but the mode 5370 - * will always be set. In the lowest directory the names are 5371 - * static strings and all have proc handlers. 5372 - */ 5373 - for (entry = *tablep; entry->mode; entry++) { 5374 - if (entry->child) 5375 - sd_free_ctl_entry(&entry->child); 5376 - if (entry->proc_handler == NULL) 5377 - kfree(entry->procname); 5378 - } 5379 - 5380 - kfree(*tablep); 5381 - *tablep = NULL; 5382 - } 5383 - 5384 - static int min_load_idx = 0; 5385 - static int max_load_idx = CPU_LOAD_IDX_MAX-1; 5386 - 5387 - static void 5388 - set_table_entry(struct ctl_table *entry, 5389 - const char *procname, void *data, int maxlen, 5390 - umode_t mode, proc_handler *proc_handler, 5391 - bool load_idx) 5392 - { 5393 - entry->procname = procname; 5394 - entry->data = data; 5395 - entry->maxlen = maxlen; 5396 - entry->mode = mode; 5397 - entry->proc_handler = proc_handler; 5398 - 5399 - if (load_idx) { 5400 - entry->extra1 = &min_load_idx; 5401 - entry->extra2 = &max_load_idx; 5402 - } 5403 - } 5404 - 5405 - static struct ctl_table * 5406 - sd_alloc_ctl_domain_table(struct sched_domain *sd) 5407 - { 5408 - struct ctl_table *table = sd_alloc_ctl_entry(14); 5409 - 5410 - if (table == NULL) 5411 - return NULL; 5412 - 5413 - set_table_entry(&table[0], "min_interval", &sd->min_interval, 5414 - sizeof(long), 0644, proc_doulongvec_minmax, false); 5415 - set_table_entry(&table[1], "max_interval", &sd->max_interval, 5416 - sizeof(long), 0644, proc_doulongvec_minmax, false); 5417 - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5418 - sizeof(int), 0644, proc_dointvec_minmax, true); 5419 - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5420 - sizeof(int), 0644, proc_dointvec_minmax, true); 5421 - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5422 - sizeof(int), 0644, proc_dointvec_minmax, true); 5423 - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5424 - sizeof(int), 0644, proc_dointvec_minmax, true); 5425 - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5426 - sizeof(int), 0644, proc_dointvec_minmax, true); 5427 - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5428 - sizeof(int), 0644, proc_dointvec_minmax, false); 5429 - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5430 - sizeof(int), 0644, proc_dointvec_minmax, false); 5431 - set_table_entry(&table[9], "cache_nice_tries", 5432 - &sd->cache_nice_tries, 5433 - sizeof(int), 0644, proc_dointvec_minmax, false); 5434 - set_table_entry(&table[10], "flags", &sd->flags, 5435 - sizeof(int), 0644, proc_dointvec_minmax, false); 5436 - set_table_entry(&table[11], "max_newidle_lb_cost", 5437 - &sd->max_newidle_lb_cost, 5438 - sizeof(long), 0644, proc_doulongvec_minmax, false); 5439 - set_table_entry(&table[12], "name", sd->name, 5440 - CORENAME_MAX_SIZE, 0444, proc_dostring, false); 5441 - /* &table[13] is terminator */ 5442 - 5443 - return table; 5444 - } 5445 - 5446 - static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5447 - { 5448 - struct ctl_table *entry, *table; 5449 - struct sched_domain *sd; 5450 - int domain_num = 0, i; 5451 - char buf[32]; 5452 - 5453 - for_each_domain(cpu, sd) 5454 - domain_num++; 5455 - entry = table = sd_alloc_ctl_entry(domain_num + 1); 5456 - if (table == NULL) 5457 - return NULL; 5458 - 5459 - i = 0; 5460 - for_each_domain(cpu, sd) { 5461 - snprintf(buf, 32, "domain%d", i); 5462 - entry->procname = kstrdup(buf, GFP_KERNEL); 5463 - entry->mode = 0555; 5464 - entry->child = sd_alloc_ctl_domain_table(sd); 5465 - entry++; 5466 - i++; 5467 - } 5468 - return table; 5469 - } 5470 - 5471 - static struct ctl_table_header *sd_sysctl_header; 5472 - static void register_sched_domain_sysctl(void) 5473 - { 5474 - int i, cpu_num = num_possible_cpus(); 5475 - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5476 - char buf[32]; 5477 - 5478 - WARN_ON(sd_ctl_dir[0].child); 5479 - sd_ctl_dir[0].child = entry; 5480 - 5481 - if (entry == NULL) 5482 - return; 5483 - 5484 - for_each_possible_cpu(i) { 5485 - snprintf(buf, 32, "cpu%d", i); 5486 - entry->procname = kstrdup(buf, GFP_KERNEL); 5487 - entry->mode = 0555; 5488 - entry->child = sd_alloc_ctl_cpu_table(i); 5489 - entry++; 5490 - } 5491 - 5492 - WARN_ON(sd_sysctl_header); 5493 - sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5494 - } 5495 - 5496 - /* may be called multiple times per register */ 5497 - static void unregister_sched_domain_sysctl(void) 5498 - { 5499 - unregister_sysctl_table(sd_sysctl_header); 5500 - sd_sysctl_header = NULL; 5501 - if (sd_ctl_dir[0].child) 5502 - sd_free_ctl_entry(&sd_ctl_dir[0].child); 5503 - } 5504 - #else 5505 - static void register_sched_domain_sysctl(void) 5506 - { 5507 - } 5508 - static void unregister_sched_domain_sysctl(void) 5509 - { 5510 - } 5511 - #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ 5512 - 5513 5411 static void set_rq_online(struct rq *rq) 5514 5412 { 5515 5413 if (!rq->online) { ··· 5924 6176 /* Setup the mask of cpus configured for isolated domains */ 5925 6177 static int __init isolated_cpu_setup(char *str) 5926 6178 { 6179 + int ret; 6180 + 5927 6181 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5928 - cpulist_parse(str, cpu_isolated_map); 6182 + ret = cpulist_parse(str, cpu_isolated_map); 6183 + if (ret) { 6184 + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); 6185 + return 0; 6186 + } 5929 6187 return 1; 5930 6188 } 5931 - 5932 6189 __setup("isolcpus=", isolated_cpu_setup); 5933 6190 5934 6191 struct s_data { ··· 7616 7863 void sched_offline_group(struct task_group *tg) 7617 7864 { 7618 7865 unsigned long flags; 7619 - int i; 7620 7866 7621 7867 /* end participation in shares distribution */ 7622 - for_each_possible_cpu(i) 7623 - unregister_fair_sched_group(tg, i); 7868 + unregister_fair_sched_group(tg); 7624 7869 7625 7870 spin_lock_irqsave(&task_group_lock, flags); 7626 7871 list_del_rcu(&tg->list); ··· 7644 7893 queued = task_on_rq_queued(tsk); 7645 7894 7646 7895 if (queued) 7647 - dequeue_task(rq, tsk, DEQUEUE_SAVE); 7896 + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); 7648 7897 if (unlikely(running)) 7649 7898 put_prev_task(rq, tsk); 7650 7899 ··· 7668 7917 if (unlikely(running)) 7669 7918 tsk->sched_class->set_curr_task(rq); 7670 7919 if (queued) 7671 - enqueue_task(rq, tsk, ENQUEUE_RESTORE); 7920 + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 7672 7921 7673 7922 task_rq_unlock(rq, tsk, &flags); 7674 7923 }
+30 -23
kernel/sched/cputime.c
··· 262 262 #ifdef CONFIG_PARAVIRT 263 263 if (static_key_false(&paravirt_steal_enabled)) { 264 264 u64 steal; 265 - cputime_t steal_ct; 265 + unsigned long steal_jiffies; 266 266 267 267 steal = paravirt_steal_clock(smp_processor_id()); 268 268 steal -= this_rq()->prev_steal_time; 269 269 270 270 /* 271 - * cputime_t may be less precise than nsecs (eg: if it's 272 - * based on jiffies). Lets cast the result to cputime 271 + * steal is in nsecs but our caller is expecting steal 272 + * time in jiffies. Lets cast the result to jiffies 273 273 * granularity and account the rest on the next rounds. 274 274 */ 275 - steal_ct = nsecs_to_cputime(steal); 276 - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); 275 + steal_jiffies = nsecs_to_jiffies(steal); 276 + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); 277 277 278 - account_steal_time(steal_ct); 279 - return steal_ct; 278 + account_steal_time(jiffies_to_cputime(steal_jiffies)); 279 + return steal_jiffies; 280 280 } 281 281 #endif 282 282 return false; ··· 668 668 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 669 669 670 670 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 671 - static unsigned long long vtime_delta(struct task_struct *tsk) 671 + static cputime_t vtime_delta(struct task_struct *tsk) 672 672 { 673 - unsigned long long clock; 673 + unsigned long now = READ_ONCE(jiffies); 674 674 675 - clock = local_clock(); 676 - if (clock < tsk->vtime_snap) 675 + if (time_before(now, (unsigned long)tsk->vtime_snap)) 677 676 return 0; 678 677 679 - return clock - tsk->vtime_snap; 678 + return jiffies_to_cputime(now - tsk->vtime_snap); 680 679 } 681 680 682 681 static cputime_t get_vtime_delta(struct task_struct *tsk) 683 682 { 684 - unsigned long long delta = vtime_delta(tsk); 683 + unsigned long now = READ_ONCE(jiffies); 684 + unsigned long delta = now - tsk->vtime_snap; 685 685 686 686 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 687 - tsk->vtime_snap += delta; 687 + tsk->vtime_snap = now; 688 688 689 - /* CHECKME: always safe to convert nsecs to cputime? */ 690 - return nsecs_to_cputime(delta); 689 + return jiffies_to_cputime(delta); 691 690 } 692 691 693 692 static void __vtime_account_system(struct task_struct *tsk) ··· 698 699 699 700 void vtime_account_system(struct task_struct *tsk) 700 701 { 702 + if (!vtime_delta(tsk)) 703 + return; 704 + 701 705 write_seqcount_begin(&tsk->vtime_seqcount); 702 706 __vtime_account_system(tsk); 703 707 write_seqcount_end(&tsk->vtime_seqcount); ··· 709 707 void vtime_gen_account_irq_exit(struct task_struct *tsk) 710 708 { 711 709 write_seqcount_begin(&tsk->vtime_seqcount); 712 - __vtime_account_system(tsk); 710 + if (vtime_delta(tsk)) 711 + __vtime_account_system(tsk); 713 712 if (context_tracking_in_user()) 714 713 tsk->vtime_snap_whence = VTIME_USER; 715 714 write_seqcount_end(&tsk->vtime_seqcount); ··· 721 718 cputime_t delta_cpu; 722 719 723 720 write_seqcount_begin(&tsk->vtime_seqcount); 724 - delta_cpu = get_vtime_delta(tsk); 725 721 tsk->vtime_snap_whence = VTIME_SYS; 726 - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 722 + if (vtime_delta(tsk)) { 723 + delta_cpu = get_vtime_delta(tsk); 724 + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 725 + } 727 726 write_seqcount_end(&tsk->vtime_seqcount); 728 727 } 729 728 730 729 void vtime_user_enter(struct task_struct *tsk) 731 730 { 732 731 write_seqcount_begin(&tsk->vtime_seqcount); 733 - __vtime_account_system(tsk); 732 + if (vtime_delta(tsk)) 733 + __vtime_account_system(tsk); 734 734 tsk->vtime_snap_whence = VTIME_USER; 735 735 write_seqcount_end(&tsk->vtime_seqcount); 736 736 } ··· 748 742 * that can thus safely catch up with a tickless delta. 749 743 */ 750 744 write_seqcount_begin(&tsk->vtime_seqcount); 751 - __vtime_account_system(tsk); 745 + if (vtime_delta(tsk)) 746 + __vtime_account_system(tsk); 752 747 current->flags |= PF_VCPU; 753 748 write_seqcount_end(&tsk->vtime_seqcount); 754 749 } ··· 779 772 780 773 write_seqcount_begin(&current->vtime_seqcount); 781 774 current->vtime_snap_whence = VTIME_SYS; 782 - current->vtime_snap = sched_clock_cpu(smp_processor_id()); 775 + current->vtime_snap = jiffies; 783 776 write_seqcount_end(&current->vtime_seqcount); 784 777 } 785 778 ··· 790 783 local_irq_save(flags); 791 784 write_seqcount_begin(&t->vtime_seqcount); 792 785 t->vtime_snap_whence = VTIME_SYS; 793 - t->vtime_snap = sched_clock_cpu(cpu); 786 + t->vtime_snap = jiffies; 794 787 write_seqcount_end(&t->vtime_seqcount); 795 788 local_irq_restore(flags); 796 789 }
+27 -33
kernel/sched/deadline.c
··· 352 352 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 353 353 struct rq *rq = rq_of_dl_rq(dl_rq); 354 354 355 - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); 355 + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); 356 + 357 + /* 358 + * We are racing with the deadline timer. So, do nothing because 359 + * the deadline timer handler will take care of properly recharging 360 + * the runtime and postponing the deadline 361 + */ 362 + if (dl_se->dl_throttled) 363 + return; 356 364 357 365 /* 358 366 * We use the regular wall clock time to set deadlines in the ··· 369 361 */ 370 362 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 371 363 dl_se->runtime = pi_se->dl_runtime; 372 - dl_se->dl_new = 0; 373 364 } 374 365 375 366 /* ··· 405 398 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 406 399 dl_se->runtime = pi_se->dl_runtime; 407 400 } 401 + 402 + if (dl_se->dl_yielded && dl_se->runtime > 0) 403 + dl_se->runtime = 0; 408 404 409 405 /* 410 406 * We keep moving the deadline away until we get some ··· 510 500 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 511 501 struct rq *rq = rq_of_dl_rq(dl_rq); 512 502 513 - /* 514 - * The arrival of a new instance needs special treatment, i.e., 515 - * the actual scheduling parameters have to be "renewed". 516 - */ 517 - if (dl_se->dl_new) { 518 - setup_new_dl_entity(dl_se, pi_se); 519 - return; 520 - } 521 - 522 503 if (dl_time_before(dl_se->deadline, rq_clock(rq)) || 523 504 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { 524 505 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; ··· 604 603 __dl_clear_params(p); 605 604 goto unlock; 606 605 } 607 - 608 - /* 609 - * This is possible if switched_from_dl() raced against a running 610 - * callback that took the above !dl_task() path and we've since then 611 - * switched back into SCHED_DEADLINE. 612 - * 613 - * There's nothing to do except drop our task reference. 614 - */ 615 - if (dl_se->dl_new) 616 - goto unlock; 617 606 618 607 /* 619 608 * The task might have been boosted by someone else and might be in the ··· 726 735 * approach need further study. 727 736 */ 728 737 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 729 - if (unlikely((s64)delta_exec <= 0)) 738 + if (unlikely((s64)delta_exec <= 0)) { 739 + if (unlikely(dl_se->dl_yielded)) 740 + goto throttle; 730 741 return; 742 + } 731 743 732 744 schedstat_set(curr->se.statistics.exec_max, 733 745 max(curr->se.statistics.exec_max, delta_exec)); ··· 743 749 744 750 sched_rt_avg_update(rq, delta_exec); 745 751 746 - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 747 - if (dl_runtime_exceeded(dl_se)) { 752 + dl_se->runtime -= delta_exec; 753 + 754 + throttle: 755 + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { 748 756 dl_se->dl_throttled = 1; 749 757 __dequeue_task_dl(rq, curr, 0); 750 758 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) ··· 913 917 * parameters of the task might need updating. Otherwise, 914 918 * we want a replenishment of its runtime. 915 919 */ 916 - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) 920 + if (flags & ENQUEUE_WAKEUP) 917 921 update_dl_entity(dl_se, pi_se); 918 922 else if (flags & ENQUEUE_REPLENISH) 919 923 replenish_dl_entity(dl_se, pi_se); ··· 990 994 */ 991 995 static void yield_task_dl(struct rq *rq) 992 996 { 993 - struct task_struct *p = rq->curr; 994 - 995 997 /* 996 998 * We make the task go to sleep until its current deadline by 997 999 * forcing its runtime to zero. This way, update_curr_dl() stops 998 1000 * it and the bandwidth timer will wake it up and will give it 999 1001 * new scheduling parameters (thanks to dl_yielded=1). 1000 1002 */ 1001 - if (p->dl.runtime > 0) { 1002 - rq->curr->dl.dl_yielded = 1; 1003 - p->dl.runtime = 0; 1004 - } 1003 + rq->curr->dl.dl_yielded = 1; 1004 + 1005 1005 update_rq_clock(rq); 1006 1006 update_curr_dl(rq); 1007 1007 /* ··· 1714 1722 */ 1715 1723 static void switched_to_dl(struct rq *rq, struct task_struct *p) 1716 1724 { 1725 + if (dl_time_before(p->dl.deadline, rq_clock(rq))) 1726 + setup_new_dl_entity(&p->dl, &p->dl); 1727 + 1717 1728 if (task_on_rq_queued(p) && rq->curr != p) { 1718 1729 #ifdef CONFIG_SMP 1719 1730 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) ··· 1763 1768 */ 1764 1769 resched_curr(rq); 1765 1770 #endif /* CONFIG_SMP */ 1766 - } else 1767 - switched_to_dl(rq, p); 1771 + } 1768 1772 } 1769 1773 1770 1774 const struct sched_class dl_sched_class = {
+367 -48
kernel/sched/debug.c
··· 16 16 #include <linux/kallsyms.h> 17 17 #include <linux/utsname.h> 18 18 #include <linux/mempolicy.h> 19 + #include <linux/debugfs.h> 19 20 20 21 #include "sched.h" 21 22 ··· 59 58 60 59 #define SPLIT_NS(x) nsec_high(x), nsec_low(x) 61 60 61 + #define SCHED_FEAT(name, enabled) \ 62 + #name , 63 + 64 + static const char * const sched_feat_names[] = { 65 + #include "features.h" 66 + }; 67 + 68 + #undef SCHED_FEAT 69 + 70 + static int sched_feat_show(struct seq_file *m, void *v) 71 + { 72 + int i; 73 + 74 + for (i = 0; i < __SCHED_FEAT_NR; i++) { 75 + if (!(sysctl_sched_features & (1UL << i))) 76 + seq_puts(m, "NO_"); 77 + seq_printf(m, "%s ", sched_feat_names[i]); 78 + } 79 + seq_puts(m, "\n"); 80 + 81 + return 0; 82 + } 83 + 84 + #ifdef HAVE_JUMP_LABEL 85 + 86 + #define jump_label_key__true STATIC_KEY_INIT_TRUE 87 + #define jump_label_key__false STATIC_KEY_INIT_FALSE 88 + 89 + #define SCHED_FEAT(name, enabled) \ 90 + jump_label_key__##enabled , 91 + 92 + struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 93 + #include "features.h" 94 + }; 95 + 96 + #undef SCHED_FEAT 97 + 98 + static void sched_feat_disable(int i) 99 + { 100 + static_key_disable(&sched_feat_keys[i]); 101 + } 102 + 103 + static void sched_feat_enable(int i) 104 + { 105 + static_key_enable(&sched_feat_keys[i]); 106 + } 107 + #else 108 + static void sched_feat_disable(int i) { }; 109 + static void sched_feat_enable(int i) { }; 110 + #endif /* HAVE_JUMP_LABEL */ 111 + 112 + static int sched_feat_set(char *cmp) 113 + { 114 + int i; 115 + int neg = 0; 116 + 117 + if (strncmp(cmp, "NO_", 3) == 0) { 118 + neg = 1; 119 + cmp += 3; 120 + } 121 + 122 + for (i = 0; i < __SCHED_FEAT_NR; i++) { 123 + if (strcmp(cmp, sched_feat_names[i]) == 0) { 124 + if (neg) { 125 + sysctl_sched_features &= ~(1UL << i); 126 + sched_feat_disable(i); 127 + } else { 128 + sysctl_sched_features |= (1UL << i); 129 + sched_feat_enable(i); 130 + } 131 + break; 132 + } 133 + } 134 + 135 + return i; 136 + } 137 + 138 + static ssize_t 139 + sched_feat_write(struct file *filp, const char __user *ubuf, 140 + size_t cnt, loff_t *ppos) 141 + { 142 + char buf[64]; 143 + char *cmp; 144 + int i; 145 + struct inode *inode; 146 + 147 + if (cnt > 63) 148 + cnt = 63; 149 + 150 + if (copy_from_user(&buf, ubuf, cnt)) 151 + return -EFAULT; 152 + 153 + buf[cnt] = 0; 154 + cmp = strstrip(buf); 155 + 156 + /* Ensure the static_key remains in a consistent state */ 157 + inode = file_inode(filp); 158 + inode_lock(inode); 159 + i = sched_feat_set(cmp); 160 + inode_unlock(inode); 161 + if (i == __SCHED_FEAT_NR) 162 + return -EINVAL; 163 + 164 + *ppos += cnt; 165 + 166 + return cnt; 167 + } 168 + 169 + static int sched_feat_open(struct inode *inode, struct file *filp) 170 + { 171 + return single_open(filp, sched_feat_show, NULL); 172 + } 173 + 174 + static const struct file_operations sched_feat_fops = { 175 + .open = sched_feat_open, 176 + .write = sched_feat_write, 177 + .read = seq_read, 178 + .llseek = seq_lseek, 179 + .release = single_release, 180 + }; 181 + 182 + static __init int sched_init_debug(void) 183 + { 184 + debugfs_create_file("sched_features", 0644, NULL, NULL, 185 + &sched_feat_fops); 186 + 187 + return 0; 188 + } 189 + late_initcall(sched_init_debug); 190 + 191 + #ifdef CONFIG_SMP 192 + 193 + #ifdef CONFIG_SYSCTL 194 + 195 + static struct ctl_table sd_ctl_dir[] = { 196 + { 197 + .procname = "sched_domain", 198 + .mode = 0555, 199 + }, 200 + {} 201 + }; 202 + 203 + static struct ctl_table sd_ctl_root[] = { 204 + { 205 + .procname = "kernel", 206 + .mode = 0555, 207 + .child = sd_ctl_dir, 208 + }, 209 + {} 210 + }; 211 + 212 + static struct ctl_table *sd_alloc_ctl_entry(int n) 213 + { 214 + struct ctl_table *entry = 215 + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 216 + 217 + return entry; 218 + } 219 + 220 + static void sd_free_ctl_entry(struct ctl_table **tablep) 221 + { 222 + struct ctl_table *entry; 223 + 224 + /* 225 + * In the intermediate directories, both the child directory and 226 + * procname are dynamically allocated and could fail but the mode 227 + * will always be set. In the lowest directory the names are 228 + * static strings and all have proc handlers. 229 + */ 230 + for (entry = *tablep; entry->mode; entry++) { 231 + if (entry->child) 232 + sd_free_ctl_entry(&entry->child); 233 + if (entry->proc_handler == NULL) 234 + kfree(entry->procname); 235 + } 236 + 237 + kfree(*tablep); 238 + *tablep = NULL; 239 + } 240 + 241 + static int min_load_idx = 0; 242 + static int max_load_idx = CPU_LOAD_IDX_MAX-1; 243 + 244 + static void 245 + set_table_entry(struct ctl_table *entry, 246 + const char *procname, void *data, int maxlen, 247 + umode_t mode, proc_handler *proc_handler, 248 + bool load_idx) 249 + { 250 + entry->procname = procname; 251 + entry->data = data; 252 + entry->maxlen = maxlen; 253 + entry->mode = mode; 254 + entry->proc_handler = proc_handler; 255 + 256 + if (load_idx) { 257 + entry->extra1 = &min_load_idx; 258 + entry->extra2 = &max_load_idx; 259 + } 260 + } 261 + 262 + static struct ctl_table * 263 + sd_alloc_ctl_domain_table(struct sched_domain *sd) 264 + { 265 + struct ctl_table *table = sd_alloc_ctl_entry(14); 266 + 267 + if (table == NULL) 268 + return NULL; 269 + 270 + set_table_entry(&table[0], "min_interval", &sd->min_interval, 271 + sizeof(long), 0644, proc_doulongvec_minmax, false); 272 + set_table_entry(&table[1], "max_interval", &sd->max_interval, 273 + sizeof(long), 0644, proc_doulongvec_minmax, false); 274 + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 275 + sizeof(int), 0644, proc_dointvec_minmax, true); 276 + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 277 + sizeof(int), 0644, proc_dointvec_minmax, true); 278 + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 279 + sizeof(int), 0644, proc_dointvec_minmax, true); 280 + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 281 + sizeof(int), 0644, proc_dointvec_minmax, true); 282 + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 283 + sizeof(int), 0644, proc_dointvec_minmax, true); 284 + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 285 + sizeof(int), 0644, proc_dointvec_minmax, false); 286 + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 287 + sizeof(int), 0644, proc_dointvec_minmax, false); 288 + set_table_entry(&table[9], "cache_nice_tries", 289 + &sd->cache_nice_tries, 290 + sizeof(int), 0644, proc_dointvec_minmax, false); 291 + set_table_entry(&table[10], "flags", &sd->flags, 292 + sizeof(int), 0644, proc_dointvec_minmax, false); 293 + set_table_entry(&table[11], "max_newidle_lb_cost", 294 + &sd->max_newidle_lb_cost, 295 + sizeof(long), 0644, proc_doulongvec_minmax, false); 296 + set_table_entry(&table[12], "name", sd->name, 297 + CORENAME_MAX_SIZE, 0444, proc_dostring, false); 298 + /* &table[13] is terminator */ 299 + 300 + return table; 301 + } 302 + 303 + static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 304 + { 305 + struct ctl_table *entry, *table; 306 + struct sched_domain *sd; 307 + int domain_num = 0, i; 308 + char buf[32]; 309 + 310 + for_each_domain(cpu, sd) 311 + domain_num++; 312 + entry = table = sd_alloc_ctl_entry(domain_num + 1); 313 + if (table == NULL) 314 + return NULL; 315 + 316 + i = 0; 317 + for_each_domain(cpu, sd) { 318 + snprintf(buf, 32, "domain%d", i); 319 + entry->procname = kstrdup(buf, GFP_KERNEL); 320 + entry->mode = 0555; 321 + entry->child = sd_alloc_ctl_domain_table(sd); 322 + entry++; 323 + i++; 324 + } 325 + return table; 326 + } 327 + 328 + static struct ctl_table_header *sd_sysctl_header; 329 + void register_sched_domain_sysctl(void) 330 + { 331 + int i, cpu_num = num_possible_cpus(); 332 + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 333 + char buf[32]; 334 + 335 + WARN_ON(sd_ctl_dir[0].child); 336 + sd_ctl_dir[0].child = entry; 337 + 338 + if (entry == NULL) 339 + return; 340 + 341 + for_each_possible_cpu(i) { 342 + snprintf(buf, 32, "cpu%d", i); 343 + entry->procname = kstrdup(buf, GFP_KERNEL); 344 + entry->mode = 0555; 345 + entry->child = sd_alloc_ctl_cpu_table(i); 346 + entry++; 347 + } 348 + 349 + WARN_ON(sd_sysctl_header); 350 + sd_sysctl_header = register_sysctl_table(sd_ctl_root); 351 + } 352 + 353 + /* may be called multiple times per register */ 354 + void unregister_sched_domain_sysctl(void) 355 + { 356 + unregister_sysctl_table(sd_sysctl_header); 357 + sd_sysctl_header = NULL; 358 + if (sd_ctl_dir[0].child) 359 + sd_free_ctl_entry(&sd_ctl_dir[0].child); 360 + } 361 + #endif /* CONFIG_SYSCTL */ 362 + #endif /* CONFIG_SMP */ 363 + 62 364 #ifdef CONFIG_FAIR_GROUP_SCHED 63 365 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 64 366 { ··· 379 75 PN(se->vruntime); 380 76 PN(se->sum_exec_runtime); 381 77 #ifdef CONFIG_SCHEDSTATS 382 - PN(se->statistics.wait_start); 383 - PN(se->statistics.sleep_start); 384 - PN(se->statistics.block_start); 385 - PN(se->statistics.sleep_max); 386 - PN(se->statistics.block_max); 387 - PN(se->statistics.exec_max); 388 - PN(se->statistics.slice_max); 389 - PN(se->statistics.wait_max); 390 - PN(se->statistics.wait_sum); 391 - P(se->statistics.wait_count); 78 + if (schedstat_enabled()) { 79 + PN(se->statistics.wait_start); 80 + PN(se->statistics.sleep_start); 81 + PN(se->statistics.block_start); 82 + PN(se->statistics.sleep_max); 83 + PN(se->statistics.block_max); 84 + PN(se->statistics.exec_max); 85 + PN(se->statistics.slice_max); 86 + PN(se->statistics.wait_max); 87 + PN(se->statistics.wait_sum); 88 + P(se->statistics.wait_count); 89 + } 392 90 #endif 393 91 P(se->load.weight); 394 92 #ifdef CONFIG_SMP ··· 428 122 (long long)(p->nvcsw + p->nivcsw), 429 123 p->prio); 430 124 #ifdef CONFIG_SCHEDSTATS 431 - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 432 - SPLIT_NS(p->se.statistics.wait_sum), 433 - SPLIT_NS(p->se.sum_exec_runtime), 434 - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); 125 + if (schedstat_enabled()) { 126 + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 127 + SPLIT_NS(p->se.statistics.wait_sum), 128 + SPLIT_NS(p->se.sum_exec_runtime), 129 + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); 130 + } 435 131 #else 436 132 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 437 133 0LL, 0L, ··· 566 258 567 259 void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) 568 260 { 261 + struct dl_bw *dl_bw; 262 + 569 263 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); 570 264 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); 265 + #ifdef CONFIG_SMP 266 + dl_bw = &cpu_rq(cpu)->rd->dl_bw; 267 + #else 268 + dl_bw = &dl_rq->dl_bw; 269 + #endif 270 + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); 271 + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); 571 272 } 572 273 573 274 extern __read_mostly int sched_clock_running; ··· 630 313 #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 631 314 #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); 632 315 633 - P(yld_count); 634 - 635 - P(sched_count); 636 - P(sched_goidle); 637 316 #ifdef CONFIG_SMP 638 317 P64(avg_idle); 639 318 P64(max_idle_balance_cost); 640 319 #endif 641 320 642 - P(ttwu_count); 643 - P(ttwu_local); 321 + if (schedstat_enabled()) { 322 + P(yld_count); 323 + P(sched_count); 324 + P(sched_goidle); 325 + P(ttwu_count); 326 + P(ttwu_local); 327 + } 644 328 645 329 #undef P 646 330 #undef P64 ··· 887 569 nr_switches = p->nvcsw + p->nivcsw; 888 570 889 571 #ifdef CONFIG_SCHEDSTATS 890 - PN(se.statistics.sum_sleep_runtime); 891 - PN(se.statistics.wait_start); 892 - PN(se.statistics.sleep_start); 893 - PN(se.statistics.block_start); 894 - PN(se.statistics.sleep_max); 895 - PN(se.statistics.block_max); 896 - PN(se.statistics.exec_max); 897 - PN(se.statistics.slice_max); 898 - PN(se.statistics.wait_max); 899 - PN(se.statistics.wait_sum); 900 - P(se.statistics.wait_count); 901 - PN(se.statistics.iowait_sum); 902 - P(se.statistics.iowait_count); 903 572 P(se.nr_migrations); 904 - P(se.statistics.nr_migrations_cold); 905 - P(se.statistics.nr_failed_migrations_affine); 906 - P(se.statistics.nr_failed_migrations_running); 907 - P(se.statistics.nr_failed_migrations_hot); 908 - P(se.statistics.nr_forced_migrations); 909 - P(se.statistics.nr_wakeups); 910 - P(se.statistics.nr_wakeups_sync); 911 - P(se.statistics.nr_wakeups_migrate); 912 - P(se.statistics.nr_wakeups_local); 913 - P(se.statistics.nr_wakeups_remote); 914 - P(se.statistics.nr_wakeups_affine); 915 - P(se.statistics.nr_wakeups_affine_attempts); 916 - P(se.statistics.nr_wakeups_passive); 917 - P(se.statistics.nr_wakeups_idle); 918 573 919 - { 574 + if (schedstat_enabled()) { 920 575 u64 avg_atom, avg_per_cpu; 576 + 577 + PN(se.statistics.sum_sleep_runtime); 578 + PN(se.statistics.wait_start); 579 + PN(se.statistics.sleep_start); 580 + PN(se.statistics.block_start); 581 + PN(se.statistics.sleep_max); 582 + PN(se.statistics.block_max); 583 + PN(se.statistics.exec_max); 584 + PN(se.statistics.slice_max); 585 + PN(se.statistics.wait_max); 586 + PN(se.statistics.wait_sum); 587 + P(se.statistics.wait_count); 588 + PN(se.statistics.iowait_sum); 589 + P(se.statistics.iowait_count); 590 + P(se.statistics.nr_migrations_cold); 591 + P(se.statistics.nr_failed_migrations_affine); 592 + P(se.statistics.nr_failed_migrations_running); 593 + P(se.statistics.nr_failed_migrations_hot); 594 + P(se.statistics.nr_forced_migrations); 595 + P(se.statistics.nr_wakeups); 596 + P(se.statistics.nr_wakeups_sync); 597 + P(se.statistics.nr_wakeups_migrate); 598 + P(se.statistics.nr_wakeups_local); 599 + P(se.statistics.nr_wakeups_remote); 600 + P(se.statistics.nr_wakeups_affine); 601 + P(se.statistics.nr_wakeups_affine_attempts); 602 + P(se.statistics.nr_wakeups_passive); 603 + P(se.statistics.nr_wakeups_idle); 921 604 922 605 avg_atom = p->se.sum_exec_runtime; 923 606 if (nr_switches)
+181 -116
kernel/sched/fair.c
··· 20 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 21 */ 22 22 23 - #include <linux/latencytop.h> 24 23 #include <linux/sched.h> 24 + #include <linux/latencytop.h> 25 25 #include <linux/cpumask.h> 26 26 #include <linux/cpuidle.h> 27 27 #include <linux/slab.h> ··· 755 755 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 756 756 { 757 757 struct task_struct *p; 758 - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 758 + u64 delta; 759 + 760 + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 759 761 760 762 if (entity_is_task(se)) { 761 763 p = task_of(se); ··· 778 776 se->statistics.wait_sum += delta; 779 777 se->statistics.wait_start = 0; 780 778 } 781 - #else 782 - static inline void 783 - update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 784 - { 785 - } 786 - 787 - static inline void 788 - update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 789 - { 790 - } 791 - #endif 792 779 793 780 /* 794 781 * Task is being enqueued - update stats: 795 782 */ 796 - static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 783 + static inline void 784 + update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 797 785 { 798 786 /* 799 787 * Are we enqueueing a waiting task? (for current tasks ··· 794 802 } 795 803 796 804 static inline void 797 - update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 805 + update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 798 806 { 799 807 /* 800 808 * Mark the end of the wait period if dequeueing a ··· 802 810 */ 803 811 if (se != cfs_rq->curr) 804 812 update_stats_wait_end(cfs_rq, se); 813 + 814 + if (flags & DEQUEUE_SLEEP) { 815 + if (entity_is_task(se)) { 816 + struct task_struct *tsk = task_of(se); 817 + 818 + if (tsk->state & TASK_INTERRUPTIBLE) 819 + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); 820 + if (tsk->state & TASK_UNINTERRUPTIBLE) 821 + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); 822 + } 823 + } 824 + 805 825 } 826 + #else 827 + static inline void 828 + update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 829 + { 830 + } 831 + 832 + static inline void 833 + update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 834 + { 835 + } 836 + 837 + static inline void 838 + update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 839 + { 840 + } 841 + 842 + static inline void 843 + update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 844 + { 845 + } 846 + #endif 806 847 807 848 /* 808 849 * We are picking a new current task - update its stats: ··· 932 907 spinlock_t lock; /* nr_tasks, tasks */ 933 908 int nr_tasks; 934 909 pid_t gid; 910 + int active_nodes; 935 911 936 912 struct rcu_head rcu; 937 - nodemask_t active_nodes; 938 913 unsigned long total_faults; 914 + unsigned long max_faults_cpu; 939 915 /* 940 916 * Faults_cpu is used to decide whether memory should move 941 917 * towards the CPU. As a consequence, these stats are weighted ··· 993 967 { 994 968 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + 995 969 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; 970 + } 971 + 972 + /* 973 + * A node triggering more than 1/3 as many NUMA faults as the maximum is 974 + * considered part of a numa group's pseudo-interleaving set. Migrations 975 + * between these nodes are slowed down, to allow things to settle down. 976 + */ 977 + #define ACTIVE_NODE_FRACTION 3 978 + 979 + static bool numa_is_active_node(int nid, struct numa_group *ng) 980 + { 981 + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; 996 982 } 997 983 998 984 /* Handle placement on systems where not all nodes are directly connected. */ ··· 1156 1118 return true; 1157 1119 1158 1120 /* 1159 - * Do not migrate if the destination is not a node that 1160 - * is actively used by this numa group. 1121 + * Destination node is much more heavily used than the source 1122 + * node? Allow migration. 1161 1123 */ 1162 - if (!node_isset(dst_nid, ng->active_nodes)) 1163 - return false; 1164 - 1165 - /* 1166 - * Source is a node that is not actively used by this 1167 - * numa group, while the destination is. Migrate. 1168 - */ 1169 - if (!node_isset(src_nid, ng->active_nodes)) 1124 + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * 1125 + ACTIVE_NODE_FRACTION) 1170 1126 return true; 1171 1127 1172 1128 /* 1173 - * Both source and destination are nodes in active 1174 - * use by this numa group. Maximize memory bandwidth 1175 - * by migrating from more heavily used groups, to less 1176 - * heavily used ones, spreading the load around. 1177 - * Use a 1/4 hysteresis to avoid spurious page movement. 1129 + * Distribute memory according to CPU & memory use on each node, 1130 + * with 3/4 hysteresis to avoid unnecessary memory migrations: 1131 + * 1132 + * faults_cpu(dst) 3 faults_cpu(src) 1133 + * --------------- * - > --------------- 1134 + * faults_mem(dst) 4 faults_mem(src) 1178 1135 */ 1179 - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); 1136 + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > 1137 + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1180 1138 } 1181 1139 1182 1140 static unsigned long weighted_cpuload(const int cpu); ··· 1518 1484 1519 1485 .best_task = NULL, 1520 1486 .best_imp = 0, 1521 - .best_cpu = -1 1487 + .best_cpu = -1, 1522 1488 }; 1523 1489 struct sched_domain *sd; 1524 1490 unsigned long taskweight, groupweight; ··· 1570 1536 * multiple NUMA nodes; in order to better consolidate the group, 1571 1537 * we need to check other locations. 1572 1538 */ 1573 - if (env.best_cpu == -1 || (p->numa_group && 1574 - nodes_weight(p->numa_group->active_nodes) > 1)) { 1539 + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { 1575 1540 for_each_online_node(nid) { 1576 1541 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1577 1542 continue; ··· 1605 1572 * trying for a better one later. Do not set the preferred node here. 1606 1573 */ 1607 1574 if (p->numa_group) { 1575 + struct numa_group *ng = p->numa_group; 1576 + 1608 1577 if (env.best_cpu == -1) 1609 1578 nid = env.src_nid; 1610 1579 else 1611 1580 nid = env.dst_nid; 1612 1581 1613 - if (node_isset(nid, p->numa_group->active_nodes)) 1582 + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) 1614 1583 sched_setnuma(p, env.dst_nid); 1615 1584 } 1616 1585 ··· 1662 1627 } 1663 1628 1664 1629 /* 1665 - * Find the nodes on which the workload is actively running. We do this by 1630 + * Find out how many nodes on the workload is actively running on. Do this by 1666 1631 * tracking the nodes from which NUMA hinting faults are triggered. This can 1667 1632 * be different from the set of nodes where the workload's memory is currently 1668 1633 * located. 1669 - * 1670 - * The bitmask is used to make smarter decisions on when to do NUMA page 1671 - * migrations, To prevent flip-flopping, and excessive page migrations, nodes 1672 - * are added when they cause over 6/16 of the maximum number of faults, but 1673 - * only removed when they drop below 3/16. 1674 1634 */ 1675 - static void update_numa_active_node_mask(struct numa_group *numa_group) 1635 + static void numa_group_count_active_nodes(struct numa_group *numa_group) 1676 1636 { 1677 1637 unsigned long faults, max_faults = 0; 1678 - int nid; 1638 + int nid, active_nodes = 0; 1679 1639 1680 1640 for_each_online_node(nid) { 1681 1641 faults = group_faults_cpu(numa_group, nid); ··· 1680 1650 1681 1651 for_each_online_node(nid) { 1682 1652 faults = group_faults_cpu(numa_group, nid); 1683 - if (!node_isset(nid, numa_group->active_nodes)) { 1684 - if (faults > max_faults * 6 / 16) 1685 - node_set(nid, numa_group->active_nodes); 1686 - } else if (faults < max_faults * 3 / 16) 1687 - node_clear(nid, numa_group->active_nodes); 1653 + if (faults * ACTIVE_NODE_FRACTION > max_faults) 1654 + active_nodes++; 1688 1655 } 1656 + 1657 + numa_group->max_faults_cpu = max_faults; 1658 + numa_group->active_nodes = active_nodes; 1689 1659 } 1690 1660 1691 1661 /* ··· 1976 1946 update_task_scan_period(p, fault_types[0], fault_types[1]); 1977 1947 1978 1948 if (p->numa_group) { 1979 - update_numa_active_node_mask(p->numa_group); 1949 + numa_group_count_active_nodes(p->numa_group); 1980 1950 spin_unlock_irq(group_lock); 1981 1951 max_nid = preferred_group_nid(p, max_group_nid); 1982 1952 } ··· 2020 1990 return; 2021 1991 2022 1992 atomic_set(&grp->refcount, 1); 1993 + grp->active_nodes = 1; 1994 + grp->max_faults_cpu = 0; 2023 1995 spin_lock_init(&grp->lock); 2024 1996 grp->gid = p->pid; 2025 1997 /* Second half of the array tracks nids where faults happen */ 2026 1998 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 2027 1999 nr_node_ids; 2028 - 2029 - node_set(task_node(current), grp->active_nodes); 2030 2000 2031 2001 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2032 2002 grp->faults[i] = p->numa_faults[i]; ··· 2141 2111 bool migrated = flags & TNF_MIGRATED; 2142 2112 int cpu_node = task_node(current); 2143 2113 int local = !!(flags & TNF_FAULT_LOCAL); 2114 + struct numa_group *ng; 2144 2115 int priv; 2145 2116 2146 2117 if (!static_branch_likely(&sched_numa_balancing)) ··· 2182 2151 * actively using should be counted as local. This allows the 2183 2152 * scan rate to slow down when a workload has settled down. 2184 2153 */ 2185 - if (!priv && !local && p->numa_group && 2186 - node_isset(cpu_node, p->numa_group->active_nodes) && 2187 - node_isset(mem_node, p->numa_group->active_nodes)) 2154 + ng = p->numa_group; 2155 + if (!priv && !local && ng && ng->active_nodes > 1 && 2156 + numa_is_active_node(cpu_node, ng) && 2157 + numa_is_active_node(mem_node, ng)) 2188 2158 local = 1; 2189 2159 2190 2160 task_numa_placement(p); ··· 3134 3102 3135 3103 static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 3136 3104 3105 + static inline void check_schedstat_required(void) 3106 + { 3107 + #ifdef CONFIG_SCHEDSTATS 3108 + if (schedstat_enabled()) 3109 + return; 3110 + 3111 + /* Force schedstat enabled if a dependent tracepoint is active */ 3112 + if (trace_sched_stat_wait_enabled() || 3113 + trace_sched_stat_sleep_enabled() || 3114 + trace_sched_stat_iowait_enabled() || 3115 + trace_sched_stat_blocked_enabled() || 3116 + trace_sched_stat_runtime_enabled()) { 3117 + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3118 + "stat_blocked and stat_runtime require the " 3119 + "kernel parameter schedstats=enabled or " 3120 + "kernel.sched_schedstats=1\n"); 3121 + } 3122 + #endif 3123 + } 3124 + 3137 3125 static void 3138 3126 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3139 3127 { ··· 3174 3122 3175 3123 if (flags & ENQUEUE_WAKEUP) { 3176 3124 place_entity(cfs_rq, se, 0); 3177 - enqueue_sleeper(cfs_rq, se); 3125 + if (schedstat_enabled()) 3126 + enqueue_sleeper(cfs_rq, se); 3178 3127 } 3179 3128 3180 - update_stats_enqueue(cfs_rq, se); 3181 - check_spread(cfs_rq, se); 3129 + check_schedstat_required(); 3130 + if (schedstat_enabled()) { 3131 + update_stats_enqueue(cfs_rq, se); 3132 + check_spread(cfs_rq, se); 3133 + } 3182 3134 if (se != cfs_rq->curr) 3183 3135 __enqueue_entity(cfs_rq, se); 3184 3136 se->on_rq = 1; ··· 3249 3193 update_curr(cfs_rq); 3250 3194 dequeue_entity_load_avg(cfs_rq, se); 3251 3195 3252 - update_stats_dequeue(cfs_rq, se); 3253 - if (flags & DEQUEUE_SLEEP) { 3254 - #ifdef CONFIG_SCHEDSTATS 3255 - if (entity_is_task(se)) { 3256 - struct task_struct *tsk = task_of(se); 3257 - 3258 - if (tsk->state & TASK_INTERRUPTIBLE) 3259 - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); 3260 - if (tsk->state & TASK_UNINTERRUPTIBLE) 3261 - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); 3262 - } 3263 - #endif 3264 - } 3196 + if (schedstat_enabled()) 3197 + update_stats_dequeue(cfs_rq, se, flags); 3265 3198 3266 3199 clear_buddies(cfs_rq, se); 3267 3200 ··· 3324 3279 * a CPU. So account for the time it spent waiting on the 3325 3280 * runqueue. 3326 3281 */ 3327 - update_stats_wait_end(cfs_rq, se); 3282 + if (schedstat_enabled()) 3283 + update_stats_wait_end(cfs_rq, se); 3328 3284 __dequeue_entity(cfs_rq, se); 3329 3285 update_load_avg(se, 1); 3330 3286 } ··· 3338 3292 * least twice that of our own weight (i.e. dont track it 3339 3293 * when there are only lesser-weight tasks around): 3340 3294 */ 3341 - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3295 + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3342 3296 se->statistics.slice_max = max(se->statistics.slice_max, 3343 3297 se->sum_exec_runtime - se->prev_sum_exec_runtime); 3344 3298 } ··· 3421 3375 /* throttle cfs_rqs exceeding runtime */ 3422 3376 check_cfs_rq_runtime(cfs_rq); 3423 3377 3424 - check_spread(cfs_rq, prev); 3378 + if (schedstat_enabled()) { 3379 + check_spread(cfs_rq, prev); 3380 + if (prev->on_rq) 3381 + update_stats_wait_start(cfs_rq, prev); 3382 + } 3383 + 3425 3384 if (prev->on_rq) { 3426 - update_stats_wait_start(cfs_rq, prev); 3427 3385 /* Put 'current' back into the tree. */ 3428 3386 __enqueue_entity(cfs_rq, prev); 3429 3387 /* in !on_rq case, update occurred at dequeue */ ··· 4509 4459 4510 4460 /* scale is effectively 1 << i now, and >> i divides by scale */ 4511 4461 4512 - old_load = this_rq->cpu_load[i] - tickless_load; 4462 + old_load = this_rq->cpu_load[i]; 4513 4463 old_load = decay_load_missed(old_load, pending_updates - 1, i); 4514 - old_load += tickless_load; 4464 + if (tickless_load) { 4465 + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); 4466 + /* 4467 + * old_load can never be a negative value because a 4468 + * decayed tickless_load cannot be greater than the 4469 + * original tickless_load. 4470 + */ 4471 + old_load += tickless_load; 4472 + } 4515 4473 new_load = this_load; 4516 4474 /* 4517 4475 * Round up the averaging division if load is increasing. This ··· 4542 4484 } 4543 4485 4544 4486 #ifdef CONFIG_NO_HZ_COMMON 4487 + static void __update_cpu_load_nohz(struct rq *this_rq, 4488 + unsigned long curr_jiffies, 4489 + unsigned long load, 4490 + int active) 4491 + { 4492 + unsigned long pending_updates; 4493 + 4494 + pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4495 + if (pending_updates) { 4496 + this_rq->last_load_update_tick = curr_jiffies; 4497 + /* 4498 + * In the regular NOHZ case, we were idle, this means load 0. 4499 + * In the NOHZ_FULL case, we were non-idle, we should consider 4500 + * its weighted load. 4501 + */ 4502 + __update_cpu_load(this_rq, load, pending_updates, active); 4503 + } 4504 + } 4505 + 4545 4506 /* 4546 4507 * There is no sane way to deal with nohz on smp when using jiffies because the 4547 4508 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading ··· 4578 4501 * Called from nohz_idle_balance() to update the load ratings before doing the 4579 4502 * idle balance. 4580 4503 */ 4581 - static void update_idle_cpu_load(struct rq *this_rq) 4504 + static void update_cpu_load_idle(struct rq *this_rq) 4582 4505 { 4583 - unsigned long curr_jiffies = READ_ONCE(jiffies); 4584 - unsigned long load = weighted_cpuload(cpu_of(this_rq)); 4585 - unsigned long pending_updates; 4586 - 4587 4506 /* 4588 4507 * bail if there's load or we're actually up-to-date. 4589 4508 */ 4590 - if (load || curr_jiffies == this_rq->last_load_update_tick) 4509 + if (weighted_cpuload(cpu_of(this_rq))) 4591 4510 return; 4592 4511 4593 - pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4594 - this_rq->last_load_update_tick = curr_jiffies; 4595 - 4596 - __update_cpu_load(this_rq, load, pending_updates, 0); 4512 + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); 4597 4513 } 4598 4514 4599 4515 /* ··· 4597 4527 struct rq *this_rq = this_rq(); 4598 4528 unsigned long curr_jiffies = READ_ONCE(jiffies); 4599 4529 unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; 4600 - unsigned long pending_updates; 4601 4530 4602 4531 if (curr_jiffies == this_rq->last_load_update_tick) 4603 4532 return; 4604 4533 4605 4534 raw_spin_lock(&this_rq->lock); 4606 - pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4607 - if (pending_updates) { 4608 - this_rq->last_load_update_tick = curr_jiffies; 4609 - /* 4610 - * In the regular NOHZ case, we were idle, this means load 0. 4611 - * In the NOHZ_FULL case, we were non-idle, we should consider 4612 - * its weighted load. 4613 - */ 4614 - __update_cpu_load(this_rq, load, pending_updates, active); 4615 - } 4535 + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); 4616 4536 raw_spin_unlock(&this_rq->lock); 4617 4537 } 4618 4538 #endif /* CONFIG_NO_HZ */ ··· 4614 4554 { 4615 4555 unsigned long load = weighted_cpuload(cpu_of(this_rq)); 4616 4556 /* 4617 - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 4557 + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). 4618 4558 */ 4619 4559 this_rq->last_load_update_tick = jiffies; 4620 4560 __update_cpu_load(this_rq, load, 1, 1); ··· 7908 7848 if (time_after_eq(jiffies, rq->next_balance)) { 7909 7849 raw_spin_lock_irq(&rq->lock); 7910 7850 update_rq_clock(rq); 7911 - update_idle_cpu_load(rq); 7851 + update_cpu_load_idle(rq); 7912 7852 raw_spin_unlock_irq(&rq->lock); 7913 7853 rebalance_domains(rq, CPU_IDLE); 7914 7854 } ··· 8294 8234 for_each_possible_cpu(i) { 8295 8235 if (tg->cfs_rq) 8296 8236 kfree(tg->cfs_rq[i]); 8297 - if (tg->se) { 8298 - if (tg->se[i]) 8299 - remove_entity_load_avg(tg->se[i]); 8237 + if (tg->se) 8300 8238 kfree(tg->se[i]); 8301 - } 8302 8239 } 8303 8240 8304 8241 kfree(tg->cfs_rq); ··· 8343 8286 return 0; 8344 8287 } 8345 8288 8346 - void unregister_fair_sched_group(struct task_group *tg, int cpu) 8289 + void unregister_fair_sched_group(struct task_group *tg) 8347 8290 { 8348 - struct rq *rq = cpu_rq(cpu); 8349 8291 unsigned long flags; 8292 + struct rq *rq; 8293 + int cpu; 8350 8294 8351 - /* 8352 - * Only empty task groups can be destroyed; so we can speculatively 8353 - * check on_list without danger of it being re-added. 8354 - */ 8355 - if (!tg->cfs_rq[cpu]->on_list) 8356 - return; 8295 + for_each_possible_cpu(cpu) { 8296 + if (tg->se[cpu]) 8297 + remove_entity_load_avg(tg->se[cpu]); 8357 8298 8358 - raw_spin_lock_irqsave(&rq->lock, flags); 8359 - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8360 - raw_spin_unlock_irqrestore(&rq->lock, flags); 8299 + /* 8300 + * Only empty task groups can be destroyed; so we can speculatively 8301 + * check on_list without danger of it being re-added. 8302 + */ 8303 + if (!tg->cfs_rq[cpu]->on_list) 8304 + continue; 8305 + 8306 + rq = cpu_rq(cpu); 8307 + 8308 + raw_spin_lock_irqsave(&rq->lock, flags); 8309 + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8310 + raw_spin_unlock_irqrestore(&rq->lock, flags); 8311 + } 8361 8312 } 8362 8313 8363 8314 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, ··· 8447 8382 return 1; 8448 8383 } 8449 8384 8450 - void unregister_fair_sched_group(struct task_group *tg, int cpu) { } 8385 + void unregister_fair_sched_group(struct task_group *tg) { } 8451 8386 8452 8387 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8453 8388
+69 -27
kernel/sched/rt.c
··· 58 58 raw_spin_lock(&rt_b->rt_runtime_lock); 59 59 if (!rt_b->rt_period_active) { 60 60 rt_b->rt_period_active = 1; 61 - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); 61 + /* 62 + * SCHED_DEADLINE updates the bandwidth, as a run away 63 + * RT task with a DL task could hog a CPU. But DL does 64 + * not reset the period. If a deadline task was running 65 + * without an RT task running, it can cause RT tasks to 66 + * throttle when they start up. Kick the timer right away 67 + * to update the period. 68 + */ 69 + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 62 70 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); 63 71 } 64 72 raw_spin_unlock(&rt_b->rt_runtime_lock); ··· 444 436 445 437 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 446 438 { 447 - return !list_empty(&rt_se->run_list); 439 + return rt_se->on_rq; 448 440 } 449 441 450 442 #ifdef CONFIG_RT_GROUP_SCHED ··· 490 482 return rt_se->my_q; 491 483 } 492 484 493 - static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 494 - static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 485 + static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 486 + static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 495 487 496 488 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 497 489 { ··· 507 499 if (!rt_se) 508 500 enqueue_top_rt_rq(rt_rq); 509 501 else if (!on_rt_rq(rt_se)) 510 - enqueue_rt_entity(rt_se, false); 502 + enqueue_rt_entity(rt_se, 0); 511 503 512 504 if (rt_rq->highest_prio.curr < curr->prio) 513 505 resched_curr(rq); ··· 524 516 if (!rt_se) 525 517 dequeue_top_rt_rq(rt_rq); 526 518 else if (on_rt_rq(rt_se)) 527 - dequeue_rt_entity(rt_se); 519 + dequeue_rt_entity(rt_se, 0); 528 520 } 529 521 530 522 static inline int rt_rq_throttled(struct rt_rq *rt_rq) ··· 1174 1166 dec_rt_group(rt_se, rt_rq); 1175 1167 } 1176 1168 1177 - static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1169 + /* 1170 + * Change rt_se->run_list location unless SAVE && !MOVE 1171 + * 1172 + * assumes ENQUEUE/DEQUEUE flags match 1173 + */ 1174 + static inline bool move_entity(unsigned int flags) 1175 + { 1176 + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1177 + return false; 1178 + 1179 + return true; 1180 + } 1181 + 1182 + static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1183 + { 1184 + list_del_init(&rt_se->run_list); 1185 + 1186 + if (list_empty(array->queue + rt_se_prio(rt_se))) 1187 + __clear_bit(rt_se_prio(rt_se), array->bitmap); 1188 + 1189 + rt_se->on_list = 0; 1190 + } 1191 + 1192 + static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1178 1193 { 1179 1194 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1180 1195 struct rt_prio_array *array = &rt_rq->active; ··· 1210 1179 * get throttled and the current group doesn't have any other 1211 1180 * active members. 1212 1181 */ 1213 - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1182 + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1183 + if (rt_se->on_list) 1184 + __delist_rt_entity(rt_se, array); 1214 1185 return; 1186 + } 1215 1187 1216 - if (head) 1217 - list_add(&rt_se->run_list, queue); 1218 - else 1219 - list_add_tail(&rt_se->run_list, queue); 1220 - __set_bit(rt_se_prio(rt_se), array->bitmap); 1188 + if (move_entity(flags)) { 1189 + WARN_ON_ONCE(rt_se->on_list); 1190 + if (flags & ENQUEUE_HEAD) 1191 + list_add(&rt_se->run_list, queue); 1192 + else 1193 + list_add_tail(&rt_se->run_list, queue); 1194 + 1195 + __set_bit(rt_se_prio(rt_se), array->bitmap); 1196 + rt_se->on_list = 1; 1197 + } 1198 + rt_se->on_rq = 1; 1221 1199 1222 1200 inc_rt_tasks(rt_se, rt_rq); 1223 1201 } 1224 1202 1225 - static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1203 + static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1226 1204 { 1227 1205 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1228 1206 struct rt_prio_array *array = &rt_rq->active; 1229 1207 1230 - list_del_init(&rt_se->run_list); 1231 - if (list_empty(array->queue + rt_se_prio(rt_se))) 1232 - __clear_bit(rt_se_prio(rt_se), array->bitmap); 1208 + if (move_entity(flags)) { 1209 + WARN_ON_ONCE(!rt_se->on_list); 1210 + __delist_rt_entity(rt_se, array); 1211 + } 1212 + rt_se->on_rq = 0; 1233 1213 1234 1214 dec_rt_tasks(rt_se, rt_rq); 1235 1215 } ··· 1249 1207 * Because the prio of an upper entry depends on the lower 1250 1208 * entries, we must remove entries top - down. 1251 1209 */ 1252 - static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1210 + static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1253 1211 { 1254 1212 struct sched_rt_entity *back = NULL; 1255 1213 ··· 1262 1220 1263 1221 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1264 1222 if (on_rt_rq(rt_se)) 1265 - __dequeue_rt_entity(rt_se); 1223 + __dequeue_rt_entity(rt_se, flags); 1266 1224 } 1267 1225 } 1268 1226 1269 - static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1227 + static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1270 1228 { 1271 1229 struct rq *rq = rq_of_rt_se(rt_se); 1272 1230 1273 - dequeue_rt_stack(rt_se); 1231 + dequeue_rt_stack(rt_se, flags); 1274 1232 for_each_sched_rt_entity(rt_se) 1275 - __enqueue_rt_entity(rt_se, head); 1233 + __enqueue_rt_entity(rt_se, flags); 1276 1234 enqueue_top_rt_rq(&rq->rt); 1277 1235 } 1278 1236 1279 - static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1237 + static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1280 1238 { 1281 1239 struct rq *rq = rq_of_rt_se(rt_se); 1282 1240 1283 - dequeue_rt_stack(rt_se); 1241 + dequeue_rt_stack(rt_se, flags); 1284 1242 1285 1243 for_each_sched_rt_entity(rt_se) { 1286 1244 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1287 1245 1288 1246 if (rt_rq && rt_rq->rt_nr_running) 1289 - __enqueue_rt_entity(rt_se, false); 1247 + __enqueue_rt_entity(rt_se, flags); 1290 1248 } 1291 1249 enqueue_top_rt_rq(&rq->rt); 1292 1250 } ··· 1302 1260 if (flags & ENQUEUE_WAKEUP) 1303 1261 rt_se->timeout = 0; 1304 1262 1305 - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1263 + enqueue_rt_entity(rt_se, flags); 1306 1264 1307 1265 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1308 1266 enqueue_pushable_task(rq, p); ··· 1313 1271 struct sched_rt_entity *rt_se = &p->rt; 1314 1272 1315 1273 update_curr_rt(rq); 1316 - dequeue_rt_entity(rt_se); 1274 + dequeue_rt_entity(rt_se, flags); 1317 1275 1318 1276 dequeue_pushable_task(rq, p); 1319 1277 }
+44 -9
kernel/sched/sched.h
··· 3 3 #include <linux/sched/sysctl.h> 4 4 #include <linux/sched/rt.h> 5 5 #include <linux/sched/deadline.h> 6 + #include <linux/binfmts.h> 6 7 #include <linux/mutex.h> 7 8 #include <linux/spinlock.h> 8 9 #include <linux/stop_machine.h> ··· 314 313 315 314 extern void free_fair_sched_group(struct task_group *tg); 316 315 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 317 - extern void unregister_fair_sched_group(struct task_group *tg, int cpu); 316 + extern void unregister_fair_sched_group(struct task_group *tg); 318 317 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 319 318 struct sched_entity *se, int cpu, 320 319 struct sched_entity *parent); 321 320 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 322 - extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 323 321 324 322 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 325 323 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); ··· 909 909 910 910 extern int group_balance_cpu(struct sched_group *sg); 911 911 912 + #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 913 + void register_sched_domain_sysctl(void); 914 + void unregister_sched_domain_sysctl(void); 915 + #else 916 + static inline void register_sched_domain_sysctl(void) 917 + { 918 + } 919 + static inline void unregister_sched_domain_sysctl(void) 920 + { 921 + } 922 + #endif 923 + 912 924 #else 913 925 914 926 static inline void sched_ttwu_pending(void) { } ··· 1034 1022 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1035 1023 1036 1024 extern struct static_key_false sched_numa_balancing; 1025 + extern struct static_key_false sched_schedstats; 1037 1026 1038 1027 static inline u64 global_rt_period(void) 1039 1028 { ··· 1143 1130 extern const int sched_prio_to_weight[40]; 1144 1131 extern const u32 sched_prio_to_wmult[40]; 1145 1132 1133 + /* 1134 + * {de,en}queue flags: 1135 + * 1136 + * DEQUEUE_SLEEP - task is no longer runnable 1137 + * ENQUEUE_WAKEUP - task just became runnable 1138 + * 1139 + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks 1140 + * are in a known state which allows modification. Such pairs 1141 + * should preserve as much state as possible. 1142 + * 1143 + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location 1144 + * in the runqueue. 1145 + * 1146 + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 1147 + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 1148 + * ENQUEUE_WAKING - sched_class::task_waking was called 1149 + * 1150 + */ 1151 + 1152 + #define DEQUEUE_SLEEP 0x01 1153 + #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1154 + #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1155 + 1146 1156 #define ENQUEUE_WAKEUP 0x01 1147 - #define ENQUEUE_HEAD 0x02 1157 + #define ENQUEUE_RESTORE 0x02 1158 + #define ENQUEUE_MOVE 0x04 1159 + 1160 + #define ENQUEUE_HEAD 0x08 1161 + #define ENQUEUE_REPLENISH 0x10 1148 1162 #ifdef CONFIG_SMP 1149 - #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ 1163 + #define ENQUEUE_WAKING 0x20 1150 1164 #else 1151 1165 #define ENQUEUE_WAKING 0x00 1152 1166 #endif 1153 - #define ENQUEUE_REPLENISH 0x08 1154 - #define ENQUEUE_RESTORE 0x10 1155 - 1156 - #define DEQUEUE_SLEEP 0x01 1157 - #define DEQUEUE_SAVE 0x02 1158 1167 1159 1168 #define RETRY_TASK ((void *)-1UL) 1160 1169
+5 -3
kernel/sched/stats.h
··· 29 29 if (rq) 30 30 rq->rq_sched_info.run_delay += delta; 31 31 } 32 - # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 33 - # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 34 - # define schedstat_set(var, val) do { var = (val); } while (0) 32 + # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33 + # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) 34 + # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) 35 + # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 35 36 #else /* !CONFIG_SCHEDSTATS */ 36 37 static inline void 37 38 rq_sched_info_arrive(struct rq *rq, unsigned long long delta) ··· 43 42 static inline void 44 43 rq_sched_info_depart(struct rq *rq, unsigned long long delta) 45 44 {} 45 + # define schedstat_enabled() 0 46 46 # define schedstat_inc(rq, field) do { } while (0) 47 47 # define schedstat_add(rq, field, amt) do { } while (0) 48 48 # define schedstat_set(var, val) do { } while (0)
+123
kernel/sched/swait.c
··· 1 + #include <linux/sched.h> 2 + #include <linux/swait.h> 3 + 4 + void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 5 + struct lock_class_key *key) 6 + { 7 + raw_spin_lock_init(&q->lock); 8 + lockdep_set_class_and_name(&q->lock, key, name); 9 + INIT_LIST_HEAD(&q->task_list); 10 + } 11 + EXPORT_SYMBOL(__init_swait_queue_head); 12 + 13 + /* 14 + * The thing about the wake_up_state() return value; I think we can ignore it. 15 + * 16 + * If for some reason it would return 0, that means the previously waiting 17 + * task is already running, so it will observe condition true (or has already). 18 + */ 19 + void swake_up_locked(struct swait_queue_head *q) 20 + { 21 + struct swait_queue *curr; 22 + 23 + if (list_empty(&q->task_list)) 24 + return; 25 + 26 + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); 27 + wake_up_process(curr->task); 28 + list_del_init(&curr->task_list); 29 + } 30 + EXPORT_SYMBOL(swake_up_locked); 31 + 32 + void swake_up(struct swait_queue_head *q) 33 + { 34 + unsigned long flags; 35 + 36 + if (!swait_active(q)) 37 + return; 38 + 39 + raw_spin_lock_irqsave(&q->lock, flags); 40 + swake_up_locked(q); 41 + raw_spin_unlock_irqrestore(&q->lock, flags); 42 + } 43 + EXPORT_SYMBOL(swake_up); 44 + 45 + /* 46 + * Does not allow usage from IRQ disabled, since we must be able to 47 + * release IRQs to guarantee bounded hold time. 48 + */ 49 + void swake_up_all(struct swait_queue_head *q) 50 + { 51 + struct swait_queue *curr; 52 + LIST_HEAD(tmp); 53 + 54 + if (!swait_active(q)) 55 + return; 56 + 57 + raw_spin_lock_irq(&q->lock); 58 + list_splice_init(&q->task_list, &tmp); 59 + while (!list_empty(&tmp)) { 60 + curr = list_first_entry(&tmp, typeof(*curr), task_list); 61 + 62 + wake_up_state(curr->task, TASK_NORMAL); 63 + list_del_init(&curr->task_list); 64 + 65 + if (list_empty(&tmp)) 66 + break; 67 + 68 + raw_spin_unlock_irq(&q->lock); 69 + raw_spin_lock_irq(&q->lock); 70 + } 71 + raw_spin_unlock_irq(&q->lock); 72 + } 73 + EXPORT_SYMBOL(swake_up_all); 74 + 75 + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) 76 + { 77 + wait->task = current; 78 + if (list_empty(&wait->task_list)) 79 + list_add(&wait->task_list, &q->task_list); 80 + } 81 + 82 + void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) 83 + { 84 + unsigned long flags; 85 + 86 + raw_spin_lock_irqsave(&q->lock, flags); 87 + __prepare_to_swait(q, wait); 88 + set_current_state(state); 89 + raw_spin_unlock_irqrestore(&q->lock, flags); 90 + } 91 + EXPORT_SYMBOL(prepare_to_swait); 92 + 93 + long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) 94 + { 95 + if (signal_pending_state(state, current)) 96 + return -ERESTARTSYS; 97 + 98 + prepare_to_swait(q, wait, state); 99 + 100 + return 0; 101 + } 102 + EXPORT_SYMBOL(prepare_to_swait_event); 103 + 104 + void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) 105 + { 106 + __set_current_state(TASK_RUNNING); 107 + if (!list_empty(&wait->task_list)) 108 + list_del_init(&wait->task_list); 109 + } 110 + 111 + void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) 112 + { 113 + unsigned long flags; 114 + 115 + __set_current_state(TASK_RUNNING); 116 + 117 + if (!list_empty_careful(&wait->task_list)) { 118 + raw_spin_lock_irqsave(&q->lock, flags); 119 + list_del_init(&wait->task_list); 120 + raw_spin_unlock_irqrestore(&q->lock, flags); 121 + } 122 + } 123 + EXPORT_SYMBOL(finish_swait);
+2 -2
kernel/softirq.c
··· 116 116 117 117 if (preempt_count() == cnt) { 118 118 #ifdef CONFIG_DEBUG_PREEMPT 119 - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); 119 + current->preempt_disable_ip = get_lock_parent_ip(); 120 120 #endif 121 - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); 122 122 } 123 123 } 124 124 EXPORT_SYMBOL(__local_bh_disable_ip);
+12 -1
kernel/sysctl.c
··· 350 350 .mode = 0644, 351 351 .proc_handler = proc_dointvec, 352 352 }, 353 + #ifdef CONFIG_SCHEDSTATS 354 + { 355 + .procname = "sched_schedstats", 356 + .data = NULL, 357 + .maxlen = sizeof(unsigned int), 358 + .mode = 0644, 359 + .proc_handler = sysctl_schedstats, 360 + .extra1 = &zero, 361 + .extra2 = &one, 362 + }, 363 + #endif /* CONFIG_SCHEDSTATS */ 353 364 #endif /* CONFIG_SMP */ 354 365 #ifdef CONFIG_NUMA_BALANCING 355 366 { ··· 516 505 .data = &latencytop_enabled, 517 506 .maxlen = sizeof(int), 518 507 .mode = 0644, 519 - .proc_handler = proc_dointvec, 508 + .proc_handler = sysctl_latencytop, 520 509 }, 521 510 #endif 522 511 #ifdef CONFIG_BLK_DEV_INITRD
+28 -22
kernel/tsacct.c
··· 93 93 { 94 94 struct mm_struct *mm; 95 95 96 - /* convert pages-usec to Mbyte-usec */ 97 - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; 98 - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; 96 + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ 97 + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; 98 + do_div(stats->coremem, 1000 * KB); 99 + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; 100 + do_div(stats->virtmem, 1000 * KB); 99 101 mm = get_task_mm(p); 100 102 if (mm) { 101 103 /* adjust to KB unit */ ··· 125 123 static void __acct_update_integrals(struct task_struct *tsk, 126 124 cputime_t utime, cputime_t stime) 127 125 { 128 - if (likely(tsk->mm)) { 129 - cputime_t time, dtime; 130 - struct timeval value; 131 - unsigned long flags; 132 - u64 delta; 126 + cputime_t time, dtime; 127 + u64 delta; 133 128 134 - local_irq_save(flags); 135 - time = stime + utime; 136 - dtime = time - tsk->acct_timexpd; 137 - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 138 - delta = value.tv_sec; 139 - delta = delta * USEC_PER_SEC + value.tv_usec; 129 + if (!likely(tsk->mm)) 130 + return; 140 131 141 - if (delta == 0) 142 - goto out; 143 - tsk->acct_timexpd = time; 144 - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 145 - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 146 - out: 147 - local_irq_restore(flags); 148 - } 132 + time = stime + utime; 133 + dtime = time - tsk->acct_timexpd; 134 + /* Avoid division: cputime_t is often in nanoseconds already. */ 135 + delta = cputime_to_nsecs(dtime); 136 + 137 + if (delta < TICK_NSEC) 138 + return; 139 + 140 + tsk->acct_timexpd = time; 141 + /* 142 + * Divide by 1024 to avoid overflow, and to avoid division. 143 + * The final unit reported to userspace is Mbyte-usecs, 144 + * the rest of the math is done in xacct_add_tsk. 145 + */ 146 + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; 147 + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; 149 148 } 150 149 151 150 /** ··· 156 153 void acct_update_integrals(struct task_struct *tsk) 157 154 { 158 155 cputime_t utime, stime; 156 + unsigned long flags; 159 157 158 + local_irq_save(flags); 160 159 task_cputime(tsk, &utime, &stime); 161 160 __acct_update_integrals(tsk, utime, stime); 161 + local_irq_restore(flags); 162 162 } 163 163 164 164 /**
+2 -2
virt/kvm/async_pf.c
··· 97 97 * This memory barrier pairs with prepare_to_wait's set_current_state() 98 98 */ 99 99 smp_mb(); 100 - if (waitqueue_active(&vcpu->wq)) 101 - wake_up_interruptible(&vcpu->wq); 100 + if (swait_active(&vcpu->wq)) 101 + swake_up(&vcpu->wq); 102 102 103 103 mmput(mm); 104 104 kvm_put_kvm(vcpu->kvm);
+8 -9
virt/kvm/kvm_main.c
··· 216 216 vcpu->kvm = kvm; 217 217 vcpu->vcpu_id = id; 218 218 vcpu->pid = NULL; 219 - vcpu->halt_poll_ns = 0; 220 - init_waitqueue_head(&vcpu->wq); 219 + init_swait_queue_head(&vcpu->wq); 221 220 kvm_async_pf_vcpu_init(vcpu); 222 221 223 222 vcpu->pre_pcpu = -1; ··· 1992 1993 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1993 1994 { 1994 1995 ktime_t start, cur; 1995 - DEFINE_WAIT(wait); 1996 + DECLARE_SWAITQUEUE(wait); 1996 1997 bool waited = false; 1997 1998 u64 block_ns; 1998 1999 ··· 2017 2018 kvm_arch_vcpu_blocking(vcpu); 2018 2019 2019 2020 for (;;) { 2020 - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2021 + prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2021 2022 2022 2023 if (kvm_vcpu_check_block(vcpu) < 0) 2023 2024 break; ··· 2026 2027 schedule(); 2027 2028 } 2028 2029 2029 - finish_wait(&vcpu->wq, &wait); 2030 + finish_swait(&vcpu->wq, &wait); 2030 2031 cur = ktime_get(); 2031 2032 2032 2033 kvm_arch_vcpu_unblocking(vcpu); ··· 2058 2059 { 2059 2060 int me; 2060 2061 int cpu = vcpu->cpu; 2061 - wait_queue_head_t *wqp; 2062 + struct swait_queue_head *wqp; 2062 2063 2063 2064 wqp = kvm_arch_vcpu_wq(vcpu); 2064 - if (waitqueue_active(wqp)) { 2065 - wake_up_interruptible(wqp); 2065 + if (swait_active(wqp)) { 2066 + swake_up(wqp); 2066 2067 ++vcpu->stat.halt_wakeup; 2067 2068 } 2068 2069 ··· 2163 2164 continue; 2164 2165 if (vcpu == me) 2165 2166 continue; 2166 - if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2167 + if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2167 2168 continue; 2168 2169 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2169 2170 continue;