Merge branches 'sched-core-for-linus' and 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

-1

Documentation/trace/kprobetrace.txt

··· 120 120 field:unsigned char common_flags; offset:2; size:1; signed:0; 121 121 field:unsigned char common_preempt_count; offset:3; size:1;signed:0; 122 122 field:int common_pid; offset:4; size:4; signed:1; 123 - field:int common_lock_depth; offset:8; size:4; signed:1; 124 123 125 124 field:unsigned long __probe_ip; offset:12; size:4; signed:0; 126 125 field:int __probe_nargs; offset:16; size:4; signed:1;

+1 -2

arch/alpha/kernel/smp.c

··· 585 585 586 586 switch (which) { 587 587 case IPI_RESCHEDULE: 588 - /* Reschedule callback. Everything to be done 589 - is done by the interrupt return path. */ 588 + scheduler_ipi(); 590 589 break; 591 590 592 591 case IPI_CALL_FUNC:

+1 -4

arch/arm/kernel/smp.c

··· 560 560 break; 561 561 562 562 case IPI_RESCHEDULE: 563 - /* 564 - * nothing more to do - eveything is 565 - * done on the interrupt return path 566 - */ 563 + scheduler_ipi(); 567 564 break; 568 565 569 566 case IPI_CALL_FUNC:

+3

arch/blackfin/mach-common/smp.c

··· 177 177 while (msg_queue->count) { 178 178 msg = &msg_queue->ipi_message[msg_queue->head]; 179 179 switch (msg->type) { 180 + case BFIN_IPI_RESCHEDULE: 181 + scheduler_ipi(); 182 + break; 180 183 case BFIN_IPI_CALL_FUNC: 181 184 spin_unlock_irqrestore(&msg_queue->lock, flags); 182 185 ipi_call_function(cpu, msg);

+8 -5

arch/cris/arch-v32/kernel/smp.c

··· 342 342 343 343 ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); 344 344 345 + if (ipi.vector & IPI_SCHEDULE) { 346 + scheduler_ipi(); 347 + } 345 348 if (ipi.vector & IPI_CALL) { 346 - func(info); 349 + func(info); 347 350 } 348 351 if (ipi.vector & IPI_FLUSH_TLB) { 349 - if (flush_mm == FLUSH_ALL) 350 - __flush_tlb_all(); 351 - else if (flush_vma == FLUSH_ALL) 352 + if (flush_mm == FLUSH_ALL) 353 + __flush_tlb_all(); 354 + else if (flush_vma == FLUSH_ALL) 352 355 __flush_tlb_mm(flush_mm); 353 - else 356 + else 354 357 __flush_tlb_page(flush_vma, flush_addr); 355 358 } 356 359

+2

arch/ia64/kernel/irq_ia64.c

··· 31 31 #include <linux/irq.h> 32 32 #include <linux/ratelimit.h> 33 33 #include <linux/acpi.h> 34 + #include <linux/sched.h> 34 35 35 36 #include <asm/delay.h> 36 37 #include <asm/intrinsics.h> ··· 497 496 smp_local_flush_tlb(); 498 497 kstat_incr_irqs_this_cpu(irq, desc); 499 498 } else if (unlikely(IS_RESCHEDULE(vector))) { 499 + scheduler_ipi(); 500 500 kstat_incr_irqs_this_cpu(irq, desc); 501 501 } else { 502 502 ia64_setreg(_IA64_REG_CR_TPR, vector);

+9 -1

arch/ia64/xen/irq_xen.c

··· 92 92 static int xen_slab_ready; 93 93 94 94 #ifdef CONFIG_SMP 95 + #include <linux/sched.h> 96 + 95 97 /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, 96 98 * it ends up to issue several memory accesses upon percpu data and 97 99 * thus adds unnecessary traffic to other paths. ··· 101 99 static irqreturn_t 102 100 xen_dummy_handler(int irq, void *dev_id) 103 101 { 102 + return IRQ_HANDLED; 103 + } 104 104 105 + static irqreturn_t 106 + xen_resched_handler(int irq, void *dev_id) 107 + { 108 + scheduler_ipi(); 105 109 return IRQ_HANDLED; 106 110 } 107 111 ··· 118 110 }; 119 111 120 112 static struct irqaction xen_resched_irqaction = { 121 - .handler = xen_dummy_handler, 113 + .handler = xen_resched_handler, 122 114 .flags = IRQF_DISABLED, 123 115 .name = "resched" 124 116 };

+1 -3

arch/m32r/kernel/smp.c

··· 122 122 * 123 123 * Description: This routine executes on CPU which received 124 124 * 'RESCHEDULE_IPI'. 125 - * Rescheduling is processed at the exit of interrupt 126 - * operation. 127 125 * 128 126 * Born on Date: 2002.02.05 129 127 * ··· 136 138 *==========================================================================*/ 137 139 void smp_reschedule_interrupt(void) 138 140 { 139 - /* nothing to do */ 141 + scheduler_ipi(); 140 142 } 141 143 142 144 /*==========================================================================*

+2

arch/mips/cavium-octeon/smp.c

··· 44 44 45 45 if (action & SMP_CALL_FUNCTION) 46 46 smp_call_function_interrupt(); 47 + if (action & SMP_RESCHEDULE_YOURSELF) 48 + scheduler_ipi(); 47 49 48 50 /* Check if we've been told to flush the icache */ 49 51 if (action & SMP_ICACHE_FLUSH)

+1 -1

arch/mips/kernel/smtc.c

··· 929 929 930 930 static void ipi_resched_interrupt(void) 931 931 { 932 - /* Return from interrupt should be enough to cause scheduler check */ 932 + scheduler_ipi(); 933 933 } 934 934 935 935 static void ipi_call_interrupt(void)

+2

arch/mips/mti-malta/malta-int.c

··· 308 308 309 309 static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) 310 310 { 311 + scheduler_ipi(); 312 + 311 313 return IRQ_HANDLED; 312 314 } 313 315

+4

arch/mips/pmc-sierra/yosemite/smp.c

··· 55 55 56 56 if (status & 0x2) 57 57 smp_call_function_interrupt(); 58 + if (status & 0x4) 59 + scheduler_ipi(); 58 60 break; 59 61 60 62 case 1: ··· 65 63 66 64 if (status & 0x2) 67 65 smp_call_function_interrupt(); 66 + if (status & 0x4) 67 + scheduler_ipi(); 68 68 break; 69 69 } 70 70 }

+2

arch/mips/sgi-ip27/ip27-irq.c

··· 147 147 #ifdef CONFIG_SMP 148 148 if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { 149 149 LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); 150 + scheduler_ipi(); 150 151 } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { 151 152 LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); 153 + scheduler_ipi(); 152 154 } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { 153 155 LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); 154 156 smp_call_function_interrupt();

+3 -4

arch/mips/sibyte/bcm1480/smp.c

··· 20 20 #include <linux/delay.h> 21 21 #include <linux/smp.h> 22 22 #include <linux/kernel_stat.h> 23 + #include <linux/sched.h> 23 24 24 25 #include <asm/mmu_context.h> 25 26 #include <asm/io.h> ··· 190 189 /* Clear the mailbox to clear the interrupt */ 191 190 __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); 192 191 193 - /* 194 - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 195 - * interrupt will do the reschedule for us 196 - */ 192 + if (action & SMP_RESCHEDULE_YOURSELF) 193 + scheduler_ipi(); 197 194 198 195 if (action & SMP_CALL_FUNCTION) 199 196 smp_call_function_interrupt();

+3 -4

arch/mips/sibyte/sb1250/smp.c

··· 21 21 #include <linux/interrupt.h> 22 22 #include <linux/smp.h> 23 23 #include <linux/kernel_stat.h> 24 + #include <linux/sched.h> 24 25 25 26 #include <asm/mmu_context.h> 26 27 #include <asm/io.h> ··· 178 177 /* Clear the mailbox to clear the interrupt */ 179 178 ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); 180 179 181 - /* 182 - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the 183 - * interrupt will do the reschedule for us 184 - */ 180 + if (action & SMP_RESCHEDULE_YOURSELF) 181 + scheduler_ipi(); 185 182 186 183 if (action & SMP_CALL_FUNCTION) 187 184 smp_call_function_interrupt();

+1 -4

arch/mn10300/kernel/smp.c

··· 494 494 * @irq: The interrupt number. 495 495 * @dev_id: The device ID. 496 496 * 497 - * We need do nothing here, since the scheduling will be effected on our way 498 - * back through entry.S. 499 - * 500 497 * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. 501 498 */ 502 499 static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) 503 500 { 504 - /* do nothing */ 501 + scheduler_ipi(); 505 502 return IRQ_HANDLED; 506 503 } 507 504

+1 -4

arch/parisc/kernel/smp.c

··· 155 155 156 156 case IPI_RESCHEDULE: 157 157 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); 158 - /* 159 - * Reschedule callback. Everything to be 160 - * done is done by the interrupt return path. 161 - */ 158 + scheduler_ipi(); 162 159 break; 163 160 164 161 case IPI_CALL_FUNC:

+2 -2

arch/powerpc/kernel/smp.c

··· 116 116 generic_smp_call_function_interrupt(); 117 117 break; 118 118 case PPC_MSG_RESCHEDULE: 119 - /* we notice need_resched on exit */ 119 + scheduler_ipi(); 120 120 break; 121 121 case PPC_MSG_CALL_FUNC_SINGLE: 122 122 generic_smp_call_function_single_interrupt(); ··· 146 146 147 147 static irqreturn_t reschedule_action(int irq, void *data) 148 148 { 149 - /* we just need the return path side effect of checking need_resched */ 149 + scheduler_ipi(); 150 150 return IRQ_HANDLED; 151 151 } 152 152

+3 -3

arch/s390/kernel/smp.c

··· 165 165 kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; 166 166 /* 167 167 * handle bit signal external calls 168 - * 169 - * For the ec_schedule signal we have to do nothing. All the work 170 - * is done automatically when we return from the interrupt. 171 168 */ 172 169 bits = xchg(&S390_lowcore.ext_call_fast, 0); 170 + 171 + if (test_bit(ec_schedule, &bits)) 172 + scheduler_ipi(); 173 173 174 174 if (test_bit(ec_call_function, &bits)) 175 175 generic_smp_call_function_interrupt();

+2

arch/sh/kernel/smp.c

··· 20 20 #include <linux/module.h> 21 21 #include <linux/cpu.h> 22 22 #include <linux/interrupt.h> 23 + #include <linux/sched.h> 23 24 #include <asm/atomic.h> 24 25 #include <asm/processor.h> 25 26 #include <asm/system.h> ··· 324 323 generic_smp_call_function_interrupt(); 325 324 break; 326 325 case SMP_MSG_RESCHEDULE: 326 + scheduler_ipi(); 327 327 break; 328 328 case SMP_MSG_FUNCTION_SINGLE: 329 329 generic_smp_call_function_single_interrupt();

+5 -1

arch/sparc/include/asm/topology_64.h

··· 65 65 #define smt_capable() (sparc64_multi_core) 66 66 #endif /* CONFIG_SMP */ 67 67 68 - #define cpu_coregroup_mask(cpu) (&cpu_core_map[cpu]) 68 + extern cpumask_t cpu_core_map[NR_CPUS]; 69 + static inline const struct cpumask *cpu_coregroup_mask(int cpu) 70 + { 71 + return &cpu_core_map[cpu]; 72 + } 69 73 70 74 #endif /* _ASM_SPARC64_TOPOLOGY_H */

+3 -1

arch/sparc/kernel/smp_32.c

··· 129 129 130 130 void smp_send_reschedule(int cpu) 131 131 { 132 - /* See sparc64 */ 132 + /* 133 + * XXX missing reschedule IPI, see scheduler_ipi() 134 + */ 133 135 } 134 136 135 137 void smp_send_stop(void)

+1

arch/sparc/kernel/smp_64.c

··· 1368 1368 void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) 1369 1369 { 1370 1370 clear_softint(1 << irq); 1371 + scheduler_ipi(); 1371 1372 } 1372 1373 1373 1374 /* This is a nop because we capture all other cpus

+1 -5

arch/tile/kernel/smp.c

··· 189 189 /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ 190 190 static irqreturn_t handle_reschedule_ipi(int irq, void *token) 191 191 { 192 - /* 193 - * Nothing to do here; when we return from interrupt, the 194 - * rescheduling will occur there. But do bump the interrupt 195 - * profiler count in the meantime. 196 - */ 197 192 __get_cpu_var(irq_stat).irq_resched_count++; 193 + scheduler_ipi(); 198 194 199 195 return IRQ_HANDLED; 200 196 }

+1 -1

arch/um/kernel/smp.c

··· 173 173 break; 174 174 175 175 case 'R': 176 - set_tsk_need_resched(current); 176 + scheduler_ipi(); 177 177 break; 178 178 179 179 case 'S':

+2 -3

arch/x86/kernel/smp.c

··· 194 194 } 195 195 196 196 /* 197 - * Reschedule call back. Nothing to do, 198 - * all the work is done automatically when 199 - * we return from the interrupt. 197 + * Reschedule call back. 200 198 */ 201 199 void smp_reschedule_interrupt(struct pt_regs *regs) 202 200 { 203 201 ack_APIC_irq(); 204 202 inc_irq_stat(irq_resched_count); 203 + scheduler_ipi(); 205 204 /* 206 205 * KVM uses this interrupt to force a cpu out of guest mode 207 206 */

+2 -3

arch/x86/xen/smp.c

··· 46 46 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 47 47 48 48 /* 49 - * Reschedule call back. Nothing to do, 50 - * all the work is done automatically when 51 - * we return from the interrupt. 49 + * Reschedule call back. 52 50 */ 53 51 static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 54 52 { 55 53 inc_irq_stat(irq_resched_count); 54 + scheduler_ipi(); 56 55 57 56 return IRQ_HANDLED; 58 57 }

-1

include/linux/init_task.h

··· 134 134 .stack = &init_thread_info, \ 135 135 .usage = ATOMIC_INIT(2), \ 136 136 .flags = PF_KTHREAD, \ 137 - .lock_depth = -1, \ 138 137 .prio = MAX_PRIO-20, \ 139 138 .static_prio = MAX_PRIO-20, \ 140 139 .normal_prio = MAX_PRIO-20, \

+1 -1

include/linux/mutex.h

··· 51 51 spinlock_t wait_lock; 52 52 struct list_head wait_list; 53 53 #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) 54 - struct thread_info *owner; 54 + struct task_struct *owner; 55 55 #endif 56 56 #ifdef CONFIG_DEBUG_MUTEXES 57 57 const char *name;

+25 -36

include/linux/sched.h

··· 360 360 extern signed long schedule_timeout_killable(signed long timeout); 361 361 extern signed long schedule_timeout_uninterruptible(signed long timeout); 362 362 asmlinkage void schedule(void); 363 - extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); 363 + extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); 364 364 365 365 struct nsproxy; 366 366 struct user_namespace; ··· 731 731 /* timestamps */ 732 732 unsigned long long last_arrival,/* when we last ran on a cpu */ 733 733 last_queued; /* when we were last queued to run */ 734 - #ifdef CONFIG_SCHEDSTATS 735 - /* BKL stats */ 736 - unsigned int bkl_count; 737 - #endif 738 734 }; 739 735 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 740 736 ··· 864 868 865 869 struct sched_group { 866 870 struct sched_group *next; /* Must be a circular list */ 871 + atomic_t ref; 867 872 868 873 /* 869 874 * CPU power of this group, SCHED_LOAD_SCALE being max power for a ··· 879 882 * NOTE: this field is variable length. (Allocated dynamically 880 883 * by attaching extra space to the end of the structure, 881 884 * depending on how many CPUs the kernel has booted up with) 882 - * 883 - * It is also be embedded into static data structures at build 884 - * time. (See 'struct static_sched_group' in kernel/sched.c) 885 885 */ 886 886 unsigned long cpumask[0]; 887 887 }; ··· 888 894 return to_cpumask(sg->cpumask); 889 895 } 890 896 891 - enum sched_domain_level { 892 - SD_LV_NONE = 0, 893 - SD_LV_SIBLING, 894 - SD_LV_MC, 895 - SD_LV_BOOK, 896 - SD_LV_CPU, 897 - SD_LV_NODE, 898 - SD_LV_ALLNODES, 899 - SD_LV_MAX 900 - }; 901 - 902 897 struct sched_domain_attr { 903 898 int relax_domain_level; 904 899 }; ··· 895 912 #define SD_ATTR_INIT (struct sched_domain_attr) { \ 896 913 .relax_domain_level = -1, \ 897 914 } 915 + 916 + extern int sched_domain_level_max; 898 917 899 918 struct sched_domain { 900 919 /* These fields must be setup */ ··· 915 930 unsigned int forkexec_idx; 916 931 unsigned int smt_gain; 917 932 int flags; /* See SD_* */ 918 - enum sched_domain_level level; 933 + int level; 919 934 920 935 /* Runtime fields. */ 921 936 unsigned long last_balance; /* init to jiffies. units in jiffies */ ··· 958 973 #ifdef CONFIG_SCHED_DEBUG 959 974 char *name; 960 975 #endif 976 + union { 977 + void *private; /* used during construction */ 978 + struct rcu_head rcu; /* used during destruction */ 979 + }; 961 980 962 981 unsigned int span_weight; 963 982 /* ··· 970 981 * NOTE: this field is variable length. (Allocated dynamically 971 982 * by attaching extra space to the end of the structure, 972 983 * depending on how many CPUs the kernel has booted up with) 973 - * 974 - * It is also be embedded into static data structures at build 975 - * time. (See 'struct static_sched_domain' in kernel/sched.c) 976 984 */ 977 985 unsigned long span[0]; 978 986 }; ··· 1034 1048 #define WF_FORK 0x02 /* child wakeup after fork */ 1035 1049 1036 1050 #define ENQUEUE_WAKEUP 1 1037 - #define ENQUEUE_WAKING 2 1038 - #define ENQUEUE_HEAD 4 1051 + #define ENQUEUE_HEAD 2 1052 + #ifdef CONFIG_SMP 1053 + #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 1054 + #else 1055 + #define ENQUEUE_WAKING 0 1056 + #endif 1039 1057 1040 1058 #define DEQUEUE_SLEEP 1 1041 1059 ··· 1057 1067 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1058 1068 1059 1069 #ifdef CONFIG_SMP 1060 - int (*select_task_rq)(struct rq *rq, struct task_struct *p, 1061 - int sd_flag, int flags); 1070 + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1062 1071 1063 1072 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1064 1073 void (*post_schedule) (struct rq *this_rq); 1065 - void (*task_waking) (struct rq *this_rq, struct task_struct *task); 1074 + void (*task_waking) (struct task_struct *task); 1066 1075 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1067 1076 1068 1077 void (*set_cpus_allowed)(struct task_struct *p, ··· 1186 1197 unsigned int flags; /* per process flags, defined below */ 1187 1198 unsigned int ptrace; 1188 1199 1189 - int lock_depth; /* BKL lock depth */ 1190 - 1191 1200 #ifdef CONFIG_SMP 1192 - #ifdef __ARCH_WANT_UNLOCKED_CTXSW 1193 - int oncpu; 1201 + struct task_struct *wake_entry; 1202 + int on_cpu; 1194 1203 #endif 1195 - #endif 1204 + int on_rq; 1196 1205 1197 1206 int prio, static_prio, normal_prio; 1198 1207 unsigned int rt_priority; ··· 1261 1274 1262 1275 /* Revert to default priority/policy when forking */ 1263 1276 unsigned sched_reset_on_fork:1; 1277 + unsigned sched_contributes_to_load:1; 1264 1278 1265 1279 pid_t pid; 1266 1280 pid_t tgid; ··· 2051 2063 2052 2064 extern int wake_up_state(struct task_struct *tsk, unsigned int state); 2053 2065 extern int wake_up_process(struct task_struct *tsk); 2054 - extern void wake_up_new_task(struct task_struct *tsk, 2055 - unsigned long clone_flags); 2066 + extern void wake_up_new_task(struct task_struct *tsk); 2056 2067 #ifdef CONFIG_SMP 2057 2068 extern void kick_process(struct task_struct *tsk); 2058 2069 #else 2059 2070 static inline void kick_process(struct task_struct *tsk) { } 2060 2071 #endif 2061 - extern void sched_fork(struct task_struct *p, int clone_flags); 2072 + extern void sched_fork(struct task_struct *p); 2062 2073 extern void sched_dead(struct task_struct *p); 2063 2074 2064 2075 extern void proc_caches_init(void); ··· 2182 2195 extern char *get_task_comm(char *to, struct task_struct *tsk); 2183 2196 2184 2197 #ifdef CONFIG_SMP 2198 + void scheduler_ipi(void); 2185 2199 extern unsigned long wait_task_inactive(struct task_struct *, long match_state); 2186 2200 #else 2201 + static inline void scheduler_ipi(void) { } 2187 2202 static inline unsigned long wait_task_inactive(struct task_struct *p, 2188 2203 long match_state) 2189 2204 {

+5

init/Kconfig

··· 827 827 desktop applications. Task group autogeneration is currently based 828 828 upon task session. 829 829 830 + config SCHED_TTWU_QUEUE 831 + bool 832 + depends on !SPARC32 833 + default y 834 + 830 835 config MM_OWNER 831 836 bool 832 837

+1 -1

kernel/cpuset.c

··· 1159 1159 static int update_relax_domain_level(struct cpuset *cs, s64 val) 1160 1160 { 1161 1161 #ifdef CONFIG_SMP 1162 - if (val < -1 || val >= SD_LV_MAX) 1162 + if (val < -1 || val >= sched_domain_level_max) 1163 1163 return -EINVAL; 1164 1164 #endif 1165 1165

+2 -3

kernel/fork.c

··· 1103 1103 1104 1104 posix_cpu_timers_init(p); 1105 1105 1106 - p->lock_depth = -1; /* -1 = no lock */ 1107 1106 do_posix_clock_monotonic_gettime(&p->start_time); 1108 1107 p->real_start_time = p->start_time; 1109 1108 monotonic_to_bootbased(&p->real_start_time); ··· 1152 1153 #endif 1153 1154 1154 1155 /* Perform scheduler related setup. Assign this task to a CPU. */ 1155 - sched_fork(p, clone_flags); 1156 + sched_fork(p); 1156 1157 1157 1158 retval = perf_event_init_task(p); 1158 1159 if (retval) ··· 1463 1464 */ 1464 1465 p->flags &= ~PF_STARTING; 1465 1466 1466 - wake_up_new_task(p, clone_flags); 1467 + wake_up_new_task(p); 1467 1468 1468 1469 tracehook_report_clone_complete(trace, regs, 1469 1470 clone_flags, nr, p);

+1 -1

kernel/mutex-debug.c

··· 75 75 return; 76 76 77 77 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 78 - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 78 + DEBUG_LOCKS_WARN_ON(lock->owner != current); 79 79 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 80 80 mutex_clear_owner(lock); 81 81 }

+1 -1

kernel/mutex-debug.h

··· 29 29 30 30 static inline void mutex_set_owner(struct mutex *lock) 31 31 { 32 - lock->owner = current_thread_info(); 32 + lock->owner = current; 33 33 } 34 34 35 35 static inline void mutex_clear_owner(struct mutex *lock)

+1 -8

kernel/mutex.c

··· 160 160 */ 161 161 162 162 for (;;) { 163 - struct thread_info *owner; 164 - 165 - /* 166 - * If we own the BKL, then don't spin. The owner of 167 - * the mutex might be waiting on us to release the BKL. 168 - */ 169 - if (unlikely(current->lock_depth >= 0)) 170 - break; 163 + struct task_struct *owner; 171 164 172 165 /* 173 166 * If there's an owner, wait for it to either

+1 -1

kernel/mutex.h

··· 19 19 #ifdef CONFIG_SMP 20 20 static inline void mutex_set_owner(struct mutex *lock) 21 21 { 22 - lock->owner = current_thread_info(); 22 + lock->owner = current; 23 23 } 24 24 25 25 static inline void mutex_clear_owner(struct mutex *lock)

+681 -1005

kernel/sched.c

··· 231 231 #endif 232 232 233 233 /* 234 - * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 + * sched_domains_mutex serializes calls to init_sched_domains, 235 235 * detach_destroy_domains and partition_sched_domains. 236 236 */ 237 237 static DEFINE_MUTEX(sched_domains_mutex); ··· 312 312 313 313 u64 exec_clock; 314 314 u64 min_vruntime; 315 + #ifndef CONFIG_64BIT 316 + u64 min_vruntime_copy; 317 + #endif 315 318 316 319 struct rb_root tasks_timeline; 317 320 struct rb_node *rb_leftmost; ··· 328 325 */ 329 326 struct sched_entity *curr, *next, *last, *skip; 330 327 328 + #ifdef CONFIG_SCHED_DEBUG 331 329 unsigned int nr_spread_over; 330 + #endif 332 331 333 332 #ifdef CONFIG_FAIR_GROUP_SCHED 334 333 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ ··· 422 417 */ 423 418 struct root_domain { 424 419 atomic_t refcount; 420 + struct rcu_head rcu; 425 421 cpumask_var_t span; 426 422 cpumask_var_t online; 427 423 ··· 466 460 u64 nohz_stamp; 467 461 unsigned char nohz_balance_kick; 468 462 #endif 469 - unsigned int skip_clock_update; 463 + int skip_clock_update; 470 464 471 465 /* capture load from *all* tasks on this cpu: */ 472 466 struct load_weight load; ··· 559 553 unsigned int ttwu_count; 560 554 unsigned int ttwu_local; 561 555 #endif 556 + 557 + #ifdef CONFIG_SMP 558 + struct task_struct *wake_list; 559 + #endif 562 560 }; 563 561 564 562 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ··· 581 571 582 572 #define rcu_dereference_check_sched_domain(p) \ 583 573 rcu_dereference_check((p), \ 584 - rcu_read_lock_sched_held() || \ 574 + rcu_read_lock_held() || \ 585 575 lockdep_is_held(&sched_domains_mutex)) 586 576 587 577 /* ··· 606 596 * Return the group to which this tasks belongs. 607 597 * 608 598 * We use task_subsys_state_check() and extend the RCU verification 609 - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 599 + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() 610 600 * holds that lock for each task it moves into the cgroup. Therefore 611 601 * by holding that lock, we pin the task to the current cgroup. 612 602 */ ··· 616 606 struct cgroup_subsys_state *css; 617 607 618 608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 619 - lockdep_is_held(&task_rq(p)->lock)); 609 + lockdep_is_held(&p->pi_lock)); 620 610 tg = container_of(css, struct task_group, css); 621 611 622 612 return autogroup_task_group(p, tg); ··· 652 642 { 653 643 s64 delta; 654 644 655 - if (rq->skip_clock_update) 645 + if (rq->skip_clock_update > 0) 656 646 return; 657 647 658 648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ··· 848 838 return rq->curr == p; 849 839 } 850 840 851 - #ifndef __ARCH_WANT_UNLOCKED_CTXSW 852 841 static inline int task_running(struct rq *rq, struct task_struct *p) 853 842 { 843 + #ifdef CONFIG_SMP 844 + return p->on_cpu; 845 + #else 854 846 return task_current(rq, p); 847 + #endif 855 848 } 856 849 850 + #ifndef __ARCH_WANT_UNLOCKED_CTXSW 857 851 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 858 852 { 853 + #ifdef CONFIG_SMP 854 + /* 855 + * We can optimise this out completely for !SMP, because the 856 + * SMP rebalancing from interrupt is the only thing that cares 857 + * here. 858 + */ 859 + next->on_cpu = 1; 860 + #endif 859 861 } 860 862 861 863 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 862 864 { 865 + #ifdef CONFIG_SMP 866 + /* 867 + * After ->on_cpu is cleared, the task can be moved to a different CPU. 868 + * We must ensure this doesn't happen until the switch is completely 869 + * finished. 870 + */ 871 + smp_wmb(); 872 + prev->on_cpu = 0; 873 + #endif 863 874 #ifdef CONFIG_DEBUG_SPINLOCK 864 875 /* this is a valid case when another task releases the spinlock */ 865 876 rq->lock.owner = current; ··· 896 865 } 897 866 898 867 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899 - static inline int task_running(struct rq *rq, struct task_struct *p) 900 - { 901 - #ifdef CONFIG_SMP 902 - return p->oncpu; 903 - #else 904 - return task_current(rq, p); 905 - #endif 906 - } 907 - 908 868 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 909 869 { 910 870 #ifdef CONFIG_SMP ··· 904 882 * SMP rebalancing from interrupt is the only thing that cares 905 883 * here. 906 884 */ 907 - next->oncpu = 1; 885 + next->on_cpu = 1; 908 886 #endif 909 887 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 910 888 raw_spin_unlock_irq(&rq->lock); ··· 917 895 { 918 896 #ifdef CONFIG_SMP 919 897 /* 920 - * After ->oncpu is cleared, the task can be moved to a different CPU. 898 + * After ->on_cpu is cleared, the task can be moved to a different CPU. 921 899 * We must ensure this doesn't happen until the switch is completely 922 900 * finished. 923 901 */ 924 902 smp_wmb(); 925 - prev->oncpu = 0; 903 + prev->on_cpu = 0; 926 904 #endif 927 905 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 928 906 local_irq_enable(); ··· 931 909 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 932 910 933 911 /* 934 - * Check whether the task is waking, we use this to synchronize ->cpus_allowed 935 - * against ttwu(). 936 - */ 937 - static inline int task_is_waking(struct task_struct *p) 938 - { 939 - return unlikely(p->state == TASK_WAKING); 940 - } 941 - 942 - /* 943 - * __task_rq_lock - lock the runqueue a given task resides on. 944 - * Must be called interrupts disabled. 912 + * __task_rq_lock - lock the rq @p resides on. 945 913 */ 946 914 static inline struct rq *__task_rq_lock(struct task_struct *p) 947 915 __acquires(rq->lock) 948 916 { 949 917 struct rq *rq; 918 + 919 + lockdep_assert_held(&p->pi_lock); 950 920 951 921 for (;;) { 952 922 rq = task_rq(p); ··· 950 936 } 951 937 952 938 /* 953 - * task_rq_lock - lock the runqueue a given task resides on and disable 954 - * interrupts. Note the ordering: we can safely lookup the task_rq without 955 - * explicitly disabling preemption. 939 + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 956 940 */ 957 941 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 942 + __acquires(p->pi_lock) 958 943 __acquires(rq->lock) 959 944 { 960 945 struct rq *rq; 961 946 962 947 for (;;) { 963 - local_irq_save(*flags); 948 + raw_spin_lock_irqsave(&p->pi_lock, *flags); 964 949 rq = task_rq(p); 965 950 raw_spin_lock(&rq->lock); 966 951 if (likely(rq == task_rq(p))) 967 952 return rq; 968 - raw_spin_unlock_irqrestore(&rq->lock, *flags); 953 + raw_spin_unlock(&rq->lock); 954 + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 969 955 } 970 956 } 971 957 ··· 975 961 raw_spin_unlock(&rq->lock); 976 962 } 977 963 978 - static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 964 + static inline void 965 + task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 979 966 __releases(rq->lock) 967 + __releases(p->pi_lock) 980 968 { 981 - raw_spin_unlock_irqrestore(&rq->lock, *flags); 969 + raw_spin_unlock(&rq->lock); 970 + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 982 971 } 983 972 984 973 /* ··· 1210 1193 int i; 1211 1194 struct sched_domain *sd; 1212 1195 1196 + rcu_read_lock(); 1213 1197 for_each_domain(cpu, sd) { 1214 - for_each_cpu(i, sched_domain_span(sd)) 1215 - if (!idle_cpu(i)) 1216 - return i; 1198 + for_each_cpu(i, sched_domain_span(sd)) { 1199 + if (!idle_cpu(i)) { 1200 + cpu = i; 1201 + goto unlock; 1202 + } 1203 + } 1217 1204 } 1205 + unlock: 1206 + rcu_read_unlock(); 1218 1207 return cpu; 1219 1208 } 1220 1209 /* ··· 1330 1307 { 1331 1308 u64 tmp; 1332 1309 1310 + tmp = (u64)delta_exec * weight; 1311 + 1333 1312 if (!lw->inv_weight) { 1334 1313 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1335 1314 lw->inv_weight = 1; 1336 1315 else 1337 - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1338 - / (lw->weight+1); 1316 + lw->inv_weight = WMULT_CONST / lw->weight; 1339 1317 } 1340 1318 1341 - tmp = (u64)delta_exec * weight; 1342 1319 /* 1343 1320 * Check whether we'd overflow the 64-bit multiplication: 1344 1321 */ ··· 1796 1773 update_rq_clock(rq); 1797 1774 sched_info_queued(p); 1798 1775 p->sched_class->enqueue_task(rq, p, flags); 1799 - p->se.on_rq = 1; 1800 1776 } 1801 1777 1802 1778 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ··· 1803 1781 update_rq_clock(rq); 1804 1782 sched_info_dequeued(p); 1805 1783 p->sched_class->dequeue_task(rq, p, flags); 1806 - p->se.on_rq = 0; 1807 1784 } 1808 1785 1809 1786 /* ··· 2137 2116 * A queue event has occurred, and we're going to schedule. In 2138 2117 * this case, we can save a useless back to back clock update. 2139 2118 */ 2140 - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2119 + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 2141 2120 rq->skip_clock_update = 1; 2142 2121 } 2143 2122 ··· 2183 2162 */ 2184 2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2185 2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2165 + 2166 + #ifdef CONFIG_LOCKDEP 2167 + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 2168 + lockdep_is_held(&task_rq(p)->lock))); 2169 + #endif 2186 2170 #endif 2187 2171 2188 2172 trace_sched_migrate_task(p, new_cpu); ··· 2206 2180 }; 2207 2181 2208 2182 static int migration_cpu_stop(void *data); 2209 - 2210 - /* 2211 - * The task's runqueue lock must be held. 2212 - * Returns true if you have to wait for migration thread. 2213 - */ 2214 - static bool migrate_task(struct task_struct *p, struct rq *rq) 2215 - { 2216 - /* 2217 - * If the task is not on a runqueue (and not running), then 2218 - * the next wake-up will properly place the task. 2219 - */ 2220 - return p->se.on_rq || task_running(rq, p); 2221 - } 2222 2183 2223 2184 /* 2224 2185 * wait_task_inactive - wait for a thread to unschedule. ··· 2264 2251 rq = task_rq_lock(p, &flags); 2265 2252 trace_sched_wait_task(p); 2266 2253 running = task_running(rq, p); 2267 - on_rq = p->se.on_rq; 2254 + on_rq = p->on_rq; 2268 2255 ncsw = 0; 2269 2256 if (!match_state || p->state == match_state) 2270 2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2271 - task_rq_unlock(rq, &flags); 2258 + task_rq_unlock(rq, p, &flags); 2272 2259 2273 2260 /* 2274 2261 * If it changed from the expected state, bail out now. ··· 2343 2330 2344 2331 #ifdef CONFIG_SMP 2345 2332 /* 2346 - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2333 + * ->cpus_allowed is protected by both rq->lock and p->pi_lock 2347 2334 */ 2348 2335 static int select_fallback_rq(int cpu, struct task_struct *p) 2349 2336 { ··· 2376 2363 } 2377 2364 2378 2365 /* 2379 - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2366 + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 2380 2367 */ 2381 2368 static inline 2382 - int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2369 + int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2383 2370 { 2384 - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2371 + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2385 2372 2386 2373 /* 2387 2374 * In order not to call set_task_cpu() on a blocking task we need ··· 2407 2394 } 2408 2395 #endif 2409 2396 2410 - static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2411 - bool is_sync, bool is_migrate, bool is_local, 2412 - unsigned long en_flags) 2397 + static void 2398 + ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 2413 2399 { 2414 - schedstat_inc(p, se.statistics.nr_wakeups); 2415 - if (is_sync) 2416 - schedstat_inc(p, se.statistics.nr_wakeups_sync); 2417 - if (is_migrate) 2418 - schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2419 - if (is_local) 2420 - schedstat_inc(p, se.statistics.nr_wakeups_local); 2421 - else 2422 - schedstat_inc(p, se.statistics.nr_wakeups_remote); 2400 + #ifdef CONFIG_SCHEDSTATS 2401 + struct rq *rq = this_rq(); 2423 2402 2424 - activate_task(rq, p, en_flags); 2403 + #ifdef CONFIG_SMP 2404 + int this_cpu = smp_processor_id(); 2405 + 2406 + if (cpu == this_cpu) { 2407 + schedstat_inc(rq, ttwu_local); 2408 + schedstat_inc(p, se.statistics.nr_wakeups_local); 2409 + } else { 2410 + struct sched_domain *sd; 2411 + 2412 + schedstat_inc(p, se.statistics.nr_wakeups_remote); 2413 + rcu_read_lock(); 2414 + for_each_domain(this_cpu, sd) { 2415 + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2416 + schedstat_inc(sd, ttwu_wake_remote); 2417 + break; 2418 + } 2419 + } 2420 + rcu_read_unlock(); 2421 + } 2422 + #endif /* CONFIG_SMP */ 2423 + 2424 + schedstat_inc(rq, ttwu_count); 2425 + schedstat_inc(p, se.statistics.nr_wakeups); 2426 + 2427 + if (wake_flags & WF_SYNC) 2428 + schedstat_inc(p, se.statistics.nr_wakeups_sync); 2429 + 2430 + if (cpu != task_cpu(p)) 2431 + schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2432 + 2433 + #endif /* CONFIG_SCHEDSTATS */ 2425 2434 } 2426 2435 2427 - static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2428 - int wake_flags, bool success) 2436 + static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 2429 2437 { 2430 - trace_sched_wakeup(p, success); 2438 + activate_task(rq, p, en_flags); 2439 + p->on_rq = 1; 2440 + 2441 + /* if a worker is waking up, notify workqueue */ 2442 + if (p->flags & PF_WQ_WORKER) 2443 + wq_worker_waking_up(p, cpu_of(rq)); 2444 + } 2445 + 2446 + /* 2447 + * Mark the task runnable and perform wakeup-preemption. 2448 + */ 2449 + static void 2450 + ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 2451 + { 2452 + trace_sched_wakeup(p, true); 2431 2453 check_preempt_curr(rq, p, wake_flags); 2432 2454 2433 2455 p->state = TASK_RUNNING; ··· 2481 2433 rq->idle_stamp = 0; 2482 2434 } 2483 2435 #endif 2484 - /* if a worker is waking up, notify workqueue */ 2485 - if ((p->flags & PF_WQ_WORKER) && success) 2486 - wq_worker_waking_up(p, cpu_of(rq)); 2436 + } 2437 + 2438 + static void 2439 + ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 2440 + { 2441 + #ifdef CONFIG_SMP 2442 + if (p->sched_contributes_to_load) 2443 + rq->nr_uninterruptible--; 2444 + #endif 2445 + 2446 + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 2447 + ttwu_do_wakeup(rq, p, wake_flags); 2448 + } 2449 + 2450 + /* 2451 + * Called in case the task @p isn't fully descheduled from its runqueue, 2452 + * in this case we must do a remote wakeup. Its a 'light' wakeup though, 2453 + * since all we need to do is flip p->state to TASK_RUNNING, since 2454 + * the task is still ->on_rq. 2455 + */ 2456 + static int ttwu_remote(struct task_struct *p, int wake_flags) 2457 + { 2458 + struct rq *rq; 2459 + int ret = 0; 2460 + 2461 + rq = __task_rq_lock(p); 2462 + if (p->on_rq) { 2463 + ttwu_do_wakeup(rq, p, wake_flags); 2464 + ret = 1; 2465 + } 2466 + __task_rq_unlock(rq); 2467 + 2468 + return ret; 2469 + } 2470 + 2471 + #ifdef CONFIG_SMP 2472 + static void sched_ttwu_pending(void) 2473 + { 2474 + struct rq *rq = this_rq(); 2475 + struct task_struct *list = xchg(&rq->wake_list, NULL); 2476 + 2477 + if (!list) 2478 + return; 2479 + 2480 + raw_spin_lock(&rq->lock); 2481 + 2482 + while (list) { 2483 + struct task_struct *p = list; 2484 + list = list->wake_entry; 2485 + ttwu_do_activate(rq, p, 0); 2486 + } 2487 + 2488 + raw_spin_unlock(&rq->lock); 2489 + } 2490 + 2491 + void scheduler_ipi(void) 2492 + { 2493 + sched_ttwu_pending(); 2494 + } 2495 + 2496 + static void ttwu_queue_remote(struct task_struct *p, int cpu) 2497 + { 2498 + struct rq *rq = cpu_rq(cpu); 2499 + struct task_struct *next = rq->wake_list; 2500 + 2501 + for (;;) { 2502 + struct task_struct *old = next; 2503 + 2504 + p->wake_entry = next; 2505 + next = cmpxchg(&rq->wake_list, old, p); 2506 + if (next == old) 2507 + break; 2508 + } 2509 + 2510 + if (!next) 2511 + smp_send_reschedule(cpu); 2512 + } 2513 + #endif 2514 + 2515 + static void ttwu_queue(struct task_struct *p, int cpu) 2516 + { 2517 + struct rq *rq = cpu_rq(cpu); 2518 + 2519 + #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) 2520 + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 2521 + ttwu_queue_remote(p, cpu); 2522 + return; 2523 + } 2524 + #endif 2525 + 2526 + raw_spin_lock(&rq->lock); 2527 + ttwu_do_activate(rq, p, 0); 2528 + raw_spin_unlock(&rq->lock); 2487 2529 } 2488 2530 2489 2531 /** ··· 2591 2453 * Returns %true if @p was woken up, %false if it was already running 2592 2454 * or @state didn't match @p's state. 2593 2455 */ 2594 - static int try_to_wake_up(struct task_struct *p, unsigned int state, 2595 - int wake_flags) 2456 + static int 2457 + try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 2596 2458 { 2597 - int cpu, orig_cpu, this_cpu, success = 0; 2598 2459 unsigned long flags; 2599 - unsigned long en_flags = ENQUEUE_WAKEUP; 2600 - struct rq *rq; 2601 - 2602 - this_cpu = get_cpu(); 2460 + int cpu, success = 0; 2603 2461 2604 2462 smp_wmb(); 2605 - rq = task_rq_lock(p, &flags); 2463 + raw_spin_lock_irqsave(&p->pi_lock, flags); 2606 2464 if (!(p->state & state)) 2607 2465 goto out; 2608 2466 2609 - if (p->se.on_rq) 2610 - goto out_running; 2611 - 2467 + success = 1; /* we're going to change ->state */ 2612 2468 cpu = task_cpu(p); 2613 - orig_cpu = cpu; 2469 + 2470 + if (p->on_rq && ttwu_remote(p, wake_flags)) 2471 + goto stat; 2614 2472 2615 2473 #ifdef CONFIG_SMP 2616 - if (unlikely(task_running(rq, p))) 2617 - goto out_activate; 2618 - 2619 2474 /* 2620 - * In order to handle concurrent wakeups and release the rq->lock 2621 - * we put the task in TASK_WAKING state. 2622 - * 2623 - * First fix up the nr_uninterruptible count: 2475 + * If the owning (remote) cpu is still in the middle of schedule() with 2476 + * this task as prev, wait until its done referencing the task. 2624 2477 */ 2625 - if (task_contributes_to_load(p)) { 2626 - if (likely(cpu_online(orig_cpu))) 2627 - rq->nr_uninterruptible--; 2628 - else 2629 - this_rq()->nr_uninterruptible--; 2478 + while (p->on_cpu) { 2479 + #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2480 + /* 2481 + * If called from interrupt context we could have landed in the 2482 + * middle of schedule(), in this case we should take care not 2483 + * to spin on ->on_cpu if p is current, since that would 2484 + * deadlock. 2485 + */ 2486 + if (p == current) { 2487 + ttwu_queue(p, cpu); 2488 + goto stat; 2489 + } 2490 + #endif 2491 + cpu_relax(); 2630 2492 } 2493 + /* 2494 + * Pairs with the smp_wmb() in finish_lock_switch(). 2495 + */ 2496 + smp_rmb(); 2497 + 2498 + p->sched_contributes_to_load = !!task_contributes_to_load(p); 2631 2499 p->state = TASK_WAKING; 2632 2500 2633 - if (p->sched_class->task_waking) { 2634 - p->sched_class->task_waking(rq, p); 2635 - en_flags |= ENQUEUE_WAKING; 2636 - } 2501 + if (p->sched_class->task_waking) 2502 + p->sched_class->task_waking(p); 2637 2503 2638 - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2639 - if (cpu != orig_cpu) 2504 + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2505 + if (task_cpu(p) != cpu) 2640 2506 set_task_cpu(p, cpu); 2641 - __task_rq_unlock(rq); 2642 - 2643 - rq = cpu_rq(cpu); 2644 - raw_spin_lock(&rq->lock); 2645 - 2646 - /* 2647 - * We migrated the task without holding either rq->lock, however 2648 - * since the task is not on the task list itself, nobody else 2649 - * will try and migrate the task, hence the rq should match the 2650 - * cpu we just moved it to. 2651 - */ 2652 - WARN_ON(task_cpu(p) != cpu); 2653 - WARN_ON(p->state != TASK_WAKING); 2654 - 2655 - #ifdef CONFIG_SCHEDSTATS 2656 - schedstat_inc(rq, ttwu_count); 2657 - if (cpu == this_cpu) 2658 - schedstat_inc(rq, ttwu_local); 2659 - else { 2660 - struct sched_domain *sd; 2661 - for_each_domain(this_cpu, sd) { 2662 - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2663 - schedstat_inc(sd, ttwu_wake_remote); 2664 - break; 2665 - } 2666 - } 2667 - } 2668 - #endif /* CONFIG_SCHEDSTATS */ 2669 - 2670 - out_activate: 2671 2507 #endif /* CONFIG_SMP */ 2672 - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2673 - cpu == this_cpu, en_flags); 2674 - success = 1; 2675 - out_running: 2676 - ttwu_post_activation(p, rq, wake_flags, success); 2508 + 2509 + ttwu_queue(p, cpu); 2510 + stat: 2511 + ttwu_stat(p, cpu, wake_flags); 2677 2512 out: 2678 - task_rq_unlock(rq, &flags); 2679 - put_cpu(); 2513 + raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2680 2514 2681 2515 return success; 2682 2516 } ··· 2657 2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2658 2548 * @p: the thread to be awakened 2659 2549 * 2660 - * Put @p on the run-queue if it's not already there. The caller must 2550 + * Put @p on the run-queue if it's not already there. The caller must 2661 2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2662 - * the current task. this_rq() stays locked over invocation. 2552 + * the current task. 2663 2553 */ 2664 2554 static void try_to_wake_up_local(struct task_struct *p) 2665 2555 { 2666 2556 struct rq *rq = task_rq(p); 2667 - bool success = false; 2668 2557 2669 2558 BUG_ON(rq != this_rq()); 2670 2559 BUG_ON(p == current); 2671 2560 lockdep_assert_held(&rq->lock); 2672 2561 2673 - if (!(p->state & TASK_NORMAL)) 2674 - return; 2675 - 2676 - if (!p->se.on_rq) { 2677 - if (likely(!task_running(rq, p))) { 2678 - schedstat_inc(rq, ttwu_count); 2679 - schedstat_inc(rq, ttwu_local); 2680 - } 2681 - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2682 - success = true; 2562 + if (!raw_spin_trylock(&p->pi_lock)) { 2563 + raw_spin_unlock(&rq->lock); 2564 + raw_spin_lock(&p->pi_lock); 2565 + raw_spin_lock(&rq->lock); 2683 2566 } 2684 - ttwu_post_activation(p, rq, 0, success); 2567 + 2568 + if (!(p->state & TASK_NORMAL)) 2569 + goto out; 2570 + 2571 + if (!p->on_rq) 2572 + ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2573 + 2574 + ttwu_do_wakeup(rq, p, 0); 2575 + ttwu_stat(p, smp_processor_id(), 0); 2576 + out: 2577 + raw_spin_unlock(&p->pi_lock); 2685 2578 } 2686 2579 2687 2580 /** ··· 2717 2604 */ 2718 2605 static void __sched_fork(struct task_struct *p) 2719 2606 { 2607 + p->on_rq = 0; 2608 + 2609 + p->se.on_rq = 0; 2720 2610 p->se.exec_start = 0; 2721 2611 p->se.sum_exec_runtime = 0; 2722 2612 p->se.prev_sum_exec_runtime = 0; 2723 2613 p->se.nr_migrations = 0; 2724 2614 p->se.vruntime = 0; 2615 + INIT_LIST_HEAD(&p->se.group_node); 2725 2616 2726 2617 #ifdef CONFIG_SCHEDSTATS 2727 2618 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2728 2619 #endif 2729 2620 2730 2621 INIT_LIST_HEAD(&p->rt.run_list); 2731 - p->se.on_rq = 0; 2732 - INIT_LIST_HEAD(&p->se.group_node); 2733 2622 2734 2623 #ifdef CONFIG_PREEMPT_NOTIFIERS 2735 2624 INIT_HLIST_HEAD(&p->preempt_notifiers); ··· 2741 2626 /* 2742 2627 * fork()/clone()-time setup: 2743 2628 */ 2744 - void sched_fork(struct task_struct *p, int clone_flags) 2629 + void sched_fork(struct task_struct *p) 2745 2630 { 2631 + unsigned long flags; 2746 2632 int cpu = get_cpu(); 2747 2633 2748 2634 __sched_fork(p); ··· 2794 2678 * 2795 2679 * Silence PROVE_RCU. 2796 2680 */ 2797 - rcu_read_lock(); 2681 + raw_spin_lock_irqsave(&p->pi_lock, flags); 2798 2682 set_task_cpu(p, cpu); 2799 - rcu_read_unlock(); 2683 + raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2800 2684 2801 2685 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2802 2686 if (likely(sched_info_on())) 2803 2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2804 2688 #endif 2805 - #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2806 - p->oncpu = 0; 2689 + #if defined(CONFIG_SMP) 2690 + p->on_cpu = 0; 2807 2691 #endif 2808 2692 #ifdef CONFIG_PREEMPT 2809 2693 /* Want to start with kernel preemption disabled. */ ··· 2823 2707 * that must be done for every newly created context, then puts the task 2824 2708 * on the runqueue and wakes it. 2825 2709 */ 2826 - void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2710 + void wake_up_new_task(struct task_struct *p) 2827 2711 { 2828 2712 unsigned long flags; 2829 2713 struct rq *rq; 2830 - int cpu __maybe_unused = get_cpu(); 2831 2714 2715 + raw_spin_lock_irqsave(&p->pi_lock, flags); 2832 2716 #ifdef CONFIG_SMP 2833 - rq = task_rq_lock(p, &flags); 2834 - p->state = TASK_WAKING; 2835 - 2836 2717 /* 2837 2718 * Fork balancing, do it here and not earlier because: 2838 2719 * - cpus_allowed can change in the fork path 2839 2720 * - any previously selected cpu might disappear through hotplug 2840 - * 2841 - * We set TASK_WAKING so that select_task_rq() can drop rq->lock 2842 - * without people poking at ->cpus_allowed. 2843 2721 */ 2844 - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2845 - set_task_cpu(p, cpu); 2846 - 2847 - p->state = TASK_RUNNING; 2848 - task_rq_unlock(rq, &flags); 2722 + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 2849 2723 #endif 2850 2724 2851 - rq = task_rq_lock(p, &flags); 2725 + rq = __task_rq_lock(p); 2852 2726 activate_task(rq, p, 0); 2853 - trace_sched_wakeup_new(p, 1); 2727 + p->on_rq = 1; 2728 + trace_sched_wakeup_new(p, true); 2854 2729 check_preempt_curr(rq, p, WF_FORK); 2855 2730 #ifdef CONFIG_SMP 2856 2731 if (p->sched_class->task_woken) 2857 2732 p->sched_class->task_woken(rq, p); 2858 2733 #endif 2859 - task_rq_unlock(rq, &flags); 2860 - put_cpu(); 2734 + task_rq_unlock(rq, p, &flags); 2861 2735 } 2862 2736 2863 2737 #ifdef CONFIG_PREEMPT_NOTIFIERS ··· 3556 3450 { 3557 3451 struct task_struct *p = current; 3558 3452 unsigned long flags; 3559 - struct rq *rq; 3560 3453 int dest_cpu; 3561 3454 3562 - rq = task_rq_lock(p, &flags); 3563 - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3455 + raw_spin_lock_irqsave(&p->pi_lock, flags); 3456 + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 3564 3457 if (dest_cpu == smp_processor_id()) 3565 3458 goto unlock; 3566 3459 3567 - /* 3568 - * select_task_rq() can race against ->cpus_allowed 3569 - */ 3570 - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3571 - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { 3460 + if (likely(cpu_active(dest_cpu))) { 3572 3461 struct migration_arg arg = { p, dest_cpu }; 3573 3462 3574 - task_rq_unlock(rq, &flags); 3575 - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3463 + raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3464 + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 3576 3465 return; 3577 3466 } 3578 3467 unlock: 3579 - task_rq_unlock(rq, &flags); 3468 + raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3580 3469 } 3581 3470 3582 3471 #endif ··· 3608 3507 3609 3508 rq = task_rq_lock(p, &flags); 3610 3509 ns = do_task_delta_exec(p, rq); 3611 - task_rq_unlock(rq, &flags); 3510 + task_rq_unlock(rq, p, &flags); 3612 3511 3613 3512 return ns; 3614 3513 } ··· 3626 3525 3627 3526 rq = task_rq_lock(p, &flags); 3628 3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3629 - task_rq_unlock(rq, &flags); 3528 + task_rq_unlock(rq, p, &flags); 3630 3529 3631 3530 return ns; 3632 3531 } ··· 3650 3549 rq = task_rq_lock(p, &flags); 3651 3550 thread_group_cputime(p, &totals); 3652 3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3653 - task_rq_unlock(rq, &flags); 3552 + task_rq_unlock(rq, p, &flags); 3654 3553 3655 3554 return ns; 3656 3555 } ··· 4004 3903 /* 4005 3904 * This function gets called by the timer code, with HZ frequency. 4006 3905 * We call it with interrupts disabled. 4007 - * 4008 - * It also gets called by the fork code, when changing the parent's 4009 - * timeslices. 4010 3906 */ 4011 3907 void scheduler_tick(void) 4012 3908 { ··· 4123 4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4124 4026 4125 4027 schedstat_inc(this_rq(), sched_count); 4126 - #ifdef CONFIG_SCHEDSTATS 4127 - if (unlikely(prev->lock_depth >= 0)) { 4128 - schedstat_inc(this_rq(), rq_sched_info.bkl_count); 4129 - schedstat_inc(prev, sched_info.bkl_count); 4130 - } 4131 - #endif 4132 4028 } 4133 4029 4134 4030 static void put_prev_task(struct rq *rq, struct task_struct *prev) 4135 4031 { 4136 - if (prev->se.on_rq) 4032 + if (prev->on_rq || rq->skip_clock_update < 0) 4137 4033 update_rq_clock(rq); 4138 4034 prev->sched_class->put_prev_task(rq, prev); 4139 4035 } ··· 4189 4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4190 4098 prev->state = TASK_RUNNING; 4191 4099 } else { 4100 + deactivate_task(rq, prev, DEQUEUE_SLEEP); 4101 + prev->on_rq = 0; 4102 + 4192 4103 /* 4193 - * If a worker is going to sleep, notify and 4194 - * ask workqueue whether it wants to wake up a 4195 - * task to maintain concurrency. If so, wake 4196 - * up the task. 4104 + * If a worker went to sleep, notify and ask workqueue 4105 + * whether it wants to wake up a task to maintain 4106 + * concurrency. 4197 4107 */ 4198 4108 if (prev->flags & PF_WQ_WORKER) { 4199 4109 struct task_struct *to_wakeup; ··· 4204 4110 if (to_wakeup) 4205 4111 try_to_wake_up_local(to_wakeup); 4206 4112 } 4207 - deactivate_task(rq, prev, DEQUEUE_SLEEP); 4208 4113 4209 4114 /* 4210 - * If we are going to sleep and we have plugged IO queued, make 4211 - * sure to submit it to avoid deadlocks. 4115 + * If we are going to sleep and we have plugged IO 4116 + * queued, make sure to submit it to avoid deadlocks. 4212 4117 */ 4213 4118 if (blk_needs_flush_plug(prev)) { 4214 4119 raw_spin_unlock(&rq->lock); ··· 4254 4161 EXPORT_SYMBOL(schedule); 4255 4162 4256 4163 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4164 + 4165 + static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4166 + { 4167 + bool ret = false; 4168 + 4169 + rcu_read_lock(); 4170 + if (lock->owner != owner) 4171 + goto fail; 4172 + 4173 + /* 4174 + * Ensure we emit the owner->on_cpu, dereference _after_ checking 4175 + * lock->owner still matches owner, if that fails, owner might 4176 + * point to free()d memory, if it still matches, the rcu_read_lock() 4177 + * ensures the memory stays valid. 4178 + */ 4179 + barrier(); 4180 + 4181 + ret = owner->on_cpu; 4182 + fail: 4183 + rcu_read_unlock(); 4184 + 4185 + return ret; 4186 + } 4187 + 4257 4188 /* 4258 4189 * Look out! "owner" is an entirely speculative pointer 4259 4190 * access and not reliable. 4260 4191 */ 4261 - int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 4192 + int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 4262 4193 { 4263 - unsigned int cpu; 4264 - struct rq *rq; 4265 - 4266 4194 if (!sched_feat(OWNER_SPIN)) 4267 4195 return 0; 4268 4196 4269 - #ifdef CONFIG_DEBUG_PAGEALLOC 4270 - /* 4271 - * Need to access the cpu field knowing that 4272 - * DEBUG_PAGEALLOC could have unmapped it if 4273 - * the mutex owner just released it and exited. 4274 - */ 4275 - if (probe_kernel_address(&owner->cpu, cpu)) 4276 - return 0; 4277 - #else 4278 - cpu = owner->cpu; 4279 - #endif 4280 - 4281 - /* 4282 - * Even if the access succeeded (likely case), 4283 - * the cpu field may no longer be valid. 4284 - */ 4285 - if (cpu >= nr_cpumask_bits) 4286 - return 0; 4287 - 4288 - /* 4289 - * We need to validate that we can do a 4290 - * get_cpu() and that we have the percpu area. 4291 - */ 4292 - if (!cpu_online(cpu)) 4293 - return 0; 4294 - 4295 - rq = cpu_rq(cpu); 4296 - 4297 - for (;;) { 4298 - /* 4299 - * Owner changed, break to re-assess state. 4300 - */ 4301 - if (lock->owner != owner) { 4302 - /* 4303 - * If the lock has switched to a different owner, 4304 - * we likely have heavy contention. Return 0 to quit 4305 - * optimistic spinning and not contend further: 4306 - */ 4307 - if (lock->owner) 4308 - return 0; 4309 - break; 4310 - } 4311 - 4312 - /* 4313 - * Is that owner really running on that cpu? 4314 - */ 4315 - if (task_thread_info(rq->curr) != owner || need_resched()) 4197 + while (owner_running(lock, owner)) { 4198 + if (need_resched()) 4316 4199 return 0; 4317 4200 4318 4201 arch_mutex_cpu_relax(); 4319 4202 } 4203 + 4204 + /* 4205 + * If the owner changed to another task there is likely 4206 + * heavy contention, stop spinning. 4207 + */ 4208 + if (lock->owner) 4209 + return 0; 4320 4210 4321 4211 return 1; 4322 4212 } ··· 4760 4684 */ 4761 4685 void rt_mutex_setprio(struct task_struct *p, int prio) 4762 4686 { 4763 - unsigned long flags; 4764 4687 int oldprio, on_rq, running; 4765 4688 struct rq *rq; 4766 4689 const struct sched_class *prev_class; 4767 4690 4768 4691 BUG_ON(prio < 0 || prio > MAX_PRIO); 4769 4692 4770 - rq = task_rq_lock(p, &flags); 4693 + rq = __task_rq_lock(p); 4771 4694 4772 4695 trace_sched_pi_setprio(p, prio); 4773 4696 oldprio = p->prio; 4774 4697 prev_class = p->sched_class; 4775 - on_rq = p->se.on_rq; 4698 + on_rq = p->on_rq; 4776 4699 running = task_current(rq, p); 4777 4700 if (on_rq) 4778 4701 dequeue_task(rq, p, 0); ··· 4791 4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4792 4717 4793 4718 check_class_changed(rq, p, prev_class, oldprio); 4794 - task_rq_unlock(rq, &flags); 4719 + __task_rq_unlock(rq); 4795 4720 } 4796 4721 4797 4722 #endif ··· 4819 4744 p->static_prio = NICE_TO_PRIO(nice); 4820 4745 goto out_unlock; 4821 4746 } 4822 - on_rq = p->se.on_rq; 4747 + on_rq = p->on_rq; 4823 4748 if (on_rq) 4824 4749 dequeue_task(rq, p, 0); 4825 4750 ··· 4839 4764 resched_task(rq->curr); 4840 4765 } 4841 4766 out_unlock: 4842 - task_rq_unlock(rq, &flags); 4767 + task_rq_unlock(rq, p, &flags); 4843 4768 } 4844 4769 EXPORT_SYMBOL(set_user_nice); 4845 4770 ··· 4953 4878 static void 4954 4879 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4955 4880 { 4956 - BUG_ON(p->se.on_rq); 4957 - 4958 4881 p->policy = policy; 4959 4882 p->rt_priority = prio; 4960 4883 p->normal_prio = normal_prio(p); ··· 5067 4994 /* 5068 4995 * make sure no PI-waiters arrive (or leave) while we are 5069 4996 * changing the priority of the task: 5070 - */ 5071 - raw_spin_lock_irqsave(&p->pi_lock, flags); 5072 - /* 4997 + * 5073 4998 * To be able to change p->policy safely, the appropriate 5074 4999 * runqueue lock must be held. 5075 5000 */ 5076 - rq = __task_rq_lock(p); 5001 + rq = task_rq_lock(p, &flags); 5077 5002 5078 5003 /* 5079 5004 * Changing the policy of the stop threads its a very bad idea 5080 5005 */ 5081 5006 if (p == rq->stop) { 5082 - __task_rq_unlock(rq); 5083 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5007 + task_rq_unlock(rq, p, &flags); 5084 5008 return -EINVAL; 5085 5009 } 5086 5010 ··· 5101 5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5102 5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5103 5033 !task_group_is_autogroup(task_group(p))) { 5104 - __task_rq_unlock(rq); 5105 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5034 + task_rq_unlock(rq, p, &flags); 5106 5035 return -EPERM; 5107 5036 } 5108 5037 } ··· 5110 5041 /* recheck policy now with rq lock held */ 5111 5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5112 5043 policy = oldpolicy = -1; 5113 - __task_rq_unlock(rq); 5114 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5044 + task_rq_unlock(rq, p, &flags); 5115 5045 goto recheck; 5116 5046 } 5117 - on_rq = p->se.on_rq; 5047 + on_rq = p->on_rq; 5118 5048 running = task_current(rq, p); 5119 5049 if (on_rq) 5120 5050 deactivate_task(rq, p, 0); ··· 5132 5064 activate_task(rq, p, 0); 5133 5065 5134 5066 check_class_changed(rq, p, prev_class, oldprio); 5135 - __task_rq_unlock(rq); 5136 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5067 + task_rq_unlock(rq, p, &flags); 5137 5068 5138 5069 rt_mutex_adjust_pi(p); 5139 5070 ··· 5383 5316 { 5384 5317 struct task_struct *p; 5385 5318 unsigned long flags; 5386 - struct rq *rq; 5387 5319 int retval; 5388 5320 5389 5321 get_online_cpus(); ··· 5397 5331 if (retval) 5398 5332 goto out_unlock; 5399 5333 5400 - rq = task_rq_lock(p, &flags); 5334 + raw_spin_lock_irqsave(&p->pi_lock, flags); 5401 5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5402 - task_rq_unlock(rq, &flags); 5336 + raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5403 5337 5404 5338 out_unlock: 5405 5339 rcu_read_unlock(); ··· 5724 5658 5725 5659 rq = task_rq_lock(p, &flags); 5726 5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5727 - task_rq_unlock(rq, &flags); 5661 + task_rq_unlock(rq, p, &flags); 5728 5662 5729 5663 rcu_read_unlock(); 5730 5664 jiffies_to_timespec(time_slice, &t); ··· 5842 5776 rcu_read_unlock(); 5843 5777 5844 5778 rq->curr = rq->idle = idle; 5845 - #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5846 - idle->oncpu = 1; 5779 + #if defined(CONFIG_SMP) 5780 + idle->on_cpu = 1; 5847 5781 #endif 5848 5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5849 5783 5850 5784 /* Set the preempt count _outside_ the spinlocks! */ 5851 - #if defined(CONFIG_PREEMPT) 5852 - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 5853 - #else 5854 5785 task_thread_info(idle)->preempt_count = 0; 5855 - #endif 5786 + 5856 5787 /* 5857 5788 * The idle tasks have their own, simple scheduling class: 5858 5789 */ ··· 5944 5881 unsigned int dest_cpu; 5945 5882 int ret = 0; 5946 5883 5947 - /* 5948 - * Serialize against TASK_WAKING so that ttwu() and wunt() can 5949 - * drop the rq->lock and still rely on ->cpus_allowed. 5950 - */ 5951 - again: 5952 - while (task_is_waking(p)) 5953 - cpu_relax(); 5954 5884 rq = task_rq_lock(p, &flags); 5955 - if (task_is_waking(p)) { 5956 - task_rq_unlock(rq, &flags); 5957 - goto again; 5958 - } 5885 + 5886 + if (cpumask_equal(&p->cpus_allowed, new_mask)) 5887 + goto out; 5959 5888 5960 5889 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5961 5890 ret = -EINVAL; 5962 5891 goto out; 5963 5892 } 5964 5893 5965 - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5966 - !cpumask_equal(&p->cpus_allowed, new_mask))) { 5894 + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { 5967 5895 ret = -EINVAL; 5968 5896 goto out; 5969 5897 } ··· 5971 5917 goto out; 5972 5918 5973 5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5974 - if (migrate_task(p, rq)) { 5920 + if (p->on_rq) { 5975 5921 struct migration_arg arg = { p, dest_cpu }; 5976 5922 /* Need help from migration thread: drop lock and wait. */ 5977 - task_rq_unlock(rq, &flags); 5923 + task_rq_unlock(rq, p, &flags); 5978 5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5979 5925 tlb_migrate_finish(p->mm); 5980 5926 return 0; 5981 5927 } 5982 5928 out: 5983 - task_rq_unlock(rq, &flags); 5929 + task_rq_unlock(rq, p, &flags); 5984 5930 5985 5931 return ret; 5986 5932 } ··· 6008 5954 rq_src = cpu_rq(src_cpu); 6009 5955 rq_dest = cpu_rq(dest_cpu); 6010 5956 5957 + raw_spin_lock(&p->pi_lock); 6011 5958 double_rq_lock(rq_src, rq_dest); 6012 5959 /* Already moved. */ 6013 5960 if (task_cpu(p) != src_cpu) ··· 6021 5966 * If we're not on a rq, the next wake-up will ensure we're 6022 5967 * placed properly. 6023 5968 */ 6024 - if (p->se.on_rq) { 5969 + if (p->on_rq) { 6025 5970 deactivate_task(rq_src, p, 0); 6026 5971 set_task_cpu(p, dest_cpu); 6027 5972 activate_task(rq_dest, p, 0); ··· 6031 5976 ret = 1; 6032 5977 fail: 6033 5978 double_rq_unlock(rq_src, rq_dest); 5979 + raw_spin_unlock(&p->pi_lock); 6034 5980 return ret; 6035 5981 } 6036 5982 ··· 6372 6316 6373 6317 #ifdef CONFIG_HOTPLUG_CPU 6374 6318 case CPU_DYING: 6319 + sched_ttwu_pending(); 6375 6320 /* Update our root-domain */ 6376 6321 raw_spin_lock_irqsave(&rq->lock, flags); 6377 6322 if (rq->rd) { ··· 6450 6393 #endif 6451 6394 6452 6395 #ifdef CONFIG_SMP 6396 + 6397 + static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 6453 6398 6454 6399 #ifdef CONFIG_SCHED_DEBUG 6455 6400 ··· 6548 6489 6549 6490 static void sched_domain_debug(struct sched_domain *sd, int cpu) 6550 6491 { 6551 - cpumask_var_t groupmask; 6552 6492 int level = 0; 6553 6493 6554 6494 if (!sched_domain_debug_enabled) ··· 6560 6502 6561 6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6562 6504 6563 - if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { 6564 - printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6565 - return; 6566 - } 6567 - 6568 6505 for (;;) { 6569 - if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6506 + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 6570 6507 break; 6571 6508 level++; 6572 6509 sd = sd->parent; 6573 6510 if (!sd) 6574 6511 break; 6575 6512 } 6576 - free_cpumask_var(groupmask); 6577 6513 } 6578 6514 #else /* !CONFIG_SCHED_DEBUG */ 6579 6515 # define sched_domain_debug(sd, cpu) do { } while (0) ··· 6624 6572 return 1; 6625 6573 } 6626 6574 6627 - static void free_rootdomain(struct root_domain *rd) 6575 + static void free_rootdomain(struct rcu_head *rcu) 6628 6576 { 6629 - synchronize_sched(); 6577 + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 6630 6578 6631 6579 cpupri_cleanup(&rd->cpupri); 6632 - 6633 6580 free_cpumask_var(rd->rto_mask); 6634 6581 free_cpumask_var(rd->online); 6635 6582 free_cpumask_var(rd->span); ··· 6669 6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6670 6619 6671 6620 if (old_rd) 6672 - free_rootdomain(old_rd); 6621 + call_rcu_sched(&old_rd->rcu, free_rootdomain); 6673 6622 } 6674 6623 6675 6624 static int init_rootdomain(struct root_domain *rd) ··· 6720 6669 return rd; 6721 6670 } 6722 6671 6672 + static void free_sched_domain(struct rcu_head *rcu) 6673 + { 6674 + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6675 + if (atomic_dec_and_test(&sd->groups->ref)) 6676 + kfree(sd->groups); 6677 + kfree(sd); 6678 + } 6679 + 6680 + static void destroy_sched_domain(struct sched_domain *sd, int cpu) 6681 + { 6682 + call_rcu(&sd->rcu, free_sched_domain); 6683 + } 6684 + 6685 + static void destroy_sched_domains(struct sched_domain *sd, int cpu) 6686 + { 6687 + for (; sd; sd = sd->parent) 6688 + destroy_sched_domain(sd, cpu); 6689 + } 6690 + 6723 6691 /* 6724 6692 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6725 6693 * hold the hotplug lock. ··· 6748 6678 { 6749 6679 struct rq *rq = cpu_rq(cpu); 6750 6680 struct sched_domain *tmp; 6751 - 6752 - for (tmp = sd; tmp; tmp = tmp->parent) 6753 - tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); 6754 6681 6755 6682 /* Remove the sched domains which do not contribute to scheduling. */ 6756 6683 for (tmp = sd; tmp; ) { ··· 6759 6692 tmp->parent = parent->parent; 6760 6693 if (parent->parent) 6761 6694 parent->parent->child = tmp; 6695 + destroy_sched_domain(parent, cpu); 6762 6696 } else 6763 6697 tmp = tmp->parent; 6764 6698 } 6765 6699 6766 6700 if (sd && sd_degenerate(sd)) { 6701 + tmp = sd; 6767 6702 sd = sd->parent; 6703 + destroy_sched_domain(tmp, cpu); 6768 6704 if (sd) 6769 6705 sd->child = NULL; 6770 6706 } ··· 6775 6705 sched_domain_debug(sd, cpu); 6776 6706 6777 6707 rq_attach_root(rq, rd); 6708 + tmp = rq->sd; 6778 6709 rcu_assign_pointer(rq->sd, sd); 6710 + destroy_sched_domains(tmp, cpu); 6779 6711 } 6780 6712 6781 6713 /* cpus with isolated domains */ ··· 6792 6720 } 6793 6721 6794 6722 __setup("isolcpus=", isolated_cpu_setup); 6795 - 6796 - /* 6797 - * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6798 - * to a function which identifies what group(along with sched group) a CPU 6799 - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6800 - * (due to the fact that we keep track of groups covered with a struct cpumask). 6801 - * 6802 - * init_sched_build_groups will build a circular linked list of the groups 6803 - * covered by the given span, and will set each group's ->cpumask correctly, 6804 - * and ->cpu_power to 0. 6805 - */ 6806 - static void 6807 - init_sched_build_groups(const struct cpumask *span, 6808 - const struct cpumask *cpu_map, 6809 - int (*group_fn)(int cpu, const struct cpumask *cpu_map, 6810 - struct sched_group **sg, 6811 - struct cpumask *tmpmask), 6812 - struct cpumask *covered, struct cpumask *tmpmask) 6813 - { 6814 - struct sched_group *first = NULL, *last = NULL; 6815 - int i; 6816 - 6817 - cpumask_clear(covered); 6818 - 6819 - for_each_cpu(i, span) { 6820 - struct sched_group *sg; 6821 - int group = group_fn(i, cpu_map, &sg, tmpmask); 6822 - int j; 6823 - 6824 - if (cpumask_test_cpu(i, covered)) 6825 - continue; 6826 - 6827 - cpumask_clear(sched_group_cpus(sg)); 6828 - sg->cpu_power = 0; 6829 - 6830 - for_each_cpu(j, span) { 6831 - if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6832 - continue; 6833 - 6834 - cpumask_set_cpu(j, covered); 6835 - cpumask_set_cpu(j, sched_group_cpus(sg)); 6836 - } 6837 - if (!first) 6838 - first = sg; 6839 - if (last) 6840 - last->next = sg; 6841 - last = sg; 6842 - } 6843 - last->next = first; 6844 - } 6845 6723 6846 6724 #define SD_NODES_PER_DOMAIN 16 6847 6725 ··· 6809 6787 */ 6810 6788 static int find_next_best_node(int node, nodemask_t *used_nodes) 6811 6789 { 6812 - int i, n, val, min_val, best_node = 0; 6790 + int i, n, val, min_val, best_node = -1; 6813 6791 6814 6792 min_val = INT_MAX; 6815 6793 ··· 6833 6811 } 6834 6812 } 6835 6813 6836 - node_set(best_node, *used_nodes); 6814 + if (best_node != -1) 6815 + node_set(best_node, *used_nodes); 6837 6816 return best_node; 6838 6817 } 6839 6818 ··· 6860 6837 6861 6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6862 6839 int next_node = find_next_best_node(node, &used_nodes); 6863 - 6840 + if (next_node < 0) 6841 + break; 6864 6842 cpumask_or(span, span, cpumask_of_node(next_node)); 6865 6843 } 6866 6844 } 6845 + 6846 + static const struct cpumask *cpu_node_mask(int cpu) 6847 + { 6848 + lockdep_assert_held(&sched_domains_mutex); 6849 + 6850 + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); 6851 + 6852 + return sched_domains_tmpmask; 6853 + } 6854 + 6855 + static const struct cpumask *cpu_allnodes_mask(int cpu) 6856 + { 6857 + return cpu_possible_mask; 6858 + } 6867 6859 #endif /* CONFIG_NUMA */ 6860 + 6861 + static const struct cpumask *cpu_cpu_mask(int cpu) 6862 + { 6863 + return cpumask_of_node(cpu_to_node(cpu)); 6864 + } 6868 6865 6869 6866 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6870 6867 6871 - /* 6872 - * The cpus mask in sched_group and sched_domain hangs off the end. 6873 - * 6874 - * ( See the the comments in include/linux/sched.h:struct sched_group 6875 - * and struct sched_domain. ) 6876 - */ 6877 - struct static_sched_group { 6878 - struct sched_group sg; 6879 - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); 6880 - }; 6881 - 6882 - struct static_sched_domain { 6883 - struct sched_domain sd; 6884 - DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6868 + struct sd_data { 6869 + struct sched_domain **__percpu sd; 6870 + struct sched_group **__percpu sg; 6885 6871 }; 6886 6872 6887 6873 struct s_data { 6888 - #ifdef CONFIG_NUMA 6889 - int sd_allnodes; 6890 - cpumask_var_t domainspan; 6891 - cpumask_var_t covered; 6892 - cpumask_var_t notcovered; 6893 - #endif 6894 - cpumask_var_t nodemask; 6895 - cpumask_var_t this_sibling_map; 6896 - cpumask_var_t this_core_map; 6897 - cpumask_var_t this_book_map; 6898 - cpumask_var_t send_covered; 6899 - cpumask_var_t tmpmask; 6900 - struct sched_group **sched_group_nodes; 6874 + struct sched_domain ** __percpu sd; 6901 6875 struct root_domain *rd; 6902 6876 }; 6903 6877 6904 6878 enum s_alloc { 6905 - sa_sched_groups = 0, 6906 6879 sa_rootdomain, 6907 - sa_tmpmask, 6908 - sa_send_covered, 6909 - sa_this_book_map, 6910 - sa_this_core_map, 6911 - sa_this_sibling_map, 6912 - sa_nodemask, 6913 - sa_sched_group_nodes, 6914 - #ifdef CONFIG_NUMA 6915 - sa_notcovered, 6916 - sa_covered, 6917 - sa_domainspan, 6918 - #endif 6880 + sa_sd, 6881 + sa_sd_storage, 6919 6882 sa_none, 6920 6883 }; 6921 6884 6922 - /* 6923 - * SMT sched-domains: 6924 - */ 6925 - #ifdef CONFIG_SCHED_SMT 6926 - static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6927 - static DEFINE_PER_CPU(struct static_sched_group, sched_groups); 6885 + struct sched_domain_topology_level; 6928 6886 6929 - static int 6930 - cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6931 - struct sched_group **sg, struct cpumask *unused) 6887 + typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 6888 + typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 6889 + 6890 + struct sched_domain_topology_level { 6891 + sched_domain_init_f init; 6892 + sched_domain_mask_f mask; 6893 + struct sd_data data; 6894 + }; 6895 + 6896 + /* 6897 + * Assumes the sched_domain tree is fully constructed 6898 + */ 6899 + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6932 6900 { 6901 + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6902 + struct sched_domain *child = sd->child; 6903 + 6904 + if (child) 6905 + cpu = cpumask_first(sched_domain_span(child)); 6906 + 6933 6907 if (sg) 6934 - *sg = &per_cpu(sched_groups, cpu).sg; 6908 + *sg = *per_cpu_ptr(sdd->sg, cpu); 6909 + 6935 6910 return cpu; 6936 6911 } 6937 - #endif /* CONFIG_SCHED_SMT */ 6938 6912 6939 6913 /* 6940 - * multi-core sched-domains: 6914 + * build_sched_groups takes the cpumask we wish to span, and a pointer 6915 + * to a function which identifies what group(along with sched group) a CPU 6916 + * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6917 + * (due to the fact that we keep track of groups covered with a struct cpumask). 6918 + * 6919 + * build_sched_groups will build a circular linked list of the groups 6920 + * covered by the given span, and will set each group's ->cpumask correctly, 6921 + * and ->cpu_power to 0. 6941 6922 */ 6942 - #ifdef CONFIG_SCHED_MC 6943 - static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6944 - static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6945 - 6946 - static int 6947 - cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6948 - struct sched_group **sg, struct cpumask *mask) 6923 + static void 6924 + build_sched_groups(struct sched_domain *sd) 6949 6925 { 6950 - int group; 6951 - #ifdef CONFIG_SCHED_SMT 6952 - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6953 - group = cpumask_first(mask); 6954 - #else 6955 - group = cpu; 6956 - #endif 6957 - if (sg) 6958 - *sg = &per_cpu(sched_group_core, group).sg; 6959 - return group; 6960 - } 6961 - #endif /* CONFIG_SCHED_MC */ 6926 + struct sched_group *first = NULL, *last = NULL; 6927 + struct sd_data *sdd = sd->private; 6928 + const struct cpumask *span = sched_domain_span(sd); 6929 + struct cpumask *covered; 6930 + int i; 6962 6931 6963 - /* 6964 - * book sched-domains: 6965 - */ 6966 - #ifdef CONFIG_SCHED_BOOK 6967 - static DEFINE_PER_CPU(struct static_sched_domain, book_domains); 6968 - static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); 6932 + lockdep_assert_held(&sched_domains_mutex); 6933 + covered = sched_domains_tmpmask; 6969 6934 6970 - static int 6971 - cpu_to_book_group(int cpu, const struct cpumask *cpu_map, 6972 - struct sched_group **sg, struct cpumask *mask) 6973 - { 6974 - int group = cpu; 6975 - #ifdef CONFIG_SCHED_MC 6976 - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6977 - group = cpumask_first(mask); 6978 - #elif defined(CONFIG_SCHED_SMT) 6979 - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6980 - group = cpumask_first(mask); 6981 - #endif 6982 - if (sg) 6983 - *sg = &per_cpu(sched_group_book, group).sg; 6984 - return group; 6985 - } 6986 - #endif /* CONFIG_SCHED_BOOK */ 6935 + cpumask_clear(covered); 6987 6936 6988 - static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6989 - static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6937 + for_each_cpu(i, span) { 6938 + struct sched_group *sg; 6939 + int group = get_group(i, sdd, &sg); 6940 + int j; 6990 6941 6991 - static int 6992 - cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 6993 - struct sched_group **sg, struct cpumask *mask) 6994 - { 6995 - int group; 6996 - #ifdef CONFIG_SCHED_BOOK 6997 - cpumask_and(mask, cpu_book_mask(cpu), cpu_map); 6998 - group = cpumask_first(mask); 6999 - #elif defined(CONFIG_SCHED_MC) 7000 - cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7001 - group = cpumask_first(mask); 7002 - #elif defined(CONFIG_SCHED_SMT) 7003 - cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 7004 - group = cpumask_first(mask); 7005 - #else 7006 - group = cpu; 7007 - #endif 7008 - if (sg) 7009 - *sg = &per_cpu(sched_group_phys, group).sg; 7010 - return group; 7011 - } 7012 - 7013 - #ifdef CONFIG_NUMA 7014 - /* 7015 - * The init_sched_build_groups can't handle what we want to do with node 7016 - * groups, so roll our own. Now each node has its own list of groups which 7017 - * gets dynamically allocated. 7018 - */ 7019 - static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 7020 - static struct sched_group ***sched_group_nodes_bycpu; 7021 - 7022 - static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 7023 - static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 7024 - 7025 - static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 7026 - struct sched_group **sg, 7027 - struct cpumask *nodemask) 7028 - { 7029 - int group; 7030 - 7031 - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 7032 - group = cpumask_first(nodemask); 7033 - 7034 - if (sg) 7035 - *sg = &per_cpu(sched_group_allnodes, group).sg; 7036 - return group; 7037 - } 7038 - 7039 - static void init_numa_sched_groups_power(struct sched_group *group_head) 7040 - { 7041 - struct sched_group *sg = group_head; 7042 - int j; 7043 - 7044 - if (!sg) 7045 - return; 7046 - do { 7047 - for_each_cpu(j, sched_group_cpus(sg)) { 7048 - struct sched_domain *sd; 7049 - 7050 - sd = &per_cpu(phys_domains, j).sd; 7051 - if (j != group_first_cpu(sd->groups)) { 7052 - /* 7053 - * Only add "power" once for each 7054 - * physical package. 7055 - */ 7056 - continue; 7057 - } 7058 - 7059 - sg->cpu_power += sd->groups->cpu_power; 7060 - } 7061 - sg = sg->next; 7062 - } while (sg != group_head); 7063 - } 7064 - 7065 - static int build_numa_sched_groups(struct s_data *d, 7066 - const struct cpumask *cpu_map, int num) 7067 - { 7068 - struct sched_domain *sd; 7069 - struct sched_group *sg, *prev; 7070 - int n, j; 7071 - 7072 - cpumask_clear(d->covered); 7073 - cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 7074 - if (cpumask_empty(d->nodemask)) { 7075 - d->sched_group_nodes[num] = NULL; 7076 - goto out; 7077 - } 7078 - 7079 - sched_domain_node_span(num, d->domainspan); 7080 - cpumask_and(d->domainspan, d->domainspan, cpu_map); 7081 - 7082 - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7083 - GFP_KERNEL, num); 7084 - if (!sg) { 7085 - printk(KERN_WARNING "Can not alloc domain group for node %d\n", 7086 - num); 7087 - return -ENOMEM; 7088 - } 7089 - d->sched_group_nodes[num] = sg; 7090 - 7091 - for_each_cpu(j, d->nodemask) { 7092 - sd = &per_cpu(node_domains, j).sd; 7093 - sd->groups = sg; 7094 - } 7095 - 7096 - sg->cpu_power = 0; 7097 - cpumask_copy(sched_group_cpus(sg), d->nodemask); 7098 - sg->next = sg; 7099 - cpumask_or(d->covered, d->covered, d->nodemask); 7100 - 7101 - prev = sg; 7102 - for (j = 0; j < nr_node_ids; j++) { 7103 - n = (num + j) % nr_node_ids; 7104 - cpumask_complement(d->notcovered, d->covered); 7105 - cpumask_and(d->tmpmask, d->notcovered, cpu_map); 7106 - cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); 7107 - if (cpumask_empty(d->tmpmask)) 7108 - break; 7109 - cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); 7110 - if (cpumask_empty(d->tmpmask)) 6942 + if (cpumask_test_cpu(i, covered)) 7111 6943 continue; 7112 - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 7113 - GFP_KERNEL, num); 7114 - if (!sg) { 7115 - printk(KERN_WARNING 7116 - "Can not alloc domain group for node %d\n", j); 7117 - return -ENOMEM; 7118 - } 6944 + 6945 + cpumask_clear(sched_group_cpus(sg)); 7119 6946 sg->cpu_power = 0; 7120 - cpumask_copy(sched_group_cpus(sg), d->tmpmask); 7121 - sg->next = prev->next; 7122 - cpumask_or(d->covered, d->covered, d->tmpmask); 7123 - prev->next = sg; 7124 - prev = sg; 7125 - } 7126 - out: 7127 - return 0; 7128 - } 7129 - #endif /* CONFIG_NUMA */ 7130 6947 7131 - #ifdef CONFIG_NUMA 7132 - /* Free memory allocated for various sched_group structures */ 7133 - static void free_sched_groups(const struct cpumask *cpu_map, 7134 - struct cpumask *nodemask) 7135 - { 7136 - int cpu, i; 7137 - 7138 - for_each_cpu(cpu, cpu_map) { 7139 - struct sched_group **sched_group_nodes 7140 - = sched_group_nodes_bycpu[cpu]; 7141 - 7142 - if (!sched_group_nodes) 7143 - continue; 7144 - 7145 - for (i = 0; i < nr_node_ids; i++) { 7146 - struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7147 - 7148 - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7149 - if (cpumask_empty(nodemask)) 6948 + for_each_cpu(j, span) { 6949 + if (get_group(j, sdd, NULL) != group) 7150 6950 continue; 7151 6951 7152 - if (sg == NULL) 7153 - continue; 7154 - sg = sg->next; 7155 - next_sg: 7156 - oldsg = sg; 7157 - sg = sg->next; 7158 - kfree(oldsg); 7159 - if (oldsg != sched_group_nodes[i]) 7160 - goto next_sg; 6952 + cpumask_set_cpu(j, covered); 6953 + cpumask_set_cpu(j, sched_group_cpus(sg)); 7161 6954 } 7162 - kfree(sched_group_nodes); 7163 - sched_group_nodes_bycpu[cpu] = NULL; 6955 + 6956 + if (!first) 6957 + first = sg; 6958 + if (last) 6959 + last->next = sg; 6960 + last = sg; 7164 6961 } 6962 + last->next = first; 7165 6963 } 7166 - #else /* !CONFIG_NUMA */ 7167 - static void free_sched_groups(const struct cpumask *cpu_map, 7168 - struct cpumask *nodemask) 7169 - { 7170 - } 7171 - #endif /* CONFIG_NUMA */ 7172 6964 7173 6965 /* 7174 6966 * Initialize sched groups cpu_power. ··· 6997 7159 */ 6998 7160 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6999 7161 { 7000 - struct sched_domain *child; 7001 - struct sched_group *group; 7002 - long power; 7003 - int weight; 7004 - 7005 7162 WARN_ON(!sd || !sd->groups); 7006 7163 7007 7164 if (cpu != group_first_cpu(sd->groups)) ··· 7004 7171 7005 7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7006 7173 7007 - child = sd->child; 7008 - 7009 - sd->groups->cpu_power = 0; 7010 - 7011 - if (!child) { 7012 - power = SCHED_LOAD_SCALE; 7013 - weight = cpumask_weight(sched_domain_span(sd)); 7014 - /* 7015 - * SMT siblings share the power of a single core. 7016 - * Usually multiple threads get a better yield out of 7017 - * that one core than a single thread would have, 7018 - * reflect that in sd->smt_gain. 7019 - */ 7020 - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 7021 - power *= sd->smt_gain; 7022 - power /= weight; 7023 - power >>= SCHED_LOAD_SHIFT; 7024 - } 7025 - sd->groups->cpu_power += power; 7026 - return; 7027 - } 7028 - 7029 - /* 7030 - * Add cpu_power of each child group to this groups cpu_power. 7031 - */ 7032 - group = child->groups; 7033 - do { 7034 - sd->groups->cpu_power += group->cpu_power; 7035 - group = group->next; 7036 - } while (group != child->groups); 7174 + update_group_power(sd, cpu); 7037 7175 } 7038 7176 7039 7177 /* ··· 7018 7214 # define SD_INIT_NAME(sd, type) do { } while (0) 7019 7215 #endif 7020 7216 7021 - #define SD_INIT(sd, type) sd_init_##type(sd) 7022 - 7023 - #define SD_INIT_FUNC(type) \ 7024 - static noinline void sd_init_##type(struct sched_domain *sd) \ 7025 - { \ 7026 - memset(sd, 0, sizeof(*sd)); \ 7027 - *sd = SD_##type##_INIT; \ 7028 - sd->level = SD_LV_##type; \ 7029 - SD_INIT_NAME(sd, type); \ 7217 + #define SD_INIT_FUNC(type) \ 7218 + static noinline struct sched_domain * \ 7219 + sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 7220 + { \ 7221 + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 7222 + *sd = SD_##type##_INIT; \ 7223 + SD_INIT_NAME(sd, type); \ 7224 + sd->private = &tl->data; \ 7225 + return sd; \ 7030 7226 } 7031 7227 7032 7228 SD_INIT_FUNC(CPU) ··· 7045 7241 #endif 7046 7242 7047 7243 static int default_relax_domain_level = -1; 7244 + int sched_domain_level_max; 7048 7245 7049 7246 static int __init setup_relax_domain_level(char *str) 7050 7247 { 7051 7248 unsigned long val; 7052 7249 7053 7250 val = simple_strtoul(str, NULL, 0); 7054 - if (val < SD_LV_MAX) 7251 + if (val < sched_domain_level_max) 7055 7252 default_relax_domain_level = val; 7056 7253 7057 7254 return 1; ··· 7080 7275 } 7081 7276 } 7082 7277 7278 + static void __sdt_free(const struct cpumask *cpu_map); 7279 + static int __sdt_alloc(const struct cpumask *cpu_map); 7280 + 7083 7281 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7084 7282 const struct cpumask *cpu_map) 7085 7283 { 7086 7284 switch (what) { 7087 - case sa_sched_groups: 7088 - free_sched_groups(cpu_map, d->tmpmask); /* fall through */ 7089 - d->sched_group_nodes = NULL; 7090 7285 case sa_rootdomain: 7091 - free_rootdomain(d->rd); /* fall through */ 7092 - case sa_tmpmask: 7093 - free_cpumask_var(d->tmpmask); /* fall through */ 7094 - case sa_send_covered: 7095 - free_cpumask_var(d->send_covered); /* fall through */ 7096 - case sa_this_book_map: 7097 - free_cpumask_var(d->this_book_map); /* fall through */ 7098 - case sa_this_core_map: 7099 - free_cpumask_var(d->this_core_map); /* fall through */ 7100 - case sa_this_sibling_map: 7101 - free_cpumask_var(d->this_sibling_map); /* fall through */ 7102 - case sa_nodemask: 7103 - free_cpumask_var(d->nodemask); /* fall through */ 7104 - case sa_sched_group_nodes: 7105 - #ifdef CONFIG_NUMA 7106 - kfree(d->sched_group_nodes); /* fall through */ 7107 - case sa_notcovered: 7108 - free_cpumask_var(d->notcovered); /* fall through */ 7109 - case sa_covered: 7110 - free_cpumask_var(d->covered); /* fall through */ 7111 - case sa_domainspan: 7112 - free_cpumask_var(d->domainspan); /* fall through */ 7113 - #endif 7286 + if (!atomic_read(&d->rd->refcount)) 7287 + free_rootdomain(&d->rd->rcu); /* fall through */ 7288 + case sa_sd: 7289 + free_percpu(d->sd); /* fall through */ 7290 + case sa_sd_storage: 7291 + __sdt_free(cpu_map); /* fall through */ 7114 7292 case sa_none: 7115 7293 break; 7116 7294 } ··· 7102 7314 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7103 7315 const struct cpumask *cpu_map) 7104 7316 { 7105 - #ifdef CONFIG_NUMA 7106 - if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7107 - return sa_none; 7108 - if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7109 - return sa_domainspan; 7110 - if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7111 - return sa_covered; 7112 - /* Allocate the per-node list of sched groups */ 7113 - d->sched_group_nodes = kcalloc(nr_node_ids, 7114 - sizeof(struct sched_group *), GFP_KERNEL); 7115 - if (!d->sched_group_nodes) { 7116 - printk(KERN_WARNING "Can not alloc sched group node list\n"); 7117 - return sa_notcovered; 7118 - } 7119 - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; 7120 - #endif 7121 - if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 7122 - return sa_sched_group_nodes; 7123 - if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) 7124 - return sa_nodemask; 7125 - if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 - return sa_this_sibling_map; 7127 - if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) 7128 - return sa_this_core_map; 7129 - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7130 - return sa_this_book_map; 7131 - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 - return sa_send_covered; 7317 + memset(d, 0, sizeof(*d)); 7318 + 7319 + if (__sdt_alloc(cpu_map)) 7320 + return sa_sd_storage; 7321 + d->sd = alloc_percpu(struct sched_domain *); 7322 + if (!d->sd) 7323 + return sa_sd_storage; 7133 7324 d->rd = alloc_rootdomain(); 7134 - if (!d->rd) { 7135 - printk(KERN_WARNING "Cannot alloc root domain\n"); 7136 - return sa_tmpmask; 7137 - } 7325 + if (!d->rd) 7326 + return sa_sd; 7138 7327 return sa_rootdomain; 7139 7328 } 7140 7329 7141 - static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7142 - const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7330 + /* 7331 + * NULL the sd_data elements we've used to build the sched_domain and 7332 + * sched_group structure so that the subsequent __free_domain_allocs() 7333 + * will not free the data we're using. 7334 + */ 7335 + static void claim_allocations(int cpu, struct sched_domain *sd) 7143 7336 { 7144 - struct sched_domain *sd = NULL; 7145 - #ifdef CONFIG_NUMA 7146 - struct sched_domain *parent; 7337 + struct sd_data *sdd = sd->private; 7338 + struct sched_group *sg = sd->groups; 7147 7339 7148 - d->sd_allnodes = 0; 7149 - if (cpumask_weight(cpu_map) > 7150 - SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { 7151 - sd = &per_cpu(allnodes_domains, i).sd; 7152 - SD_INIT(sd, ALLNODES); 7153 - set_domain_attribute(sd, attr); 7154 - cpumask_copy(sched_domain_span(sd), cpu_map); 7155 - cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); 7156 - d->sd_allnodes = 1; 7340 + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7341 + *per_cpu_ptr(sdd->sd, cpu) = NULL; 7342 + 7343 + if (cpu == cpumask_first(sched_group_cpus(sg))) { 7344 + WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); 7345 + *per_cpu_ptr(sdd->sg, cpu) = NULL; 7157 7346 } 7158 - parent = sd; 7159 - 7160 - sd = &per_cpu(node_domains, i).sd; 7161 - SD_INIT(sd, NODE); 7162 - set_domain_attribute(sd, attr); 7163 - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 7164 - sd->parent = parent; 7165 - if (parent) 7166 - parent->child = sd; 7167 - cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 7168 - #endif 7169 - return sd; 7170 7347 } 7171 7348 7172 - static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7173 - const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7174 - struct sched_domain *parent, int i) 7175 - { 7176 - struct sched_domain *sd; 7177 - sd = &per_cpu(phys_domains, i).sd; 7178 - SD_INIT(sd, CPU); 7179 - set_domain_attribute(sd, attr); 7180 - cpumask_copy(sched_domain_span(sd), d->nodemask); 7181 - sd->parent = parent; 7182 - if (parent) 7183 - parent->child = sd; 7184 - cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); 7185 - return sd; 7186 - } 7187 - 7188 - static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7189 - const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7190 - struct sched_domain *parent, int i) 7191 - { 7192 - struct sched_domain *sd = parent; 7193 - #ifdef CONFIG_SCHED_BOOK 7194 - sd = &per_cpu(book_domains, i).sd; 7195 - SD_INIT(sd, BOOK); 7196 - set_domain_attribute(sd, attr); 7197 - cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); 7198 - sd->parent = parent; 7199 - parent->child = sd; 7200 - cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); 7201 - #endif 7202 - return sd; 7203 - } 7204 - 7205 - static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206 - const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 - struct sched_domain *parent, int i) 7208 - { 7209 - struct sched_domain *sd = parent; 7210 - #ifdef CONFIG_SCHED_MC 7211 - sd = &per_cpu(core_domains, i).sd; 7212 - SD_INIT(sd, MC); 7213 - set_domain_attribute(sd, attr); 7214 - cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); 7215 - sd->parent = parent; 7216 - parent->child = sd; 7217 - cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); 7218 - #endif 7219 - return sd; 7220 - } 7221 - 7222 - static struct sched_domain *__build_smt_sched_domain(struct s_data *d, 7223 - const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7224 - struct sched_domain *parent, int i) 7225 - { 7226 - struct sched_domain *sd = parent; 7227 7349 #ifdef CONFIG_SCHED_SMT 7228 - sd = &per_cpu(cpu_domains, i).sd; 7229 - SD_INIT(sd, SIBLING); 7230 - set_domain_attribute(sd, attr); 7231 - cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); 7232 - sd->parent = parent; 7233 - parent->child = sd; 7234 - cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); 7235 - #endif 7236 - return sd; 7237 - } 7238 - 7239 - static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7240 - const struct cpumask *cpu_map, int cpu) 7350 + static const struct cpumask *cpu_smt_mask(int cpu) 7241 7351 { 7242 - switch (l) { 7352 + return topology_thread_cpumask(cpu); 7353 + } 7354 + #endif 7355 + 7356 + /* 7357 + * Topology list, bottom-up. 7358 + */ 7359 + static struct sched_domain_topology_level default_topology[] = { 7243 7360 #ifdef CONFIG_SCHED_SMT 7244 - case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7245 - cpumask_and(d->this_sibling_map, cpu_map, 7246 - topology_thread_cpumask(cpu)); 7247 - if (cpu == cpumask_first(d->this_sibling_map)) 7248 - init_sched_build_groups(d->this_sibling_map, cpu_map, 7249 - &cpu_to_cpu_group, 7250 - d->send_covered, d->tmpmask); 7251 - break; 7361 + { sd_init_SIBLING, cpu_smt_mask, }, 7252 7362 #endif 7253 7363 #ifdef CONFIG_SCHED_MC 7254 - case SD_LV_MC: /* set up multi-core groups */ 7255 - cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); 7256 - if (cpu == cpumask_first(d->this_core_map)) 7257 - init_sched_build_groups(d->this_core_map, cpu_map, 7258 - &cpu_to_core_group, 7259 - d->send_covered, d->tmpmask); 7260 - break; 7364 + { sd_init_MC, cpu_coregroup_mask, }, 7261 7365 #endif 7262 7366 #ifdef CONFIG_SCHED_BOOK 7263 - case SD_LV_BOOK: /* set up book groups */ 7264 - cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); 7265 - if (cpu == cpumask_first(d->this_book_map)) 7266 - init_sched_build_groups(d->this_book_map, cpu_map, 7267 - &cpu_to_book_group, 7268 - d->send_covered, d->tmpmask); 7269 - break; 7367 + { sd_init_BOOK, cpu_book_mask, }, 7270 7368 #endif 7271 - case SD_LV_CPU: /* set up physical groups */ 7272 - cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 - if (!cpumask_empty(d->nodemask)) 7274 - init_sched_build_groups(d->nodemask, cpu_map, 7275 - &cpu_to_phys_group, 7276 - d->send_covered, d->tmpmask); 7277 - break; 7369 + { sd_init_CPU, cpu_cpu_mask, }, 7278 7370 #ifdef CONFIG_NUMA 7279 - case SD_LV_ALLNODES: 7280 - init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7281 - d->send_covered, d->tmpmask); 7282 - break; 7371 + { sd_init_NODE, cpu_node_mask, }, 7372 + { sd_init_ALLNODES, cpu_allnodes_mask, }, 7283 7373 #endif 7284 - default: 7285 - break; 7374 + { NULL, }, 7375 + }; 7376 + 7377 + static struct sched_domain_topology_level *sched_domain_topology = default_topology; 7378 + 7379 + static int __sdt_alloc(const struct cpumask *cpu_map) 7380 + { 7381 + struct sched_domain_topology_level *tl; 7382 + int j; 7383 + 7384 + for (tl = sched_domain_topology; tl->init; tl++) { 7385 + struct sd_data *sdd = &tl->data; 7386 + 7387 + sdd->sd = alloc_percpu(struct sched_domain *); 7388 + if (!sdd->sd) 7389 + return -ENOMEM; 7390 + 7391 + sdd->sg = alloc_percpu(struct sched_group *); 7392 + if (!sdd->sg) 7393 + return -ENOMEM; 7394 + 7395 + for_each_cpu(j, cpu_map) { 7396 + struct sched_domain *sd; 7397 + struct sched_group *sg; 7398 + 7399 + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7400 + GFP_KERNEL, cpu_to_node(j)); 7401 + if (!sd) 7402 + return -ENOMEM; 7403 + 7404 + *per_cpu_ptr(sdd->sd, j) = sd; 7405 + 7406 + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 7407 + GFP_KERNEL, cpu_to_node(j)); 7408 + if (!sg) 7409 + return -ENOMEM; 7410 + 7411 + *per_cpu_ptr(sdd->sg, j) = sg; 7412 + } 7286 7413 } 7414 + 7415 + return 0; 7416 + } 7417 + 7418 + static void __sdt_free(const struct cpumask *cpu_map) 7419 + { 7420 + struct sched_domain_topology_level *tl; 7421 + int j; 7422 + 7423 + for (tl = sched_domain_topology; tl->init; tl++) { 7424 + struct sd_data *sdd = &tl->data; 7425 + 7426 + for_each_cpu(j, cpu_map) { 7427 + kfree(*per_cpu_ptr(sdd->sd, j)); 7428 + kfree(*per_cpu_ptr(sdd->sg, j)); 7429 + } 7430 + free_percpu(sdd->sd); 7431 + free_percpu(sdd->sg); 7432 + } 7433 + } 7434 + 7435 + struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 7436 + struct s_data *d, const struct cpumask *cpu_map, 7437 + struct sched_domain_attr *attr, struct sched_domain *child, 7438 + int cpu) 7439 + { 7440 + struct sched_domain *sd = tl->init(tl, cpu); 7441 + if (!sd) 7442 + return child; 7443 + 7444 + set_domain_attribute(sd, attr); 7445 + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 7446 + if (child) { 7447 + sd->level = child->level + 1; 7448 + sched_domain_level_max = max(sched_domain_level_max, sd->level); 7449 + child->parent = sd; 7450 + } 7451 + sd->child = child; 7452 + 7453 + return sd; 7287 7454 } 7288 7455 7289 7456 /* 7290 7457 * Build sched domains for a given set of cpus and attach the sched domains 7291 7458 * to the individual cpus 7292 7459 */ 7293 - static int __build_sched_domains(const struct cpumask *cpu_map, 7294 - struct sched_domain_attr *attr) 7460 + static int build_sched_domains(const struct cpumask *cpu_map, 7461 + struct sched_domain_attr *attr) 7295 7462 { 7296 7463 enum s_alloc alloc_state = sa_none; 7297 - struct s_data d; 7298 7464 struct sched_domain *sd; 7299 - int i; 7300 - #ifdef CONFIG_NUMA 7301 - d.sd_allnodes = 0; 7302 - #endif 7465 + struct s_data d; 7466 + int i, ret = -ENOMEM; 7303 7467 7304 7468 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7305 7469 if (alloc_state != sa_rootdomain) 7306 7470 goto error; 7307 - alloc_state = sa_sched_groups; 7308 7471 7309 - /* 7310 - * Set up domains for cpus specified by the cpu_map. 7311 - */ 7472 + /* Set up domains for cpus specified by the cpu_map. */ 7312 7473 for_each_cpu(i, cpu_map) { 7313 - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7314 - cpu_map); 7474 + struct sched_domain_topology_level *tl; 7315 7475 7316 - sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 - sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 - sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7319 - sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 - sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7476 + sd = NULL; 7477 + for (tl = sched_domain_topology; tl->init; tl++) 7478 + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7479 + 7480 + while (sd->child) 7481 + sd = sd->child; 7482 + 7483 + *per_cpu_ptr(d.sd, i) = sd; 7321 7484 } 7322 7485 7486 + /* Build the groups for the domains */ 7323 7487 for_each_cpu(i, cpu_map) { 7324 - build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 - build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); 7326 - build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7488 + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7489 + sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7490 + get_group(i, sd->private, &sd->groups); 7491 + atomic_inc(&sd->groups->ref); 7492 + 7493 + if (i != cpumask_first(sched_domain_span(sd))) 7494 + continue; 7495 + 7496 + build_sched_groups(sd); 7497 + } 7327 7498 } 7328 - 7329 - /* Set up physical groups */ 7330 - for (i = 0; i < nr_node_ids; i++) 7331 - build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7332 - 7333 - #ifdef CONFIG_NUMA 7334 - /* Set up node groups */ 7335 - if (d.sd_allnodes) 7336 - build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 7337 - 7338 - for (i = 0; i < nr_node_ids; i++) 7339 - if (build_numa_sched_groups(&d, cpu_map, i)) 7340 - goto error; 7341 - #endif 7342 7499 7343 7500 /* Calculate CPU power for physical packages and nodes */ 7344 - #ifdef CONFIG_SCHED_SMT 7345 - for_each_cpu(i, cpu_map) { 7346 - sd = &per_cpu(cpu_domains, i).sd; 7347 - init_sched_groups_power(i, sd); 7348 - } 7349 - #endif 7350 - #ifdef CONFIG_SCHED_MC 7351 - for_each_cpu(i, cpu_map) { 7352 - sd = &per_cpu(core_domains, i).sd; 7353 - init_sched_groups_power(i, sd); 7354 - } 7355 - #endif 7356 - #ifdef CONFIG_SCHED_BOOK 7357 - for_each_cpu(i, cpu_map) { 7358 - sd = &per_cpu(book_domains, i).sd; 7359 - init_sched_groups_power(i, sd); 7360 - } 7361 - #endif 7501 + for (i = nr_cpumask_bits-1; i >= 0; i--) { 7502 + if (!cpumask_test_cpu(i, cpu_map)) 7503 + continue; 7362 7504 7363 - for_each_cpu(i, cpu_map) { 7364 - sd = &per_cpu(phys_domains, i).sd; 7365 - init_sched_groups_power(i, sd); 7505 + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7506 + claim_allocations(i, sd); 7507 + init_sched_groups_power(i, sd); 7508 + } 7366 7509 } 7367 - 7368 - #ifdef CONFIG_NUMA 7369 - for (i = 0; i < nr_node_ids; i++) 7370 - init_numa_sched_groups_power(d.sched_group_nodes[i]); 7371 - 7372 - if (d.sd_allnodes) { 7373 - struct sched_group *sg; 7374 - 7375 - cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7376 - d.tmpmask); 7377 - init_numa_sched_groups_power(sg); 7378 - } 7379 - #endif 7380 7510 7381 7511 /* Attach the domains */ 7512 + rcu_read_lock(); 7382 7513 for_each_cpu(i, cpu_map) { 7383 - #ifdef CONFIG_SCHED_SMT 7384 - sd = &per_cpu(cpu_domains, i).sd; 7385 - #elif defined(CONFIG_SCHED_MC) 7386 - sd = &per_cpu(core_domains, i).sd; 7387 - #elif defined(CONFIG_SCHED_BOOK) 7388 - sd = &per_cpu(book_domains, i).sd; 7389 - #else 7390 - sd = &per_cpu(phys_domains, i).sd; 7391 - #endif 7514 + sd = *per_cpu_ptr(d.sd, i); 7392 7515 cpu_attach_domain(sd, d.rd, i); 7393 7516 } 7517 + rcu_read_unlock(); 7394 7518 7395 - d.sched_group_nodes = NULL; /* don't free this we still need it */ 7396 - __free_domain_allocs(&d, sa_tmpmask, cpu_map); 7397 - return 0; 7398 - 7519 + ret = 0; 7399 7520 error: 7400 7521 __free_domain_allocs(&d, alloc_state, cpu_map); 7401 - return -ENOMEM; 7402 - } 7403 - 7404 - static int build_sched_domains(const struct cpumask *cpu_map) 7405 - { 7406 - return __build_sched_domains(cpu_map, NULL); 7522 + return ret; 7407 7523 } 7408 7524 7409 7525 static cpumask_var_t *doms_cur; /* current sched domains */ ··· 7362 7670 * For now this just excludes isolated cpus, but could be used to 7363 7671 * exclude other special cases in the future. 7364 7672 */ 7365 - static int arch_init_sched_domains(const struct cpumask *cpu_map) 7673 + static int init_sched_domains(const struct cpumask *cpu_map) 7366 7674 { 7367 7675 int err; 7368 7676 ··· 7373 7681 doms_cur = &fallback_doms; 7374 7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7375 7683 dattr_cur = NULL; 7376 - err = build_sched_domains(doms_cur[0]); 7684 + err = build_sched_domains(doms_cur[0], NULL); 7377 7685 register_sched_domain_sysctl(); 7378 7686 7379 7687 return err; 7380 - } 7381 - 7382 - static void arch_destroy_sched_domains(const struct cpumask *cpu_map, 7383 - struct cpumask *tmpmask) 7384 - { 7385 - free_sched_groups(cpu_map, tmpmask); 7386 7688 } 7387 7689 7388 7690 /* ··· 7385 7699 */ 7386 7700 static void detach_destroy_domains(const struct cpumask *cpu_map) 7387 7701 { 7388 - /* Save because hotplug lock held. */ 7389 - static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); 7390 7702 int i; 7391 7703 7704 + rcu_read_lock(); 7392 7705 for_each_cpu(i, cpu_map) 7393 7706 cpu_attach_domain(NULL, &def_root_domain, i); 7394 - synchronize_sched(); 7395 - arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); 7707 + rcu_read_unlock(); 7396 7708 } 7397 7709 7398 7710 /* handle null as "default" */ ··· 7479 7795 goto match2; 7480 7796 } 7481 7797 /* no match - add a new doms_new */ 7482 - __build_sched_domains(doms_new[i], 7483 - dattr_new ? dattr_new + i : NULL); 7798 + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 7484 7799 match2: 7485 7800 ; 7486 7801 } ··· 7498 7815 } 7499 7816 7500 7817 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7501 - static void arch_reinit_sched_domains(void) 7818 + static void reinit_sched_domains(void) 7502 7819 { 7503 7820 get_online_cpus(); 7504 7821 ··· 7531 7848 else 7532 7849 sched_mc_power_savings = level; 7533 7850 7534 - arch_reinit_sched_domains(); 7851 + reinit_sched_domains(); 7535 7852 7536 7853 return count; 7537 7854 } ··· 7650 7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7651 7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7652 7969 7653 - #if defined(CONFIG_NUMA) 7654 - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 7655 - GFP_KERNEL); 7656 - BUG_ON(sched_group_nodes_bycpu == NULL); 7657 - #endif 7658 7970 get_online_cpus(); 7659 7971 mutex_lock(&sched_domains_mutex); 7660 - arch_init_sched_domains(cpu_active_mask); 7972 + init_sched_domains(cpu_active_mask); 7661 7973 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7662 7974 if (cpumask_empty(non_isolated_cpus)) 7663 7975 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); ··· 7959 8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7960 8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7961 8283 #ifdef CONFIG_SMP 8284 + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7962 8285 #ifdef CONFIG_NO_HZ 7963 8286 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7964 8287 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); ··· 8019 8340 int old_prio = p->prio; 8020 8341 int on_rq; 8021 8342 8022 - on_rq = p->se.on_rq; 8343 + on_rq = p->on_rq; 8023 8344 if (on_rq) 8024 8345 deactivate_task(rq, p, 0); 8025 8346 __setscheduler(rq, p, SCHED_NORMAL, 0); ··· 8232 8553 { 8233 8554 struct rt_rq *rt_rq; 8234 8555 struct sched_rt_entity *rt_se; 8235 - struct rq *rq; 8236 8556 int i; 8237 8557 8238 8558 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); ··· 8245 8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8246 8568 8247 8569 for_each_possible_cpu(i) { 8248 - rq = cpu_rq(i); 8249 - 8250 8570 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8251 8571 GFP_KERNEL, cpu_to_node(i)); 8252 8572 if (!rt_rq) ··· 8359 8683 rq = task_rq_lock(tsk, &flags); 8360 8684 8361 8685 running = task_current(rq, tsk); 8362 - on_rq = tsk->se.on_rq; 8686 + on_rq = tsk->on_rq; 8363 8687 8364 8688 if (on_rq) 8365 8689 dequeue_task(rq, tsk, 0); ··· 8378 8702 if (on_rq) 8379 8703 enqueue_task(rq, tsk, 0); 8380 8704 8381 - task_rq_unlock(rq, &flags); 8705 + task_rq_unlock(rq, tsk, &flags); 8382 8706 } 8383 8707 #endif /* CONFIG_CGROUP_SCHED */ 8384 8708

+1 -5

kernel/sched_debug.c

··· 152 152 read_lock_irqsave(&tasklist_lock, flags); 153 153 154 154 do_each_thread(g, p) { 155 - if (!p->se.on_rq || task_cpu(p) != rq_cpu) 155 + if (!p->on_rq || task_cpu(p) != rq_cpu) 156 156 continue; 157 157 158 158 print_task(m, rq, p); ··· 296 296 P(ttwu_count); 297 297 P(ttwu_local); 298 298 299 - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", 300 - rq->rq_sched_info.bkl_count); 301 - 302 299 #undef P 303 300 #undef P64 304 301 #endif ··· 438 441 P(se.statistics.wait_count); 439 442 PN(se.statistics.iowait_sum); 440 443 P(se.statistics.iowait_count); 441 - P(sched_info.bkl_count); 442 444 P(se.nr_migrations); 443 445 P(se.statistics.nr_migrations_cold); 444 446 P(se.statistics.nr_failed_migrations_affine);

+87 -39

kernel/sched_fair.c

··· 358 358 } 359 359 360 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 361 + #ifndef CONFIG_64BIT 362 + smp_wmb(); 363 + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 364 + #endif 361 365 } 362 366 363 367 /* ··· 1344 1340 hrtick_update(rq); 1345 1341 } 1346 1342 1343 + static void set_next_buddy(struct sched_entity *se); 1344 + 1347 1345 /* 1348 1346 * The dequeue_task method is called before nr_running is 1349 1347 * decreased. We remove the task from the rbtree and ··· 1355 1349 { 1356 1350 struct cfs_rq *cfs_rq; 1357 1351 struct sched_entity *se = &p->se; 1352 + int task_sleep = flags & DEQUEUE_SLEEP; 1358 1353 1359 1354 for_each_sched_entity(se) { 1360 1355 cfs_rq = cfs_rq_of(se); 1361 1356 dequeue_entity(cfs_rq, se, flags); 1362 1357 1363 1358 /* Don't dequeue parent if it has other entities besides us */ 1364 - if (cfs_rq->load.weight) 1359 + if (cfs_rq->load.weight) { 1360 + /* 1361 + * Bias pick_next to pick a task from this cfs_rq, as 1362 + * p is sleeping when it is within its sched_slice. 1363 + */ 1364 + if (task_sleep && parent_entity(se)) 1365 + set_next_buddy(parent_entity(se)); 1365 1366 break; 1367 + } 1366 1368 flags |= DEQUEUE_SLEEP; 1367 1369 } 1368 1370 ··· 1386 1372 1387 1373 #ifdef CONFIG_SMP 1388 1374 1389 - static void task_waking_fair(struct rq *rq, struct task_struct *p) 1375 + static void task_waking_fair(struct task_struct *p) 1390 1376 { 1391 1377 struct sched_entity *se = &p->se; 1392 1378 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1379 + u64 min_vruntime; 1393 1380 1394 - se->vruntime -= cfs_rq->min_vruntime; 1381 + #ifndef CONFIG_64BIT 1382 + u64 min_vruntime_copy; 1383 + 1384 + do { 1385 + min_vruntime_copy = cfs_rq->min_vruntime_copy; 1386 + smp_rmb(); 1387 + min_vruntime = cfs_rq->min_vruntime; 1388 + } while (min_vruntime != min_vruntime_copy); 1389 + #else 1390 + min_vruntime = cfs_rq->min_vruntime; 1391 + #endif 1392 + 1393 + se->vruntime -= min_vruntime; 1395 1394 } 1396 1395 1397 1396 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 1649 1622 /* 1650 1623 * Otherwise, iterate the domains and find an elegible idle cpu. 1651 1624 */ 1625 + rcu_read_lock(); 1652 1626 for_each_domain(target, sd) { 1653 1627 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1654 1628 break; ··· 1669 1641 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1670 1642 break; 1671 1643 } 1644 + rcu_read_unlock(); 1672 1645 1673 1646 return target; 1674 1647 } ··· 1686 1657 * preempt must be disabled. 1687 1658 */ 1688 1659 static int 1689 - select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1660 + select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1690 1661 { 1691 1662 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1692 1663 int cpu = smp_processor_id(); ··· 1702 1673 new_cpu = prev_cpu; 1703 1674 } 1704 1675 1676 + rcu_read_lock(); 1705 1677 for_each_domain(cpu, tmp) { 1706 1678 if (!(tmp->flags & SD_LOAD_BALANCE)) 1707 1679 continue; ··· 1753 1723 1754 1724 if (affine_sd) { 1755 1725 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1756 - return select_idle_sibling(p, cpu); 1757 - else 1758 - return select_idle_sibling(p, prev_cpu); 1726 + prev_cpu = cpu; 1727 + 1728 + new_cpu = select_idle_sibling(p, prev_cpu); 1729 + goto unlock; 1759 1730 } 1760 1731 1761 1732 while (sd) { ··· 1797 1766 } 1798 1767 /* while loop will break here if sd == NULL */ 1799 1768 } 1769 + unlock: 1770 + rcu_read_unlock(); 1800 1771 1801 1772 return new_cpu; 1802 1773 } ··· 1822 1789 * This is especially important for buddies when the leftmost 1823 1790 * task is higher priority than the buddy. 1824 1791 */ 1825 - if (unlikely(se->load.weight != NICE_0_LOAD)) 1826 - gran = calc_delta_fair(gran, se); 1827 - 1828 - return gran; 1792 + return calc_delta_fair(gran, se); 1829 1793 } 1830 1794 1831 1795 /* ··· 1856 1826 1857 1827 static void set_last_buddy(struct sched_entity *se) 1858 1828 { 1859 - if (likely(task_of(se)->policy != SCHED_IDLE)) { 1860 - for_each_sched_entity(se) 1861 - cfs_rq_of(se)->last = se; 1862 - } 1829 + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 1830 + return; 1831 + 1832 + for_each_sched_entity(se) 1833 + cfs_rq_of(se)->last = se; 1863 1834 } 1864 1835 1865 1836 static void set_next_buddy(struct sched_entity *se) 1866 1837 { 1867 - if (likely(task_of(se)->policy != SCHED_IDLE)) { 1868 - for_each_sched_entity(se) 1869 - cfs_rq_of(se)->next = se; 1870 - } 1838 + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) 1839 + return; 1840 + 1841 + for_each_sched_entity(se) 1842 + cfs_rq_of(se)->next = se; 1871 1843 } 1872 1844 1873 1845 static void set_skip_buddy(struct sched_entity *se) 1874 1846 { 1875 - if (likely(task_of(se)->policy != SCHED_IDLE)) { 1876 - for_each_sched_entity(se) 1877 - cfs_rq_of(se)->skip = se; 1878 - } 1847 + for_each_sched_entity(se) 1848 + cfs_rq_of(se)->skip = se; 1879 1849 } 1880 1850 1881 1851 /* ··· 1887 1857 struct sched_entity *se = &curr->se, *pse = &p->se; 1888 1858 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1889 1859 int scale = cfs_rq->nr_running >= sched_nr_latency; 1860 + int next_buddy_marked = 0; 1890 1861 1891 1862 if (unlikely(se == pse)) 1892 1863 return; 1893 1864 1894 - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1865 + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 1895 1866 set_next_buddy(pse); 1867 + next_buddy_marked = 1; 1868 + } 1896 1869 1897 1870 /* 1898 1871 * We can come here with TIF_NEED_RESCHED already set from new task ··· 1923 1890 update_curr(cfs_rq); 1924 1891 find_matching_se(&se, &pse); 1925 1892 BUG_ON(!pse); 1926 - if (wakeup_preempt_entity(se, pse) == 1) 1893 + if (wakeup_preempt_entity(se, pse) == 1) { 1894 + /* 1895 + * Bias pick_next to pick the sched entity that is 1896 + * triggering this preemption. 1897 + */ 1898 + if (!next_buddy_marked) 1899 + set_next_buddy(pse); 1927 1900 goto preempt; 1901 + } 1928 1902 1929 1903 return; 1930 1904 ··· 2142 2102 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2143 2103 unsigned long max_load_move, struct sched_domain *sd, 2144 2104 enum cpu_idle_type idle, int *all_pinned, 2145 - int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2105 + struct cfs_rq *busiest_cfs_rq) 2146 2106 { 2147 2107 int loops = 0, pulled = 0; 2148 2108 long rem_load_move = max_load_move; ··· 2180 2140 */ 2181 2141 if (rem_load_move <= 0) 2182 2142 break; 2183 - 2184 - if (p->prio < *this_best_prio) 2185 - *this_best_prio = p->prio; 2186 2143 } 2187 2144 out: 2188 2145 /* ··· 2239 2202 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2240 2203 unsigned long max_load_move, 2241 2204 struct sched_domain *sd, enum cpu_idle_type idle, 2242 - int *all_pinned, int *this_best_prio) 2205 + int *all_pinned) 2243 2206 { 2244 2207 long rem_load_move = max_load_move; 2245 2208 int busiest_cpu = cpu_of(busiest); ··· 2264 2227 rem_load = div_u64(rem_load, busiest_h_load + 1); 2265 2228 2266 2229 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2267 - rem_load, sd, idle, all_pinned, this_best_prio, 2230 + rem_load, sd, idle, all_pinned, 2268 2231 busiest_cfs_rq); 2269 2232 2270 2233 if (!moved_load) ··· 2290 2253 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2291 2254 unsigned long max_load_move, 2292 2255 struct sched_domain *sd, enum cpu_idle_type idle, 2293 - int *all_pinned, int *this_best_prio) 2256 + int *all_pinned) 2294 2257 { 2295 2258 return balance_tasks(this_rq, this_cpu, busiest, 2296 2259 max_load_move, sd, idle, all_pinned, 2297 - this_best_prio, &busiest->cfs); 2260 + &busiest->cfs); 2298 2261 } 2299 2262 #endif 2300 2263 ··· 2311 2274 int *all_pinned) 2312 2275 { 2313 2276 unsigned long total_load_moved = 0, load_moved; 2314 - int this_best_prio = this_rq->curr->prio; 2315 2277 2316 2278 do { 2317 2279 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2318 2280 max_load_move - total_load_moved, 2319 - sd, idle, all_pinned, &this_best_prio); 2281 + sd, idle, all_pinned); 2320 2282 2321 2283 total_load_moved += load_moved; 2322 2284 ··· 2684 2648 /* 2685 2649 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2686 2650 */ 2687 - if (sd->level != SD_LV_SIBLING) 2651 + if (!(sd->flags & SD_SHARE_CPUPOWER)) 2688 2652 return 0; 2689 2653 2690 2654 /* ··· 3501 3465 raw_spin_unlock(&this_rq->lock); 3502 3466 3503 3467 update_shares(this_cpu); 3468 + rcu_read_lock(); 3504 3469 for_each_domain(this_cpu, sd) { 3505 3470 unsigned long interval; 3506 3471 int balance = 1; ··· 3523 3486 break; 3524 3487 } 3525 3488 } 3489 + rcu_read_unlock(); 3526 3490 3527 3491 raw_spin_lock(&this_rq->lock); 3528 3492 ··· 3572 3534 double_lock_balance(busiest_rq, target_rq); 3573 3535 3574 3536 /* Search for an sd spanning us and the target CPU. */ 3537 + rcu_read_lock(); 3575 3538 for_each_domain(target_cpu, sd) { 3576 3539 if ((sd->flags & SD_LOAD_BALANCE) && 3577 3540 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) ··· 3588 3549 else 3589 3550 schedstat_inc(sd, alb_failed); 3590 3551 } 3552 + rcu_read_unlock(); 3591 3553 double_unlock_balance(busiest_rq, target_rq); 3592 3554 out_unlock: 3593 3555 busiest_rq->active_balance = 0; ··· 3715 3675 { 3716 3676 struct sched_domain *sd; 3717 3677 struct sched_group *ilb_group; 3678 + int ilb = nr_cpu_ids; 3718 3679 3719 3680 /* 3720 3681 * Have idle load balancer selection from semi-idle packages only ··· 3731 3690 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3732 3691 goto out_done; 3733 3692 3693 + rcu_read_lock(); 3734 3694 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3735 3695 ilb_group = sd->groups; 3736 3696 3737 3697 do { 3738 - if (is_semi_idle_group(ilb_group)) 3739 - return cpumask_first(nohz.grp_idle_mask); 3698 + if (is_semi_idle_group(ilb_group)) { 3699 + ilb = cpumask_first(nohz.grp_idle_mask); 3700 + goto unlock; 3701 + } 3740 3702 3741 3703 ilb_group = ilb_group->next; 3742 3704 3743 3705 } while (ilb_group != sd->groups); 3744 3706 } 3707 + unlock: 3708 + rcu_read_unlock(); 3745 3709 3746 3710 out_done: 3747 - return nr_cpu_ids; 3711 + return ilb; 3748 3712 } 3749 3713 #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3750 3714 static inline int find_new_ilb(int call_cpu) ··· 3894 3848 3895 3849 update_shares(cpu); 3896 3850 3851 + rcu_read_lock(); 3897 3852 for_each_domain(cpu, sd) { 3898 3853 if (!(sd->flags & SD_LOAD_BALANCE)) 3899 3854 continue; ··· 3940 3893 if (!balance) 3941 3894 break; 3942 3895 } 3896 + rcu_read_unlock(); 3943 3897 3944 3898 /* 3945 3899 * next_balance will be updated only when there is a need.

+6

kernel/sched_features.h

··· 64 64 * Decrement CPU power based on irq activity 65 65 */ 66 66 SCHED_FEAT(NONIRQ_POWER, 1) 67 + 68 + /* 69 + * Queue remote wakeups on the target CPU and process them 70 + * using the scheduler IPI. Reduces rq->lock contention/bounces. 71 + */ 72 + SCHED_FEAT(TTWU_QUEUE, 1)

+1 -1

kernel/sched_idletask.c

··· 7 7 8 8 #ifdef CONFIG_SMP 9 9 static int 10 - select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10 + select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 11 11 { 12 12 return task_cpu(p); /* IDLE tasks as never migrated */ 13 13 }

+62 -25

kernel/sched_rt.c

··· 183 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 184 184 } 185 185 186 + typedef struct task_group *rt_rq_iter_t; 187 + 188 + #define for_each_rt_rq(rt_rq, iter, rq) \ 189 + for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ 190 + (&iter->list != &task_groups) && \ 191 + (rt_rq = iter->rt_rq[cpu_of(rq)]); \ 192 + iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) 193 + 186 194 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 187 195 { 188 196 list_add_rcu(&rt_rq->leaf_rt_rq_list, ··· 295 287 { 296 288 return ktime_to_ns(def_rt_bandwidth.rt_period); 297 289 } 290 + 291 + typedef struct rt_rq *rt_rq_iter_t; 292 + 293 + #define for_each_rt_rq(rt_rq, iter, rq) \ 294 + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 298 295 299 296 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 300 297 { ··· 415 402 static void __disable_runtime(struct rq *rq) 416 403 { 417 404 struct root_domain *rd = rq->rd; 405 + rt_rq_iter_t iter; 418 406 struct rt_rq *rt_rq; 419 407 420 408 if (unlikely(!scheduler_running)) 421 409 return; 422 410 423 - for_each_leaf_rt_rq(rt_rq, rq) { 411 + for_each_rt_rq(rt_rq, iter, rq) { 424 412 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 425 413 s64 want; 426 414 int i; ··· 501 487 502 488 static void __enable_runtime(struct rq *rq) 503 489 { 490 + rt_rq_iter_t iter; 504 491 struct rt_rq *rt_rq; 505 492 506 493 if (unlikely(!scheduler_running)) ··· 510 495 /* 511 496 * Reset each runqueue's bandwidth settings 512 497 */ 513 - for_each_leaf_rt_rq(rt_rq, rq) { 498 + for_each_rt_rq(rt_rq, iter, rq) { 514 499 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 515 500 516 501 raw_spin_lock(&rt_b->rt_runtime_lock); ··· 577 562 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 578 563 rt_rq->rt_throttled = 0; 579 564 enqueue = 1; 565 + 566 + /* 567 + * Force a clock update if the CPU was idle, 568 + * lest wakeup -> unthrottle time accumulate. 569 + */ 570 + if (rt_rq->rt_nr_running && rq->curr == rq->idle) 571 + rq->skip_clock_update = -1; 580 572 } 581 573 if (rt_rq->rt_time || rt_rq->rt_nr_running) 582 574 idle = 0; ··· 999 977 static int find_lowest_rq(struct task_struct *task); 1000 978 1001 979 static int 1002 - select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 980 + select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1003 981 { 982 + struct task_struct *curr; 983 + struct rq *rq; 984 + int cpu; 985 + 1004 986 if (sd_flag != SD_BALANCE_WAKE) 1005 987 return smp_processor_id(); 1006 988 989 + cpu = task_cpu(p); 990 + rq = cpu_rq(cpu); 991 + 992 + rcu_read_lock(); 993 + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 994 + 1007 995 /* 1008 - * If the current task is an RT task, then 996 + * If the current task on @p's runqueue is an RT task, then 1009 997 * try to see if we can wake this RT task up on another 1010 998 * runqueue. Otherwise simply start this RT task 1011 999 * on its current runqueue. ··· 1029 997 * lock? 1030 998 * 1031 999 * For equal prio tasks, we just let the scheduler sort it out. 1032 - */ 1033 - if (unlikely(rt_task(rq->curr)) && 1034 - (rq->curr->rt.nr_cpus_allowed < 2 || 1035 - rq->curr->prio < p->prio) && 1036 - (p->rt.nr_cpus_allowed > 1)) { 1037 - int cpu = find_lowest_rq(p); 1038 - 1039 - return (cpu == -1) ? task_cpu(p) : cpu; 1040 - } 1041 - 1042 - /* 1000 + * 1043 1001 * Otherwise, just let it ride on the affined RQ and the 1044 1002 * post-schedule router will push the preempted task away 1003 + * 1004 + * This test is optimistic, if we get it wrong the load-balancer 1005 + * will have to sort it out. 1045 1006 */ 1046 - return task_cpu(p); 1007 + if (curr && unlikely(rt_task(curr)) && 1008 + (curr->rt.nr_cpus_allowed < 2 || 1009 + curr->prio < p->prio) && 1010 + (p->rt.nr_cpus_allowed > 1)) { 1011 + int target = find_lowest_rq(p); 1012 + 1013 + if (target != -1) 1014 + cpu = target; 1015 + } 1016 + rcu_read_unlock(); 1017 + 1018 + return cpu; 1047 1019 } 1048 1020 1049 1021 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) ··· 1172 1136 * The previous task needs to be made eligible for pushing 1173 1137 * if it is still active 1174 1138 */ 1175 - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) 1139 + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1176 1140 enqueue_pushable_task(rq, p); 1177 1141 } 1178 1142 ··· 1323 1287 !cpumask_test_cpu(lowest_rq->cpu, 1324 1288 &task->cpus_allowed) || 1325 1289 task_running(rq, task) || 1326 - !task->se.on_rq)) { 1290 + !task->on_rq)) { 1327 1291 1328 1292 raw_spin_unlock(&lowest_rq->lock); 1329 1293 lowest_rq = NULL; ··· 1357 1321 BUG_ON(task_current(rq, p)); 1358 1322 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1359 1323 1360 - BUG_ON(!p->se.on_rq); 1324 + BUG_ON(!p->on_rq); 1361 1325 BUG_ON(!rt_task(p)); 1362 1326 1363 1327 return p; ··· 1503 1467 */ 1504 1468 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1505 1469 WARN_ON(p == src_rq->curr); 1506 - WARN_ON(!p->se.on_rq); 1470 + WARN_ON(!p->on_rq); 1507 1471 1508 1472 /* 1509 1473 * There's a chance that p is higher in priority ··· 1574 1538 * Update the migration status of the RQ if we have an RT task 1575 1539 * which is running AND changing its weight value. 1576 1540 */ 1577 - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1541 + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { 1578 1542 struct rq *rq = task_rq(p); 1579 1543 1580 1544 if (!task_current(rq, p)) { ··· 1644 1608 * we may need to handle the pulling of RT tasks 1645 1609 * now. 1646 1610 */ 1647 - if (p->se.on_rq && !rq->rt.rt_nr_running) 1611 + if (p->on_rq && !rq->rt.rt_nr_running) 1648 1612 pull_rt_task(rq); 1649 1613 } 1650 1614 ··· 1674 1638 * If that current running task is also an RT task 1675 1639 * then see if we can move to another run queue. 1676 1640 */ 1677 - if (p->se.on_rq && rq->curr != p) { 1641 + if (p->on_rq && rq->curr != p) { 1678 1642 #ifdef CONFIG_SMP 1679 1643 if (rq->rt.overloaded && push_rt_task(rq) && 1680 1644 /* Don't resched if we changed runqueues */ ··· 1693 1657 static void 1694 1658 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1695 1659 { 1696 - if (!p->se.on_rq) 1660 + if (!p->on_rq) 1697 1661 return; 1698 1662 1699 1663 if (rq->curr == p) { ··· 1832 1796 1833 1797 static void print_rt_stats(struct seq_file *m, int cpu) 1834 1798 { 1799 + rt_rq_iter_t iter; 1835 1800 struct rt_rq *rt_rq; 1836 1801 1837 1802 rcu_read_lock(); 1838 - for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) 1803 + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 1839 1804 print_rt_rq(m, cpu, rt_rq); 1840 1805 rcu_read_unlock(); 1841 1806 }

+2 -3

kernel/sched_stoptask.c

··· 9 9 10 10 #ifdef CONFIG_SMP 11 11 static int 12 - select_task_rq_stop(struct rq *rq, struct task_struct *p, 13 - int sd_flag, int flags) 12 + select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14 13 { 15 14 return task_cpu(p); /* stop tasks as never migrate */ 16 15 } ··· 25 26 { 26 27 struct task_struct *stop = rq->stop; 27 28 28 - if (stop && stop->se.on_rq) 29 + if (stop && stop->on_rq) 29 30 return stop; 30 31 31 32 return NULL;

-1

kernel/trace/trace_kprobe.c

··· 53 53 "common_preempt_count", 54 54 "common_pid", 55 55 "common_tgid", 56 - "common_lock_depth", 57 56 FIELD_STRING_IP, 58 57 FIELD_STRING_RETIP, 59 58 FIELD_STRING_FUNC,

-1

tools/perf/Documentation/perf-script-perl.txt

··· 63 63 field:unsigned char common_flags; 64 64 field:unsigned char common_preempt_count; 65 65 field:int common_pid; 66 - field:int common_lock_depth; 67 66 68 67 field:char comm[TASK_COMM_LEN]; 69 68 field:pid_t pid;

-1

tools/perf/Documentation/perf-script-python.txt

··· 463 463 field:unsigned char common_flags; 464 464 field:unsigned char common_preempt_count; 465 465 field:int common_pid; 466 - field:int common_lock_depth; 467 466 468 467 field:char comm[TASK_COMM_LEN]; 469 468 field:pid_t pid;