Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining

Currently, when a cpu goes down, cpu_active is cleared before
CPU_DOWN_PREPARE starts and cpuset configuration is updated from a
default priority cpu notifier. When a cpu is coming up, it's set
before CPU_ONLINE but cpuset configuration again is updated from the
same cpu notifier.

For cpu notifiers, this presents an inconsistent state. Threads which
a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be
migrated to other cpus because the cpu is no more inactive.

Fix it by updating cpu_active in the highest priority cpu notifier and
cpuset configuration in the second highest when a cpu is coming up.
Down path is updated similarly. This guarantees that all other cpu
notifiers see consistent cpu_active and cpuset configuration.

cpuset_track_online_cpus() notifier is converted to
cpuset_update_active_cpus() which just updates the configuration and
now called from cpuset_cpu_[in]active() notifiers registered from
sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus()
degenerates into partition_sched_domains() making separate notifier
for !CONFIG_CPUSETS unnecessary.

This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug
callback creates a kthread and kthread_bind()s it to the target cpu,
and the thread is expected to run on that cpu.

* Ingo's test discovered __cpuinit/exit markups were incorrect.
Fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Menage <menage@google.com>

Tejun Heo 3a101d05 50a323b7

+75 -43
+16
include/linux/cpu.h
··· 52 52 * CPU notifier priorities. 53 53 */ 54 54 enum { 55 + /* 56 + * SCHED_ACTIVE marks a cpu which is coming up active during 57 + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first 58 + * notifier. CPUSET_ACTIVE adjusts cpuset according to 59 + * cpu_active mask right after SCHED_ACTIVE. During 60 + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are 61 + * ordered in the similar way. 62 + * 63 + * This ordering guarantees consistent cpu_active mask and 64 + * migration behavior to all cpu notifiers. 65 + */ 66 + CPU_PRI_SCHED_ACTIVE = INT_MAX, 67 + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, 68 + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, 69 + CPU_PRI_CPUSET_INACTIVE = INT_MIN, 70 + 55 71 /* migration should happen before other stuff but after perf */ 56 72 CPU_PRI_PERF = 20, 57 73 CPU_PRI_MIGRATION = 10,
+6
include/linux/cpuset.h
··· 20 20 21 21 extern int cpuset_init(void); 22 22 extern void cpuset_init_smp(void); 23 + extern void cpuset_update_active_cpus(void); 23 24 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 24 25 extern int cpuset_cpus_allowed_fallback(struct task_struct *p); 25 26 extern nodemask_t cpuset_mems_allowed(struct task_struct *p); ··· 132 131 133 132 static inline int cpuset_init(void) { return 0; } 134 133 static inline void cpuset_init_smp(void) {} 134 + 135 + static inline void cpuset_update_active_cpus(void) 136 + { 137 + partition_sched_domains(1, NULL, NULL); 138 + } 135 139 136 140 static inline void cpuset_cpus_allowed(struct task_struct *p, 137 141 struct cpumask *mask)
-6
kernel/cpu.c
··· 235 235 return -EINVAL; 236 236 237 237 cpu_hotplug_begin(); 238 - set_cpu_active(cpu, false); 239 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 240 239 if (err) { 241 - set_cpu_active(cpu, true); 242 - 243 240 nr_calls--; 244 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 245 242 printk("%s: attempt to take down CPU %u failed\n", ··· 246 249 247 250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 248 251 if (err) { 249 - set_cpu_active(cpu, true); 250 252 /* CPU didn't die: tell everyone. Can't complain. */ 251 253 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 252 254 ··· 316 320 if (ret != 0) 317 321 goto out_notify; 318 322 BUG_ON(!cpu_online(cpu)); 319 - 320 - set_cpu_active(cpu, true); 321 323 322 324 /* Now call notifier in preparation. */ 323 325 cpu_notify(CPU_ONLINE | mod, hcpu);
+2 -19
kernel/cpuset.c
··· 2113 2113 * but making no active use of cpusets. 2114 2114 * 2115 2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2116 - * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 + * cpu_active_mask on each CPU hotplug (cpuhp) event. 2117 2117 * 2118 2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2119 2119 * before calling generate_sched_domains(). 2120 2120 */ 2121 - static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2122 - unsigned long phase, void *unused_cpu) 2121 + void __cpuexit cpuset_update_active_cpus(void) 2123 2122 { 2124 2123 struct sched_domain_attr *attr; 2125 2124 cpumask_var_t *doms; 2126 2125 int ndoms; 2127 - 2128 - switch (phase) { 2129 - case CPU_ONLINE: 2130 - case CPU_ONLINE_FROZEN: 2131 - case CPU_DOWN_PREPARE: 2132 - case CPU_DOWN_PREPARE_FROZEN: 2133 - case CPU_DOWN_FAILED: 2134 - case CPU_DOWN_FAILED_FROZEN: 2135 - break; 2136 - 2137 - default: 2138 - return NOTIFY_DONE; 2139 - } 2140 2126 2141 2127 cgroup_lock(); 2142 2128 mutex_lock(&callback_mutex); ··· 2134 2148 2135 2149 /* Have scheduler rebuild the domains */ 2136 2150 partition_sched_domains(ndoms, doms, attr); 2137 - 2138 - return NOTIFY_OK; 2139 2151 } 2140 2152 2141 2153 #ifdef CONFIG_MEMORY_HOTPLUG ··· 2187 2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2188 2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2189 2205 2190 - hotcpu_notifier(cpuset_track_online_cpus, 0); 2191 2206 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2192 2207 2193 2208 cpuset_wq = create_singlethread_workqueue("cpuset");
+51 -18
kernel/sched.c
··· 5804 5804 .priority = CPU_PRI_MIGRATION, 5805 5805 }; 5806 5806 5807 + static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 5808 + unsigned long action, void *hcpu) 5809 + { 5810 + switch (action & ~CPU_TASKS_FROZEN) { 5811 + case CPU_ONLINE: 5812 + case CPU_DOWN_FAILED: 5813 + set_cpu_active((long)hcpu, true); 5814 + return NOTIFY_OK; 5815 + default: 5816 + return NOTIFY_DONE; 5817 + } 5818 + } 5819 + 5820 + static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 5821 + unsigned long action, void *hcpu) 5822 + { 5823 + switch (action & ~CPU_TASKS_FROZEN) { 5824 + case CPU_DOWN_PREPARE: 5825 + set_cpu_active((long)hcpu, false); 5826 + return NOTIFY_OK; 5827 + default: 5828 + return NOTIFY_DONE; 5829 + } 5830 + } 5831 + 5807 5832 static int __init migration_init(void) 5808 5833 { 5809 5834 void *cpu = (void *)(long)smp_processor_id(); 5810 5835 int err; 5811 5836 5812 - /* Start one for the boot CPU: */ 5837 + /* Initialize migration for the boot CPU */ 5813 5838 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5814 5839 BUG_ON(err == NOTIFY_BAD); 5815 5840 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5816 5841 register_cpu_notifier(&migration_notifier); 5842 + 5843 + /* Register cpu active notifiers */ 5844 + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5845 + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5817 5846 5818 5847 return 0; 5819 5848 } ··· 7302 7273 } 7303 7274 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7304 7275 7305 - #ifndef CONFIG_CPUSETS 7306 7276 /* 7307 - * Add online and remove offline CPUs from the scheduler domains. 7308 - * When cpusets are enabled they take over this function. 7277 + * Update cpusets according to cpu_active mask. If cpusets are 7278 + * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7279 + * around partition_sched_domains(). 7309 7280 */ 7310 - static int update_sched_domains(struct notifier_block *nfb, 7311 - unsigned long action, void *hcpu) 7281 + static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb, 7282 + unsigned long action, void *hcpu) 7312 7283 { 7313 - switch (action) { 7284 + switch (action & ~CPU_TASKS_FROZEN) { 7314 7285 case CPU_ONLINE: 7315 - case CPU_ONLINE_FROZEN: 7316 - case CPU_DOWN_PREPARE: 7317 - case CPU_DOWN_PREPARE_FROZEN: 7318 7286 case CPU_DOWN_FAILED: 7319 - case CPU_DOWN_FAILED_FROZEN: 7320 - partition_sched_domains(1, NULL, NULL); 7287 + cpuset_update_active_cpus(); 7321 7288 return NOTIFY_OK; 7322 - 7323 7289 default: 7324 7290 return NOTIFY_DONE; 7325 7291 } 7326 7292 } 7327 - #endif 7293 + 7294 + static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb, 7295 + unsigned long action, void *hcpu) 7296 + { 7297 + switch (action & ~CPU_TASKS_FROZEN) { 7298 + case CPU_DOWN_PREPARE: 7299 + cpuset_update_active_cpus(); 7300 + return NOTIFY_OK; 7301 + default: 7302 + return NOTIFY_DONE; 7303 + } 7304 + } 7328 7305 7329 7306 static int update_runtime(struct notifier_block *nfb, 7330 7307 unsigned long action, void *hcpu) ··· 7376 7341 mutex_unlock(&sched_domains_mutex); 7377 7342 put_online_cpus(); 7378 7343 7379 - #ifndef CONFIG_CPUSETS 7380 - /* XXX: Theoretical race here - CPU may be hotplugged now */ 7381 - hotcpu_notifier(update_sched_domains, 0); 7382 - #endif 7344 + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7345 + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7383 7346 7384 7347 /* RT runtime code needs to handle some hotplug events */ 7385 7348 hotcpu_notifier(update_runtime, 0);