Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cgroup/cpuset: Make cpuset hotplug processing synchronous

Since commit 3a5a6d0c2b03("cpuset: don't nest cgroup_mutex inside
get_online_cpus()"), cpuset hotplug was done asynchronously via a work
function. This is to avoid recursive locking of cgroup_mutex.

Since then, the cgroup locking scheme has changed quite a bit. A
cpuset_mutex was introduced to protect cpuset specific operations.
The cpuset_mutex is then replaced by a cpuset_rwsem. With commit
d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock
order"), cpu_hotplug_lock is acquired before cpuset_rwsem. Later on,
cpuset_rwsem is reverted back to cpuset_mutex. All these locking changes
allow the hotplug code to call into cpuset core directly.

The following commits were also merged due to the asynchronous nature
of cpuset hotplug processing.

- commit b22afcdf04c9 ("cpu/hotplug: Cure the cpusets trainwreck")
- commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume
bugs")
- commit 28b89b9e6f7b ("cpuset: handle race between CPU hotplug and
cpuset_hotplug_work")

Clean up all these bandages by making cpuset hotplug
processing synchronous again with the exception that the call to
cgroup_transfer_tasks() to transfer tasks out of an empty cgroup v1
cpuset, if necessary, will still be done via a work function due to the
existing cgroup_mutex -> cpu_hotplug_lock dependency. It is possible
to reverse that dependency, but that will require updating a number of
different cgroup controllers. This special hotplug code path should be
rarely taken anyway.

As all the cpuset states will be updated by the end of the hotplug
operation, we can revert most the above commits except commit
50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs")
which is partially reverted. Also removing some cpus_read_lock trylock
attempts in the cpuset partition code as they are no longer necessary
since the cpu_hotplug_lock is now held for the whole duration of the
cpuset hotplug code path.

Signed-off-by: Waiman Long <longman@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Waiman Long and committed by
Tejun Heo
2125c003 4793cb59

+56 -138
-3
include/linux/cpuset.h
··· 70 70 extern void cpuset_init_smp(void); 71 71 extern void cpuset_force_rebuild(void); 72 72 extern void cpuset_update_active_cpus(void); 73 - extern void cpuset_wait_for_hotplug(void); 74 73 extern void inc_dl_tasks_cs(struct task_struct *task); 75 74 extern void dec_dl_tasks_cs(struct task_struct *task); 76 75 extern void cpuset_lock(void); ··· 183 184 { 184 185 partition_sched_domains(1, NULL, NULL); 185 186 } 186 - 187 - static inline void cpuset_wait_for_hotplug(void) { } 188 187 189 188 static inline void inc_dl_tasks_cs(struct task_struct *task) { } 190 189 static inline void dec_dl_tasks_cs(struct task_struct *task) { }
+56 -85
kernel/cgroup/cpuset.c
··· 202 202 }; 203 203 204 204 /* 205 + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously 206 + */ 207 + struct cpuset_remove_tasks_struct { 208 + struct work_struct work; 209 + struct cpuset *cs; 210 + }; 211 + 212 + /* 205 213 * Exclusive CPUs distributed out to sub-partitions of top_cpuset 206 214 */ 207 215 static cpumask_var_t subpartitions_cpus; ··· 457 449 458 450 static struct workqueue_struct *cpuset_migrate_mm_wq; 459 451 460 - /* 461 - * CPU / memory hotplug is handled asynchronously. 462 - */ 463 - static void cpuset_hotplug_workfn(struct work_struct *work); 464 - static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 465 - 466 452 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 467 453 468 454 static inline void check_insane_mems_config(nodemask_t *nodes) ··· 542 540 rcu_read_lock(); 543 541 cs = task_cs(tsk); 544 542 545 - while (!cpumask_intersects(cs->effective_cpus, pmask)) { 543 + while (!cpumask_intersects(cs->effective_cpus, pmask)) 546 544 cs = parent_cs(cs); 547 - if (unlikely(!cs)) { 548 - /* 549 - * The top cpuset doesn't have any online cpu as a 550 - * consequence of a race between cpuset_hotplug_work 551 - * and cpu hotplug notifier. But we know the top 552 - * cpuset's effective_cpus is on its way to be 553 - * identical to cpu_online_mask. 554 - */ 555 - goto out_unlock; 556 - } 557 - } 558 - cpumask_and(pmask, pmask, cs->effective_cpus); 559 545 560 - out_unlock: 546 + cpumask_and(pmask, pmask, cs->effective_cpus); 561 547 rcu_read_unlock(); 562 548 } 563 549 ··· 1207 1217 /* 1208 1218 * If we have raced with CPU hotplug, return early to avoid 1209 1219 * passing doms with offlined cpu to partition_sched_domains(). 1210 - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. 1220 + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. 1211 1221 * 1212 1222 * With no CPUs in any subpartitions, top_cpuset's effective CPUs 1213 1223 * should be the same as the active CPUs, so checking only top_cpuset ··· 1250 1260 } 1251 1261 #endif /* CONFIG_SMP */ 1252 1262 1253 - void rebuild_sched_domains(void) 1263 + static void rebuild_sched_domains_cpuslocked(void) 1254 1264 { 1255 - cpus_read_lock(); 1256 1265 mutex_lock(&cpuset_mutex); 1257 1266 rebuild_sched_domains_locked(); 1258 1267 mutex_unlock(&cpuset_mutex); 1268 + } 1269 + 1270 + void rebuild_sched_domains(void) 1271 + { 1272 + cpus_read_lock(); 1273 + rebuild_sched_domains_cpuslocked(); 1259 1274 cpus_read_unlock(); 1260 1275 } 1261 1276 ··· 2074 2079 2075 2080 /* 2076 2081 * For partcmd_update without newmask, it is being called from 2077 - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. 2078 - * Update the load balance flag and scheduling domain if 2079 - * cpus_read_trylock() is successful. 2082 + * cpuset_handle_hotplug(). Update the load balance flag and 2083 + * scheduling domain accordingly. 2080 2084 */ 2081 - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { 2085 + if ((cmd == partcmd_update) && !newmask) 2082 2086 update_partition_sd_lb(cs, old_prs); 2083 - cpus_read_unlock(); 2084 - } 2085 2087 2086 2088 notify_partition_change(cs, old_prs); 2087 2089 return 0; ··· 3591 3599 * proceeding, so that we don't end up keep removing tasks added 3592 3600 * after execution capability is restored. 3593 3601 * 3594 - * cpuset_hotplug_work calls back into cgroup core via 3595 - * cgroup_transfer_tasks() and waiting for it from a cgroupfs 3602 + * cpuset_handle_hotplug may call back into cgroup core asynchronously 3603 + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs 3596 3604 * operation like this one can lead to a deadlock through kernfs 3597 3605 * active_ref protection. Let's break the protection. Losing the 3598 3606 * protection is okay as we check whether @cs is online after ··· 3601 3609 */ 3602 3610 css_get(&cs->css); 3603 3611 kernfs_break_active_protection(of->kn); 3604 - flush_work(&cpuset_hotplug_work); 3605 3612 3606 3613 cpus_read_lock(); 3607 3614 mutex_lock(&cpuset_mutex); ··· 4345 4354 } 4346 4355 } 4347 4356 4357 + static void cpuset_migrate_tasks_workfn(struct work_struct *work) 4358 + { 4359 + struct cpuset_remove_tasks_struct *s; 4360 + 4361 + s = container_of(work, struct cpuset_remove_tasks_struct, work); 4362 + remove_tasks_in_empty_cpuset(s->cs); 4363 + css_put(&s->cs->css); 4364 + kfree(s); 4365 + } 4366 + 4348 4367 static void 4349 4368 hotplug_update_tasks_legacy(struct cpuset *cs, 4350 4369 struct cpumask *new_cpus, nodemask_t *new_mems, ··· 4384 4383 /* 4385 4384 * Move tasks to the nearest ancestor with execution resources, 4386 4385 * This is full cgroup operation which will also call back into 4387 - * cpuset. Should be done outside any lock. 4386 + * cpuset. Execute it asynchronously using workqueue. 4388 4387 */ 4389 - if (is_empty) { 4390 - mutex_unlock(&cpuset_mutex); 4391 - remove_tasks_in_empty_cpuset(cs); 4392 - mutex_lock(&cpuset_mutex); 4388 + if (is_empty && cs->css.cgroup->nr_populated_csets && 4389 + css_tryget_online(&cs->css)) { 4390 + struct cpuset_remove_tasks_struct *s; 4391 + 4392 + s = kzalloc(sizeof(*s), GFP_KERNEL); 4393 + if (WARN_ON_ONCE(!s)) { 4394 + css_put(&cs->css); 4395 + return; 4396 + } 4397 + 4398 + s->cs = cs; 4399 + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); 4400 + schedule_work(&s->work); 4393 4401 } 4394 4402 } 4395 4403 ··· 4429 4419 void cpuset_force_rebuild(void) 4430 4420 { 4431 4421 force_rebuild = true; 4432 - } 4433 - 4434 - /* 4435 - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in 4436 - * progress. 4437 - * Return: true if successful, false otherwise 4438 - * 4439 - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, 4440 - * cpus_read_trylock() is used here to acquire the lock. 4441 - */ 4442 - static bool cpuset_hotplug_cpus_read_trylock(void) 4443 - { 4444 - int retries = 0; 4445 - 4446 - while (!cpus_read_trylock()) { 4447 - /* 4448 - * CPU hotplug still in progress. Retry 5 times 4449 - * with a 10ms wait before bailing out. 4450 - */ 4451 - if (++retries > 5) 4452 - return false; 4453 - msleep(10); 4454 - } 4455 - return true; 4456 4422 } 4457 4423 4458 4424 /** ··· 4479 4493 compute_partition_effective_cpumask(cs, &new_cpus); 4480 4494 4481 4495 if (remote && cpumask_empty(&new_cpus) && 4482 - partition_is_populated(cs, NULL) && 4483 - cpuset_hotplug_cpus_read_trylock()) { 4496 + partition_is_populated(cs, NULL)) { 4484 4497 remote_partition_disable(cs, tmp); 4485 4498 compute_effective_cpumask(&new_cpus, cs, parent); 4486 4499 remote = false; 4487 4500 cpuset_force_rebuild(); 4488 - cpus_read_unlock(); 4489 4501 } 4490 4502 4491 4503 /* ··· 4503 4519 else if (is_partition_valid(parent) && is_partition_invalid(cs)) 4504 4520 partcmd = partcmd_update; 4505 4521 4506 - /* 4507 - * cpus_read_lock needs to be held before calling 4508 - * update_parent_effective_cpumask(). To avoid circular lock 4509 - * dependency between cpuset_mutex and cpus_read_lock, 4510 - * cpus_read_trylock() is used here to acquire the lock. 4511 - */ 4512 4522 if (partcmd >= 0) { 4513 - if (!cpuset_hotplug_cpus_read_trylock()) 4514 - goto update_tasks; 4515 - 4516 4523 update_parent_effective_cpumask(cs, partcmd, NULL, tmp); 4517 - cpus_read_unlock(); 4518 4524 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { 4519 4525 compute_partition_effective_cpumask(cs, &new_cpus); 4520 4526 cpuset_force_rebuild(); ··· 4532 4558 } 4533 4559 4534 4560 /** 4535 - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset 4536 - * @work: unused 4561 + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset 4537 4562 * 4538 4563 * This function is called after either CPU or memory configuration has 4539 4564 * changed and updates cpuset accordingly. The top_cpuset is always ··· 4546 4573 * 4547 4574 * Note that CPU offlining during suspend is ignored. We don't modify 4548 4575 * cpusets across suspend/resume cycles at all. 4576 + * 4577 + * CPU / memory hotplug is handled synchronously. 4549 4578 */ 4550 - static void cpuset_hotplug_workfn(struct work_struct *work) 4579 + static void cpuset_handle_hotplug(void) 4551 4580 { 4552 4581 static cpumask_t new_cpus; 4553 4582 static nodemask_t new_mems; ··· 4560 4585 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 4561 4586 ptmp = &tmp; 4562 4587 4588 + lockdep_assert_cpus_held(); 4563 4589 mutex_lock(&cpuset_mutex); 4564 4590 4565 4591 /* fetch the available cpus/mems and find out which changed how */ ··· 4642 4666 /* rebuild sched domains if cpus_allowed has changed */ 4643 4667 if (cpus_updated || force_rebuild) { 4644 4668 force_rebuild = false; 4645 - rebuild_sched_domains(); 4669 + rebuild_sched_domains_cpuslocked(); 4646 4670 } 4647 4671 4648 4672 free_cpumasks(NULL, ptmp); ··· 4655 4679 * inside cgroup synchronization. Bounce actual hotplug processing 4656 4680 * to a work item to avoid reverse locking order. 4657 4681 */ 4658 - schedule_work(&cpuset_hotplug_work); 4659 - } 4660 - 4661 - void cpuset_wait_for_hotplug(void) 4662 - { 4663 - flush_work(&cpuset_hotplug_work); 4682 + cpuset_handle_hotplug(); 4664 4683 } 4665 4684 4666 4685 /* ··· 4666 4695 static int cpuset_track_online_nodes(struct notifier_block *self, 4667 4696 unsigned long action, void *arg) 4668 4697 { 4669 - schedule_work(&cpuset_hotplug_work); 4698 + cpuset_handle_hotplug(); 4670 4699 return NOTIFY_OK; 4671 4700 } 4672 4701
-48
kernel/cpu.c
··· 1208 1208 kthread_unpark(this_cpu_read(cpuhp_state.thread)); 1209 1209 } 1210 1210 1211 - /* 1212 - * 1213 - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock 1214 - * protected region. 1215 - * 1216 - * The operation is still serialized against concurrent CPU hotplug via 1217 - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ 1218 - * serialized against other hotplug related activity like adding or 1219 - * removing of state callbacks and state instances, which invoke either the 1220 - * startup or the teardown callback of the affected state. 1221 - * 1222 - * This is required for subsystems which are unfixable vs. CPU hotplug and 1223 - * evade lock inversion problems by scheduling work which has to be 1224 - * completed _before_ cpu_up()/_cpu_down() returns. 1225 - * 1226 - * Don't even think about adding anything to this for any new code or even 1227 - * drivers. It's only purpose is to keep existing lock order trainwrecks 1228 - * working. 1229 - * 1230 - * For cpu_down() there might be valid reasons to finish cleanups which are 1231 - * not required to be done under cpu_hotplug_lock, but that's a different 1232 - * story and would be not invoked via this. 1233 - */ 1234 - static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) 1235 - { 1236 - /* 1237 - * cpusets delegate hotplug operations to a worker to "solve" the 1238 - * lock order problems. Wait for the worker, but only if tasks are 1239 - * _not_ frozen (suspend, hibernate) as that would wait forever. 1240 - * 1241 - * The wait is required because otherwise the hotplug operation 1242 - * returns with inconsistent state, which could even be observed in 1243 - * user space when a new CPU is brought up. The CPU plug uevent 1244 - * would be delivered and user space reacting on it would fail to 1245 - * move tasks to the newly plugged CPU up to the point where the 1246 - * work has finished because up to that point the newly plugged CPU 1247 - * is not assignable in cpusets/cgroups. On unplug that's not 1248 - * necessarily a visible issue, but it is still inconsistent state, 1249 - * which is the real problem which needs to be "fixed". This can't 1250 - * prevent the transient state between scheduling the work and 1251 - * returning from waiting for it. 1252 - */ 1253 - if (!tasks_frozen) 1254 - cpuset_wait_for_hotplug(); 1255 - } 1256 - 1257 1211 #ifdef CONFIG_HOTPLUG_CPU 1258 1212 #ifndef arch_clear_mm_cpumask_cpu 1259 1213 #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) ··· 1448 1494 */ 1449 1495 lockup_detector_cleanup(); 1450 1496 arch_smt_update(); 1451 - cpu_up_down_serialize_trainwrecks(tasks_frozen); 1452 1497 return ret; 1453 1498 } 1454 1499 ··· 1681 1728 out: 1682 1729 cpus_write_unlock(); 1683 1730 arch_smt_update(); 1684 - cpu_up_down_serialize_trainwrecks(tasks_frozen); 1685 1731 return ret; 1686 1732 } 1687 1733
-2
kernel/power/process.c
··· 194 194 __usermodehelper_set_disable_depth(UMH_FREEZING); 195 195 thaw_workqueues(); 196 196 197 - cpuset_wait_for_hotplug(); 198 - 199 197 read_lock(&tasklist_lock); 200 198 for_each_process_thread(g, p) { 201 199 /* No other threads should have PF_SUSPEND_TASK set */