Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cpuset: make mm migration asynchronous

If "cpuset.memory_migrate" is set, when a process is moved from one
cpuset to another with a different memory node mask, pages in used by
the process are migrated to the new set of nodes. This was performed
synchronously in the ->attach() callback, which is synchronized
against process management. Recently, the synchronization was changed
from per-process rwsem to global percpu rwsem for simplicity and
optimization.

Combined with the synchronous mm migration, this led to deadlocks
because mm migration could schedule a work item which may in turn try
to create a new worker blocking on the process management lock held
from cgroup process migration path.

This heavy an operation shouldn't be performed synchronously from that
deep inside cgroup migration in the first place. This patch punts the
actual migration to an ordered workqueue and updates cgroup process
migration and cpuset config update paths to flush the workqueue after
all locks are released. This way, the operations still seem
synchronous to userland without entangling mm migration with process
management synchronization. CPU hotplug can also invoke mm migration
but there's no reason for it to wait for mm migrations and thus
doesn't synchronize against their completions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # v4.4+

Tejun Heo e93ad19d 3e1e21c7

+57 -22
+6
include/linux/cpuset.h
··· 137 137 task_unlock(current); 138 138 } 139 139 140 + extern void cpuset_post_attach_flush(void); 141 + 140 142 #else /* !CONFIG_CPUSETS */ 141 143 142 144 static inline bool cpusets_enabled(void) { return false; } ··· 243 241 static inline bool read_mems_allowed_retry(unsigned int seq) 244 242 { 245 243 return false; 244 + } 245 + 246 + static inline void cpuset_post_attach_flush(void) 247 + { 246 248 } 247 249 248 250 #endif /* !CONFIG_CPUSETS */
+2
kernel/cgroup.c
··· 58 58 #include <linux/kthread.h> 59 59 #include <linux/delay.h> 60 60 #include <linux/atomic.h> 61 + #include <linux/cpuset.h> 61 62 #include <net/sock.h> 62 63 63 64 /* ··· 2740 2739 out_unlock_threadgroup: 2741 2740 percpu_up_write(&cgroup_threadgroup_rwsem); 2742 2741 cgroup_kn_unlock(of->kn); 2742 + cpuset_post_attach_flush(); 2743 2743 return ret ?: nbytes; 2744 2744 } 2745 2745
+49 -22
kernel/cpuset.c
··· 287 287 static DEFINE_MUTEX(cpuset_mutex); 288 288 static DEFINE_SPINLOCK(callback_lock); 289 289 290 + static struct workqueue_struct *cpuset_migrate_mm_wq; 291 + 290 292 /* 291 293 * CPU / memory hotplug is handled asynchronously. 292 294 */ ··· 974 972 } 975 973 976 974 /* 977 - * cpuset_migrate_mm 978 - * 979 - * Migrate memory region from one set of nodes to another. 980 - * 981 - * Temporarilly set tasks mems_allowed to target nodes of migration, 982 - * so that the migration code can allocate pages on these nodes. 983 - * 984 - * While the mm_struct we are migrating is typically from some 985 - * other task, the task_struct mems_allowed that we are hacking 986 - * is for our current task, which must allocate new pages for that 987 - * migrating memory region. 975 + * Migrate memory region from one set of nodes to another. This is 976 + * performed asynchronously as it can be called from process migration path 977 + * holding locks involved in process management. All mm migrations are 978 + * performed in the queued order and can be waited for by flushing 979 + * cpuset_migrate_mm_wq. 988 980 */ 981 + 982 + struct cpuset_migrate_mm_work { 983 + struct work_struct work; 984 + struct mm_struct *mm; 985 + nodemask_t from; 986 + nodemask_t to; 987 + }; 988 + 989 + static void cpuset_migrate_mm_workfn(struct work_struct *work) 990 + { 991 + struct cpuset_migrate_mm_work *mwork = 992 + container_of(work, struct cpuset_migrate_mm_work, work); 993 + 994 + /* on a wq worker, no need to worry about %current's mems_allowed */ 995 + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 996 + mmput(mwork->mm); 997 + kfree(mwork); 998 + } 989 999 990 1000 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 991 1001 const nodemask_t *to) 992 1002 { 993 - struct task_struct *tsk = current; 1003 + struct cpuset_migrate_mm_work *mwork; 994 1004 995 - tsk->mems_allowed = *to; 1005 + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 1006 + if (mwork) { 1007 + mwork->mm = mm; 1008 + mwork->from = *from; 1009 + mwork->to = *to; 1010 + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 1011 + queue_work(cpuset_migrate_mm_wq, &mwork->work); 1012 + } else { 1013 + mmput(mm); 1014 + } 1015 + } 996 1016 997 - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 998 - 999 - rcu_read_lock(); 1000 - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); 1001 - rcu_read_unlock(); 1017 + void cpuset_post_attach_flush(void) 1018 + { 1019 + flush_workqueue(cpuset_migrate_mm_wq); 1002 1020 } 1003 1021 1004 1022 /* ··· 1119 1097 mpol_rebind_mm(mm, &cs->mems_allowed); 1120 1098 if (migrate) 1121 1099 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1122 - mmput(mm); 1100 + else 1101 + mmput(mm); 1123 1102 } 1124 1103 css_task_iter_end(&it); 1125 1104 ··· 1568 1545 * @old_mems_allowed is the right nodesets that we 1569 1546 * migrate mm from. 1570 1547 */ 1571 - if (is_memory_migrate(cs)) { 1548 + if (is_memory_migrate(cs)) 1572 1549 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1573 1550 &cpuset_attach_nodemask_to); 1574 - } 1575 - mmput(mm); 1551 + else 1552 + mmput(mm); 1576 1553 } 1577 1554 } 1578 1555 ··· 1737 1714 mutex_unlock(&cpuset_mutex); 1738 1715 kernfs_unbreak_active_protection(of->kn); 1739 1716 css_put(&cs->css); 1717 + flush_workqueue(cpuset_migrate_mm_wq); 1740 1718 return retval ?: nbytes; 1741 1719 } 1742 1720 ··· 2383 2359 top_cpuset.effective_mems = node_states[N_MEMORY]; 2384 2360 2385 2361 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2362 + 2363 + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 2364 + BUG_ON(!cpuset_migrate_mm_wq); 2386 2365 } 2387 2366 2388 2367 /**