Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

- The destruction path of cgroup objects are asynchronous and
multi-staged and some of them ended up destroying parents before
children leading to failures in cpu and memory controllers. Ensure
that parents are always destroyed after children.

- cpuset mm node migration was performed synchronously while holding
threadgroup and cgroup mutexes and the recent threadgroup locking
update resulted in a possible deadlock. The migration is best effort
and shouldn't have been performed under those locks to begin with.
Made asynchronous.

- Minor documentation fix.

* 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1'
cgroup: make sure a parent css isn't freed before its children
cgroup: make sure a parent css isn't offlined before its children
cpuset: make mm migration asynchronous

+85 -31
+1 -1
Documentation/cgroup-v2.txt
··· 7 7 conventions of cgroup v2. It describes all userland-visible aspects 8 8 of cgroup including core and specific controller behaviors. All 9 9 future changes must be reflected in this document. Documentation for 10 - v1 is available under Documentation/cgroup-legacy/. 10 + v1 is available under Documentation/cgroup-v1/. 11 11 12 12 CONTENTS 13 13
+6
include/linux/cgroup-defs.h
··· 127 127 */ 128 128 u64 serial_nr; 129 129 130 + /* 131 + * Incremented by online self and children. Used to guarantee that 132 + * parents are not offlined before their children. 133 + */ 134 + atomic_t online_cnt; 135 + 130 136 /* percpu_ref killing and RCU release */ 131 137 struct rcu_head rcu_head; 132 138 struct work_struct destroy_work;
+6
include/linux/cpuset.h
··· 137 137 task_unlock(current); 138 138 } 139 139 140 + extern void cpuset_post_attach_flush(void); 141 + 140 142 #else /* !CONFIG_CPUSETS */ 141 143 142 144 static inline bool cpusets_enabled(void) { return false; } ··· 243 241 static inline bool read_mems_allowed_retry(unsigned int seq) 244 242 { 245 243 return false; 244 + } 245 + 246 + static inline void cpuset_post_attach_flush(void) 247 + { 246 248 } 247 249 248 250 #endif /* !CONFIG_CPUSETS */
+23 -8
kernel/cgroup.c
··· 58 58 #include <linux/kthread.h> 59 59 #include <linux/delay.h> 60 60 #include <linux/atomic.h> 61 + #include <linux/cpuset.h> 61 62 #include <net/sock.h> 62 63 63 64 /* ··· 2740 2739 out_unlock_threadgroup: 2741 2740 percpu_up_write(&cgroup_threadgroup_rwsem); 2742 2741 cgroup_kn_unlock(of->kn); 2742 + cpuset_post_attach_flush(); 2743 2743 return ret ?: nbytes; 2744 2744 } 2745 2745 ··· 4657 4655 4658 4656 if (ss) { 4659 4657 /* css free path */ 4658 + struct cgroup_subsys_state *parent = css->parent; 4660 4659 int id = css->id; 4661 - 4662 - if (css->parent) 4663 - css_put(css->parent); 4664 4660 4665 4661 ss->css_free(css); 4666 4662 cgroup_idr_remove(&ss->css_idr, id); 4667 4663 cgroup_put(cgrp); 4664 + 4665 + if (parent) 4666 + css_put(parent); 4668 4667 } else { 4669 4668 /* cgroup free path */ 4670 4669 atomic_dec(&cgrp->root->nr_cgrps); ··· 4761 4758 INIT_LIST_HEAD(&css->sibling); 4762 4759 INIT_LIST_HEAD(&css->children); 4763 4760 css->serial_nr = css_serial_nr_next++; 4761 + atomic_set(&css->online_cnt, 0); 4764 4762 4765 4763 if (cgroup_parent(cgrp)) { 4766 4764 css->parent = cgroup_css(cgroup_parent(cgrp), ss); ··· 4784 4780 if (!ret) { 4785 4781 css->flags |= CSS_ONLINE; 4786 4782 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4783 + 4784 + atomic_inc(&css->online_cnt); 4785 + if (css->parent) 4786 + atomic_inc(&css->parent->online_cnt); 4787 4787 } 4788 4788 return ret; 4789 4789 } ··· 5025 5017 container_of(work, struct cgroup_subsys_state, destroy_work); 5026 5018 5027 5019 mutex_lock(&cgroup_mutex); 5028 - offline_css(css); 5029 - mutex_unlock(&cgroup_mutex); 5030 5020 5031 - css_put(css); 5021 + do { 5022 + offline_css(css); 5023 + css_put(css); 5024 + /* @css can't go away while we're holding cgroup_mutex */ 5025 + css = css->parent; 5026 + } while (css && atomic_dec_and_test(&css->online_cnt)); 5027 + 5028 + mutex_unlock(&cgroup_mutex); 5032 5029 } 5033 5030 5034 5031 /* css kill confirmation processing requires process context, bounce */ ··· 5042 5029 struct cgroup_subsys_state *css = 5043 5030 container_of(ref, struct cgroup_subsys_state, refcnt); 5044 5031 5045 - INIT_WORK(&css->destroy_work, css_killed_work_fn); 5046 - queue_work(cgroup_destroy_wq, &css->destroy_work); 5032 + if (atomic_dec_and_test(&css->online_cnt)) { 5033 + INIT_WORK(&css->destroy_work, css_killed_work_fn); 5034 + queue_work(cgroup_destroy_wq, &css->destroy_work); 5035 + } 5047 5036 } 5048 5037 5049 5038 /**
+49 -22
kernel/cpuset.c
··· 287 287 static DEFINE_MUTEX(cpuset_mutex); 288 288 static DEFINE_SPINLOCK(callback_lock); 289 289 290 + static struct workqueue_struct *cpuset_migrate_mm_wq; 291 + 290 292 /* 291 293 * CPU / memory hotplug is handled asynchronously. 292 294 */ ··· 974 972 } 975 973 976 974 /* 977 - * cpuset_migrate_mm 978 - * 979 - * Migrate memory region from one set of nodes to another. 980 - * 981 - * Temporarilly set tasks mems_allowed to target nodes of migration, 982 - * so that the migration code can allocate pages on these nodes. 983 - * 984 - * While the mm_struct we are migrating is typically from some 985 - * other task, the task_struct mems_allowed that we are hacking 986 - * is for our current task, which must allocate new pages for that 987 - * migrating memory region. 975 + * Migrate memory region from one set of nodes to another. This is 976 + * performed asynchronously as it can be called from process migration path 977 + * holding locks involved in process management. All mm migrations are 978 + * performed in the queued order and can be waited for by flushing 979 + * cpuset_migrate_mm_wq. 988 980 */ 981 + 982 + struct cpuset_migrate_mm_work { 983 + struct work_struct work; 984 + struct mm_struct *mm; 985 + nodemask_t from; 986 + nodemask_t to; 987 + }; 988 + 989 + static void cpuset_migrate_mm_workfn(struct work_struct *work) 990 + { 991 + struct cpuset_migrate_mm_work *mwork = 992 + container_of(work, struct cpuset_migrate_mm_work, work); 993 + 994 + /* on a wq worker, no need to worry about %current's mems_allowed */ 995 + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); 996 + mmput(mwork->mm); 997 + kfree(mwork); 998 + } 989 999 990 1000 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 991 1001 const nodemask_t *to) 992 1002 { 993 - struct task_struct *tsk = current; 1003 + struct cpuset_migrate_mm_work *mwork; 994 1004 995 - tsk->mems_allowed = *to; 1005 + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); 1006 + if (mwork) { 1007 + mwork->mm = mm; 1008 + mwork->from = *from; 1009 + mwork->to = *to; 1010 + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); 1011 + queue_work(cpuset_migrate_mm_wq, &mwork->work); 1012 + } else { 1013 + mmput(mm); 1014 + } 1015 + } 996 1016 997 - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 998 - 999 - rcu_read_lock(); 1000 - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); 1001 - rcu_read_unlock(); 1017 + void cpuset_post_attach_flush(void) 1018 + { 1019 + flush_workqueue(cpuset_migrate_mm_wq); 1002 1020 } 1003 1021 1004 1022 /* ··· 1119 1097 mpol_rebind_mm(mm, &cs->mems_allowed); 1120 1098 if (migrate) 1121 1099 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1122 - mmput(mm); 1100 + else 1101 + mmput(mm); 1123 1102 } 1124 1103 css_task_iter_end(&it); 1125 1104 ··· 1568 1545 * @old_mems_allowed is the right nodesets that we 1569 1546 * migrate mm from. 1570 1547 */ 1571 - if (is_memory_migrate(cs)) { 1548 + if (is_memory_migrate(cs)) 1572 1549 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1573 1550 &cpuset_attach_nodemask_to); 1574 - } 1575 - mmput(mm); 1551 + else 1552 + mmput(mm); 1576 1553 } 1577 1554 } 1578 1555 ··· 1737 1714 mutex_unlock(&cpuset_mutex); 1738 1715 kernfs_unbreak_active_protection(of->kn); 1739 1716 css_put(&cs->css); 1717 + flush_workqueue(cpuset_migrate_mm_wq); 1740 1718 return retval ?: nbytes; 1741 1719 } 1742 1720 ··· 2383 2359 top_cpuset.effective_mems = node_states[N_MEMORY]; 2384 2360 2385 2361 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2362 + 2363 + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); 2364 + BUG_ON(!cpuset_migrate_mm_wq); 2386 2365 } 2387 2366 2388 2367 /**