memcg: fix deadlock between cpuset and memcg

Commit b1dd693e ("memcg: avoid deadlock between move charge and
try_charge()") can cause another deadlock about mmap_sem on task migration
if cpuset and memcg are mounted onto the same mount point.

After the commit, cgroup_attach_task() has sequence like:

cgroup_attach_task()
ss->can_attach()
cpuset_can_attach()
mem_cgroup_can_attach()
down_read(&mmap_sem) (1)
ss->attach()
cpuset_attach()
mpol_rebind_mm()
down_write(&mmap_sem) (2)
up_write(&mmap_sem)
cpuset_migrate_mm()
do_migrate_pages()
down_read(&mmap_sem)
up_read(&mmap_sem)
mem_cgroup_move_task()
mem_cgroup_clear_mc()
up_read(&mmap_sem)

We can cause deadlock at (2) because we've already aquire the mmap_sem at (1).

But the commit itself is necessary to fix deadlocks which have existed
before the commit like:

Ex.1)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() | down_write(&mmap_sem)
mc.moving_task = current | ..
mem_cgroup_precharge_mc() | __mem_cgroup_try_charge()
mem_cgroup_count_precharge() | prepare_to_wait()
down_read(&mmap_sem) | if (mc.moving_task)
-> cannot aquire the lock | -> true
| schedule()
| -> move charge should wake it up

Ex.2)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() |
mc.moving_task = current |
mem_cgroup_precharge_mc() |
mem_cgroup_count_precharge() |
down_read(&mmap_sem) |
.. |
up_read(&mmap_sem) |
| down_write(&mmap_sem)
mem_cgroup_move_task() | ..
mem_cgroup_move_charge() | __mem_cgroup_try_charge()
down_read(&mmap_sem) | prepare_to_wait()
-> cannot aquire the lock | if (mc.moving_task)
| -> true
| schedule()
| -> move charge should wake it up

This patch fixes all of these problems by:
1. revert the commit.
2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge()
has released the mmap_sem.
3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in
mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel
all extra charges, wake up all waiters, and retry trylock.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Daisuke Nishimura and committed by Linus Torvalds dfe076b0 043d18b1

+49 -35
+49 -35
mm/memcontrol.c
··· 292 292 unsigned long moved_charge; 293 293 unsigned long moved_swap; 294 294 struct task_struct *moving_task; /* a task moving charges */ 295 - struct mm_struct *mm; 296 295 wait_queue_head_t waitq; /* a waitq for other context */ 297 296 } mc = { 298 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), ··· 4680 4681 unsigned long precharge; 4681 4682 struct vm_area_struct *vma; 4682 4683 4683 - /* We've already held the mmap_sem */ 4684 + down_read(&mm->mmap_sem); 4684 4685 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4685 4686 struct mm_walk mem_cgroup_count_precharge_walk = { 4686 4687 .pmd_entry = mem_cgroup_count_precharge_pte_range, ··· 4692 4693 walk_page_range(vma->vm_start, vma->vm_end, 4693 4694 &mem_cgroup_count_precharge_walk); 4694 4695 } 4696 + up_read(&mm->mmap_sem); 4695 4697 4696 4698 precharge = mc.precharge; 4697 4699 mc.precharge = 0; ··· 4702 4702 4703 4703 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4704 4704 { 4705 - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4705 + unsigned long precharge = mem_cgroup_count_precharge(mm); 4706 + 4707 + VM_BUG_ON(mc.moving_task); 4708 + mc.moving_task = current; 4709 + return mem_cgroup_do_precharge(precharge); 4706 4710 } 4707 4711 4708 - static void mem_cgroup_clear_mc(void) 4712 + /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4713 + static void __mem_cgroup_clear_mc(void) 4709 4714 { 4710 4715 struct mem_cgroup *from = mc.from; 4711 4716 struct mem_cgroup *to = mc.to; ··· 4745 4740 PAGE_SIZE * mc.moved_swap); 4746 4741 } 4747 4742 /* we've already done mem_cgroup_get(mc.to) */ 4748 - 4749 4743 mc.moved_swap = 0; 4750 4744 } 4751 - if (mc.mm) { 4752 - up_read(&mc.mm->mmap_sem); 4753 - mmput(mc.mm); 4754 - } 4745 + memcg_oom_recover(from); 4746 + memcg_oom_recover(to); 4747 + wake_up_all(&mc.waitq); 4748 + } 4749 + 4750 + static void mem_cgroup_clear_mc(void) 4751 + { 4752 + struct mem_cgroup *from = mc.from; 4753 + 4754 + /* 4755 + * we must clear moving_task before waking up waiters at the end of 4756 + * task migration. 4757 + */ 4758 + mc.moving_task = NULL; 4759 + __mem_cgroup_clear_mc(); 4755 4760 spin_lock(&mc.lock); 4756 4761 mc.from = NULL; 4757 4762 mc.to = NULL; 4758 4763 spin_unlock(&mc.lock); 4759 - mc.moving_task = NULL; 4760 - mc.mm = NULL; 4761 4764 mem_cgroup_end_move(from); 4762 - memcg_oom_recover(from); 4763 - memcg_oom_recover(to); 4764 - wake_up_all(&mc.waitq); 4765 4765 } 4766 4766 4767 4767 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, ··· 4788 4778 return 0; 4789 4779 /* We move charges only when we move a owner of the mm */ 4790 4780 if (mm->owner == p) { 4791 - /* 4792 - * We do all the move charge works under one mmap_sem to 4793 - * avoid deadlock with down_write(&mmap_sem) 4794 - * -> try_charge() -> if (mc.moving_task) -> sleep. 4795 - */ 4796 - down_read(&mm->mmap_sem); 4797 - 4798 4781 VM_BUG_ON(mc.from); 4799 4782 VM_BUG_ON(mc.to); 4800 4783 VM_BUG_ON(mc.precharge); 4801 4784 VM_BUG_ON(mc.moved_charge); 4802 4785 VM_BUG_ON(mc.moved_swap); 4803 - VM_BUG_ON(mc.moving_task); 4804 - VM_BUG_ON(mc.mm); 4805 - 4806 4786 mem_cgroup_start_move(from); 4807 4787 spin_lock(&mc.lock); 4808 4788 mc.from = from; 4809 4789 mc.to = mem; 4810 - mc.precharge = 0; 4811 - mc.moved_charge = 0; 4812 - mc.moved_swap = 0; 4813 4790 spin_unlock(&mc.lock); 4814 - mc.moving_task = current; 4815 - mc.mm = mm; 4791 + /* We set mc.moving_task later */ 4816 4792 4817 4793 ret = mem_cgroup_precharge_mc(mm); 4818 4794 if (ret) 4819 4795 mem_cgroup_clear_mc(); 4820 - /* We call up_read() and mmput() in clear_mc(). */ 4821 - } else 4822 - mmput(mm); 4796 + } 4797 + mmput(mm); 4823 4798 } 4824 4799 return ret; 4825 4800 } ··· 4893 4898 struct vm_area_struct *vma; 4894 4899 4895 4900 lru_add_drain_all(); 4896 - /* We've already held the mmap_sem */ 4901 + retry: 4902 + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 4903 + /* 4904 + * Someone who are holding the mmap_sem might be waiting in 4905 + * waitq. So we cancel all extra charges, wake up all waiters, 4906 + * and retry. Because we cancel precharges, we might not be able 4907 + * to move enough charges, but moving charge is a best-effort 4908 + * feature anyway, so it wouldn't be a big problem. 4909 + */ 4910 + __mem_cgroup_clear_mc(); 4911 + cond_resched(); 4912 + goto retry; 4913 + } 4897 4914 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4898 4915 int ret; 4899 4916 struct mm_walk mem_cgroup_move_charge_walk = { ··· 4924 4917 */ 4925 4918 break; 4926 4919 } 4920 + up_read(&mm->mmap_sem); 4927 4921 } 4928 4922 4929 4923 static void mem_cgroup_move_task(struct cgroup_subsys *ss, ··· 4933 4925 struct task_struct *p, 4934 4926 bool threadgroup) 4935 4927 { 4936 - if (!mc.mm) 4928 + struct mm_struct *mm; 4929 + 4930 + if (!mc.to) 4937 4931 /* no need to move charge */ 4938 4932 return; 4939 4933 4940 - mem_cgroup_move_charge(mc.mm); 4934 + mm = get_task_mm(p); 4935 + if (mm) { 4936 + mem_cgroup_move_charge(mm); 4937 + mmput(mm); 4938 + } 4941 4939 mem_cgroup_clear_mc(); 4942 4940 } 4943 4941 #else /* !CONFIG_MMU */