Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg: remove PCG_MOVE_LOCK flag from page_cgroup

PCG_MOVE_LOCK is used for bit spinlock to avoid race between overwriting
pc->mem_cgroup and page statistics accounting per memcg. This lock helps
to avoid the race but the race is very rare because moving tasks between
cgroup is not a usual job. So, it seems using 1bit per page is too
costly.

This patch changes this lock as per-memcg spinlock and removes
PCG_MOVE_LOCK.

If smaller lock is required, we'll be able to add some hashes but I'd like
to start from this.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KAMEZAWA Hiroyuki and committed by
Linus Torvalds
312734c0 619d094b

+32 -29
-19
include/linux/page_cgroup.h
··· 7 7 PCG_USED, /* this object is in use. */ 8 8 PCG_MIGRATION, /* under page migration */ 9 9 /* flags for mem_cgroup and file and I/O status */ 10 - PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ 11 10 PCG_FILE_MAPPED, /* page is accounted as "mapped" */ 12 11 __NR_PCG_FLAGS, 13 12 }; ··· 86 87 static inline void unlock_page_cgroup(struct page_cgroup *pc) 87 88 { 88 89 bit_spin_unlock(PCG_LOCK, &pc->flags); 89 - } 90 - 91 - static inline void move_lock_page_cgroup(struct page_cgroup *pc, 92 - unsigned long *flags) 93 - { 94 - /* 95 - * We know updates to pc->flags of page cache's stats are from both of 96 - * usual context or IRQ context. Disable IRQ to avoid deadlock. 97 - */ 98 - local_irq_save(*flags); 99 - bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); 100 - } 101 - 102 - static inline void move_unlock_page_cgroup(struct page_cgroup *pc, 103 - unsigned long *flags) 104 - { 105 - bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); 106 - local_irq_restore(*flags); 107 90 } 108 91 109 92 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
+32 -10
mm/memcontrol.c
··· 300 300 * set > 0 if pages under this cgroup are moving to other cgroup. 301 301 */ 302 302 atomic_t moving_account; 303 + /* taken only while moving_account > 0 */ 304 + spinlock_t move_lock; 303 305 /* 304 306 * percpu counter. 305 307 */ ··· 1378 1376 return false; 1379 1377 } 1380 1378 1379 + /* 1380 + * Take this lock when 1381 + * - a code tries to modify page's memcg while it's USED. 1382 + * - a code tries to modify page state accounting in a memcg. 1383 + * see mem_cgroup_stealed(), too. 1384 + */ 1385 + static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1386 + unsigned long *flags) 1387 + { 1388 + spin_lock_irqsave(&memcg->move_lock, *flags); 1389 + } 1390 + 1391 + static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1392 + unsigned long *flags) 1393 + { 1394 + spin_unlock_irqrestore(&memcg->move_lock, *flags); 1395 + } 1396 + 1381 1397 /** 1382 1398 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1383 1399 * @memcg: The memory cgroup that went over limit ··· 1920 1900 1921 1901 if (mem_cgroup_disabled()) 1922 1902 return; 1923 - 1903 + again: 1924 1904 rcu_read_lock(); 1925 1905 memcg = pc->mem_cgroup; 1926 1906 if (unlikely(!memcg || !PageCgroupUsed(pc))) ··· 1928 1908 /* pc->mem_cgroup is unstable ? */ 1929 1909 if (unlikely(mem_cgroup_stealed(memcg))) { 1930 1910 /* take a lock against to access pc->mem_cgroup */ 1931 - move_lock_page_cgroup(pc, &flags); 1911 + move_lock_mem_cgroup(memcg, &flags); 1912 + if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 1913 + move_unlock_mem_cgroup(memcg, &flags); 1914 + rcu_read_unlock(); 1915 + goto again; 1916 + } 1932 1917 need_unlock = true; 1933 - memcg = pc->mem_cgroup; 1934 - if (!memcg || !PageCgroupUsed(pc)) 1935 - goto out; 1936 1918 } 1937 1919 1938 1920 switch (idx) { ··· 1953 1931 1954 1932 out: 1955 1933 if (unlikely(need_unlock)) 1956 - move_unlock_page_cgroup(pc, &flags); 1934 + move_unlock_mem_cgroup(memcg, &flags); 1957 1935 rcu_read_unlock(); 1958 1936 } 1959 1937 ··· 2522 2500 2523 2501 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2524 2502 2525 - #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2526 - (1 << PCG_MIGRATION)) 2503 + #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2527 2504 /* 2528 2505 * Because tail pages are not marked as "used", set it. We're under 2529 2506 * zone->lru_lock, 'splitting on pmd' and compound_lock. ··· 2593 2572 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2594 2573 goto unlock; 2595 2574 2596 - move_lock_page_cgroup(pc, &flags); 2575 + move_lock_mem_cgroup(from, &flags); 2597 2576 2598 2577 if (PageCgroupFileMapped(pc)) { 2599 2578 /* Update mapped_file data for mem_cgroup */ ··· 2617 2596 * guaranteed that "to" is never removed. So, we don't check rmdir 2618 2597 * status here. 2619 2598 */ 2620 - move_unlock_page_cgroup(pc, &flags); 2599 + move_unlock_mem_cgroup(from, &flags); 2621 2600 ret = 0; 2622 2601 unlock: 2623 2602 unlock_page_cgroup(pc); ··· 4992 4971 atomic_set(&memcg->refcnt, 1); 4993 4972 memcg->move_charge_at_immigrate = 0; 4994 4973 mutex_init(&memcg->thresholds_lock); 4974 + spin_lock_init(&memcg->move_lock); 4995 4975 return &memcg->css; 4996 4976 free_out: 4997 4977 __mem_cgroup_free(memcg);