Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg: simplify LRU handling by new rule

Now, at LRU handling, memory cgroup needs to do complicated works to see
valid pc->mem_cgroup, which may be overwritten.

This patch is for relaxing the protocol. This patch guarantees
- when pc->mem_cgroup is overwritten, page must not be on LRU.

By this, LRU routine can believe pc->mem_cgroup and don't need to check
bits on pc->flags. This new rule may adds small overheads to swapin. But
in most case, lru handling gets faster.

After this patch, PCG_ACCT_LRU bit is obsolete and removed.

[akpm@linux-foundation.org: remove unneeded VM_BUG_ON(), restore hannes's christmas tree]
[akpm@linux-foundation.org: clean up code comment]
[hughd@google.com: fix NULL mem_cgroup_try_charge]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KAMEZAWA Hiroyuki and committed by
Linus Torvalds
38c5d72f 4e5f01c2

+54 -77
-8
include/linux/page_cgroup.h
··· 10 10 /* flags for mem_cgroup and file and I/O status */ 11 11 PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ 12 12 PCG_FILE_MAPPED, /* page is accounted as "mapped" */ 13 - /* No lock in page_cgroup */ 14 - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ 15 13 __NR_PCG_FLAGS, 16 14 }; 17 15 ··· 72 74 TESTPCGFLAG(Used, USED) 73 75 CLEARPCGFLAG(Used, USED) 74 76 SETPCGFLAG(Used, USED) 75 - 76 - SETPCGFLAG(AcctLRU, ACCT_LRU) 77 - CLEARPCGFLAG(AcctLRU, ACCT_LRU) 78 - TESTPCGFLAG(AcctLRU, ACCT_LRU) 79 - TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU) 80 - 81 77 82 78 SETPCGFLAG(FileMapped, FILE_MAPPED) 83 79 CLEARPCGFLAG(FileMapped, FILE_MAPPED)
+54 -69
mm/memcontrol.c
··· 1040 1040 return &zone->lruvec; 1041 1041 1042 1042 pc = lookup_page_cgroup(page); 1043 - VM_BUG_ON(PageCgroupAcctLRU(pc)); 1044 - /* 1045 - * putback: charge: 1046 - * SetPageLRU SetPageCgroupUsed 1047 - * smp_mb smp_mb 1048 - * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU 1049 - * 1050 - * Ensure that one of the two sides adds the page to the memcg 1051 - * LRU during a race. 1052 - */ 1053 - smp_mb(); 1054 - /* 1055 - * If the page is uncharged, it may be freed soon, but it 1056 - * could also be swap cache (readahead, swapoff) that needs to 1057 - * be reclaimable in the future. root_mem_cgroup will babysit 1058 - * it for the time being. 1059 - */ 1060 - if (PageCgroupUsed(pc)) { 1061 - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1062 - smp_rmb(); 1063 - memcg = pc->mem_cgroup; 1064 - SetPageCgroupAcctLRU(pc); 1065 - } else 1066 - memcg = root_mem_cgroup; 1043 + memcg = pc->mem_cgroup; 1067 1044 mz = page_cgroup_zoneinfo(memcg, page); 1068 1045 /* compound_order() is stabilized through lru_lock */ 1069 1046 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); ··· 1067 1090 return; 1068 1091 1069 1092 pc = lookup_page_cgroup(page); 1070 - /* 1071 - * root_mem_cgroup babysits uncharged LRU pages, but 1072 - * PageCgroupUsed is cleared when the page is about to get 1073 - * freed. PageCgroupAcctLRU remembers whether the 1074 - * LRU-accounting happened against pc->mem_cgroup or 1075 - * root_mem_cgroup. 1076 - */ 1077 - if (TestClearPageCgroupAcctLRU(pc)) { 1078 - VM_BUG_ON(!pc->mem_cgroup); 1079 - memcg = pc->mem_cgroup; 1080 - } else 1081 - memcg = root_mem_cgroup; 1093 + memcg = pc->mem_cgroup; 1094 + VM_BUG_ON(!memcg); 1082 1095 mz = page_cgroup_zoneinfo(memcg, page); 1083 1096 /* huge page split is done under lru_lock. so, we have no races. */ 1084 1097 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); ··· 2184 2217 } 2185 2218 2186 2219 /* 2187 - * Unlike exported interface, "oom" parameter is added. if oom==true, 2188 - * oom-killer can be invoked. 2220 + * __mem_cgroup_try_charge() does 2221 + * 1. detect memcg to be charged against from passed *mm and *ptr, 2222 + * 2. update res_counter 2223 + * 3. call memory reclaim if necessary. 2224 + * 2225 + * In some special case, if the task is fatal, fatal_signal_pending() or 2226 + * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2227 + * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2228 + * as possible without any hazards. 2: all pages should have a valid 2229 + * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2230 + * pointer, that is treated as a charge to root_mem_cgroup. 2231 + * 2232 + * So __mem_cgroup_try_charge() will return 2233 + * 0 ... on success, filling *ptr with a valid memcg pointer. 2234 + * -ENOMEM ... charge failure because of resource limits. 2235 + * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2236 + * 2237 + * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2238 + * the oom-killer can be invoked. 2189 2239 */ 2190 2240 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2191 2241 gfp_t gfp_mask, ··· 2231 2247 * set, if so charge the init_mm (happens for pagecache usage). 2232 2248 */ 2233 2249 if (!*ptr && !mm) 2234 - goto bypass; 2250 + *ptr = root_mem_cgroup; 2235 2251 again: 2236 2252 if (*ptr) { /* css should be a valid one */ 2237 2253 memcg = *ptr; ··· 2257 2273 * task-struct. So, mm->owner can be NULL. 2258 2274 */ 2259 2275 memcg = mem_cgroup_from_task(p); 2260 - if (!memcg || mem_cgroup_is_root(memcg)) { 2276 + if (!memcg) 2277 + memcg = root_mem_cgroup; 2278 + if (mem_cgroup_is_root(memcg)) { 2261 2279 rcu_read_unlock(); 2262 2280 goto done; 2263 2281 } ··· 2334 2348 *ptr = NULL; 2335 2349 return -ENOMEM; 2336 2350 bypass: 2337 - *ptr = NULL; 2338 - return 0; 2351 + *ptr = root_mem_cgroup; 2352 + return -EINTR; 2339 2353 } 2340 2354 2341 2355 /* ··· 2443 2457 2444 2458 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2445 2459 unlock_page_cgroup(pc); 2460 + WARN_ON_ONCE(PageLRU(page)); 2446 2461 /* 2447 2462 * "charge_statistics" updated event counter. Then, check it. 2448 2463 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. ··· 2455 2468 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2456 2469 2457 2470 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2458 - (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2471 + (1 << PCG_MIGRATION)) 2459 2472 /* 2460 2473 * Because tail pages are not marked as "used", set it. We're under 2461 2474 * zone->lru_lock, 'splitting on pmd' and compound_lock. ··· 2465 2478 void mem_cgroup_split_huge_fixup(struct page *head) 2466 2479 { 2467 2480 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2481 + struct mem_cgroup_per_zone *mz; 2468 2482 struct page_cgroup *pc; 2483 + enum lru_list lru; 2469 2484 int i; 2470 2485 2471 2486 if (mem_cgroup_disabled()) ··· 2476 2487 pc = head_pc + i; 2477 2488 pc->mem_cgroup = head_pc->mem_cgroup; 2478 2489 smp_wmb();/* see __commit_charge() */ 2479 - /* 2480 - * LRU flags cannot be copied because we need to add tail 2481 - * page to LRU by generic call and our hooks will be called. 2482 - */ 2483 2490 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2484 2491 } 2485 - 2486 - if (PageCgroupAcctLRU(head_pc)) { 2487 - enum lru_list lru; 2488 - struct mem_cgroup_per_zone *mz; 2489 - /* 2490 - * We hold lru_lock, then, reduce counter directly. 2491 - */ 2492 - lru = page_lru(head); 2493 - mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2494 - MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; 2495 - } 2492 + /* 2493 + * Tail pages will be added to LRU. 2494 + * We hold lru_lock,then,reduce counter directly. 2495 + */ 2496 + lru = page_lru(head); 2497 + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2498 + MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1; 2496 2499 } 2497 2500 #endif 2498 2501 ··· 2601 2620 2602 2621 parent = mem_cgroup_from_cont(pcg); 2603 2622 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2604 - if (ret || !parent) 2623 + if (ret) 2605 2624 goto put_back; 2606 2625 2607 2626 if (nr_pages > 1) ··· 2648 2667 2649 2668 pc = lookup_page_cgroup(page); 2650 2669 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2651 - if (ret || !memcg) 2670 + if (ret == -ENOMEM) 2652 2671 return ret; 2653 - 2654 2672 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2655 2673 return 0; 2656 2674 } ··· 2716 2736 if (!page_is_file_cache(page)) 2717 2737 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2718 2738 2719 - if (!PageSwapCache(page)) { 2739 + if (!PageSwapCache(page)) 2720 2740 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 2721 - WARN_ON_ONCE(PageLRU(page)); 2722 - } else { /* page is swapcache/shmem */ 2741 + else { /* page is swapcache/shmem */ 2723 2742 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2724 2743 if (!ret) 2725 2744 __mem_cgroup_commit_charge_swapin(page, memcg, type); ··· 2760 2781 *memcgp = memcg; 2761 2782 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 2762 2783 css_put(&memcg->css); 2784 + if (ret == -EINTR) 2785 + ret = 0; 2763 2786 return ret; 2764 2787 charge_cur_mm: 2765 2788 if (unlikely(!mm)) 2766 2789 mm = &init_mm; 2767 - return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2790 + ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2791 + if (ret == -EINTR) 2792 + ret = 0; 2793 + return ret; 2768 2794 } 2769 2795 2770 2796 static void ··· 3229 3245 *memcgp = memcg; 3230 3246 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); 3231 3247 css_put(&memcg->css);/* drop extra refcnt */ 3232 - if (ret || *memcgp == NULL) { 3248 + if (ret) { 3233 3249 if (PageAnon(page)) { 3234 3250 lock_page_cgroup(pc); 3235 3251 ClearPageCgroupMigration(pc); ··· 3239 3255 */ 3240 3256 mem_cgroup_uncharge_page(page); 3241 3257 } 3258 + /* we'll need to revisit this error code (we have -EINTR) */ 3242 3259 return -ENOMEM; 3243 3260 } 3244 3261 /* ··· 3659 3674 pc = lookup_page_cgroup(page); 3660 3675 3661 3676 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3662 - if (ret == -ENOMEM) 3677 + if (ret == -ENOMEM || ret == -EINTR) 3663 3678 break; 3664 3679 3665 3680 if (ret == -EBUSY || ret == -EINVAL) { ··· 5050 5065 } 5051 5066 ret = __mem_cgroup_try_charge(NULL, 5052 5067 GFP_KERNEL, 1, &memcg, false); 5053 - if (ret || !memcg) 5068 + if (ret) 5054 5069 /* mem_cgroup_clear_mc() will do uncharge later */ 5055 - return -ENOMEM; 5070 + return ret; 5056 5071 mc.precharge++; 5057 5072 } 5058 5073 return ret;