Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: embed the memcg pointer directly into struct page

Memory cgroups used to have 5 per-page pointers. To allow users to
disable that amount of overhead during runtime, those pointers were
allocated in a separate array, with a translation layer between them and
struct page.

There is now only one page pointer remaining: the memcg pointer, that
indicates which cgroup the page is associated with when charged. The
complexity of runtime allocation and the runtime translation overhead is
no longer justified to save that *potential* 0.19% of memory. With
CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
after the page->private member and doesn't even increase struct page,
and then this patch actually saves space. Remaining users that care can
still compile their kernels without CONFIG_MEMCG.

text data bss dec hex filename
8828345 1725264 983040 11536649 b00909 vmlinux.old
8827425 1725264 966656 11519345 afc571 vmlinux.new

[mhocko@suse.cz: update Documentation/cgroups/memory.txt]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Johannes Weiner and committed by
Linus Torvalds
1306a85a 22811c6b

+46 -487
+5
Documentation/cgroups/memory.txt
··· 1 1 Memory Resource Controller 2 2 3 + NOTE: This document is hopelessly outdated and it asks for a complete 4 + rewrite. It still contains a useful information so we are keeping it 5 + here but make sure to check the current code if you need a deeper 6 + understanding. 7 + 3 8 NOTE: The Memory Resource Controller has generically been referred to as the 4 9 memory controller in this document. Do not confuse memory controller 5 10 used here with the memory controller that is used in hardware.
+1 -5
include/linux/memcontrol.h
··· 25 25 #include <linux/jump_label.h> 26 26 27 27 struct mem_cgroup; 28 - struct page_cgroup; 29 28 struct page; 30 29 struct mm_struct; 31 30 struct kmem_cache; ··· 465 466 * memcg_kmem_uncharge_pages: uncharge pages from memcg 466 467 * @page: pointer to struct page being freed 467 468 * @order: allocation order. 468 - * 469 - * there is no need to specify memcg here, since it is embedded in page_cgroup 470 469 */ 471 470 static inline void 472 471 memcg_kmem_uncharge_pages(struct page *page, int order) ··· 481 484 * 482 485 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or 483 486 * failure of the allocation. if @page is NULL, this function will revert the 484 - * charges. Otherwise, it will commit the memcg given by @memcg to the 485 - * corresponding page_cgroup. 487 + * charges. Otherwise, it will commit @page to @memcg. 486 488 */ 487 489 static inline void 488 490 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+5
include/linux/mm_types.h
··· 22 22 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 23 23 24 24 struct address_space; 25 + struct mem_cgroup; 25 26 26 27 #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) 27 28 #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ ··· 167 166 struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ 168 167 struct page *first_page; /* Compound tail pages */ 169 168 }; 169 + 170 + #ifdef CONFIG_MEMCG 171 + struct mem_cgroup *mem_cgroup; 172 + #endif 170 173 171 174 /* 172 175 * On machines where all RAM is mapped into kernel address space,
-12
include/linux/mmzone.h
··· 722 722 int nr_zones; 723 723 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 724 724 struct page *node_mem_map; 725 - #ifdef CONFIG_MEMCG 726 - struct page_cgroup *node_page_cgroup; 727 - #endif 728 725 #endif 729 726 #ifndef CONFIG_NO_BOOTMEM 730 727 struct bootmem_data *bdata; ··· 1075 1078 #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1076 1079 1077 1080 struct page; 1078 - struct page_cgroup; 1079 1081 struct mem_section { 1080 1082 /* 1081 1083 * This is, logically, a pointer to an array of struct ··· 1092 1096 1093 1097 /* See declaration of similar field in struct zone */ 1094 1098 unsigned long *pageblock_flags; 1095 - #ifdef CONFIG_MEMCG 1096 - /* 1097 - * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use 1098 - * section. (see memcontrol.h/page_cgroup.h about this.) 1099 - */ 1100 - struct page_cgroup *page_cgroup; 1101 - unsigned long pad; 1102 - #endif 1103 1099 /* 1104 1100 * WARNING: mem_section must be a power-of-2 in size for the 1105 1101 * calculation and use of SECTION_ROOT_MASK to make sense.
-53
include/linux/page_cgroup.h
··· 1 1 #ifndef __LINUX_PAGE_CGROUP_H 2 2 #define __LINUX_PAGE_CGROUP_H 3 3 4 - struct pglist_data; 5 - 6 - #ifdef CONFIG_MEMCG 7 - struct mem_cgroup; 8 - 9 - /* 10 - * Page Cgroup can be considered as an extended mem_map. 11 - * A page_cgroup page is associated with every page descriptor. The 12 - * page_cgroup helps us identify information about the cgroup 13 - * All page cgroups are allocated at boot or memory hotplug event, 14 - * then the page cgroup for pfn always exists. 15 - */ 16 - struct page_cgroup { 17 - struct mem_cgroup *mem_cgroup; 18 - }; 19 - 20 - extern void pgdat_page_cgroup_init(struct pglist_data *pgdat); 21 - 22 - #ifdef CONFIG_SPARSEMEM 23 - static inline void page_cgroup_init_flatmem(void) 24 - { 25 - } 26 - extern void page_cgroup_init(void); 27 - #else 28 - extern void page_cgroup_init_flatmem(void); 29 - static inline void page_cgroup_init(void) 30 - { 31 - } 32 - #endif 33 - 34 - struct page_cgroup *lookup_page_cgroup(struct page *page); 35 - 36 - #else /* !CONFIG_MEMCG */ 37 - struct page_cgroup; 38 - 39 - static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) 40 - { 41 - } 42 - 43 - static inline struct page_cgroup *lookup_page_cgroup(struct page *page) 44 - { 45 - return NULL; 46 - } 47 - 48 - static inline void page_cgroup_init(void) 49 - { 50 - } 51 - 52 - static inline void page_cgroup_init_flatmem(void) 53 - { 54 - } 55 - #endif /* CONFIG_MEMCG */ 56 - 57 4 #include <linux/swap.h> 58 5 59 6 #ifdef CONFIG_MEMCG_SWAP
-7
init/main.c
··· 51 51 #include <linux/mempolicy.h> 52 52 #include <linux/key.h> 53 53 #include <linux/buffer_head.h> 54 - #include <linux/page_cgroup.h> 55 54 #include <linux/debug_locks.h> 56 55 #include <linux/debugobjects.h> 57 56 #include <linux/lockdep.h> ··· 484 485 */ 485 486 static void __init mm_init(void) 486 487 { 487 - /* 488 - * page_cgroup requires contiguous pages, 489 - * bigger than MAX_ORDER unless SPARSEMEM. 490 - */ 491 - page_cgroup_init_flatmem(); 492 488 mem_init(); 493 489 kmem_cache_init(); 494 490 percpu_init_late(); ··· 621 627 initrd_start = 0; 622 628 } 623 629 #endif 624 - page_cgroup_init(); 625 630 debug_objects_mem_init(); 626 631 kmemleak_init(); 627 632 setup_per_cpu_pageset();
+35 -89
mm/memcontrol.c
··· 1274 1274 { 1275 1275 struct mem_cgroup_per_zone *mz; 1276 1276 struct mem_cgroup *memcg; 1277 - struct page_cgroup *pc; 1278 1277 struct lruvec *lruvec; 1279 1278 1280 1279 if (mem_cgroup_disabled()) { ··· 1281 1282 goto out; 1282 1283 } 1283 1284 1284 - pc = lookup_page_cgroup(page); 1285 - memcg = pc->mem_cgroup; 1285 + memcg = page->mem_cgroup; 1286 1286 /* 1287 1287 * Swapcache readahead pages are added to the LRU - and 1288 1288 * possibly migrated - before they are charged. ··· 2018 2020 unsigned long *flags) 2019 2021 { 2020 2022 struct mem_cgroup *memcg; 2021 - struct page_cgroup *pc; 2022 2023 2023 2024 rcu_read_lock(); 2024 2025 2025 2026 if (mem_cgroup_disabled()) 2026 2027 return NULL; 2027 - 2028 - pc = lookup_page_cgroup(page); 2029 2028 again: 2030 - memcg = pc->mem_cgroup; 2029 + memcg = page->mem_cgroup; 2031 2030 if (unlikely(!memcg)) 2032 2031 return NULL; 2033 2032 ··· 2033 2038 return memcg; 2034 2039 2035 2040 spin_lock_irqsave(&memcg->move_lock, *flags); 2036 - if (memcg != pc->mem_cgroup) { 2041 + if (memcg != page->mem_cgroup) { 2037 2042 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2038 2043 goto again; 2039 2044 } ··· 2400 2405 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2401 2406 { 2402 2407 struct mem_cgroup *memcg; 2403 - struct page_cgroup *pc; 2404 2408 unsigned short id; 2405 2409 swp_entry_t ent; 2406 2410 2407 2411 VM_BUG_ON_PAGE(!PageLocked(page), page); 2408 2412 2409 - pc = lookup_page_cgroup(page); 2410 - memcg = pc->mem_cgroup; 2411 - 2413 + memcg = page->mem_cgroup; 2412 2414 if (memcg) { 2413 2415 if (!css_tryget_online(&memcg->css)) 2414 2416 memcg = NULL; ··· 2455 2463 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2456 2464 bool lrucare) 2457 2465 { 2458 - struct page_cgroup *pc = lookup_page_cgroup(page); 2459 2466 int isolated; 2460 2467 2461 - VM_BUG_ON_PAGE(pc->mem_cgroup, page); 2468 + VM_BUG_ON_PAGE(page->mem_cgroup, page); 2462 2469 2463 2470 /* 2464 2471 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page ··· 2468 2477 2469 2478 /* 2470 2479 * Nobody should be changing or seriously looking at 2471 - * pc->mem_cgroup at this point: 2480 + * page->mem_cgroup at this point: 2472 2481 * 2473 2482 * - the page is uncharged 2474 2483 * ··· 2480 2489 * - a page cache insertion, a swapin fault, or a migration 2481 2490 * have the page locked 2482 2491 */ 2483 - pc->mem_cgroup = memcg; 2492 + page->mem_cgroup = memcg; 2484 2493 2485 2494 if (lrucare) 2486 2495 unlock_page_lru(page, isolated); ··· 2963 2972 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2964 2973 int order) 2965 2974 { 2966 - struct page_cgroup *pc; 2967 - 2968 2975 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2969 2976 2970 2977 /* The page allocation failed. Revert */ ··· 2970 2981 memcg_uncharge_kmem(memcg, 1 << order); 2971 2982 return; 2972 2983 } 2973 - pc = lookup_page_cgroup(page); 2974 - pc->mem_cgroup = memcg; 2984 + page->mem_cgroup = memcg; 2975 2985 } 2976 2986 2977 2987 void __memcg_kmem_uncharge_pages(struct page *page, int order) 2978 2988 { 2979 - struct page_cgroup *pc = lookup_page_cgroup(page); 2980 - struct mem_cgroup *memcg = pc->mem_cgroup; 2989 + struct mem_cgroup *memcg = page->mem_cgroup; 2981 2990 2982 2991 if (!memcg) 2983 2992 return; ··· 2983 2996 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2984 2997 2985 2998 memcg_uncharge_kmem(memcg, 1 << order); 2986 - pc->mem_cgroup = NULL; 2999 + page->mem_cgroup = NULL; 2987 3000 } 2988 3001 #else 2989 3002 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) ··· 3001 3014 */ 3002 3015 void mem_cgroup_split_huge_fixup(struct page *head) 3003 3016 { 3004 - struct page_cgroup *pc = lookup_page_cgroup(head); 3005 3017 int i; 3006 3018 3007 3019 if (mem_cgroup_disabled()) 3008 3020 return; 3009 3021 3010 3022 for (i = 1; i < HPAGE_PMD_NR; i++) 3011 - pc[i].mem_cgroup = pc[0].mem_cgroup; 3023 + head[i].mem_cgroup = head->mem_cgroup; 3012 3024 3013 - __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3025 + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3014 3026 HPAGE_PMD_NR); 3015 3027 } 3016 3028 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ··· 3018 3032 * mem_cgroup_move_account - move account of the page 3019 3033 * @page: the page 3020 3034 * @nr_pages: number of regular pages (>1 for huge pages) 3021 - * @pc: page_cgroup of the page. 3022 3035 * @from: mem_cgroup which the page is moved from. 3023 3036 * @to: mem_cgroup which the page is moved to. @from != @to. 3024 3037 * ··· 3030 3045 */ 3031 3046 static int mem_cgroup_move_account(struct page *page, 3032 3047 unsigned int nr_pages, 3033 - struct page_cgroup *pc, 3034 3048 struct mem_cgroup *from, 3035 3049 struct mem_cgroup *to) 3036 3050 { ··· 3049 3065 goto out; 3050 3066 3051 3067 /* 3052 - * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3068 + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup 3053 3069 * of its source page while we change it: page migration takes 3054 3070 * both pages off the LRU, but page cache replacement doesn't. 3055 3071 */ ··· 3057 3073 goto out; 3058 3074 3059 3075 ret = -EINVAL; 3060 - if (pc->mem_cgroup != from) 3076 + if (page->mem_cgroup != from) 3061 3077 goto out_unlock; 3062 3078 3063 3079 spin_lock_irqsave(&from->move_lock, flags); ··· 3077 3093 } 3078 3094 3079 3095 /* 3080 - * It is safe to change pc->mem_cgroup here because the page 3096 + * It is safe to change page->mem_cgroup here because the page 3081 3097 * is referenced, charged, and isolated - we can't race with 3082 3098 * uncharging, charging, migration, or LRU putback. 3083 3099 */ 3084 3100 3085 3101 /* caller should have done css_get */ 3086 - pc->mem_cgroup = to; 3102 + page->mem_cgroup = to; 3087 3103 spin_unlock_irqrestore(&from->move_lock, flags); 3088 3104 3089 3105 ret = 0; ··· 3158 3174 #endif 3159 3175 3160 3176 #ifdef CONFIG_DEBUG_VM 3161 - static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3162 - { 3163 - struct page_cgroup *pc; 3164 - 3165 - pc = lookup_page_cgroup(page); 3166 - /* 3167 - * Can be NULL while feeding pages into the page allocator for 3168 - * the first time, i.e. during boot or memory hotplug; 3169 - * or when mem_cgroup_disabled(). 3170 - */ 3171 - if (likely(pc) && pc->mem_cgroup) 3172 - return pc; 3173 - return NULL; 3174 - } 3175 - 3176 3177 bool mem_cgroup_bad_page_check(struct page *page) 3177 3178 { 3178 3179 if (mem_cgroup_disabled()) 3179 3180 return false; 3180 3181 3181 - return lookup_page_cgroup_used(page) != NULL; 3182 + return page->mem_cgroup != NULL; 3182 3183 } 3183 3184 3184 3185 void mem_cgroup_print_bad_page(struct page *page) 3185 3186 { 3186 - struct page_cgroup *pc; 3187 - 3188 - pc = lookup_page_cgroup_used(page); 3189 - if (pc) 3190 - pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup); 3187 + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); 3191 3188 } 3192 3189 #endif 3193 3190 ··· 5088 5123 unsigned long addr, pte_t ptent, union mc_target *target) 5089 5124 { 5090 5125 struct page *page = NULL; 5091 - struct page_cgroup *pc; 5092 5126 enum mc_target_type ret = MC_TARGET_NONE; 5093 5127 swp_entry_t ent = { .val = 0 }; 5094 5128 ··· 5101 5137 if (!page && !ent.val) 5102 5138 return ret; 5103 5139 if (page) { 5104 - pc = lookup_page_cgroup(page); 5105 5140 /* 5106 5141 * Do only loose check w/o serialization. 5107 - * mem_cgroup_move_account() checks the pc is valid or 5142 + * mem_cgroup_move_account() checks the page is valid or 5108 5143 * not under LRU exclusion. 5109 5144 */ 5110 - if (pc->mem_cgroup == mc.from) { 5145 + if (page->mem_cgroup == mc.from) { 5111 5146 ret = MC_TARGET_PAGE; 5112 5147 if (target) 5113 5148 target->page = page; ··· 5134 5171 unsigned long addr, pmd_t pmd, union mc_target *target) 5135 5172 { 5136 5173 struct page *page = NULL; 5137 - struct page_cgroup *pc; 5138 5174 enum mc_target_type ret = MC_TARGET_NONE; 5139 5175 5140 5176 page = pmd_page(pmd); 5141 5177 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5142 5178 if (!move_anon()) 5143 5179 return ret; 5144 - pc = lookup_page_cgroup(page); 5145 - if (pc->mem_cgroup == mc.from) { 5180 + if (page->mem_cgroup == mc.from) { 5146 5181 ret = MC_TARGET_PAGE; 5147 5182 if (target) { 5148 5183 get_page(page); ··· 5339 5378 enum mc_target_type target_type; 5340 5379 union mc_target target; 5341 5380 struct page *page; 5342 - struct page_cgroup *pc; 5343 5381 5344 5382 /* 5345 5383 * We don't take compound_lock() here but no race with splitting thp ··· 5359 5399 if (target_type == MC_TARGET_PAGE) { 5360 5400 page = target.page; 5361 5401 if (!isolate_lru_page(page)) { 5362 - pc = lookup_page_cgroup(page); 5363 5402 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5364 - pc, mc.from, mc.to)) { 5403 + mc.from, mc.to)) { 5365 5404 mc.precharge -= HPAGE_PMD_NR; 5366 5405 mc.moved_charge += HPAGE_PMD_NR; 5367 5406 } ··· 5388 5429 page = target.page; 5389 5430 if (isolate_lru_page(page)) 5390 5431 goto put; 5391 - pc = lookup_page_cgroup(page); 5392 - if (!mem_cgroup_move_account(page, 1, pc, 5393 - mc.from, mc.to)) { 5432 + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { 5394 5433 mc.precharge--; 5395 5434 /* we uncharge from mc.from later. */ 5396 5435 mc.moved_charge++; ··· 5576 5619 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5577 5620 { 5578 5621 struct mem_cgroup *memcg; 5579 - struct page_cgroup *pc; 5580 5622 unsigned short oldid; 5581 5623 5582 5624 VM_BUG_ON_PAGE(PageLRU(page), page); ··· 5584 5628 if (!do_swap_account) 5585 5629 return; 5586 5630 5587 - pc = lookup_page_cgroup(page); 5588 - memcg = pc->mem_cgroup; 5631 + memcg = page->mem_cgroup; 5589 5632 5590 5633 /* Readahead page, never charged */ 5591 5634 if (!memcg) ··· 5594 5639 VM_BUG_ON_PAGE(oldid, page); 5595 5640 mem_cgroup_swap_statistics(memcg, true); 5596 5641 5597 - pc->mem_cgroup = NULL; 5642 + page->mem_cgroup = NULL; 5598 5643 5599 5644 if (!mem_cgroup_is_root(memcg)) 5600 5645 page_counter_uncharge(&memcg->memory, 1); ··· 5661 5706 goto out; 5662 5707 5663 5708 if (PageSwapCache(page)) { 5664 - struct page_cgroup *pc = lookup_page_cgroup(page); 5665 5709 /* 5666 5710 * Every swap fault against a single page tries to charge the 5667 5711 * page, bail as early as possible. shmem_unuse() encounters ··· 5668 5714 * the page lock, which serializes swap cache removal, which 5669 5715 * in turn serializes uncharging. 5670 5716 */ 5671 - if (pc->mem_cgroup) 5717 + if (page->mem_cgroup) 5672 5718 goto out; 5673 5719 } 5674 5720 ··· 5821 5867 next = page_list->next; 5822 5868 do { 5823 5869 unsigned int nr_pages = 1; 5824 - struct page_cgroup *pc; 5825 5870 5826 5871 page = list_entry(next, struct page, lru); 5827 5872 next = page->lru.next; ··· 5828 5875 VM_BUG_ON_PAGE(PageLRU(page), page); 5829 5876 VM_BUG_ON_PAGE(page_count(page), page); 5830 5877 5831 - pc = lookup_page_cgroup(page); 5832 - if (!pc->mem_cgroup) 5878 + if (!page->mem_cgroup) 5833 5879 continue; 5834 5880 5835 5881 /* 5836 5882 * Nobody should be changing or seriously looking at 5837 - * pc->mem_cgroup at this point, we have fully 5883 + * page->mem_cgroup at this point, we have fully 5838 5884 * exclusive access to the page. 5839 5885 */ 5840 5886 5841 - if (memcg != pc->mem_cgroup) { 5887 + if (memcg != page->mem_cgroup) { 5842 5888 if (memcg) { 5843 5889 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5844 5890 nr_huge, page); 5845 5891 pgpgout = nr_anon = nr_file = nr_huge = 0; 5846 5892 } 5847 - memcg = pc->mem_cgroup; 5893 + memcg = page->mem_cgroup; 5848 5894 } 5849 5895 5850 5896 if (PageTransHuge(page)) { ··· 5857 5905 else 5858 5906 nr_file += nr_pages; 5859 5907 5860 - pc->mem_cgroup = NULL; 5908 + page->mem_cgroup = NULL; 5861 5909 5862 5910 pgpgout++; 5863 5911 } while (next != page_list); ··· 5876 5924 */ 5877 5925 void mem_cgroup_uncharge(struct page *page) 5878 5926 { 5879 - struct page_cgroup *pc; 5880 - 5881 5927 if (mem_cgroup_disabled()) 5882 5928 return; 5883 5929 5884 5930 /* Don't touch page->lru of any random page, pre-check: */ 5885 - pc = lookup_page_cgroup(page); 5886 - if (!pc->mem_cgroup) 5931 + if (!page->mem_cgroup) 5887 5932 return; 5888 5933 5889 5934 INIT_LIST_HEAD(&page->lru); ··· 5917 5968 bool lrucare) 5918 5969 { 5919 5970 struct mem_cgroup *memcg; 5920 - struct page_cgroup *pc; 5921 5971 int isolated; 5922 5972 5923 5973 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); ··· 5931 5983 return; 5932 5984 5933 5985 /* Page cache replacement: new page already charged? */ 5934 - pc = lookup_page_cgroup(newpage); 5935 - if (pc->mem_cgroup) 5986 + if (newpage->mem_cgroup) 5936 5987 return; 5937 5988 5938 5989 /* ··· 5940 5993 * uncharged page when the PFN walker finds a page that 5941 5994 * reclaim just put back on the LRU but has not released yet. 5942 5995 */ 5943 - pc = lookup_page_cgroup(oldpage); 5944 - memcg = pc->mem_cgroup; 5996 + memcg = oldpage->mem_cgroup; 5945 5997 if (!memcg) 5946 5998 return; 5947 5999 5948 6000 if (lrucare) 5949 6001 lock_page_lru(oldpage, &isolated); 5950 6002 5951 - pc->mem_cgroup = NULL; 6003 + oldpage->mem_cgroup = NULL; 5952 6004 5953 6005 if (lrucare) 5954 6006 unlock_page_lru(oldpage, isolated);
-2
mm/page_alloc.c
··· 48 48 #include <linux/backing-dev.h> 49 49 #include <linux/fault-inject.h> 50 50 #include <linux/page-isolation.h> 51 - #include <linux/page_cgroup.h> 52 51 #include <linux/debugobjects.h> 53 52 #include <linux/kmemleak.h> 54 53 #include <linux/compaction.h> ··· 4852 4853 #endif 4853 4854 init_waitqueue_head(&pgdat->kswapd_wait); 4854 4855 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4855 - pgdat_page_cgroup_init(pgdat); 4856 4856 4857 4857 for (j = 0; j < MAX_NR_ZONES; j++) { 4858 4858 struct zone *zone = pgdat->node_zones + j;
-319
mm/page_cgroup.c
··· 1 1 #include <linux/mm.h> 2 - #include <linux/mmzone.h> 3 - #include <linux/bootmem.h> 4 - #include <linux/bit_spinlock.h> 5 2 #include <linux/page_cgroup.h> 6 - #include <linux/hash.h> 7 - #include <linux/slab.h> 8 - #include <linux/memory.h> 9 3 #include <linux/vmalloc.h> 10 - #include <linux/cgroup.h> 11 4 #include <linux/swapops.h> 12 - #include <linux/kmemleak.h> 13 - 14 - static unsigned long total_usage; 15 - 16 - #if !defined(CONFIG_SPARSEMEM) 17 - 18 - 19 - void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 20 - { 21 - pgdat->node_page_cgroup = NULL; 22 - } 23 - 24 - struct page_cgroup *lookup_page_cgroup(struct page *page) 25 - { 26 - unsigned long pfn = page_to_pfn(page); 27 - unsigned long offset; 28 - struct page_cgroup *base; 29 - 30 - base = NODE_DATA(page_to_nid(page))->node_page_cgroup; 31 - #ifdef CONFIG_DEBUG_VM 32 - /* 33 - * The sanity checks the page allocator does upon freeing a 34 - * page can reach here before the page_cgroup arrays are 35 - * allocated when feeding a range of pages to the allocator 36 - * for the first time during bootup or memory hotplug. 37 - */ 38 - if (unlikely(!base)) 39 - return NULL; 40 - #endif 41 - offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; 42 - return base + offset; 43 - } 44 - 45 - static int __init alloc_node_page_cgroup(int nid) 46 - { 47 - struct page_cgroup *base; 48 - unsigned long table_size; 49 - unsigned long nr_pages; 50 - 51 - nr_pages = NODE_DATA(nid)->node_spanned_pages; 52 - if (!nr_pages) 53 - return 0; 54 - 55 - table_size = sizeof(struct page_cgroup) * nr_pages; 56 - 57 - base = memblock_virt_alloc_try_nid_nopanic( 58 - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 59 - BOOTMEM_ALLOC_ACCESSIBLE, nid); 60 - if (!base) 61 - return -ENOMEM; 62 - NODE_DATA(nid)->node_page_cgroup = base; 63 - total_usage += table_size; 64 - return 0; 65 - } 66 - 67 - void __init page_cgroup_init_flatmem(void) 68 - { 69 - 70 - int nid, fail; 71 - 72 - if (mem_cgroup_disabled()) 73 - return; 74 - 75 - for_each_online_node(nid) { 76 - fail = alloc_node_page_cgroup(nid); 77 - if (fail) 78 - goto fail; 79 - } 80 - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 81 - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" 82 - " don't want memory cgroups\n"); 83 - return; 84 - fail: 85 - printk(KERN_CRIT "allocation of page_cgroup failed.\n"); 86 - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); 87 - panic("Out of memory"); 88 - } 89 - 90 - #else /* CONFIG_FLAT_NODE_MEM_MAP */ 91 - 92 - struct page_cgroup *lookup_page_cgroup(struct page *page) 93 - { 94 - unsigned long pfn = page_to_pfn(page); 95 - struct mem_section *section = __pfn_to_section(pfn); 96 - #ifdef CONFIG_DEBUG_VM 97 - /* 98 - * The sanity checks the page allocator does upon freeing a 99 - * page can reach here before the page_cgroup arrays are 100 - * allocated when feeding a range of pages to the allocator 101 - * for the first time during bootup or memory hotplug. 102 - */ 103 - if (!section->page_cgroup) 104 - return NULL; 105 - #endif 106 - return section->page_cgroup + pfn; 107 - } 108 - 109 - static void *__meminit alloc_page_cgroup(size_t size, int nid) 110 - { 111 - gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; 112 - void *addr = NULL; 113 - 114 - addr = alloc_pages_exact_nid(nid, size, flags); 115 - if (addr) { 116 - kmemleak_alloc(addr, size, 1, flags); 117 - return addr; 118 - } 119 - 120 - if (node_state(nid, N_HIGH_MEMORY)) 121 - addr = vzalloc_node(size, nid); 122 - else 123 - addr = vzalloc(size); 124 - 125 - return addr; 126 - } 127 - 128 - static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) 129 - { 130 - struct mem_section *section; 131 - struct page_cgroup *base; 132 - unsigned long table_size; 133 - 134 - section = __pfn_to_section(pfn); 135 - 136 - if (section->page_cgroup) 137 - return 0; 138 - 139 - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 140 - base = alloc_page_cgroup(table_size, nid); 141 - 142 - /* 143 - * The value stored in section->page_cgroup is (base - pfn) 144 - * and it does not point to the memory block allocated above, 145 - * causing kmemleak false positives. 146 - */ 147 - kmemleak_not_leak(base); 148 - 149 - if (!base) { 150 - printk(KERN_ERR "page cgroup allocation failure\n"); 151 - return -ENOMEM; 152 - } 153 - 154 - /* 155 - * The passed "pfn" may not be aligned to SECTION. For the calculation 156 - * we need to apply a mask. 157 - */ 158 - pfn &= PAGE_SECTION_MASK; 159 - section->page_cgroup = base - pfn; 160 - total_usage += table_size; 161 - return 0; 162 - } 163 - #ifdef CONFIG_MEMORY_HOTPLUG 164 - static void free_page_cgroup(void *addr) 165 - { 166 - if (is_vmalloc_addr(addr)) { 167 - vfree(addr); 168 - } else { 169 - struct page *page = virt_to_page(addr); 170 - size_t table_size = 171 - sizeof(struct page_cgroup) * PAGES_PER_SECTION; 172 - 173 - BUG_ON(PageReserved(page)); 174 - kmemleak_free(addr); 175 - free_pages_exact(addr, table_size); 176 - } 177 - } 178 - 179 - static void __free_page_cgroup(unsigned long pfn) 180 - { 181 - struct mem_section *ms; 182 - struct page_cgroup *base; 183 - 184 - ms = __pfn_to_section(pfn); 185 - if (!ms || !ms->page_cgroup) 186 - return; 187 - base = ms->page_cgroup + pfn; 188 - free_page_cgroup(base); 189 - ms->page_cgroup = NULL; 190 - } 191 - 192 - static int __meminit online_page_cgroup(unsigned long start_pfn, 193 - unsigned long nr_pages, 194 - int nid) 195 - { 196 - unsigned long start, end, pfn; 197 - int fail = 0; 198 - 199 - start = SECTION_ALIGN_DOWN(start_pfn); 200 - end = SECTION_ALIGN_UP(start_pfn + nr_pages); 201 - 202 - if (nid == -1) { 203 - /* 204 - * In this case, "nid" already exists and contains valid memory. 205 - * "start_pfn" passed to us is a pfn which is an arg for 206 - * online__pages(), and start_pfn should exist. 207 - */ 208 - nid = pfn_to_nid(start_pfn); 209 - VM_BUG_ON(!node_state(nid, N_ONLINE)); 210 - } 211 - 212 - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 213 - if (!pfn_present(pfn)) 214 - continue; 215 - fail = init_section_page_cgroup(pfn, nid); 216 - } 217 - if (!fail) 218 - return 0; 219 - 220 - /* rollback */ 221 - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 222 - __free_page_cgroup(pfn); 223 - 224 - return -ENOMEM; 225 - } 226 - 227 - static int __meminit offline_page_cgroup(unsigned long start_pfn, 228 - unsigned long nr_pages, int nid) 229 - { 230 - unsigned long start, end, pfn; 231 - 232 - start = SECTION_ALIGN_DOWN(start_pfn); 233 - end = SECTION_ALIGN_UP(start_pfn + nr_pages); 234 - 235 - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 236 - __free_page_cgroup(pfn); 237 - return 0; 238 - 239 - } 240 - 241 - static int __meminit page_cgroup_callback(struct notifier_block *self, 242 - unsigned long action, void *arg) 243 - { 244 - struct memory_notify *mn = arg; 245 - int ret = 0; 246 - switch (action) { 247 - case MEM_GOING_ONLINE: 248 - ret = online_page_cgroup(mn->start_pfn, 249 - mn->nr_pages, mn->status_change_nid); 250 - break; 251 - case MEM_OFFLINE: 252 - offline_page_cgroup(mn->start_pfn, 253 - mn->nr_pages, mn->status_change_nid); 254 - break; 255 - case MEM_CANCEL_ONLINE: 256 - offline_page_cgroup(mn->start_pfn, 257 - mn->nr_pages, mn->status_change_nid); 258 - break; 259 - case MEM_GOING_OFFLINE: 260 - break; 261 - case MEM_ONLINE: 262 - case MEM_CANCEL_OFFLINE: 263 - break; 264 - } 265 - 266 - return notifier_from_errno(ret); 267 - } 268 - 269 - #endif 270 - 271 - void __init page_cgroup_init(void) 272 - { 273 - unsigned long pfn; 274 - int nid; 275 - 276 - if (mem_cgroup_disabled()) 277 - return; 278 - 279 - for_each_node_state(nid, N_MEMORY) { 280 - unsigned long start_pfn, end_pfn; 281 - 282 - start_pfn = node_start_pfn(nid); 283 - end_pfn = node_end_pfn(nid); 284 - /* 285 - * start_pfn and end_pfn may not be aligned to SECTION and the 286 - * page->flags of out of node pages are not initialized. So we 287 - * scan [start_pfn, the biggest section's pfn < end_pfn) here. 288 - */ 289 - for (pfn = start_pfn; 290 - pfn < end_pfn; 291 - pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { 292 - 293 - if (!pfn_valid(pfn)) 294 - continue; 295 - /* 296 - * Nodes's pfns can be overlapping. 297 - * We know some arch can have a nodes layout such as 298 - * -------------pfn--------------> 299 - * N0 | N1 | N2 | N0 | N1 | N2|.... 300 - */ 301 - if (pfn_to_nid(pfn) != nid) 302 - continue; 303 - if (init_section_page_cgroup(pfn, nid)) 304 - goto oom; 305 - } 306 - } 307 - hotplug_memory_notifier(page_cgroup_callback, 0); 308 - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 309 - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " 310 - "don't want memory cgroups\n"); 311 - return; 312 - oom: 313 - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 314 - panic("Out of memory"); 315 - } 316 - 317 - void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 318 - { 319 - return; 320 - } 321 - 322 - #endif 323 - 324 5 325 6 #ifdef CONFIG_MEMCG_SWAP 326 7