Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg: add per cgroup writeback pages accounting

Add memcg routines to count writeback pages, later dirty pages will also
be accounted.

After Kame's commit 89c06bd52fb9 ("memcg: use new logic for page stat
accounting"), we can use 'struct page' flag to test page state instead
of per page_cgroup flag. But memcg has a feature to move a page from a
cgroup to another one and may have race between "move" and "page stat
accounting". So in order to avoid the race we have designed a new lock:

mem_cgroup_begin_update_page_stat()
modify page information -->(a)
mem_cgroup_update_page_stat() -->(b)
mem_cgroup_end_update_page_stat()

It requires both (a) and (b)(writeback pages accounting) to be pretected
in mem_cgroup_{begin/end}_update_page_stat(). It's full no-op for
!CONFIG_MEMCG, almost no-op if memcg is disabled (but compiled in), rcu
read lock in the most cases (no task is moving), and spin_lock_irqsave
on top in the slow path.

There're two writeback interfaces to modify: test_{clear/set}_page_writeback().
And the lock order is:
--> memcg->move_lock
--> mapping->tree_lock

Signed-off-by: Sha Zhengju <handai.szj@taobao.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Greg Thelen <gthelen@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Sha Zhengju and committed by
Linus Torvalds
3ea67d06 658b72c5

+39 -7
+1
include/linux/memcontrol.h
··· 42 42 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 43 43 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 44 44 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 45 + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ 45 46 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 46 47 MEM_CGROUP_STAT_NSTATS, 47 48 };
+23 -7
mm/memcontrol.c
··· 89 89 "rss", 90 90 "rss_huge", 91 91 "mapped_file", 92 + "writeback", 92 93 "swap", 93 94 }; 94 95 ··· 3655 3654 } 3656 3655 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3657 3656 3657 + static inline 3658 + void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, 3659 + struct mem_cgroup *to, 3660 + unsigned int nr_pages, 3661 + enum mem_cgroup_stat_index idx) 3662 + { 3663 + /* Update stat data for mem_cgroup */ 3664 + preempt_disable(); 3665 + WARN_ON_ONCE(from->stat->count[idx] < nr_pages); 3666 + __this_cpu_add(from->stat->count[idx], -nr_pages); 3667 + __this_cpu_add(to->stat->count[idx], nr_pages); 3668 + preempt_enable(); 3669 + } 3670 + 3658 3671 /** 3659 3672 * mem_cgroup_move_account - move account of the page 3660 3673 * @page: the page ··· 3714 3699 3715 3700 move_lock_mem_cgroup(from, &flags); 3716 3701 3717 - if (!anon && page_mapped(page)) { 3718 - /* Update mapped_file data for mem_cgroup */ 3719 - preempt_disable(); 3720 - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3721 - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3722 - preempt_enable(); 3723 - } 3702 + if (!anon && page_mapped(page)) 3703 + mem_cgroup_move_account_page_stat(from, to, nr_pages, 3704 + MEM_CGROUP_STAT_FILE_MAPPED); 3705 + 3706 + if (PageWriteback(page)) 3707 + mem_cgroup_move_account_page_stat(from, to, nr_pages, 3708 + MEM_CGROUP_STAT_WRITEBACK); 3709 + 3724 3710 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3725 3711 3726 3712 /* caller should have done css_get */
+15
mm/page-writeback.c
··· 2143 2143 2144 2144 /* 2145 2145 * Helper function for set_page_writeback family. 2146 + * 2147 + * The caller must hold mem_cgroup_begin/end_update_page_stat() lock 2148 + * while calling this function. 2149 + * See test_set_page_writeback for example. 2150 + * 2146 2151 * NOTE: Unlike account_page_dirtied this does not rely on being atomic 2147 2152 * wrt interrupts. 2148 2153 */ 2149 2154 void account_page_writeback(struct page *page) 2150 2155 { 2156 + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2151 2157 inc_zone_page_state(page, NR_WRITEBACK); 2152 2158 } 2153 2159 EXPORT_SYMBOL(account_page_writeback); ··· 2370 2364 { 2371 2365 struct address_space *mapping = page_mapping(page); 2372 2366 int ret; 2367 + bool locked; 2368 + unsigned long memcg_flags; 2373 2369 2370 + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2374 2371 if (mapping) { 2375 2372 struct backing_dev_info *bdi = mapping->backing_dev_info; 2376 2373 unsigned long flags; ··· 2394 2385 ret = TestClearPageWriteback(page); 2395 2386 } 2396 2387 if (ret) { 2388 + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); 2397 2389 dec_zone_page_state(page, NR_WRITEBACK); 2398 2390 inc_zone_page_state(page, NR_WRITTEN); 2399 2391 } 2392 + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2400 2393 return ret; 2401 2394 } 2402 2395 ··· 2406 2395 { 2407 2396 struct address_space *mapping = page_mapping(page); 2408 2397 int ret; 2398 + bool locked; 2399 + unsigned long memcg_flags; 2409 2400 2401 + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); 2410 2402 if (mapping) { 2411 2403 struct backing_dev_info *bdi = mapping->backing_dev_info; 2412 2404 unsigned long flags; ··· 2436 2422 } 2437 2423 if (!ret) 2438 2424 account_page_writeback(page); 2425 + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); 2439 2426 return ret; 2440 2427 2441 2428 }