Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: memcontrol: lockless page counters

Memory is internally accounted in bytes, using spinlock-protected 64-bit
counters, even though the smallest accounting delta is a page. The
counter interface is also convoluted and does too many things.

Introduce a new lockless word-sized page counter API, then change all
memory accounting over to it. The translation from and to bytes then only
happens when interfacing with userspace.

The removed locking overhead is noticable when scaling beyond the per-cpu
charge caches - on a 4-socket machine with 144-threads, the following test
shows the performance differences of 288 memcgs concurrently running a
page fault benchmark:

vanilla:

18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% )
1,380,638 context-switches # 0.074 K/sec ( +- 0.75% )
24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% )
1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% )
50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% )
1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% )
1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% )

132.474343877 seconds time elapsed ( +- 0.21% )

lockless:

12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% )
832,850 context-switches # 0.068 K/sec ( +- 0.54% )
15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% )
1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% )
32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% )
2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% )
1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% )

91.369330729 seconds time elapsed ( +- 0.45% )

On top of improved scalability, this also gets rid of the icky long long
types in the very heart of memcg, which is great for 32 bit and also makes
the code a lot more readable.

Notable differences between the old and new API:

- res_counter_charge() and res_counter_charge_nofail() become
page_counter_try_charge() and page_counter_charge() resp. to match
the more common kernel naming scheme of try_do()/do()

- res_counter_uncharge_until() is only ever used to cancel a local
counter and never to uncharge bigger segments of a hierarchy, so
it's replaced by the simpler page_counter_cancel()

- res_counter_set_limit() is replaced by page_counter_limit(), which
expects its callers to serialize against themselves

- res_counter_memparse_write_strategy() is replaced by
page_counter_limit(), which rounds down to the nearest page size -
rather than up. This is more reasonable for explicitely requested
hard upper limits.

- to keep charging light-weight, page_counter_try_charge() charges
speculatively, only to roll back if the result exceeds the limit.
Because of this, a failing bigger charge can temporarily lock out
smaller charges that would otherwise succeed. The error is bounded
to the difference between the smallest and the biggest possible
charge size, so for memcg, this means that a failing THP charge can
send base page charges into reclaim upto 2MB (4MB) before the limit
would have been reached. This should be acceptable.

[akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse]
[akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Johannes Weiner and committed by
Linus Torvalds
3e32cb2e 8df0c2dc

+617 -406
+2 -2
Documentation/cgroups/memory.txt
··· 52 52 tasks # attach a task(thread) and show list of threads 53 53 cgroup.procs # show list of processes 54 54 cgroup.event_control # an interface for event_fd() 55 - memory.usage_in_bytes # show current res_counter usage for memory 55 + memory.usage_in_bytes # show current usage for memory 56 56 (See 5.5 for details) 57 - memory.memsw.usage_in_bytes # show current res_counter usage for memory+Swap 57 + memory.memsw.usage_in_bytes # show current usage for memory+Swap 58 58 (See 5.5 for details) 59 59 memory.limit_in_bytes # set/show limit of memory usage 60 60 memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
+2 -3
include/linux/memcontrol.h
··· 447 447 /* 448 448 * __GFP_NOFAIL allocations will move on even if charging is not 449 449 * possible. Therefore we don't even try, and have this allocation 450 - * unaccounted. We could in theory charge it with 451 - * res_counter_charge_nofail, but we hope those allocations are rare, 452 - * and won't be worth the trouble. 450 + * unaccounted. We could in theory charge it forcibly, but we hope 451 + * those allocations are rare, and won't be worth the trouble. 453 452 */ 454 453 if (gfp & __GFP_NOFAIL) 455 454 return true;
+51
include/linux/page_counter.h
··· 1 + #ifndef _LINUX_PAGE_COUNTER_H 2 + #define _LINUX_PAGE_COUNTER_H 3 + 4 + #include <linux/atomic.h> 5 + #include <linux/kernel.h> 6 + #include <asm/page.h> 7 + 8 + struct page_counter { 9 + atomic_long_t count; 10 + unsigned long limit; 11 + struct page_counter *parent; 12 + 13 + /* legacy */ 14 + unsigned long watermark; 15 + unsigned long failcnt; 16 + }; 17 + 18 + #if BITS_PER_LONG == 32 19 + #define PAGE_COUNTER_MAX LONG_MAX 20 + #else 21 + #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) 22 + #endif 23 + 24 + static inline void page_counter_init(struct page_counter *counter, 25 + struct page_counter *parent) 26 + { 27 + atomic_long_set(&counter->count, 0); 28 + counter->limit = PAGE_COUNTER_MAX; 29 + counter->parent = parent; 30 + } 31 + 32 + static inline unsigned long page_counter_read(struct page_counter *counter) 33 + { 34 + return atomic_long_read(&counter->count); 35 + } 36 + 37 + int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); 38 + void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); 39 + int page_counter_try_charge(struct page_counter *counter, 40 + unsigned long nr_pages, 41 + struct page_counter **fail); 42 + int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); 43 + int page_counter_limit(struct page_counter *counter, unsigned long limit); 44 + int page_counter_memparse(const char *buf, unsigned long *nr_pages); 45 + 46 + static inline void page_counter_reset_watermark(struct page_counter *counter) 47 + { 48 + counter->watermark = page_counter_read(counter); 49 + } 50 + 51 + #endif /* _LINUX_PAGE_COUNTER_H */
+9 -17
include/net/sock.h
··· 54 54 #include <linux/security.h> 55 55 #include <linux/slab.h> 56 56 #include <linux/uaccess.h> 57 + #include <linux/page_counter.h> 57 58 #include <linux/memcontrol.h> 58 - #include <linux/res_counter.h> 59 59 #include <linux/static_key.h> 60 60 #include <linux/aio.h> 61 61 #include <linux/sched.h> ··· 1062 1062 }; 1063 1063 1064 1064 struct cg_proto { 1065 - struct res_counter memory_allocated; /* Current allocated memory. */ 1065 + struct page_counter memory_allocated; /* Current allocated memory. */ 1066 1066 struct percpu_counter sockets_allocated; /* Current number of sockets. */ 1067 1067 int memory_pressure; 1068 1068 long sysctl_mem[3]; ··· 1214 1214 unsigned long amt, 1215 1215 int *parent_status) 1216 1216 { 1217 - struct res_counter *fail; 1218 - int ret; 1217 + page_counter_charge(&prot->memory_allocated, amt); 1219 1218 1220 - ret = res_counter_charge_nofail(&prot->memory_allocated, 1221 - amt << PAGE_SHIFT, &fail); 1222 - if (ret < 0) 1219 + if (page_counter_read(&prot->memory_allocated) > 1220 + prot->memory_allocated.limit) 1223 1221 *parent_status = OVER_LIMIT; 1224 1222 } 1225 1223 1226 1224 static inline void memcg_memory_allocated_sub(struct cg_proto *prot, 1227 1225 unsigned long amt) 1228 1226 { 1229 - res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); 1230 - } 1231 - 1232 - static inline u64 memcg_memory_allocated_read(struct cg_proto *prot) 1233 - { 1234 - u64 ret; 1235 - ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE); 1236 - return ret >> PAGE_SHIFT; 1227 + page_counter_uncharge(&prot->memory_allocated, amt); 1237 1228 } 1238 1229 1239 1230 static inline long 1240 1231 sk_memory_allocated(const struct sock *sk) 1241 1232 { 1242 1233 struct proto *prot = sk->sk_prot; 1234 + 1243 1235 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) 1244 - return memcg_memory_allocated_read(sk->sk_cgrp); 1236 + return page_counter_read(&sk->sk_cgrp->memory_allocated); 1245 1237 1246 1238 return atomic_long_read(prot->memory_allocated); 1247 1239 } ··· 1247 1255 memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); 1248 1256 /* update the root cgroup regardless */ 1249 1257 atomic_long_add_return(amt, prot->memory_allocated); 1250 - return memcg_memory_allocated_read(sk->sk_cgrp); 1258 + return page_counter_read(&sk->sk_cgrp->memory_allocated); 1251 1259 } 1252 1260 1253 1261 return atomic_long_add_return(amt, prot->memory_allocated);
+4 -1
init/Kconfig
··· 978 978 This option enables controller independent resource accounting 979 979 infrastructure that works with cgroups. 980 980 981 + config PAGE_COUNTER 982 + bool 983 + 981 984 config MEMCG 982 985 bool "Memory Resource Controller for Control Groups" 983 - depends on RESOURCE_COUNTERS 986 + select PAGE_COUNTER 984 987 select EVENTFD 985 988 help 986 989 Provides a memory resource controller that manages both anonymous
+1
mm/Makefile
··· 55 55 obj-$(CONFIG_MIGRATION) += migrate.o 56 56 obj-$(CONFIG_QUICKLIST) += quicklist.o 57 57 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 58 + obj-$(CONFIG_PAGE_COUNTER) += page_counter.o 58 59 obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o 59 60 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 60 61 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+297 -340
mm/memcontrol.c
··· 25 25 * GNU General Public License for more details. 26 26 */ 27 27 28 - #include <linux/res_counter.h> 28 + #include <linux/page_counter.h> 29 29 #include <linux/memcontrol.h> 30 30 #include <linux/cgroup.h> 31 31 #include <linux/mm.h> ··· 165 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 166 166 167 167 struct rb_node tree_node; /* RB tree node */ 168 - unsigned long long usage_in_excess;/* Set to the value by which */ 168 + unsigned long usage_in_excess;/* Set to the value by which */ 169 169 /* the soft limit is exceeded*/ 170 170 bool on_tree; 171 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ ··· 198 198 199 199 struct mem_cgroup_threshold { 200 200 struct eventfd_ctx *eventfd; 201 - u64 threshold; 201 + unsigned long threshold; 202 202 }; 203 203 204 204 /* For threshold */ ··· 284 284 */ 285 285 struct mem_cgroup { 286 286 struct cgroup_subsys_state css; 287 - /* 288 - * the counter to account for memory usage 289 - */ 290 - struct res_counter res; 287 + 288 + /* Accounted resources */ 289 + struct page_counter memory; 290 + struct page_counter memsw; 291 + struct page_counter kmem; 292 + 293 + unsigned long soft_limit; 291 294 292 295 /* vmpressure notifications */ 293 296 struct vmpressure vmpressure; ··· 298 295 /* css_online() has been completed */ 299 296 int initialized; 300 297 301 - /* 302 - * the counter to account for mem+swap usage. 303 - */ 304 - struct res_counter memsw; 305 - 306 - /* 307 - * the counter to account for kernel memory usage. 308 - */ 309 - struct res_counter kmem; 310 298 /* 311 299 * Should the accounting and control be hierarchical, per subtree? 312 300 */ ··· 644 650 * This check can't live in kmem destruction function, 645 651 * since the charges will outlive the cgroup 646 652 */ 647 - WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 653 + WARN_ON(page_counter_read(&memcg->kmem)); 648 654 } 649 655 #else 650 656 static void disarm_kmem_keys(struct mem_cgroup *memcg) ··· 700 706 701 707 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 702 708 struct mem_cgroup_tree_per_zone *mctz, 703 - unsigned long long new_usage_in_excess) 709 + unsigned long new_usage_in_excess) 704 710 { 705 711 struct rb_node **p = &mctz->rb_root.rb_node; 706 712 struct rb_node *parent = NULL; ··· 749 755 spin_unlock_irqrestore(&mctz->lock, flags); 750 756 } 751 757 758 + static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 759 + { 760 + unsigned long nr_pages = page_counter_read(&memcg->memory); 761 + unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 762 + unsigned long excess = 0; 763 + 764 + if (nr_pages > soft_limit) 765 + excess = nr_pages - soft_limit; 766 + 767 + return excess; 768 + } 752 769 753 770 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 754 771 { 755 - unsigned long long excess; 772 + unsigned long excess; 756 773 struct mem_cgroup_per_zone *mz; 757 774 struct mem_cgroup_tree_per_zone *mctz; 758 775 ··· 774 769 */ 775 770 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 776 771 mz = mem_cgroup_page_zoneinfo(memcg, page); 777 - excess = res_counter_soft_limit_excess(&memcg->res); 772 + excess = soft_limit_excess(memcg); 778 773 /* 779 774 * We have to update the tree if mz is on RB-tree or 780 775 * mem is over its softlimit. ··· 830 825 * position in the tree. 831 826 */ 832 827 __mem_cgroup_remove_exceeded(mz, mctz); 833 - if (!res_counter_soft_limit_excess(&mz->memcg->res) || 828 + if (!soft_limit_excess(mz->memcg) || 834 829 !css_tryget_online(&mz->memcg->css)) 835 830 goto retry; 836 831 done: ··· 1497 1492 return inactive * inactive_ratio < active; 1498 1493 } 1499 1494 1500 - #define mem_cgroup_from_res_counter(counter, member) \ 1495 + #define mem_cgroup_from_counter(counter, member) \ 1501 1496 container_of(counter, struct mem_cgroup, member) 1502 1497 1503 1498 /** ··· 1509 1504 */ 1510 1505 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1511 1506 { 1512 - unsigned long long margin; 1507 + unsigned long margin = 0; 1508 + unsigned long count; 1509 + unsigned long limit; 1513 1510 1514 - margin = res_counter_margin(&memcg->res); 1515 - if (do_swap_account) 1516 - margin = min(margin, res_counter_margin(&memcg->memsw)); 1517 - return margin >> PAGE_SHIFT; 1511 + count = page_counter_read(&memcg->memory); 1512 + limit = ACCESS_ONCE(memcg->memory.limit); 1513 + if (count < limit) 1514 + margin = limit - count; 1515 + 1516 + if (do_swap_account) { 1517 + count = page_counter_read(&memcg->memsw); 1518 + limit = ACCESS_ONCE(memcg->memsw.limit); 1519 + if (count <= limit) 1520 + margin = min(margin, limit - count); 1521 + } 1522 + 1523 + return margin; 1518 1524 } 1519 1525 1520 1526 int mem_cgroup_swappiness(struct mem_cgroup *memcg) ··· 1660 1644 1661 1645 rcu_read_unlock(); 1662 1646 1663 - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1664 - res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1665 - res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1666 - res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1667 - pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1668 - res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1669 - res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1670 - res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1671 - pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1672 - res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1673 - res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1674 - res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1647 + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1648 + K((u64)page_counter_read(&memcg->memory)), 1649 + K((u64)memcg->memory.limit), memcg->memory.failcnt); 1650 + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1651 + K((u64)page_counter_read(&memcg->memsw)), 1652 + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1653 + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1654 + K((u64)page_counter_read(&memcg->kmem)), 1655 + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1675 1656 1676 1657 for_each_mem_cgroup_tree(iter, memcg) { 1677 1658 pr_info("Memory cgroup stats for "); ··· 1708 1695 /* 1709 1696 * Return the memory (and swap, if configured) limit for a memcg. 1710 1697 */ 1711 - static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1698 + static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1712 1699 { 1713 - u64 limit; 1700 + unsigned long limit; 1714 1701 1715 - limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1716 - 1717 - /* 1718 - * Do not consider swap space if we cannot swap due to swappiness 1719 - */ 1702 + limit = memcg->memory.limit; 1720 1703 if (mem_cgroup_swappiness(memcg)) { 1721 - u64 memsw; 1704 + unsigned long memsw_limit; 1722 1705 1723 - limit += total_swap_pages << PAGE_SHIFT; 1724 - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1725 - 1726 - /* 1727 - * If memsw is finite and limits the amount of swap space 1728 - * available to this memcg, return that limit. 1729 - */ 1730 - limit = min(limit, memsw); 1706 + memsw_limit = memcg->memsw.limit; 1707 + limit = min(limit + total_swap_pages, memsw_limit); 1731 1708 } 1732 - 1733 1709 return limit; 1734 1710 } 1735 1711 ··· 1742 1740 } 1743 1741 1744 1742 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1745 - totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1743 + totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1746 1744 for_each_mem_cgroup_tree(iter, memcg) { 1747 1745 struct css_task_iter it; 1748 1746 struct task_struct *task; ··· 1945 1943 .priority = 0, 1946 1944 }; 1947 1945 1948 - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1946 + excess = soft_limit_excess(root_memcg); 1949 1947 1950 1948 while (1) { 1951 1949 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); ··· 1976 1974 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1977 1975 zone, &nr_scanned); 1978 1976 *total_scanned += nr_scanned; 1979 - if (!res_counter_soft_limit_excess(&root_memcg->res)) 1977 + if (!soft_limit_excess(root_memcg)) 1980 1978 break; 1981 1979 } 1982 1980 mem_cgroup_iter_break(root_memcg, victim); ··· 2318 2316 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2319 2317 { 2320 2318 struct memcg_stock_pcp *stock; 2321 - bool ret = true; 2319 + bool ret = false; 2322 2320 2323 2321 if (nr_pages > CHARGE_BATCH) 2324 - return false; 2322 + return ret; 2325 2323 2326 2324 stock = &get_cpu_var(memcg_stock); 2327 - if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2325 + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2328 2326 stock->nr_pages -= nr_pages; 2329 - else /* need to call res_counter_charge */ 2330 - ret = false; 2327 + ret = true; 2328 + } 2331 2329 put_cpu_var(memcg_stock); 2332 2330 return ret; 2333 2331 } 2334 2332 2335 2333 /* 2336 - * Returns stocks cached in percpu to res_counter and reset cached information. 2334 + * Returns stocks cached in percpu and reset cached information. 2337 2335 */ 2338 2336 static void drain_stock(struct memcg_stock_pcp *stock) 2339 2337 { 2340 2338 struct mem_cgroup *old = stock->cached; 2341 2339 2342 2340 if (stock->nr_pages) { 2343 - unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2344 - 2345 - res_counter_uncharge(&old->res, bytes); 2341 + page_counter_uncharge(&old->memory, stock->nr_pages); 2346 2342 if (do_swap_account) 2347 - res_counter_uncharge(&old->memsw, bytes); 2343 + page_counter_uncharge(&old->memsw, stock->nr_pages); 2348 2344 stock->nr_pages = 0; 2349 2345 } 2350 2346 stock->cached = NULL; ··· 2371 2371 } 2372 2372 2373 2373 /* 2374 - * Cache charges(val) which is from res_counter, to local per_cpu area. 2374 + * Cache charges(val) to local per_cpu area. 2375 2375 * This will be consumed by consume_stock() function, later. 2376 2376 */ 2377 2377 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ··· 2431 2431 /* 2432 2432 * Tries to drain stocked charges in other cpus. This function is asynchronous 2433 2433 * and just put a work per cpu for draining localy on each cpu. Caller can 2434 - * expects some charges will be back to res_counter later but cannot wait for 2435 - * it. 2434 + * expects some charges will be back later but cannot wait for it. 2436 2435 */ 2437 2436 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2438 2437 { ··· 2505 2506 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2506 2507 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2507 2508 struct mem_cgroup *mem_over_limit; 2508 - struct res_counter *fail_res; 2509 + struct page_counter *counter; 2509 2510 unsigned long nr_reclaimed; 2510 - unsigned long long size; 2511 2511 bool may_swap = true; 2512 2512 bool drained = false; 2513 2513 int ret = 0; ··· 2517 2519 if (consume_stock(memcg, nr_pages)) 2518 2520 goto done; 2519 2521 2520 - size = batch * PAGE_SIZE; 2521 2522 if (!do_swap_account || 2522 - !res_counter_charge(&memcg->memsw, size, &fail_res)) { 2523 - if (!res_counter_charge(&memcg->res, size, &fail_res)) 2523 + !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2524 + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2524 2525 goto done_restock; 2525 2526 if (do_swap_account) 2526 - res_counter_uncharge(&memcg->memsw, size); 2527 - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2527 + page_counter_uncharge(&memcg->memsw, batch); 2528 + mem_over_limit = mem_cgroup_from_counter(counter, memory); 2528 2529 } else { 2529 - mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2530 + mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2530 2531 may_swap = false; 2531 2532 } 2532 2533 ··· 2608 2611 2609 2612 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2610 2613 { 2611 - unsigned long bytes = nr_pages * PAGE_SIZE; 2612 - 2613 2614 if (mem_cgroup_is_root(memcg)) 2614 2615 return; 2615 2616 2616 - res_counter_uncharge(&memcg->res, bytes); 2617 + page_counter_uncharge(&memcg->memory, nr_pages); 2617 2618 if (do_swap_account) 2618 - res_counter_uncharge(&memcg->memsw, bytes); 2619 - } 2620 - 2621 - /* 2622 - * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2623 - * This is useful when moving usage to parent cgroup. 2624 - */ 2625 - static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2626 - unsigned int nr_pages) 2627 - { 2628 - unsigned long bytes = nr_pages * PAGE_SIZE; 2629 - 2630 - if (mem_cgroup_is_root(memcg)) 2631 - return; 2632 - 2633 - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2634 - if (do_swap_account) 2635 - res_counter_uncharge_until(&memcg->memsw, 2636 - memcg->memsw.parent, bytes); 2619 + page_counter_uncharge(&memcg->memsw, nr_pages); 2637 2620 } 2638 2621 2639 2622 /* ··· 2737 2760 unlock_page_lru(page, isolated); 2738 2761 } 2739 2762 2740 - static DEFINE_MUTEX(set_limit_mutex); 2741 - 2742 2763 #ifdef CONFIG_MEMCG_KMEM 2743 2764 /* 2744 2765 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or ··· 2779 2804 } 2780 2805 #endif 2781 2806 2782 - static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2807 + static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2808 + unsigned long nr_pages) 2783 2809 { 2784 - struct res_counter *fail_res; 2810 + struct page_counter *counter; 2785 2811 int ret = 0; 2786 2812 2787 - ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2788 - if (ret) 2813 + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2814 + if (ret < 0) 2789 2815 return ret; 2790 2816 2791 - ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2817 + ret = try_charge(memcg, gfp, nr_pages); 2792 2818 if (ret == -EINTR) { 2793 2819 /* 2794 2820 * try_charge() chose to bypass to root due to OOM kill or ··· 2806 2830 * when the allocation triggers should have been already 2807 2831 * directed to the root cgroup in memcontrol.h 2808 2832 */ 2809 - res_counter_charge_nofail(&memcg->res, size, &fail_res); 2833 + page_counter_charge(&memcg->memory, nr_pages); 2810 2834 if (do_swap_account) 2811 - res_counter_charge_nofail(&memcg->memsw, size, 2812 - &fail_res); 2835 + page_counter_charge(&memcg->memsw, nr_pages); 2813 2836 ret = 0; 2814 2837 } else if (ret) 2815 - res_counter_uncharge(&memcg->kmem, size); 2838 + page_counter_uncharge(&memcg->kmem, nr_pages); 2816 2839 2817 2840 return ret; 2818 2841 } 2819 2842 2820 - static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2843 + static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2844 + unsigned long nr_pages) 2821 2845 { 2822 - res_counter_uncharge(&memcg->res, size); 2846 + page_counter_uncharge(&memcg->memory, nr_pages); 2823 2847 if (do_swap_account) 2824 - res_counter_uncharge(&memcg->memsw, size); 2848 + page_counter_uncharge(&memcg->memsw, nr_pages); 2825 2849 2826 2850 /* Not down to 0 */ 2827 - if (res_counter_uncharge(&memcg->kmem, size)) 2851 + if (page_counter_uncharge(&memcg->kmem, nr_pages)) 2828 2852 return; 2829 2853 2830 2854 /* ··· 3100 3124 3101 3125 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 3102 3126 { 3127 + unsigned int nr_pages = 1 << order; 3103 3128 int res; 3104 3129 3105 - res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 3106 - PAGE_SIZE << order); 3130 + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 3107 3131 if (!res) 3108 - atomic_add(1 << order, &cachep->memcg_params->nr_pages); 3132 + atomic_add(nr_pages, &cachep->memcg_params->nr_pages); 3109 3133 return res; 3110 3134 } 3111 3135 3112 3136 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 3113 3137 { 3114 - memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 3115 - atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 3138 + unsigned int nr_pages = 1 << order; 3139 + 3140 + memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 3141 + atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); 3116 3142 } 3117 3143 3118 3144 /* ··· 3235 3257 return true; 3236 3258 } 3237 3259 3238 - ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3260 + ret = memcg_charge_kmem(memcg, gfp, 1 << order); 3239 3261 if (!ret) 3240 3262 *_memcg = memcg; 3241 3263 ··· 3252 3274 3253 3275 /* The page allocation failed. Revert */ 3254 3276 if (!page) { 3255 - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3277 + memcg_uncharge_kmem(memcg, 1 << order); 3256 3278 return; 3257 3279 } 3258 3280 /* ··· 3285 3307 return; 3286 3308 3287 3309 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3288 - memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3310 + memcg_uncharge_kmem(memcg, 1 << order); 3289 3311 } 3290 3312 #else 3291 3313 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) ··· 3463 3485 3464 3486 ret = mem_cgroup_move_account(page, nr_pages, 3465 3487 pc, child, parent); 3466 - if (!ret) 3467 - __mem_cgroup_cancel_local_charge(child, nr_pages); 3488 + if (!ret) { 3489 + /* Take charge off the local counters */ 3490 + page_counter_cancel(&child->memory, nr_pages); 3491 + if (do_swap_account) 3492 + page_counter_cancel(&child->memsw, nr_pages); 3493 + } 3468 3494 3469 3495 if (nr_pages > 1) 3470 3496 compound_unlock_irqrestore(page, flags); ··· 3498 3516 * 3499 3517 * Returns 0 on success, -EINVAL on failure. 3500 3518 * 3501 - * The caller must have charged to @to, IOW, called res_counter_charge() about 3519 + * The caller must have charged to @to, IOW, called page_counter_charge() about 3502 3520 * both res and memsw, and called css_get(). 3503 3521 */ 3504 3522 static int mem_cgroup_move_swap_account(swp_entry_t entry, ··· 3514 3532 mem_cgroup_swap_statistics(to, true); 3515 3533 /* 3516 3534 * This function is only called from task migration context now. 3517 - * It postpones res_counter and refcount handling till the end 3535 + * It postpones page_counter and refcount handling till the end 3518 3536 * of task migration(mem_cgroup_clear_mc()) for performance 3519 3537 * improvement. But we cannot postpone css_get(to) because if 3520 3538 * the process that has been moved to @to does swap-in, the ··· 3572 3590 } 3573 3591 #endif 3574 3592 3593 + static DEFINE_MUTEX(memcg_limit_mutex); 3594 + 3575 3595 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3576 - unsigned long long val) 3596 + unsigned long limit) 3577 3597 { 3598 + unsigned long curusage; 3599 + unsigned long oldusage; 3600 + bool enlarge = false; 3578 3601 int retry_count; 3579 - int ret = 0; 3580 - int children = mem_cgroup_count_children(memcg); 3581 - u64 curusage, oldusage; 3582 - int enlarge; 3602 + int ret; 3583 3603 3584 3604 /* 3585 3605 * For keeping hierarchical_reclaim simple, how long we should retry 3586 3606 * is depends on callers. We set our retry-count to be function 3587 3607 * of # of children which we should visit in this loop. 3588 3608 */ 3589 - retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3609 + retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3610 + mem_cgroup_count_children(memcg); 3590 3611 3591 - oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3612 + oldusage = page_counter_read(&memcg->memory); 3592 3613 3593 - enlarge = 0; 3594 - while (retry_count) { 3614 + do { 3595 3615 if (signal_pending(current)) { 3596 3616 ret = -EINTR; 3597 3617 break; 3598 3618 } 3599 - /* 3600 - * Rather than hide all in some function, I do this in 3601 - * open coded manner. You see what this really does. 3602 - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3603 - */ 3604 - mutex_lock(&set_limit_mutex); 3605 - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { 3619 + 3620 + mutex_lock(&memcg_limit_mutex); 3621 + if (limit > memcg->memsw.limit) { 3622 + mutex_unlock(&memcg_limit_mutex); 3606 3623 ret = -EINVAL; 3607 - mutex_unlock(&set_limit_mutex); 3608 3624 break; 3609 3625 } 3610 - 3611 - if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) 3612 - enlarge = 1; 3613 - 3614 - ret = res_counter_set_limit(&memcg->res, val); 3615 - mutex_unlock(&set_limit_mutex); 3626 + if (limit > memcg->memory.limit) 3627 + enlarge = true; 3628 + ret = page_counter_limit(&memcg->memory, limit); 3629 + mutex_unlock(&memcg_limit_mutex); 3616 3630 3617 3631 if (!ret) 3618 3632 break; 3619 3633 3620 3634 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3621 3635 3622 - curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3636 + curusage = page_counter_read(&memcg->memory); 3623 3637 /* Usage is reduced ? */ 3624 3638 if (curusage >= oldusage) 3625 3639 retry_count--; 3626 3640 else 3627 3641 oldusage = curusage; 3628 - } 3642 + } while (retry_count); 3643 + 3629 3644 if (!ret && enlarge) 3630 3645 memcg_oom_recover(memcg); 3631 3646 ··· 3630 3651 } 3631 3652 3632 3653 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3633 - unsigned long long val) 3654 + unsigned long limit) 3634 3655 { 3656 + unsigned long curusage; 3657 + unsigned long oldusage; 3658 + bool enlarge = false; 3635 3659 int retry_count; 3636 - u64 oldusage, curusage; 3637 - int children = mem_cgroup_count_children(memcg); 3638 - int ret = -EBUSY; 3639 - int enlarge = 0; 3660 + int ret; 3640 3661 3641 3662 /* see mem_cgroup_resize_res_limit */ 3642 - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3643 - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3644 - while (retry_count) { 3663 + retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3664 + mem_cgroup_count_children(memcg); 3665 + 3666 + oldusage = page_counter_read(&memcg->memsw); 3667 + 3668 + do { 3645 3669 if (signal_pending(current)) { 3646 3670 ret = -EINTR; 3647 3671 break; 3648 3672 } 3649 - /* 3650 - * Rather than hide all in some function, I do this in 3651 - * open coded manner. You see what this really does. 3652 - * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3653 - */ 3654 - mutex_lock(&set_limit_mutex); 3655 - if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { 3673 + 3674 + mutex_lock(&memcg_limit_mutex); 3675 + if (limit < memcg->memory.limit) { 3676 + mutex_unlock(&memcg_limit_mutex); 3656 3677 ret = -EINVAL; 3657 - mutex_unlock(&set_limit_mutex); 3658 3678 break; 3659 3679 } 3660 - if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) 3661 - enlarge = 1; 3662 - ret = res_counter_set_limit(&memcg->memsw, val); 3663 - mutex_unlock(&set_limit_mutex); 3680 + if (limit > memcg->memsw.limit) 3681 + enlarge = true; 3682 + ret = page_counter_limit(&memcg->memsw, limit); 3683 + mutex_unlock(&memcg_limit_mutex); 3664 3684 3665 3685 if (!ret) 3666 3686 break; 3667 3687 3668 3688 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3669 3689 3670 - curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3690 + curusage = page_counter_read(&memcg->memsw); 3671 3691 /* Usage is reduced ? */ 3672 3692 if (curusage >= oldusage) 3673 3693 retry_count--; 3674 3694 else 3675 3695 oldusage = curusage; 3676 - } 3696 + } while (retry_count); 3697 + 3677 3698 if (!ret && enlarge) 3678 3699 memcg_oom_recover(memcg); 3700 + 3679 3701 return ret; 3680 3702 } 3681 3703 ··· 3689 3709 unsigned long reclaimed; 3690 3710 int loop = 0; 3691 3711 struct mem_cgroup_tree_per_zone *mctz; 3692 - unsigned long long excess; 3712 + unsigned long excess; 3693 3713 unsigned long nr_scanned; 3694 3714 3695 3715 if (order > 0) ··· 3743 3763 } while (1); 3744 3764 } 3745 3765 __mem_cgroup_remove_exceeded(mz, mctz); 3746 - excess = res_counter_soft_limit_excess(&mz->memcg->res); 3766 + excess = soft_limit_excess(mz->memcg); 3747 3767 /* 3748 3768 * One school of thought says that we should not add 3749 3769 * back the node to the tree if reclaim returns 0. ··· 3836 3856 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 3837 3857 { 3838 3858 int node, zid; 3839 - u64 usage; 3840 3859 3841 3860 do { 3842 3861 /* This is for making all *used* pages to be on LRU. */ ··· 3867 3888 * right after the check. RES_USAGE should be safe as we always 3868 3889 * charge before adding to the LRU. 3869 3890 */ 3870 - usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 3871 - res_counter_read_u64(&memcg->kmem, RES_USAGE); 3872 - } while (usage > 0); 3891 + } while (page_counter_read(&memcg->memory) - 3892 + page_counter_read(&memcg->kmem) > 0); 3873 3893 } 3874 3894 3875 3895 /* ··· 3908 3930 /* we call try-to-free pages for make this cgroup empty */ 3909 3931 lru_add_drain_all(); 3910 3932 /* try to free all pages in this cgroup */ 3911 - while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3933 + while (nr_retries && page_counter_read(&memcg->memory)) { 3912 3934 int progress; 3913 3935 3914 3936 if (signal_pending(current)) ··· 3979 4001 return retval; 3980 4002 } 3981 4003 3982 - static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3983 - enum mem_cgroup_stat_index idx) 4004 + static unsigned long tree_stat(struct mem_cgroup *memcg, 4005 + enum mem_cgroup_stat_index idx) 3984 4006 { 3985 4007 struct mem_cgroup *iter; 3986 4008 long val = 0; ··· 3998 4020 { 3999 4021 u64 val; 4000 4022 4001 - if (!mem_cgroup_is_root(memcg)) { 4023 + if (mem_cgroup_is_root(memcg)) { 4024 + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 4025 + val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 4026 + if (swap) 4027 + val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 4028 + } else { 4002 4029 if (!swap) 4003 - return res_counter_read_u64(&memcg->res, RES_USAGE); 4030 + val = page_counter_read(&memcg->memory); 4004 4031 else 4005 - return res_counter_read_u64(&memcg->memsw, RES_USAGE); 4032 + val = page_counter_read(&memcg->memsw); 4006 4033 } 4007 - 4008 - /* 4009 - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 4010 - * as well as in MEM_CGROUP_STAT_RSS_HUGE. 4011 - */ 4012 - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 4013 - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 4014 - 4015 - if (swap) 4016 - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 4017 - 4018 4034 return val << PAGE_SHIFT; 4019 4035 } 4020 4036 4037 + enum { 4038 + RES_USAGE, 4039 + RES_LIMIT, 4040 + RES_MAX_USAGE, 4041 + RES_FAILCNT, 4042 + RES_SOFT_LIMIT, 4043 + }; 4021 4044 4022 4045 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4023 4046 struct cftype *cft) 4024 4047 { 4025 4048 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4026 - enum res_type type = MEMFILE_TYPE(cft->private); 4027 - int name = MEMFILE_ATTR(cft->private); 4049 + struct page_counter *counter; 4028 4050 4029 - switch (type) { 4051 + switch (MEMFILE_TYPE(cft->private)) { 4030 4052 case _MEM: 4031 - if (name == RES_USAGE) 4032 - return mem_cgroup_usage(memcg, false); 4033 - return res_counter_read_u64(&memcg->res, name); 4034 - case _MEMSWAP: 4035 - if (name == RES_USAGE) 4036 - return mem_cgroup_usage(memcg, true); 4037 - return res_counter_read_u64(&memcg->memsw, name); 4038 - case _KMEM: 4039 - return res_counter_read_u64(&memcg->kmem, name); 4053 + counter = &memcg->memory; 4040 4054 break; 4055 + case _MEMSWAP: 4056 + counter = &memcg->memsw; 4057 + break; 4058 + case _KMEM: 4059 + counter = &memcg->kmem; 4060 + break; 4061 + default: 4062 + BUG(); 4063 + } 4064 + 4065 + switch (MEMFILE_ATTR(cft->private)) { 4066 + case RES_USAGE: 4067 + if (counter == &memcg->memory) 4068 + return mem_cgroup_usage(memcg, false); 4069 + if (counter == &memcg->memsw) 4070 + return mem_cgroup_usage(memcg, true); 4071 + return (u64)page_counter_read(counter) * PAGE_SIZE; 4072 + case RES_LIMIT: 4073 + return (u64)counter->limit * PAGE_SIZE; 4074 + case RES_MAX_USAGE: 4075 + return (u64)counter->watermark * PAGE_SIZE; 4076 + case RES_FAILCNT: 4077 + return counter->failcnt; 4078 + case RES_SOFT_LIMIT: 4079 + return (u64)memcg->soft_limit * PAGE_SIZE; 4041 4080 default: 4042 4081 BUG(); 4043 4082 } ··· 4063 4068 #ifdef CONFIG_MEMCG_KMEM 4064 4069 /* should be called with activate_kmem_mutex held */ 4065 4070 static int __memcg_activate_kmem(struct mem_cgroup *memcg, 4066 - unsigned long long limit) 4071 + unsigned long nr_pages) 4067 4072 { 4068 4073 int err = 0; 4069 4074 int memcg_id; ··· 4110 4115 * We couldn't have accounted to this cgroup, because it hasn't got the 4111 4116 * active bit set yet, so this should succeed. 4112 4117 */ 4113 - err = res_counter_set_limit(&memcg->kmem, limit); 4118 + err = page_counter_limit(&memcg->kmem, nr_pages); 4114 4119 VM_BUG_ON(err); 4115 4120 4116 4121 static_key_slow_inc(&memcg_kmem_enabled_key); ··· 4126 4131 } 4127 4132 4128 4133 static int memcg_activate_kmem(struct mem_cgroup *memcg, 4129 - unsigned long long limit) 4134 + unsigned long nr_pages) 4130 4135 { 4131 4136 int ret; 4132 4137 4133 4138 mutex_lock(&activate_kmem_mutex); 4134 - ret = __memcg_activate_kmem(memcg, limit); 4139 + ret = __memcg_activate_kmem(memcg, nr_pages); 4135 4140 mutex_unlock(&activate_kmem_mutex); 4136 4141 return ret; 4137 4142 } 4138 4143 4139 4144 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4140 - unsigned long long val) 4145 + unsigned long limit) 4141 4146 { 4142 4147 int ret; 4143 4148 4149 + mutex_lock(&memcg_limit_mutex); 4144 4150 if (!memcg_kmem_is_active(memcg)) 4145 - ret = memcg_activate_kmem(memcg, val); 4151 + ret = memcg_activate_kmem(memcg, limit); 4146 4152 else 4147 - ret = res_counter_set_limit(&memcg->kmem, val); 4153 + ret = page_counter_limit(&memcg->kmem, limit); 4154 + mutex_unlock(&memcg_limit_mutex); 4148 4155 return ret; 4149 4156 } 4150 4157 ··· 4164 4167 * after this point, because it has at least one child already. 4165 4168 */ 4166 4169 if (memcg_kmem_is_active(parent)) 4167 - ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 4170 + ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 4168 4171 mutex_unlock(&activate_kmem_mutex); 4169 4172 return ret; 4170 4173 } 4171 4174 #else 4172 4175 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4173 - unsigned long long val) 4176 + unsigned long limit) 4174 4177 { 4175 4178 return -EINVAL; 4176 4179 } ··· 4184 4187 char *buf, size_t nbytes, loff_t off) 4185 4188 { 4186 4189 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4187 - enum res_type type; 4188 - int name; 4189 - unsigned long long val; 4190 + unsigned long nr_pages; 4190 4191 int ret; 4191 4192 4192 4193 buf = strstrip(buf); 4193 - type = MEMFILE_TYPE(of_cft(of)->private); 4194 - name = MEMFILE_ATTR(of_cft(of)->private); 4194 + ret = page_counter_memparse(buf, &nr_pages); 4195 + if (ret) 4196 + return ret; 4195 4197 4196 - switch (name) { 4198 + switch (MEMFILE_ATTR(of_cft(of)->private)) { 4197 4199 case RES_LIMIT: 4198 4200 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4199 4201 ret = -EINVAL; 4200 4202 break; 4201 4203 } 4202 - /* This function does all necessary parse...reuse it */ 4203 - ret = res_counter_memparse_write_strategy(buf, &val); 4204 - if (ret) 4204 + switch (MEMFILE_TYPE(of_cft(of)->private)) { 4205 + case _MEM: 4206 + ret = mem_cgroup_resize_limit(memcg, nr_pages); 4205 4207 break; 4206 - if (type == _MEM) 4207 - ret = mem_cgroup_resize_limit(memcg, val); 4208 - else if (type == _MEMSWAP) 4209 - ret = mem_cgroup_resize_memsw_limit(memcg, val); 4210 - else if (type == _KMEM) 4211 - ret = memcg_update_kmem_limit(memcg, val); 4212 - else 4213 - return -EINVAL; 4208 + case _MEMSWAP: 4209 + ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 4210 + break; 4211 + case _KMEM: 4212 + ret = memcg_update_kmem_limit(memcg, nr_pages); 4213 + break; 4214 + } 4214 4215 break; 4215 4216 case RES_SOFT_LIMIT: 4216 - ret = res_counter_memparse_write_strategy(buf, &val); 4217 - if (ret) 4218 - break; 4219 - /* 4220 - * For memsw, soft limits are hard to implement in terms 4221 - * of semantics, for now, we support soft limits for 4222 - * control without swap 4223 - */ 4224 - if (type == _MEM) 4225 - ret = res_counter_set_soft_limit(&memcg->res, val); 4226 - else 4227 - ret = -EINVAL; 4228 - break; 4229 - default: 4230 - ret = -EINVAL; /* should be BUG() ? */ 4217 + memcg->soft_limit = nr_pages; 4218 + ret = 0; 4231 4219 break; 4232 4220 } 4233 4221 return ret ?: nbytes; 4234 - } 4235 - 4236 - static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 4237 - unsigned long long *mem_limit, unsigned long long *memsw_limit) 4238 - { 4239 - unsigned long long min_limit, min_memsw_limit, tmp; 4240 - 4241 - min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4242 - min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4243 - if (!memcg->use_hierarchy) 4244 - goto out; 4245 - 4246 - while (memcg->css.parent) { 4247 - memcg = mem_cgroup_from_css(memcg->css.parent); 4248 - if (!memcg->use_hierarchy) 4249 - break; 4250 - tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 4251 - min_limit = min(min_limit, tmp); 4252 - tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4253 - min_memsw_limit = min(min_memsw_limit, tmp); 4254 - } 4255 - out: 4256 - *mem_limit = min_limit; 4257 - *memsw_limit = min_memsw_limit; 4258 4222 } 4259 4223 4260 4224 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4261 4225 size_t nbytes, loff_t off) 4262 4226 { 4263 4227 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4264 - int name; 4265 - enum res_type type; 4228 + struct page_counter *counter; 4266 4229 4267 - type = MEMFILE_TYPE(of_cft(of)->private); 4268 - name = MEMFILE_ATTR(of_cft(of)->private); 4230 + switch (MEMFILE_TYPE(of_cft(of)->private)) { 4231 + case _MEM: 4232 + counter = &memcg->memory; 4233 + break; 4234 + case _MEMSWAP: 4235 + counter = &memcg->memsw; 4236 + break; 4237 + case _KMEM: 4238 + counter = &memcg->kmem; 4239 + break; 4240 + default: 4241 + BUG(); 4242 + } 4269 4243 4270 - switch (name) { 4244 + switch (MEMFILE_ATTR(of_cft(of)->private)) { 4271 4245 case RES_MAX_USAGE: 4272 - if (type == _MEM) 4273 - res_counter_reset_max(&memcg->res); 4274 - else if (type == _MEMSWAP) 4275 - res_counter_reset_max(&memcg->memsw); 4276 - else if (type == _KMEM) 4277 - res_counter_reset_max(&memcg->kmem); 4278 - else 4279 - return -EINVAL; 4246 + page_counter_reset_watermark(counter); 4280 4247 break; 4281 4248 case RES_FAILCNT: 4282 - if (type == _MEM) 4283 - res_counter_reset_failcnt(&memcg->res); 4284 - else if (type == _MEMSWAP) 4285 - res_counter_reset_failcnt(&memcg->memsw); 4286 - else if (type == _KMEM) 4287 - res_counter_reset_failcnt(&memcg->kmem); 4288 - else 4289 - return -EINVAL; 4249 + counter->failcnt = 0; 4290 4250 break; 4251 + default: 4252 + BUG(); 4291 4253 } 4292 4254 4293 4255 return nbytes; ··· 4343 4387 static int memcg_stat_show(struct seq_file *m, void *v) 4344 4388 { 4345 4389 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4390 + unsigned long memory, memsw; 4346 4391 struct mem_cgroup *mi; 4347 4392 unsigned int i; 4348 4393 ··· 4363 4406 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 4364 4407 4365 4408 /* Hierarchical information */ 4366 - { 4367 - unsigned long long limit, memsw_limit; 4368 - memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4369 - seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 4370 - if (do_swap_account) 4371 - seq_printf(m, "hierarchical_memsw_limit %llu\n", 4372 - memsw_limit); 4409 + memory = memsw = PAGE_COUNTER_MAX; 4410 + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4411 + memory = min(memory, mi->memory.limit); 4412 + memsw = min(memsw, mi->memsw.limit); 4373 4413 } 4414 + seq_printf(m, "hierarchical_memory_limit %llu\n", 4415 + (u64)memory * PAGE_SIZE); 4416 + if (do_swap_account) 4417 + seq_printf(m, "hierarchical_memsw_limit %llu\n", 4418 + (u64)memsw * PAGE_SIZE); 4374 4419 4375 4420 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4376 4421 long long val = 0; ··· 4456 4497 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4457 4498 { 4458 4499 struct mem_cgroup_threshold_ary *t; 4459 - u64 usage; 4500 + unsigned long usage; 4460 4501 int i; 4461 4502 4462 4503 rcu_read_lock(); ··· 4555 4596 { 4556 4597 struct mem_cgroup_thresholds *thresholds; 4557 4598 struct mem_cgroup_threshold_ary *new; 4558 - u64 threshold, usage; 4599 + unsigned long threshold; 4600 + unsigned long usage; 4559 4601 int i, size, ret; 4560 4602 4561 - ret = res_counter_memparse_write_strategy(args, &threshold); 4603 + ret = page_counter_memparse(args, &threshold); 4562 4604 if (ret) 4563 4605 return ret; 4564 4606 ··· 4649 4689 { 4650 4690 struct mem_cgroup_thresholds *thresholds; 4651 4691 struct mem_cgroup_threshold_ary *new; 4652 - u64 usage; 4692 + unsigned long usage; 4653 4693 int i, j, size; 4654 4694 4655 4695 mutex_lock(&memcg->thresholds_lock); ··· 4843 4883 4844 4884 memcg_kmem_mark_dead(memcg); 4845 4885 4846 - if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 4886 + if (page_counter_read(&memcg->kmem)) 4847 4887 return; 4848 4888 4849 4889 if (memcg_kmem_test_and_clear_dead(memcg)) ··· 5323 5363 */ 5324 5364 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 5325 5365 { 5326 - if (!memcg->res.parent) 5366 + if (!memcg->memory.parent) 5327 5367 return NULL; 5328 - return mem_cgroup_from_res_counter(memcg->res.parent, res); 5368 + return mem_cgroup_from_counter(memcg->memory.parent, memory); 5329 5369 } 5330 5370 EXPORT_SYMBOL(parent_mem_cgroup); 5331 5371 ··· 5370 5410 /* root ? */ 5371 5411 if (parent_css == NULL) { 5372 5412 root_mem_cgroup = memcg; 5373 - res_counter_init(&memcg->res, NULL); 5374 - res_counter_init(&memcg->memsw, NULL); 5375 - res_counter_init(&memcg->kmem, NULL); 5413 + page_counter_init(&memcg->memory, NULL); 5414 + page_counter_init(&memcg->memsw, NULL); 5415 + page_counter_init(&memcg->kmem, NULL); 5376 5416 } 5377 5417 5378 5418 memcg->last_scanned_node = MAX_NUMNODES; ··· 5411 5451 memcg->swappiness = mem_cgroup_swappiness(parent); 5412 5452 5413 5453 if (parent->use_hierarchy) { 5414 - res_counter_init(&memcg->res, &parent->res); 5415 - res_counter_init(&memcg->memsw, &parent->memsw); 5416 - res_counter_init(&memcg->kmem, &parent->kmem); 5454 + page_counter_init(&memcg->memory, &parent->memory); 5455 + page_counter_init(&memcg->memsw, &parent->memsw); 5456 + page_counter_init(&memcg->kmem, &parent->kmem); 5417 5457 5418 5458 /* 5419 5459 * No need to take a reference to the parent because cgroup 5420 5460 * core guarantees its existence. 5421 5461 */ 5422 5462 } else { 5423 - res_counter_init(&memcg->res, NULL); 5424 - res_counter_init(&memcg->memsw, NULL); 5425 - res_counter_init(&memcg->kmem, NULL); 5463 + page_counter_init(&memcg->memory, NULL); 5464 + page_counter_init(&memcg->memsw, NULL); 5465 + page_counter_init(&memcg->kmem, NULL); 5426 5466 /* 5427 5467 * Deeper hierachy with use_hierarchy == false doesn't make 5428 5468 * much sense so let cgroup subsystem know about this ··· 5504 5544 /* 5505 5545 * XXX: css_offline() would be where we should reparent all 5506 5546 * memory to prepare the cgroup for destruction. However, 5507 - * memcg does not do css_tryget_online() and res_counter charging 5547 + * memcg does not do css_tryget_online() and page_counter charging 5508 5548 * under the same RCU lock region, which means that charging 5509 5549 * could race with offlining. Offlining only happens to 5510 5550 * cgroups with no tasks in them but charges can show up ··· 5524 5564 * call_rcu() 5525 5565 * offline_css() 5526 5566 * reparent_charges() 5527 - * res_counter_charge() 5567 + * page_counter_try_charge() 5528 5568 * css_put() 5529 5569 * css_free() 5530 5570 * pc->mem_cgroup = dead memcg ··· 5559 5599 { 5560 5600 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5561 5601 5562 - mem_cgroup_resize_limit(memcg, ULLONG_MAX); 5563 - mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 5564 - memcg_update_kmem_limit(memcg, ULLONG_MAX); 5565 - res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 5602 + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 5603 + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 5604 + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 5605 + memcg->soft_limit = 0; 5566 5606 } 5567 5607 5568 5608 #ifdef CONFIG_MMU ··· 5876 5916 if (mc.moved_swap) { 5877 5917 /* uncharge swap account from the old cgroup */ 5878 5918 if (!mem_cgroup_is_root(mc.from)) 5879 - res_counter_uncharge(&mc.from->memsw, 5880 - PAGE_SIZE * mc.moved_swap); 5919 + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5920 + 5921 + /* 5922 + * we charged both to->memory and to->memsw, so we 5923 + * should uncharge to->memory. 5924 + */ 5925 + if (!mem_cgroup_is_root(mc.to)) 5926 + page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5881 5927 5882 5928 for (i = 0; i < mc.moved_swap; i++) 5883 5929 css_put(&mc.from->css); 5884 5930 5885 - /* 5886 - * we charged both to->res and to->memsw, so we should 5887 - * uncharge to->res. 5888 - */ 5889 - if (!mem_cgroup_is_root(mc.to)) 5890 - res_counter_uncharge(&mc.to->res, 5891 - PAGE_SIZE * mc.moved_swap); 5892 5931 /* we've already done css_get(mc.to) */ 5893 5932 mc.moved_swap = 0; 5894 5933 } ··· 6253 6294 memcg = mem_cgroup_lookup(id); 6254 6295 if (memcg) { 6255 6296 if (!mem_cgroup_is_root(memcg)) 6256 - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 6297 + page_counter_uncharge(&memcg->memsw, 1); 6257 6298 mem_cgroup_swap_statistics(memcg, false); 6258 6299 css_put(&memcg->css); 6259 6300 } ··· 6419 6460 6420 6461 if (!mem_cgroup_is_root(memcg)) { 6421 6462 if (nr_mem) 6422 - res_counter_uncharge(&memcg->res, 6423 - nr_mem * PAGE_SIZE); 6463 + page_counter_uncharge(&memcg->memory, nr_mem); 6424 6464 if (nr_memsw) 6425 - res_counter_uncharge(&memcg->memsw, 6426 - nr_memsw * PAGE_SIZE); 6465 + page_counter_uncharge(&memcg->memsw, nr_memsw); 6427 6466 memcg_oom_recover(memcg); 6428 6467 } 6429 6468
+207
mm/page_counter.c
··· 1 + /* 2 + * Lockless hierarchical page accounting & limiting 3 + * 4 + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner 5 + */ 6 + 7 + #include <linux/page_counter.h> 8 + #include <linux/atomic.h> 9 + #include <linux/kernel.h> 10 + #include <linux/string.h> 11 + #include <linux/sched.h> 12 + #include <linux/bug.h> 13 + #include <asm/page.h> 14 + 15 + /** 16 + * page_counter_cancel - take pages out of the local counter 17 + * @counter: counter 18 + * @nr_pages: number of pages to cancel 19 + * 20 + * Returns whether there are remaining pages in the counter. 21 + */ 22 + int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) 23 + { 24 + long new; 25 + 26 + new = atomic_long_sub_return(nr_pages, &counter->count); 27 + 28 + /* More uncharges than charges? */ 29 + WARN_ON_ONCE(new < 0); 30 + 31 + return new > 0; 32 + } 33 + 34 + /** 35 + * page_counter_charge - hierarchically charge pages 36 + * @counter: counter 37 + * @nr_pages: number of pages to charge 38 + * 39 + * NOTE: This does not consider any configured counter limits. 40 + */ 41 + void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) 42 + { 43 + struct page_counter *c; 44 + 45 + for (c = counter; c; c = c->parent) { 46 + long new; 47 + 48 + new = atomic_long_add_return(nr_pages, &c->count); 49 + /* 50 + * This is indeed racy, but we can live with some 51 + * inaccuracy in the watermark. 52 + */ 53 + if (new > c->watermark) 54 + c->watermark = new; 55 + } 56 + } 57 + 58 + /** 59 + * page_counter_try_charge - try to hierarchically charge pages 60 + * @counter: counter 61 + * @nr_pages: number of pages to charge 62 + * @fail: points first counter to hit its limit, if any 63 + * 64 + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of 65 + * its ancestors has hit its configured limit. 66 + */ 67 + int page_counter_try_charge(struct page_counter *counter, 68 + unsigned long nr_pages, 69 + struct page_counter **fail) 70 + { 71 + struct page_counter *c; 72 + 73 + for (c = counter; c; c = c->parent) { 74 + long new; 75 + /* 76 + * Charge speculatively to avoid an expensive CAS. If 77 + * a bigger charge fails, it might falsely lock out a 78 + * racing smaller charge and send it into reclaim 79 + * early, but the error is limited to the difference 80 + * between the two sizes, which is less than 2M/4M in 81 + * case of a THP locking out a regular page charge. 82 + * 83 + * The atomic_long_add_return() implies a full memory 84 + * barrier between incrementing the count and reading 85 + * the limit. When racing with page_counter_limit(), 86 + * we either see the new limit or the setter sees the 87 + * counter has changed and retries. 88 + */ 89 + new = atomic_long_add_return(nr_pages, &c->count); 90 + if (new > c->limit) { 91 + atomic_long_sub(nr_pages, &c->count); 92 + /* 93 + * This is racy, but we can live with some 94 + * inaccuracy in the failcnt. 95 + */ 96 + c->failcnt++; 97 + *fail = c; 98 + goto failed; 99 + } 100 + /* 101 + * Just like with failcnt, we can live with some 102 + * inaccuracy in the watermark. 103 + */ 104 + if (new > c->watermark) 105 + c->watermark = new; 106 + } 107 + return 0; 108 + 109 + failed: 110 + for (c = counter; c != *fail; c = c->parent) 111 + page_counter_cancel(c, nr_pages); 112 + 113 + return -ENOMEM; 114 + } 115 + 116 + /** 117 + * page_counter_uncharge - hierarchically uncharge pages 118 + * @counter: counter 119 + * @nr_pages: number of pages to uncharge 120 + * 121 + * Returns whether there are remaining charges in @counter. 122 + */ 123 + int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) 124 + { 125 + struct page_counter *c; 126 + int ret = 1; 127 + 128 + for (c = counter; c; c = c->parent) { 129 + int remainder; 130 + 131 + remainder = page_counter_cancel(c, nr_pages); 132 + if (c == counter && !remainder) 133 + ret = 0; 134 + } 135 + 136 + return ret; 137 + } 138 + 139 + /** 140 + * page_counter_limit - limit the number of pages allowed 141 + * @counter: counter 142 + * @limit: limit to set 143 + * 144 + * Returns 0 on success, -EBUSY if the current number of pages on the 145 + * counter already exceeds the specified limit. 146 + * 147 + * The caller must serialize invocations on the same counter. 148 + */ 149 + int page_counter_limit(struct page_counter *counter, unsigned long limit) 150 + { 151 + for (;;) { 152 + unsigned long old; 153 + long count; 154 + 155 + /* 156 + * Update the limit while making sure that it's not 157 + * below the concurrently-changing counter value. 158 + * 159 + * The xchg implies two full memory barriers before 160 + * and after, so the read-swap-read is ordered and 161 + * ensures coherency with page_counter_try_charge(): 162 + * that function modifies the count before checking 163 + * the limit, so if it sees the old limit, we see the 164 + * modified counter and retry. 165 + */ 166 + count = atomic_long_read(&counter->count); 167 + 168 + if (count > limit) 169 + return -EBUSY; 170 + 171 + old = xchg(&counter->limit, limit); 172 + 173 + if (atomic_long_read(&counter->count) <= count) 174 + return 0; 175 + 176 + counter->limit = old; 177 + cond_resched(); 178 + } 179 + } 180 + 181 + /** 182 + * page_counter_memparse - memparse() for page counter limits 183 + * @buf: string to parse 184 + * @nr_pages: returns the result in number of pages 185 + * 186 + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 187 + * limited to %PAGE_COUNTER_MAX. 188 + */ 189 + int page_counter_memparse(const char *buf, unsigned long *nr_pages) 190 + { 191 + char unlimited[] = "-1"; 192 + char *end; 193 + u64 bytes; 194 + 195 + if (!strncmp(buf, unlimited, sizeof(unlimited))) { 196 + *nr_pages = PAGE_COUNTER_MAX; 197 + return 0; 198 + } 199 + 200 + bytes = memparse(buf, &end); 201 + if (*end != '\0') 202 + return -EINVAL; 203 + 204 + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); 205 + 206 + return 0; 207 + }
+44 -43
net/ipv4/tcp_memcontrol.c
··· 9 9 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 10 10 { 11 11 /* 12 - * The root cgroup does not use res_counters, but rather, 12 + * The root cgroup does not use page_counters, but rather, 13 13 * rely on the data already collected by the network 14 14 * subsystem 15 15 */ 16 - struct res_counter *res_parent = NULL; 17 - struct cg_proto *cg_proto, *parent_cg; 18 16 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 17 + struct page_counter *counter_parent = NULL; 18 + struct cg_proto *cg_proto, *parent_cg; 19 19 20 20 cg_proto = tcp_prot.proto_cgroup(memcg); 21 21 if (!cg_proto) ··· 29 29 30 30 parent_cg = tcp_prot.proto_cgroup(parent); 31 31 if (parent_cg) 32 - res_parent = &parent_cg->memory_allocated; 32 + counter_parent = &parent_cg->memory_allocated; 33 33 34 - res_counter_init(&cg_proto->memory_allocated, res_parent); 34 + page_counter_init(&cg_proto->memory_allocated, counter_parent); 35 35 percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); 36 36 37 37 return 0; ··· 50 50 } 51 51 EXPORT_SYMBOL(tcp_destroy_cgroup); 52 52 53 - static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) 53 + static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) 54 54 { 55 55 struct cg_proto *cg_proto; 56 56 int i; ··· 60 60 if (!cg_proto) 61 61 return -EINVAL; 62 62 63 - if (val > RES_COUNTER_MAX) 64 - val = RES_COUNTER_MAX; 65 - 66 - ret = res_counter_set_limit(&cg_proto->memory_allocated, val); 63 + ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); 67 64 if (ret) 68 65 return ret; 69 66 70 67 for (i = 0; i < 3; i++) 71 - cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, 68 + cg_proto->sysctl_mem[i] = min_t(long, nr_pages, 72 69 sysctl_tcp_mem[i]); 73 70 74 - if (val == RES_COUNTER_MAX) 71 + if (nr_pages == PAGE_COUNTER_MAX) 75 72 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); 76 - else if (val != RES_COUNTER_MAX) { 73 + else { 77 74 /* 78 75 * The active bit needs to be written after the static_key 79 76 * update. This is what guarantees that the socket activation ··· 99 102 return 0; 100 103 } 101 104 105 + enum { 106 + RES_USAGE, 107 + RES_LIMIT, 108 + RES_MAX_USAGE, 109 + RES_FAILCNT, 110 + }; 111 + 112 + static DEFINE_MUTEX(tcp_limit_mutex); 113 + 102 114 static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, 103 115 char *buf, size_t nbytes, loff_t off) 104 116 { 105 117 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 106 - unsigned long long val; 118 + unsigned long nr_pages; 107 119 int ret = 0; 108 120 109 121 buf = strstrip(buf); ··· 120 114 switch (of_cft(of)->private) { 121 115 case RES_LIMIT: 122 116 /* see memcontrol.c */ 123 - ret = res_counter_memparse_write_strategy(buf, &val); 117 + ret = page_counter_memparse(buf, &nr_pages); 124 118 if (ret) 125 119 break; 126 - ret = tcp_update_limit(memcg, val); 120 + mutex_lock(&tcp_limit_mutex); 121 + ret = tcp_update_limit(memcg, nr_pages); 122 + mutex_unlock(&tcp_limit_mutex); 127 123 break; 128 124 default: 129 125 ret = -EINVAL; ··· 134 126 return ret ?: nbytes; 135 127 } 136 128 137 - static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) 138 - { 139 - struct cg_proto *cg_proto; 140 - 141 - cg_proto = tcp_prot.proto_cgroup(memcg); 142 - if (!cg_proto) 143 - return default_val; 144 - 145 - return res_counter_read_u64(&cg_proto->memory_allocated, type); 146 - } 147 - 148 - static u64 tcp_read_usage(struct mem_cgroup *memcg) 149 - { 150 - struct cg_proto *cg_proto; 151 - 152 - cg_proto = tcp_prot.proto_cgroup(memcg); 153 - if (!cg_proto) 154 - return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; 155 - 156 - return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); 157 - } 158 - 159 129 static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) 160 130 { 161 131 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 132 + struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); 162 133 u64 val; 163 134 164 135 switch (cft->private) { 165 136 case RES_LIMIT: 166 - val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); 137 + if (!cg_proto) 138 + return PAGE_COUNTER_MAX; 139 + val = cg_proto->memory_allocated.limit; 140 + val *= PAGE_SIZE; 167 141 break; 168 142 case RES_USAGE: 169 - val = tcp_read_usage(memcg); 143 + if (!cg_proto) 144 + val = atomic_long_read(&tcp_memory_allocated); 145 + else 146 + val = page_counter_read(&cg_proto->memory_allocated); 147 + val *= PAGE_SIZE; 170 148 break; 171 149 case RES_FAILCNT: 150 + if (!cg_proto) 151 + return 0; 152 + val = cg_proto->memory_allocated.failcnt; 153 + break; 172 154 case RES_MAX_USAGE: 173 - val = tcp_read_stat(memcg, cft->private, 0); 155 + if (!cg_proto) 156 + return 0; 157 + val = cg_proto->memory_allocated.watermark; 158 + val *= PAGE_SIZE; 174 159 break; 175 160 default: 176 161 BUG(); ··· 184 183 185 184 switch (of_cft(of)->private) { 186 185 case RES_MAX_USAGE: 187 - res_counter_reset_max(&cg_proto->memory_allocated); 186 + page_counter_reset_watermark(&cg_proto->memory_allocated); 188 187 break; 189 188 case RES_FAILCNT: 190 - res_counter_reset_failcnt(&cg_proto->memory_allocated); 189 + cg_proto->memory_allocated.failcnt = 0; 191 190 break; 192 191 } 193 192