at v2.6.32-rc2 3214 lines 82 kB view raw
1/* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20#include <linux/res_counter.h> 21#include <linux/memcontrol.h> 22#include <linux/cgroup.h> 23#include <linux/mm.h> 24#include <linux/pagemap.h> 25#include <linux/smp.h> 26#include <linux/page-flags.h> 27#include <linux/backing-dev.h> 28#include <linux/bit_spinlock.h> 29#include <linux/rcupdate.h> 30#include <linux/limits.h> 31#include <linux/mutex.h> 32#include <linux/rbtree.h> 33#include <linux/slab.h> 34#include <linux/swap.h> 35#include <linux/spinlock.h> 36#include <linux/fs.h> 37#include <linux/seq_file.h> 38#include <linux/vmalloc.h> 39#include <linux/mm_inline.h> 40#include <linux/page_cgroup.h> 41#include "internal.h" 42 43#include <asm/uaccess.h> 44 45struct cgroup_subsys mem_cgroup_subsys __read_mostly; 46#define MEM_CGROUP_RECLAIM_RETRIES 5 47struct mem_cgroup *root_mem_cgroup __read_mostly; 48 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 51int do_swap_account __read_mostly; 52static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 53#else 54#define do_swap_account (0) 55#endif 56 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 58#define SOFTLIMIT_EVENTS_THRESH (1000) 59 60/* 61 * Statistics for memory cgroup. 62 */ 63enum mem_cgroup_stat_index { 64 /* 65 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 66 */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 74 75 MEM_CGROUP_STAT_NSTATS, 76}; 77 78struct mem_cgroup_stat_cpu { 79 s64 count[MEM_CGROUP_STAT_NSTATS]; 80} ____cacheline_aligned_in_smp; 81 82struct mem_cgroup_stat { 83 struct mem_cgroup_stat_cpu cpustat[0]; 84}; 85 86static inline void 87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, 88 enum mem_cgroup_stat_index idx) 89{ 90 stat->count[idx] = 0; 91} 92 93static inline s64 94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, 95 enum mem_cgroup_stat_index idx) 96{ 97 return stat->count[idx]; 98} 99 100/* 101 * For accounting under irq disable, no need for increment preempt count. 102 */ 103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, 104 enum mem_cgroup_stat_index idx, int val) 105{ 106 stat->count[idx] += val; 107} 108 109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 110 enum mem_cgroup_stat_index idx) 111{ 112 int cpu; 113 s64 ret = 0; 114 for_each_possible_cpu(cpu) 115 ret += stat->cpustat[cpu].count[idx]; 116 return ret; 117} 118 119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) 120{ 121 s64 ret; 122 123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); 124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); 125 return ret; 126} 127 128/* 129 * per-zone information in memory controller. 130 */ 131struct mem_cgroup_per_zone { 132 /* 133 * spin_lock to protect the per cgroup LRU 134 */ 135 struct list_head lists[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS]; 137 138 struct zone_reclaim_stat reclaim_stat; 139 struct rb_node tree_node; /* RB tree node */ 140 unsigned long long usage_in_excess;/* Set to the value by which */ 141 /* the soft limit is exceeded*/ 142 bool on_tree; 143 struct mem_cgroup *mem; /* Back pointer, we cannot */ 144 /* use container_of */ 145}; 146/* Macro for accessing counter */ 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 148 149struct mem_cgroup_per_node { 150 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151}; 152 153struct mem_cgroup_lru_info { 154 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 155}; 156 157/* 158 * Cgroups above their limits are maintained in a RB-Tree, independent of 159 * their hierarchy representation 160 */ 161 162struct mem_cgroup_tree_per_zone { 163 struct rb_root rb_root; 164 spinlock_t lock; 165}; 166 167struct mem_cgroup_tree_per_node { 168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 169}; 170 171struct mem_cgroup_tree { 172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 173}; 174 175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 176 177/* 178 * The memory controller data structure. The memory controller controls both 179 * page cache and RSS per cgroup. We would eventually like to provide 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 181 * to help the administrator determine what knobs to tune. 182 * 183 * TODO: Add a water mark for the memory controller. Reclaim will begin when 184 * we hit the water mark. May be even add a low water mark, such that 185 * no reclaim occurs from a cgroup at it's low water mark, this is 186 * a feature that will be implemented much later in the future. 187 */ 188struct mem_cgroup { 189 struct cgroup_subsys_state css; 190 /* 191 * the counter to account for memory usage 192 */ 193 struct res_counter res; 194 /* 195 * the counter to account for mem+swap usage. 196 */ 197 struct res_counter memsw; 198 /* 199 * Per cgroup active and inactive list, similar to the 200 * per zone LRU lists. 201 */ 202 struct mem_cgroup_lru_info info; 203 204 /* 205 protect against reclaim related member. 206 */ 207 spinlock_t reclaim_param_lock; 208 209 int prev_priority; /* for recording reclaim priority */ 210 211 /* 212 * While reclaiming in a hiearchy, we cache the last child we 213 * reclaimed from. 214 */ 215 int last_scanned_child; 216 /* 217 * Should the accounting and control be hierarchical, per subtree? 218 */ 219 bool use_hierarchy; 220 unsigned long last_oom_jiffies; 221 atomic_t refcnt; 222 223 unsigned int swappiness; 224 225 /* set when res.limit == memsw.limit */ 226 bool memsw_is_minimum; 227 228 /* 229 * statistics. This must be placed at the end of memcg. 230 */ 231 struct mem_cgroup_stat stat; 232}; 233 234/* 235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 236 * limit reclaim to prevent infinite loops, if they ever occur. 237 */ 238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 240 241enum charge_type { 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED, 244 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 245 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 246 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 247 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 248 NR_CHARGE_TYPE, 249}; 250 251/* only for here (for easy reading.) */ 252#define PCGF_CACHE (1UL << PCG_CACHE) 253#define PCGF_USED (1UL << PCG_USED) 254#define PCGF_LOCK (1UL << PCG_LOCK) 255/* Not used, but added here for completeness */ 256#define PCGF_ACCT (1UL << PCG_ACCT) 257 258/* for encoding cft->private value on file */ 259#define _MEM (0) 260#define _MEMSWAP (1) 261#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff) 264 265/* 266 * Reclaim flags for mem_cgroup_hierarchical_reclaim 267 */ 268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 274 275static void mem_cgroup_get(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 278 279static struct mem_cgroup_per_zone * 280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281{ 282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283} 284 285static struct mem_cgroup_per_zone * 286page_cgroup_zoneinfo(struct page_cgroup *pc) 287{ 288 struct mem_cgroup *mem = pc->mem_cgroup; 289 int nid = page_cgroup_nid(pc); 290 int zid = page_cgroup_zid(pc); 291 292 if (!mem) 293 return NULL; 294 295 return mem_cgroup_zoneinfo(mem, nid, zid); 296} 297 298static struct mem_cgroup_tree_per_zone * 299soft_limit_tree_node_zone(int nid, int zid) 300{ 301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 302} 303 304static struct mem_cgroup_tree_per_zone * 305soft_limit_tree_from_page(struct page *page) 306{ 307 int nid = page_to_nid(page); 308 int zid = page_zonenum(page); 309 310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 311} 312 313static void 314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 315 struct mem_cgroup_per_zone *mz, 316 struct mem_cgroup_tree_per_zone *mctz) 317{ 318 struct rb_node **p = &mctz->rb_root.rb_node; 319 struct rb_node *parent = NULL; 320 struct mem_cgroup_per_zone *mz_node; 321 322 if (mz->on_tree) 323 return; 324 325 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); 326 while (*p) { 327 parent = *p; 328 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 329 tree_node); 330 if (mz->usage_in_excess < mz_node->usage_in_excess) 331 p = &(*p)->rb_left; 332 /* 333 * We can't avoid mem cgroups that are over their soft 334 * limit by the same amount 335 */ 336 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 337 p = &(*p)->rb_right; 338 } 339 rb_link_node(&mz->tree_node, parent, p); 340 rb_insert_color(&mz->tree_node, &mctz->rb_root); 341 mz->on_tree = true; 342} 343 344static void 345__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 346 struct mem_cgroup_per_zone *mz, 347 struct mem_cgroup_tree_per_zone *mctz) 348{ 349 if (!mz->on_tree) 350 return; 351 rb_erase(&mz->tree_node, &mctz->rb_root); 352 mz->on_tree = false; 353} 354 355static void 356mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 357 struct mem_cgroup_per_zone *mz, 358 struct mem_cgroup_tree_per_zone *mctz) 359{ 360 spin_lock(&mctz->lock); 361 __mem_cgroup_insert_exceeded(mem, mz, mctz); 362 spin_unlock(&mctz->lock); 363} 364 365static void 366mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 367 struct mem_cgroup_per_zone *mz, 368 struct mem_cgroup_tree_per_zone *mctz) 369{ 370 spin_lock(&mctz->lock); 371 __mem_cgroup_remove_exceeded(mem, mz, mctz); 372 spin_unlock(&mctz->lock); 373} 374 375static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) 376{ 377 bool ret = false; 378 int cpu; 379 s64 val; 380 struct mem_cgroup_stat_cpu *cpustat; 381 382 cpu = get_cpu(); 383 cpustat = &mem->stat.cpustat[cpu]; 384 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); 385 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { 386 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); 387 ret = true; 388 } 389 put_cpu(); 390 return ret; 391} 392 393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 394{ 395 unsigned long long prev_usage_in_excess, new_usage_in_excess; 396 bool updated_tree = false; 397 struct mem_cgroup_per_zone *mz; 398 struct mem_cgroup_tree_per_zone *mctz; 399 400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); 401 mctz = soft_limit_tree_from_page(page); 402 403 /* 404 * We do updates in lazy mode, mem's are removed 405 * lazily from the per-zone, per-node rb tree 406 */ 407 prev_usage_in_excess = mz->usage_in_excess; 408 409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); 410 if (prev_usage_in_excess) { 411 mem_cgroup_remove_exceeded(mem, mz, mctz); 412 updated_tree = true; 413 } 414 if (!new_usage_in_excess) 415 goto done; 416 mem_cgroup_insert_exceeded(mem, mz, mctz); 417 418done: 419 if (updated_tree) { 420 spin_lock(&mctz->lock); 421 mz->usage_in_excess = new_usage_in_excess; 422 spin_unlock(&mctz->lock); 423 } 424} 425 426static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 427{ 428 int node, zone; 429 struct mem_cgroup_per_zone *mz; 430 struct mem_cgroup_tree_per_zone *mctz; 431 432 for_each_node_state(node, N_POSSIBLE) { 433 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 434 mz = mem_cgroup_zoneinfo(mem, node, zone); 435 mctz = soft_limit_tree_node_zone(node, zone); 436 mem_cgroup_remove_exceeded(mem, mz, mctz); 437 } 438 } 439} 440 441static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) 442{ 443 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; 444} 445 446static struct mem_cgroup_per_zone * 447__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 448{ 449 struct rb_node *rightmost = NULL; 450 struct mem_cgroup_per_zone *mz = NULL; 451 452retry: 453 rightmost = rb_last(&mctz->rb_root); 454 if (!rightmost) 455 goto done; /* Nothing to reclaim from */ 456 457 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 458 /* 459 * Remove the node now but someone else can add it back, 460 * we will to add it back at the end of reclaim to its correct 461 * position in the tree. 462 */ 463 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 464 if (!res_counter_soft_limit_excess(&mz->mem->res) || 465 !css_tryget(&mz->mem->css)) 466 goto retry; 467done: 468 return mz; 469} 470 471static struct mem_cgroup_per_zone * 472mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 473{ 474 struct mem_cgroup_per_zone *mz; 475 476 spin_lock(&mctz->lock); 477 mz = __mem_cgroup_largest_soft_limit_node(mctz); 478 spin_unlock(&mctz->lock); 479 return mz; 480} 481 482static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 483 bool charge) 484{ 485 int val = (charge) ? 1 : -1; 486 struct mem_cgroup_stat *stat = &mem->stat; 487 struct mem_cgroup_stat_cpu *cpustat; 488 int cpu = get_cpu(); 489 490 cpustat = &stat->cpustat[cpu]; 491 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); 492 put_cpu(); 493} 494 495static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 496 struct page_cgroup *pc, 497 bool charge) 498{ 499 int val = (charge) ? 1 : -1; 500 struct mem_cgroup_stat *stat = &mem->stat; 501 struct mem_cgroup_stat_cpu *cpustat; 502 int cpu = get_cpu(); 503 504 cpustat = &stat->cpustat[cpu]; 505 if (PageCgroupCache(pc)) 506 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 507 else 508 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 509 510 if (charge) 511 __mem_cgroup_stat_add_safe(cpustat, 512 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 513 else 514 __mem_cgroup_stat_add_safe(cpustat, 515 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 516 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 517 put_cpu(); 518} 519 520static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 521 enum lru_list idx) 522{ 523 int nid, zid; 524 struct mem_cgroup_per_zone *mz; 525 u64 total = 0; 526 527 for_each_online_node(nid) 528 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 529 mz = mem_cgroup_zoneinfo(mem, nid, zid); 530 total += MEM_CGROUP_ZSTAT(mz, idx); 531 } 532 return total; 533} 534 535static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 536{ 537 return container_of(cgroup_subsys_state(cont, 538 mem_cgroup_subsys_id), struct mem_cgroup, 539 css); 540} 541 542struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 543{ 544 /* 545 * mm_update_next_owner() may clear mm->owner to NULL 546 * if it races with swapoff, page migration, etc. 547 * So this can be called with p == NULL. 548 */ 549 if (unlikely(!p)) 550 return NULL; 551 552 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 553 struct mem_cgroup, css); 554} 555 556static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 557{ 558 struct mem_cgroup *mem = NULL; 559 560 if (!mm) 561 return NULL; 562 /* 563 * Because we have no locks, mm->owner's may be being moved to other 564 * cgroup. We use css_tryget() here even if this looks 565 * pessimistic (rather than adding locks here). 566 */ 567 rcu_read_lock(); 568 do { 569 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 570 if (unlikely(!mem)) 571 break; 572 } while (!css_tryget(&mem->css)); 573 rcu_read_unlock(); 574 return mem; 575} 576 577/* 578 * Call callback function against all cgroup under hierarchy tree. 579 */ 580static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, 581 int (*func)(struct mem_cgroup *, void *)) 582{ 583 int found, ret, nextid; 584 struct cgroup_subsys_state *css; 585 struct mem_cgroup *mem; 586 587 if (!root->use_hierarchy) 588 return (*func)(root, data); 589 590 nextid = 1; 591 do { 592 ret = 0; 593 mem = NULL; 594 595 rcu_read_lock(); 596 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 597 &found); 598 if (css && css_tryget(css)) 599 mem = container_of(css, struct mem_cgroup, css); 600 rcu_read_unlock(); 601 602 if (mem) { 603 ret = (*func)(mem, data); 604 css_put(&mem->css); 605 } 606 nextid = found + 1; 607 } while (!ret && css); 608 609 return ret; 610} 611 612static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 613{ 614 return (mem == root_mem_cgroup); 615} 616 617/* 618 * Following LRU functions are allowed to be used without PCG_LOCK. 619 * Operations are called by routine of global LRU independently from memcg. 620 * What we have to take care of here is validness of pc->mem_cgroup. 621 * 622 * Changes to pc->mem_cgroup happens when 623 * 1. charge 624 * 2. moving account 625 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 626 * It is added to LRU before charge. 627 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 628 * When moving account, the page is not on LRU. It's isolated. 629 */ 630 631void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 632{ 633 struct page_cgroup *pc; 634 struct mem_cgroup_per_zone *mz; 635 636 if (mem_cgroup_disabled()) 637 return; 638 pc = lookup_page_cgroup(page); 639 /* can happen while we handle swapcache. */ 640 if (!TestClearPageCgroupAcctLRU(pc)) 641 return; 642 VM_BUG_ON(!pc->mem_cgroup); 643 /* 644 * We don't check PCG_USED bit. It's cleared when the "page" is finally 645 * removed from global LRU. 646 */ 647 mz = page_cgroup_zoneinfo(pc); 648 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 649 if (mem_cgroup_is_root(pc->mem_cgroup)) 650 return; 651 VM_BUG_ON(list_empty(&pc->lru)); 652 list_del_init(&pc->lru); 653 return; 654} 655 656void mem_cgroup_del_lru(struct page *page) 657{ 658 mem_cgroup_del_lru_list(page, page_lru(page)); 659} 660 661void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 662{ 663 struct mem_cgroup_per_zone *mz; 664 struct page_cgroup *pc; 665 666 if (mem_cgroup_disabled()) 667 return; 668 669 pc = lookup_page_cgroup(page); 670 /* 671 * Used bit is set without atomic ops but after smp_wmb(). 672 * For making pc->mem_cgroup visible, insert smp_rmb() here. 673 */ 674 smp_rmb(); 675 /* unused or root page is not rotated. */ 676 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 677 return; 678 mz = page_cgroup_zoneinfo(pc); 679 list_move(&pc->lru, &mz->lists[lru]); 680} 681 682void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 683{ 684 struct page_cgroup *pc; 685 struct mem_cgroup_per_zone *mz; 686 687 if (mem_cgroup_disabled()) 688 return; 689 pc = lookup_page_cgroup(page); 690 VM_BUG_ON(PageCgroupAcctLRU(pc)); 691 /* 692 * Used bit is set without atomic ops but after smp_wmb(). 693 * For making pc->mem_cgroup visible, insert smp_rmb() here. 694 */ 695 smp_rmb(); 696 if (!PageCgroupUsed(pc)) 697 return; 698 699 mz = page_cgroup_zoneinfo(pc); 700 MEM_CGROUP_ZSTAT(mz, lru) += 1; 701 SetPageCgroupAcctLRU(pc); 702 if (mem_cgroup_is_root(pc->mem_cgroup)) 703 return; 704 list_add(&pc->lru, &mz->lists[lru]); 705} 706 707/* 708 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 709 * lru because the page may.be reused after it's fully uncharged (because of 710 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 711 * it again. This function is only used to charge SwapCache. It's done under 712 * lock_page and expected that zone->lru_lock is never held. 713 */ 714static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 715{ 716 unsigned long flags; 717 struct zone *zone = page_zone(page); 718 struct page_cgroup *pc = lookup_page_cgroup(page); 719 720 spin_lock_irqsave(&zone->lru_lock, flags); 721 /* 722 * Forget old LRU when this page_cgroup is *not* used. This Used bit 723 * is guarded by lock_page() because the page is SwapCache. 724 */ 725 if (!PageCgroupUsed(pc)) 726 mem_cgroup_del_lru_list(page, page_lru(page)); 727 spin_unlock_irqrestore(&zone->lru_lock, flags); 728} 729 730static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 731{ 732 unsigned long flags; 733 struct zone *zone = page_zone(page); 734 struct page_cgroup *pc = lookup_page_cgroup(page); 735 736 spin_lock_irqsave(&zone->lru_lock, flags); 737 /* link when the page is linked to LRU but page_cgroup isn't */ 738 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 739 mem_cgroup_add_lru_list(page, page_lru(page)); 740 spin_unlock_irqrestore(&zone->lru_lock, flags); 741} 742 743 744void mem_cgroup_move_lists(struct page *page, 745 enum lru_list from, enum lru_list to) 746{ 747 if (mem_cgroup_disabled()) 748 return; 749 mem_cgroup_del_lru_list(page, from); 750 mem_cgroup_add_lru_list(page, to); 751} 752 753int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 754{ 755 int ret; 756 struct mem_cgroup *curr = NULL; 757 758 task_lock(task); 759 rcu_read_lock(); 760 curr = try_get_mem_cgroup_from_mm(task->mm); 761 rcu_read_unlock(); 762 task_unlock(task); 763 if (!curr) 764 return 0; 765 if (curr->use_hierarchy) 766 ret = css_is_ancestor(&curr->css, &mem->css); 767 else 768 ret = (curr == mem); 769 css_put(&curr->css); 770 return ret; 771} 772 773/* 774 * prev_priority control...this will be used in memory reclaim path. 775 */ 776int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 777{ 778 int prev_priority; 779 780 spin_lock(&mem->reclaim_param_lock); 781 prev_priority = mem->prev_priority; 782 spin_unlock(&mem->reclaim_param_lock); 783 784 return prev_priority; 785} 786 787void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 788{ 789 spin_lock(&mem->reclaim_param_lock); 790 if (priority < mem->prev_priority) 791 mem->prev_priority = priority; 792 spin_unlock(&mem->reclaim_param_lock); 793} 794 795void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 796{ 797 spin_lock(&mem->reclaim_param_lock); 798 mem->prev_priority = priority; 799 spin_unlock(&mem->reclaim_param_lock); 800} 801 802static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 803{ 804 unsigned long active; 805 unsigned long inactive; 806 unsigned long gb; 807 unsigned long inactive_ratio; 808 809 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 810 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 811 812 gb = (inactive + active) >> (30 - PAGE_SHIFT); 813 if (gb) 814 inactive_ratio = int_sqrt(10 * gb); 815 else 816 inactive_ratio = 1; 817 818 if (present_pages) { 819 present_pages[0] = inactive; 820 present_pages[1] = active; 821 } 822 823 return inactive_ratio; 824} 825 826int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 827{ 828 unsigned long active; 829 unsigned long inactive; 830 unsigned long present_pages[2]; 831 unsigned long inactive_ratio; 832 833 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 834 835 inactive = present_pages[0]; 836 active = present_pages[1]; 837 838 if (inactive * inactive_ratio < active) 839 return 1; 840 841 return 0; 842} 843 844int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 845{ 846 unsigned long active; 847 unsigned long inactive; 848 849 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 850 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 851 852 return (active > inactive); 853} 854 855unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 856 struct zone *zone, 857 enum lru_list lru) 858{ 859 int nid = zone->zone_pgdat->node_id; 860 int zid = zone_idx(zone); 861 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 862 863 return MEM_CGROUP_ZSTAT(mz, lru); 864} 865 866struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 867 struct zone *zone) 868{ 869 int nid = zone->zone_pgdat->node_id; 870 int zid = zone_idx(zone); 871 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 872 873 return &mz->reclaim_stat; 874} 875 876struct zone_reclaim_stat * 877mem_cgroup_get_reclaim_stat_from_page(struct page *page) 878{ 879 struct page_cgroup *pc; 880 struct mem_cgroup_per_zone *mz; 881 882 if (mem_cgroup_disabled()) 883 return NULL; 884 885 pc = lookup_page_cgroup(page); 886 /* 887 * Used bit is set without atomic ops but after smp_wmb(). 888 * For making pc->mem_cgroup visible, insert smp_rmb() here. 889 */ 890 smp_rmb(); 891 if (!PageCgroupUsed(pc)) 892 return NULL; 893 894 mz = page_cgroup_zoneinfo(pc); 895 if (!mz) 896 return NULL; 897 898 return &mz->reclaim_stat; 899} 900 901unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 902 struct list_head *dst, 903 unsigned long *scanned, int order, 904 int mode, struct zone *z, 905 struct mem_cgroup *mem_cont, 906 int active, int file) 907{ 908 unsigned long nr_taken = 0; 909 struct page *page; 910 unsigned long scan; 911 LIST_HEAD(pc_list); 912 struct list_head *src; 913 struct page_cgroup *pc, *tmp; 914 int nid = z->zone_pgdat->node_id; 915 int zid = zone_idx(z); 916 struct mem_cgroup_per_zone *mz; 917 int lru = LRU_FILE * file + active; 918 int ret; 919 920 BUG_ON(!mem_cont); 921 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 922 src = &mz->lists[lru]; 923 924 scan = 0; 925 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 926 if (scan >= nr_to_scan) 927 break; 928 929 page = pc->page; 930 if (unlikely(!PageCgroupUsed(pc))) 931 continue; 932 if (unlikely(!PageLRU(page))) 933 continue; 934 935 scan++; 936 ret = __isolate_lru_page(page, mode, file); 937 switch (ret) { 938 case 0: 939 list_move(&page->lru, dst); 940 mem_cgroup_del_lru(page); 941 nr_taken++; 942 break; 943 case -EBUSY: 944 /* we don't affect global LRU but rotate in our LRU */ 945 mem_cgroup_rotate_lru_list(page, page_lru(page)); 946 break; 947 default: 948 break; 949 } 950 } 951 952 *scanned = scan; 953 return nr_taken; 954} 955 956#define mem_cgroup_from_res_counter(counter, member) \ 957 container_of(counter, struct mem_cgroup, member) 958 959static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 960{ 961 if (do_swap_account) { 962 if (res_counter_check_under_limit(&mem->res) && 963 res_counter_check_under_limit(&mem->memsw)) 964 return true; 965 } else 966 if (res_counter_check_under_limit(&mem->res)) 967 return true; 968 return false; 969} 970 971static unsigned int get_swappiness(struct mem_cgroup *memcg) 972{ 973 struct cgroup *cgrp = memcg->css.cgroup; 974 unsigned int swappiness; 975 976 /* root ? */ 977 if (cgrp->parent == NULL) 978 return vm_swappiness; 979 980 spin_lock(&memcg->reclaim_param_lock); 981 swappiness = memcg->swappiness; 982 spin_unlock(&memcg->reclaim_param_lock); 983 984 return swappiness; 985} 986 987static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 988{ 989 int *val = data; 990 (*val)++; 991 return 0; 992} 993 994/** 995 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 996 * @memcg: The memory cgroup that went over limit 997 * @p: Task that is going to be killed 998 * 999 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1000 * enabled 1001 */ 1002void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1003{ 1004 struct cgroup *task_cgrp; 1005 struct cgroup *mem_cgrp; 1006 /* 1007 * Need a buffer in BSS, can't rely on allocations. The code relies 1008 * on the assumption that OOM is serialized for memory controller. 1009 * If this assumption is broken, revisit this code. 1010 */ 1011 static char memcg_name[PATH_MAX]; 1012 int ret; 1013 1014 if (!memcg) 1015 return; 1016 1017 1018 rcu_read_lock(); 1019 1020 mem_cgrp = memcg->css.cgroup; 1021 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1022 1023 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1024 if (ret < 0) { 1025 /* 1026 * Unfortunately, we are unable to convert to a useful name 1027 * But we'll still print out the usage information 1028 */ 1029 rcu_read_unlock(); 1030 goto done; 1031 } 1032 rcu_read_unlock(); 1033 1034 printk(KERN_INFO "Task in %s killed", memcg_name); 1035 1036 rcu_read_lock(); 1037 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1038 if (ret < 0) { 1039 rcu_read_unlock(); 1040 goto done; 1041 } 1042 rcu_read_unlock(); 1043 1044 /* 1045 * Continues from above, so we don't need an KERN_ level 1046 */ 1047 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1048done: 1049 1050 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1051 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1052 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1053 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1054 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1055 "failcnt %llu\n", 1056 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1057 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1058 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1059} 1060 1061/* 1062 * This function returns the number of memcg under hierarchy tree. Returns 1063 * 1(self count) if no children. 1064 */ 1065static int mem_cgroup_count_children(struct mem_cgroup *mem) 1066{ 1067 int num = 0; 1068 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1069 return num; 1070} 1071 1072/* 1073 * Visit the first child (need not be the first child as per the ordering 1074 * of the cgroup list, since we track last_scanned_child) of @mem and use 1075 * that to reclaim free pages from. 1076 */ 1077static struct mem_cgroup * 1078mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1079{ 1080 struct mem_cgroup *ret = NULL; 1081 struct cgroup_subsys_state *css; 1082 int nextid, found; 1083 1084 if (!root_mem->use_hierarchy) { 1085 css_get(&root_mem->css); 1086 ret = root_mem; 1087 } 1088 1089 while (!ret) { 1090 rcu_read_lock(); 1091 nextid = root_mem->last_scanned_child + 1; 1092 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1093 &found); 1094 if (css && css_tryget(css)) 1095 ret = container_of(css, struct mem_cgroup, css); 1096 1097 rcu_read_unlock(); 1098 /* Updates scanning parameter */ 1099 spin_lock(&root_mem->reclaim_param_lock); 1100 if (!css) { 1101 /* this means start scan from ID:1 */ 1102 root_mem->last_scanned_child = 0; 1103 } else 1104 root_mem->last_scanned_child = found; 1105 spin_unlock(&root_mem->reclaim_param_lock); 1106 } 1107 1108 return ret; 1109} 1110 1111/* 1112 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1113 * we reclaimed from, so that we don't end up penalizing one child extensively 1114 * based on its position in the children list. 1115 * 1116 * root_mem is the original ancestor that we've been reclaim from. 1117 * 1118 * We give up and return to the caller when we visit root_mem twice. 1119 * (other groups can be removed while we're walking....) 1120 * 1121 * If shrink==true, for avoiding to free too much, this returns immedieately. 1122 */ 1123static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1124 struct zone *zone, 1125 gfp_t gfp_mask, 1126 unsigned long reclaim_options) 1127{ 1128 struct mem_cgroup *victim; 1129 int ret, total = 0; 1130 int loop = 0; 1131 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1132 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1133 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1134 unsigned long excess = mem_cgroup_get_excess(root_mem); 1135 1136 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1137 if (root_mem->memsw_is_minimum) 1138 noswap = true; 1139 1140 while (1) { 1141 victim = mem_cgroup_select_victim(root_mem); 1142 if (victim == root_mem) { 1143 loop++; 1144 if (loop >= 2) { 1145 /* 1146 * If we have not been able to reclaim 1147 * anything, it might because there are 1148 * no reclaimable pages under this hierarchy 1149 */ 1150 if (!check_soft || !total) { 1151 css_put(&victim->css); 1152 break; 1153 } 1154 /* 1155 * We want to do more targetted reclaim. 1156 * excess >> 2 is not to excessive so as to 1157 * reclaim too much, nor too less that we keep 1158 * coming back to reclaim from this cgroup 1159 */ 1160 if (total >= (excess >> 2) || 1161 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1162 css_put(&victim->css); 1163 break; 1164 } 1165 } 1166 } 1167 if (!mem_cgroup_local_usage(&victim->stat)) { 1168 /* this cgroup's local usage == 0 */ 1169 css_put(&victim->css); 1170 continue; 1171 } 1172 /* we use swappiness of local cgroup */ 1173 if (check_soft) 1174 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1175 noswap, get_swappiness(victim), zone, 1176 zone->zone_pgdat->node_id); 1177 else 1178 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1179 noswap, get_swappiness(victim)); 1180 css_put(&victim->css); 1181 /* 1182 * At shrinking usage, we can't check we should stop here or 1183 * reclaim more. It's depends on callers. last_scanned_child 1184 * will work enough for keeping fairness under tree. 1185 */ 1186 if (shrink) 1187 return ret; 1188 total += ret; 1189 if (check_soft) { 1190 if (res_counter_check_under_soft_limit(&root_mem->res)) 1191 return total; 1192 } else if (mem_cgroup_check_under_limit(root_mem)) 1193 return 1 + total; 1194 } 1195 return total; 1196} 1197 1198bool mem_cgroup_oom_called(struct task_struct *task) 1199{ 1200 bool ret = false; 1201 struct mem_cgroup *mem; 1202 struct mm_struct *mm; 1203 1204 rcu_read_lock(); 1205 mm = task->mm; 1206 if (!mm) 1207 mm = &init_mm; 1208 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1209 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) 1210 ret = true; 1211 rcu_read_unlock(); 1212 return ret; 1213} 1214 1215static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1216{ 1217 mem->last_oom_jiffies = jiffies; 1218 return 0; 1219} 1220 1221static void record_last_oom(struct mem_cgroup *mem) 1222{ 1223 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1224} 1225 1226/* 1227 * Currently used to update mapped file statistics, but the routine can be 1228 * generalized to update other statistics as well. 1229 */ 1230void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1231{ 1232 struct mem_cgroup *mem; 1233 struct mem_cgroup_stat *stat; 1234 struct mem_cgroup_stat_cpu *cpustat; 1235 int cpu; 1236 struct page_cgroup *pc; 1237 1238 if (!page_is_file_cache(page)) 1239 return; 1240 1241 pc = lookup_page_cgroup(page); 1242 if (unlikely(!pc)) 1243 return; 1244 1245 lock_page_cgroup(pc); 1246 mem = pc->mem_cgroup; 1247 if (!mem) 1248 goto done; 1249 1250 if (!PageCgroupUsed(pc)) 1251 goto done; 1252 1253 /* 1254 * Preemption is already disabled, we don't need get_cpu() 1255 */ 1256 cpu = smp_processor_id(); 1257 stat = &mem->stat; 1258 cpustat = &stat->cpustat[cpu]; 1259 1260 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1261done: 1262 unlock_page_cgroup(pc); 1263} 1264 1265/* 1266 * Unlike exported interface, "oom" parameter is added. if oom==true, 1267 * oom-killer can be invoked. 1268 */ 1269static int __mem_cgroup_try_charge(struct mm_struct *mm, 1270 gfp_t gfp_mask, struct mem_cgroup **memcg, 1271 bool oom, struct page *page) 1272{ 1273 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; 1274 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1275 struct res_counter *fail_res, *soft_fail_res = NULL; 1276 1277 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1278 /* Don't account this! */ 1279 *memcg = NULL; 1280 return 0; 1281 } 1282 1283 /* 1284 * We always charge the cgroup the mm_struct belongs to. 1285 * The mm_struct's mem_cgroup changes on task migration if the 1286 * thread group leader migrates. It's possible that mm is not 1287 * set, if so charge the init_mm (happens for pagecache usage). 1288 */ 1289 mem = *memcg; 1290 if (likely(!mem)) { 1291 mem = try_get_mem_cgroup_from_mm(mm); 1292 *memcg = mem; 1293 } else { 1294 css_get(&mem->css); 1295 } 1296 if (unlikely(!mem)) 1297 return 0; 1298 1299 VM_BUG_ON(css_is_removed(&mem->css)); 1300 1301 while (1) { 1302 int ret = 0; 1303 unsigned long flags = 0; 1304 1305 if (mem_cgroup_is_root(mem)) 1306 goto done; 1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, 1308 &soft_fail_res); 1309 if (likely(!ret)) { 1310 if (!do_swap_account) 1311 break; 1312 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1313 &fail_res, NULL); 1314 if (likely(!ret)) 1315 break; 1316 /* mem+swap counter fails */ 1317 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1318 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1319 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1320 memsw); 1321 } else 1322 /* mem counter fails */ 1323 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1324 res); 1325 1326 if (!(gfp_mask & __GFP_WAIT)) 1327 goto nomem; 1328 1329 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1330 gfp_mask, flags); 1331 if (ret) 1332 continue; 1333 1334 /* 1335 * try_to_free_mem_cgroup_pages() might not give us a full 1336 * picture of reclaim. Some pages are reclaimed and might be 1337 * moved to swap cache or just unmapped from the cgroup. 1338 * Check the limit again to see if the reclaim reduced the 1339 * current usage of the cgroup before giving up 1340 * 1341 */ 1342 if (mem_cgroup_check_under_limit(mem_over_limit)) 1343 continue; 1344 1345 if (!nr_retries--) { 1346 if (oom) { 1347 mutex_lock(&memcg_tasklist); 1348 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1349 mutex_unlock(&memcg_tasklist); 1350 record_last_oom(mem_over_limit); 1351 } 1352 goto nomem; 1353 } 1354 } 1355 /* 1356 * Insert just the ancestor, we should trickle down to the correct 1357 * cgroup for reclaim, since the other nodes will be below their 1358 * soft limit 1359 */ 1360 if (soft_fail_res) { 1361 mem_over_soft_limit = 1362 mem_cgroup_from_res_counter(soft_fail_res, res); 1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) 1364 mem_cgroup_update_tree(mem_over_soft_limit, page); 1365 } 1366done: 1367 return 0; 1368nomem: 1369 css_put(&mem->css); 1370 return -ENOMEM; 1371} 1372 1373/* 1374 * A helper function to get mem_cgroup from ID. must be called under 1375 * rcu_read_lock(). The caller must check css_is_removed() or some if 1376 * it's concern. (dropping refcnt from swap can be called against removed 1377 * memcg.) 1378 */ 1379static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 1380{ 1381 struct cgroup_subsys_state *css; 1382 1383 /* ID 0 is unused ID */ 1384 if (!id) 1385 return NULL; 1386 css = css_lookup(&mem_cgroup_subsys, id); 1387 if (!css) 1388 return NULL; 1389 return container_of(css, struct mem_cgroup, css); 1390} 1391 1392static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1393{ 1394 struct mem_cgroup *mem; 1395 struct page_cgroup *pc; 1396 unsigned short id; 1397 swp_entry_t ent; 1398 1399 VM_BUG_ON(!PageLocked(page)); 1400 1401 if (!PageSwapCache(page)) 1402 return NULL; 1403 1404 pc = lookup_page_cgroup(page); 1405 lock_page_cgroup(pc); 1406 if (PageCgroupUsed(pc)) { 1407 mem = pc->mem_cgroup; 1408 if (mem && !css_tryget(&mem->css)) 1409 mem = NULL; 1410 } else { 1411 ent.val = page_private(page); 1412 id = lookup_swap_cgroup(ent); 1413 rcu_read_lock(); 1414 mem = mem_cgroup_lookup(id); 1415 if (mem && !css_tryget(&mem->css)) 1416 mem = NULL; 1417 rcu_read_unlock(); 1418 } 1419 unlock_page_cgroup(pc); 1420 return mem; 1421} 1422 1423/* 1424 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 1425 * USED state. If already USED, uncharge and return. 1426 */ 1427 1428static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 1429 struct page_cgroup *pc, 1430 enum charge_type ctype) 1431{ 1432 /* try_charge() can return NULL to *memcg, taking care of it. */ 1433 if (!mem) 1434 return; 1435 1436 lock_page_cgroup(pc); 1437 if (unlikely(PageCgroupUsed(pc))) { 1438 unlock_page_cgroup(pc); 1439 if (!mem_cgroup_is_root(mem)) { 1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1441 if (do_swap_account) 1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE, 1443 NULL); 1444 } 1445 css_put(&mem->css); 1446 return; 1447 } 1448 1449 pc->mem_cgroup = mem; 1450 /* 1451 * We access a page_cgroup asynchronously without lock_page_cgroup(). 1452 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 1453 * is accessed after testing USED bit. To make pc->mem_cgroup visible 1454 * before USED bit, we need memory barrier here. 1455 * See mem_cgroup_add_lru_list(), etc. 1456 */ 1457 smp_wmb(); 1458 switch (ctype) { 1459 case MEM_CGROUP_CHARGE_TYPE_CACHE: 1460 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 1461 SetPageCgroupCache(pc); 1462 SetPageCgroupUsed(pc); 1463 break; 1464 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1465 ClearPageCgroupCache(pc); 1466 SetPageCgroupUsed(pc); 1467 break; 1468 default: 1469 break; 1470 } 1471 1472 mem_cgroup_charge_statistics(mem, pc, true); 1473 1474 unlock_page_cgroup(pc); 1475} 1476 1477/** 1478 * mem_cgroup_move_account - move account of the page 1479 * @pc: page_cgroup of the page. 1480 * @from: mem_cgroup which the page is moved from. 1481 * @to: mem_cgroup which the page is moved to. @from != @to. 1482 * 1483 * The caller must confirm following. 1484 * - page is not on LRU (isolate_page() is useful.) 1485 * 1486 * returns 0 at success, 1487 * returns -EBUSY when lock is busy or "pc" is unstable. 1488 * 1489 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1490 * new cgroup. It should be done by a caller. 1491 */ 1492 1493static int mem_cgroup_move_account(struct page_cgroup *pc, 1494 struct mem_cgroup *from, struct mem_cgroup *to) 1495{ 1496 struct mem_cgroup_per_zone *from_mz, *to_mz; 1497 int nid, zid; 1498 int ret = -EBUSY; 1499 struct page *page; 1500 int cpu; 1501 struct mem_cgroup_stat *stat; 1502 struct mem_cgroup_stat_cpu *cpustat; 1503 1504 VM_BUG_ON(from == to); 1505 VM_BUG_ON(PageLRU(pc->page)); 1506 1507 nid = page_cgroup_nid(pc); 1508 zid = page_cgroup_zid(pc); 1509 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 1510 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 1511 1512 if (!trylock_page_cgroup(pc)) 1513 return ret; 1514 1515 if (!PageCgroupUsed(pc)) 1516 goto out; 1517 1518 if (pc->mem_cgroup != from) 1519 goto out; 1520 1521 if (!mem_cgroup_is_root(from)) 1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL); 1523 mem_cgroup_charge_statistics(from, pc, false); 1524 1525 page = pc->page; 1526 if (page_is_file_cache(page) && page_mapped(page)) { 1527 cpu = smp_processor_id(); 1528 /* Update mapped_file data for mem_cgroup "from" */ 1529 stat = &from->stat; 1530 cpustat = &stat->cpustat[cpu]; 1531 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1532 -1); 1533 1534 /* Update mapped_file data for mem_cgroup "to" */ 1535 stat = &to->stat; 1536 cpustat = &stat->cpustat[cpu]; 1537 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1538 1); 1539 } 1540 1541 if (do_swap_account && !mem_cgroup_is_root(from)) 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); 1543 css_put(&from->css); 1544 1545 css_get(&to->css); 1546 pc->mem_cgroup = to; 1547 mem_cgroup_charge_statistics(to, pc, true); 1548 ret = 0; 1549out: 1550 unlock_page_cgroup(pc); 1551 /* 1552 * We charges against "to" which may not have any tasks. Then, "to" 1553 * can be under rmdir(). But in current implementation, caller of 1554 * this function is just force_empty() and it's garanteed that 1555 * "to" is never removed. So, we don't check rmdir status here. 1556 */ 1557 return ret; 1558} 1559 1560/* 1561 * move charges to its parent. 1562 */ 1563 1564static int mem_cgroup_move_parent(struct page_cgroup *pc, 1565 struct mem_cgroup *child, 1566 gfp_t gfp_mask) 1567{ 1568 struct page *page = pc->page; 1569 struct cgroup *cg = child->css.cgroup; 1570 struct cgroup *pcg = cg->parent; 1571 struct mem_cgroup *parent; 1572 int ret; 1573 1574 /* Is ROOT ? */ 1575 if (!pcg) 1576 return -EINVAL; 1577 1578 1579 parent = mem_cgroup_from_cont(pcg); 1580 1581 1582 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1583 if (ret || !parent) 1584 return ret; 1585 1586 if (!get_page_unless_zero(page)) { 1587 ret = -EBUSY; 1588 goto uncharge; 1589 } 1590 1591 ret = isolate_lru_page(page); 1592 1593 if (ret) 1594 goto cancel; 1595 1596 ret = mem_cgroup_move_account(pc, child, parent); 1597 1598 putback_lru_page(page); 1599 if (!ret) { 1600 put_page(page); 1601 /* drop extra refcnt by try_charge() */ 1602 css_put(&parent->css); 1603 return 0; 1604 } 1605 1606cancel: 1607 put_page(page); 1608uncharge: 1609 /* drop extra refcnt by try_charge() */ 1610 css_put(&parent->css); 1611 /* uncharge if move fails */ 1612 if (!mem_cgroup_is_root(parent)) { 1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); 1614 if (do_swap_account) 1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); 1616 } 1617 return ret; 1618} 1619 1620/* 1621 * Charge the memory controller for page usage. 1622 * Return 1623 * 0 if the charge was successful 1624 * < 0 if the cgroup is over its limit 1625 */ 1626static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 1627 gfp_t gfp_mask, enum charge_type ctype, 1628 struct mem_cgroup *memcg) 1629{ 1630 struct mem_cgroup *mem; 1631 struct page_cgroup *pc; 1632 int ret; 1633 1634 pc = lookup_page_cgroup(page); 1635 /* can happen at boot */ 1636 if (unlikely(!pc)) 1637 return 0; 1638 prefetchw(pc); 1639 1640 mem = memcg; 1641 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1642 if (ret || !mem) 1643 return ret; 1644 1645 __mem_cgroup_commit_charge(mem, pc, ctype); 1646 return 0; 1647} 1648 1649int mem_cgroup_newpage_charge(struct page *page, 1650 struct mm_struct *mm, gfp_t gfp_mask) 1651{ 1652 if (mem_cgroup_disabled()) 1653 return 0; 1654 if (PageCompound(page)) 1655 return 0; 1656 /* 1657 * If already mapped, we don't have to account. 1658 * If page cache, page->mapping has address_space. 1659 * But page->mapping may have out-of-use anon_vma pointer, 1660 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 1661 * is NULL. 1662 */ 1663 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 1664 return 0; 1665 if (unlikely(!mm)) 1666 mm = &init_mm; 1667 return mem_cgroup_charge_common(page, mm, gfp_mask, 1668 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1669} 1670 1671static void 1672__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1673 enum charge_type ctype); 1674 1675int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1676 gfp_t gfp_mask) 1677{ 1678 struct mem_cgroup *mem = NULL; 1679 int ret; 1680 1681 if (mem_cgroup_disabled()) 1682 return 0; 1683 if (PageCompound(page)) 1684 return 0; 1685 /* 1686 * Corner case handling. This is called from add_to_page_cache() 1687 * in usual. But some FS (shmem) precharges this page before calling it 1688 * and call add_to_page_cache() with GFP_NOWAIT. 1689 * 1690 * For GFP_NOWAIT case, the page may be pre-charged before calling 1691 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1692 * charge twice. (It works but has to pay a bit larger cost.) 1693 * And when the page is SwapCache, it should take swap information 1694 * into account. This is under lock_page() now. 1695 */ 1696 if (!(gfp_mask & __GFP_WAIT)) { 1697 struct page_cgroup *pc; 1698 1699 1700 pc = lookup_page_cgroup(page); 1701 if (!pc) 1702 return 0; 1703 lock_page_cgroup(pc); 1704 if (PageCgroupUsed(pc)) { 1705 unlock_page_cgroup(pc); 1706 return 0; 1707 } 1708 unlock_page_cgroup(pc); 1709 } 1710 1711 if (unlikely(!mm && !mem)) 1712 mm = &init_mm; 1713 1714 if (page_is_file_cache(page)) 1715 return mem_cgroup_charge_common(page, mm, gfp_mask, 1716 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1717 1718 /* shmem */ 1719 if (PageSwapCache(page)) { 1720 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 1721 if (!ret) 1722 __mem_cgroup_commit_charge_swapin(page, mem, 1723 MEM_CGROUP_CHARGE_TYPE_SHMEM); 1724 } else 1725 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1726 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1727 1728 return ret; 1729} 1730 1731/* 1732 * While swap-in, try_charge -> commit or cancel, the page is locked. 1733 * And when try_charge() successfully returns, one refcnt to memcg without 1734 * struct page_cgroup is aquired. This refcnt will be cumsumed by 1735 * "commit()" or removed by "cancel()" 1736 */ 1737int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1738 struct page *page, 1739 gfp_t mask, struct mem_cgroup **ptr) 1740{ 1741 struct mem_cgroup *mem; 1742 int ret; 1743 1744 if (mem_cgroup_disabled()) 1745 return 0; 1746 1747 if (!do_swap_account) 1748 goto charge_cur_mm; 1749 /* 1750 * A racing thread's fault, or swapoff, may have already updated 1751 * the pte, and even removed page from swap cache: return success 1752 * to go on to do_swap_page()'s pte_same() test, which should fail. 1753 */ 1754 if (!PageSwapCache(page)) 1755 return 0; 1756 mem = try_get_mem_cgroup_from_swapcache(page); 1757 if (!mem) 1758 goto charge_cur_mm; 1759 *ptr = mem; 1760 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 1761 /* drop extra refcnt from tryget */ 1762 css_put(&mem->css); 1763 return ret; 1764charge_cur_mm: 1765 if (unlikely(!mm)) 1766 mm = &init_mm; 1767 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 1768} 1769 1770static void 1771__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 1772 enum charge_type ctype) 1773{ 1774 struct page_cgroup *pc; 1775 1776 if (mem_cgroup_disabled()) 1777 return; 1778 if (!ptr) 1779 return; 1780 cgroup_exclude_rmdir(&ptr->css); 1781 pc = lookup_page_cgroup(page); 1782 mem_cgroup_lru_del_before_commit_swapcache(page); 1783 __mem_cgroup_commit_charge(ptr, pc, ctype); 1784 mem_cgroup_lru_add_after_commit_swapcache(page); 1785 /* 1786 * Now swap is on-memory. This means this page may be 1787 * counted both as mem and swap....double count. 1788 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 1789 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 1790 * may call delete_from_swap_cache() before reach here. 1791 */ 1792 if (do_swap_account && PageSwapCache(page)) { 1793 swp_entry_t ent = {.val = page_private(page)}; 1794 unsigned short id; 1795 struct mem_cgroup *memcg; 1796 1797 id = swap_cgroup_record(ent, 0); 1798 rcu_read_lock(); 1799 memcg = mem_cgroup_lookup(id); 1800 if (memcg) { 1801 /* 1802 * This recorded memcg can be obsolete one. So, avoid 1803 * calling css_tryget 1804 */ 1805 if (!mem_cgroup_is_root(memcg)) 1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, 1807 NULL); 1808 mem_cgroup_swap_statistics(memcg, false); 1809 mem_cgroup_put(memcg); 1810 } 1811 rcu_read_unlock(); 1812 } 1813 /* 1814 * At swapin, we may charge account against cgroup which has no tasks. 1815 * So, rmdir()->pre_destroy() can be called while we do this charge. 1816 * In that case, we need to call pre_destroy() again. check it here. 1817 */ 1818 cgroup_release_and_wakeup_rmdir(&ptr->css); 1819} 1820 1821void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1822{ 1823 __mem_cgroup_commit_charge_swapin(page, ptr, 1824 MEM_CGROUP_CHARGE_TYPE_MAPPED); 1825} 1826 1827void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1828{ 1829 if (mem_cgroup_disabled()) 1830 return; 1831 if (!mem) 1832 return; 1833 if (!mem_cgroup_is_root(mem)) { 1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); 1835 if (do_swap_account) 1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1837 } 1838 css_put(&mem->css); 1839} 1840 1841 1842/* 1843 * uncharge if !page_mapped(page) 1844 */ 1845static struct mem_cgroup * 1846__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1847{ 1848 struct page_cgroup *pc; 1849 struct mem_cgroup *mem = NULL; 1850 struct mem_cgroup_per_zone *mz; 1851 bool soft_limit_excess = false; 1852 1853 if (mem_cgroup_disabled()) 1854 return NULL; 1855 1856 if (PageSwapCache(page)) 1857 return NULL; 1858 1859 /* 1860 * Check if our page_cgroup is valid 1861 */ 1862 pc = lookup_page_cgroup(page); 1863 if (unlikely(!pc || !PageCgroupUsed(pc))) 1864 return NULL; 1865 1866 lock_page_cgroup(pc); 1867 1868 mem = pc->mem_cgroup; 1869 1870 if (!PageCgroupUsed(pc)) 1871 goto unlock_out; 1872 1873 switch (ctype) { 1874 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1875 case MEM_CGROUP_CHARGE_TYPE_DROP: 1876 if (page_mapped(page)) 1877 goto unlock_out; 1878 break; 1879 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 1880 if (!PageAnon(page)) { /* Shared memory */ 1881 if (page->mapping && !page_is_file_cache(page)) 1882 goto unlock_out; 1883 } else if (page_mapped(page)) /* Anon */ 1884 goto unlock_out; 1885 break; 1886 default: 1887 break; 1888 } 1889 1890 if (!mem_cgroup_is_root(mem)) { 1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); 1892 if (do_swap_account && 1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); 1895 } 1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1897 mem_cgroup_swap_statistics(mem, true); 1898 mem_cgroup_charge_statistics(mem, pc, false); 1899 1900 ClearPageCgroupUsed(pc); 1901 /* 1902 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1903 * freed from LRU. This is safe because uncharged page is expected not 1904 * to be reused (freed soon). Exception is SwapCache, it's handled by 1905 * special functions. 1906 */ 1907 1908 mz = page_cgroup_zoneinfo(pc); 1909 unlock_page_cgroup(pc); 1910 1911 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) 1912 mem_cgroup_update_tree(mem, page); 1913 /* at swapout, this memcg will be accessed to record to swap */ 1914 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1915 css_put(&mem->css); 1916 1917 return mem; 1918 1919unlock_out: 1920 unlock_page_cgroup(pc); 1921 return NULL; 1922} 1923 1924void mem_cgroup_uncharge_page(struct page *page) 1925{ 1926 /* early check. */ 1927 if (page_mapped(page)) 1928 return; 1929 if (page->mapping && !PageAnon(page)) 1930 return; 1931 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1932} 1933 1934void mem_cgroup_uncharge_cache_page(struct page *page) 1935{ 1936 VM_BUG_ON(page_mapped(page)); 1937 VM_BUG_ON(page->mapping); 1938 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1939} 1940 1941#ifdef CONFIG_SWAP 1942/* 1943 * called after __delete_from_swap_cache() and drop "page" account. 1944 * memcg information is recorded to swap_cgroup of "ent" 1945 */ 1946void 1947mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 1948{ 1949 struct mem_cgroup *memcg; 1950 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 1951 1952 if (!swapout) /* this was a swap cache but the swap is unused ! */ 1953 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 1954 1955 memcg = __mem_cgroup_uncharge_common(page, ctype); 1956 1957 /* record memcg information */ 1958 if (do_swap_account && swapout && memcg) { 1959 swap_cgroup_record(ent, css_id(&memcg->css)); 1960 mem_cgroup_get(memcg); 1961 } 1962 if (swapout && memcg) 1963 css_put(&memcg->css); 1964} 1965#endif 1966 1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1968/* 1969 * called from swap_entry_free(). remove record in swap_cgroup and 1970 * uncharge "memsw" account. 1971 */ 1972void mem_cgroup_uncharge_swap(swp_entry_t ent) 1973{ 1974 struct mem_cgroup *memcg; 1975 unsigned short id; 1976 1977 if (!do_swap_account) 1978 return; 1979 1980 id = swap_cgroup_record(ent, 0); 1981 rcu_read_lock(); 1982 memcg = mem_cgroup_lookup(id); 1983 if (memcg) { 1984 /* 1985 * We uncharge this because swap is freed. 1986 * This memcg can be obsolete one. We avoid calling css_tryget 1987 */ 1988 if (!mem_cgroup_is_root(memcg)) 1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); 1990 mem_cgroup_swap_statistics(memcg, false); 1991 mem_cgroup_put(memcg); 1992 } 1993 rcu_read_unlock(); 1994} 1995#endif 1996 1997/* 1998 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 1999 * page belongs to. 2000 */ 2001int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2002{ 2003 struct page_cgroup *pc; 2004 struct mem_cgroup *mem = NULL; 2005 int ret = 0; 2006 2007 if (mem_cgroup_disabled()) 2008 return 0; 2009 2010 pc = lookup_page_cgroup(page); 2011 lock_page_cgroup(pc); 2012 if (PageCgroupUsed(pc)) { 2013 mem = pc->mem_cgroup; 2014 css_get(&mem->css); 2015 } 2016 unlock_page_cgroup(pc); 2017 2018 if (mem) { 2019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2020 page); 2021 css_put(&mem->css); 2022 } 2023 *ptr = mem; 2024 return ret; 2025} 2026 2027/* remove redundant charge if migration failed*/ 2028void mem_cgroup_end_migration(struct mem_cgroup *mem, 2029 struct page *oldpage, struct page *newpage) 2030{ 2031 struct page *target, *unused; 2032 struct page_cgroup *pc; 2033 enum charge_type ctype; 2034 2035 if (!mem) 2036 return; 2037 cgroup_exclude_rmdir(&mem->css); 2038 /* at migration success, oldpage->mapping is NULL. */ 2039 if (oldpage->mapping) { 2040 target = oldpage; 2041 unused = NULL; 2042 } else { 2043 target = newpage; 2044 unused = oldpage; 2045 } 2046 2047 if (PageAnon(target)) 2048 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 2049 else if (page_is_file_cache(target)) 2050 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2051 else 2052 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2053 2054 /* unused page is not on radix-tree now. */ 2055 if (unused) 2056 __mem_cgroup_uncharge_common(unused, ctype); 2057 2058 pc = lookup_page_cgroup(target); 2059 /* 2060 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2061 * So, double-counting is effectively avoided. 2062 */ 2063 __mem_cgroup_commit_charge(mem, pc, ctype); 2064 2065 /* 2066 * Both of oldpage and newpage are still under lock_page(). 2067 * Then, we don't have to care about race in radix-tree. 2068 * But we have to be careful that this page is unmapped or not. 2069 * 2070 * There is a case for !page_mapped(). At the start of 2071 * migration, oldpage was mapped. But now, it's zapped. 2072 * But we know *target* page is not freed/reused under us. 2073 * mem_cgroup_uncharge_page() does all necessary checks. 2074 */ 2075 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2076 mem_cgroup_uncharge_page(target); 2077 /* 2078 * At migration, we may charge account against cgroup which has no tasks 2079 * So, rmdir()->pre_destroy() can be called while we do this charge. 2080 * In that case, we need to call pre_destroy() again. check it here. 2081 */ 2082 cgroup_release_and_wakeup_rmdir(&mem->css); 2083} 2084 2085/* 2086 * A call to try to shrink memory usage on charge failure at shmem's swapin. 2087 * Calling hierarchical_reclaim is not enough because we should update 2088 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 2089 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 2090 * not from the memcg which this page would be charged to. 2091 * try_charge_swapin does all of these works properly. 2092 */ 2093int mem_cgroup_shmem_charge_fallback(struct page *page, 2094 struct mm_struct *mm, 2095 gfp_t gfp_mask) 2096{ 2097 struct mem_cgroup *mem = NULL; 2098 int ret; 2099 2100 if (mem_cgroup_disabled()) 2101 return 0; 2102 2103 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2104 if (!ret) 2105 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 2106 2107 return ret; 2108} 2109 2110static DEFINE_MUTEX(set_limit_mutex); 2111 2112static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 2113 unsigned long long val) 2114{ 2115 int retry_count; 2116 int progress; 2117 u64 memswlimit; 2118 int ret = 0; 2119 int children = mem_cgroup_count_children(memcg); 2120 u64 curusage, oldusage; 2121 2122 /* 2123 * For keeping hierarchical_reclaim simple, how long we should retry 2124 * is depends on callers. We set our retry-count to be function 2125 * of # of children which we should visit in this loop. 2126 */ 2127 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 2128 2129 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2130 2131 while (retry_count) { 2132 if (signal_pending(current)) { 2133 ret = -EINTR; 2134 break; 2135 } 2136 /* 2137 * Rather than hide all in some function, I do this in 2138 * open coded manner. You see what this really does. 2139 * We have to guarantee mem->res.limit < mem->memsw.limit. 2140 */ 2141 mutex_lock(&set_limit_mutex); 2142 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2143 if (memswlimit < val) { 2144 ret = -EINVAL; 2145 mutex_unlock(&set_limit_mutex); 2146 break; 2147 } 2148 ret = res_counter_set_limit(&memcg->res, val); 2149 if (!ret) { 2150 if (memswlimit == val) 2151 memcg->memsw_is_minimum = true; 2152 else 2153 memcg->memsw_is_minimum = false; 2154 } 2155 mutex_unlock(&set_limit_mutex); 2156 2157 if (!ret) 2158 break; 2159 2160 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2161 GFP_KERNEL, 2162 MEM_CGROUP_RECLAIM_SHRINK); 2163 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2164 /* Usage is reduced ? */ 2165 if (curusage >= oldusage) 2166 retry_count--; 2167 else 2168 oldusage = curusage; 2169 } 2170 2171 return ret; 2172} 2173 2174static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 2175 unsigned long long val) 2176{ 2177 int retry_count; 2178 u64 memlimit, oldusage, curusage; 2179 int children = mem_cgroup_count_children(memcg); 2180 int ret = -EBUSY; 2181 2182 /* see mem_cgroup_resize_res_limit */ 2183 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 2184 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2185 while (retry_count) { 2186 if (signal_pending(current)) { 2187 ret = -EINTR; 2188 break; 2189 } 2190 /* 2191 * Rather than hide all in some function, I do this in 2192 * open coded manner. You see what this really does. 2193 * We have to guarantee mem->res.limit < mem->memsw.limit. 2194 */ 2195 mutex_lock(&set_limit_mutex); 2196 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2197 if (memlimit > val) { 2198 ret = -EINVAL; 2199 mutex_unlock(&set_limit_mutex); 2200 break; 2201 } 2202 ret = res_counter_set_limit(&memcg->memsw, val); 2203 if (!ret) { 2204 if (memlimit == val) 2205 memcg->memsw_is_minimum = true; 2206 else 2207 memcg->memsw_is_minimum = false; 2208 } 2209 mutex_unlock(&set_limit_mutex); 2210 2211 if (!ret) 2212 break; 2213 2214 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 2215 MEM_CGROUP_RECLAIM_NOSWAP | 2216 MEM_CGROUP_RECLAIM_SHRINK); 2217 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2218 /* Usage is reduced ? */ 2219 if (curusage >= oldusage) 2220 retry_count--; 2221 else 2222 oldusage = curusage; 2223 } 2224 return ret; 2225} 2226 2227unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2228 gfp_t gfp_mask, int nid, 2229 int zid) 2230{ 2231 unsigned long nr_reclaimed = 0; 2232 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2233 unsigned long reclaimed; 2234 int loop = 0; 2235 struct mem_cgroup_tree_per_zone *mctz; 2236 2237 if (order > 0) 2238 return 0; 2239 2240 mctz = soft_limit_tree_node_zone(nid, zid); 2241 /* 2242 * This loop can run a while, specially if mem_cgroup's continuously 2243 * keep exceeding their soft limit and putting the system under 2244 * pressure 2245 */ 2246 do { 2247 if (next_mz) 2248 mz = next_mz; 2249 else 2250 mz = mem_cgroup_largest_soft_limit_node(mctz); 2251 if (!mz) 2252 break; 2253 2254 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 2255 gfp_mask, 2256 MEM_CGROUP_RECLAIM_SOFT); 2257 nr_reclaimed += reclaimed; 2258 spin_lock(&mctz->lock); 2259 2260 /* 2261 * If we failed to reclaim anything from this memory cgroup 2262 * it is time to move on to the next cgroup 2263 */ 2264 next_mz = NULL; 2265 if (!reclaimed) { 2266 do { 2267 /* 2268 * Loop until we find yet another one. 2269 * 2270 * By the time we get the soft_limit lock 2271 * again, someone might have aded the 2272 * group back on the RB tree. Iterate to 2273 * make sure we get a different mem. 2274 * mem_cgroup_largest_soft_limit_node returns 2275 * NULL if no other cgroup is present on 2276 * the tree 2277 */ 2278 next_mz = 2279 __mem_cgroup_largest_soft_limit_node(mctz); 2280 if (next_mz == mz) { 2281 css_put(&next_mz->mem->css); 2282 next_mz = NULL; 2283 } else /* next_mz == NULL or other memcg */ 2284 break; 2285 } while (1); 2286 } 2287 mz->usage_in_excess = 2288 res_counter_soft_limit_excess(&mz->mem->res); 2289 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 2290 /* 2291 * One school of thought says that we should not add 2292 * back the node to the tree if reclaim returns 0. 2293 * But our reclaim could return 0, simply because due 2294 * to priority we are exposing a smaller subset of 2295 * memory to reclaim from. Consider this as a longer 2296 * term TODO. 2297 */ 2298 if (mz->usage_in_excess) 2299 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); 2300 spin_unlock(&mctz->lock); 2301 css_put(&mz->mem->css); 2302 loop++; 2303 /* 2304 * Could not reclaim anything and there are no more 2305 * mem cgroups to try or we seem to be looping without 2306 * reclaiming anything. 2307 */ 2308 if (!nr_reclaimed && 2309 (next_mz == NULL || 2310 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 2311 break; 2312 } while (!nr_reclaimed); 2313 if (next_mz) 2314 css_put(&next_mz->mem->css); 2315 return nr_reclaimed; 2316} 2317 2318/* 2319 * This routine traverse page_cgroup in given list and drop them all. 2320 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2321 */ 2322static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 2323 int node, int zid, enum lru_list lru) 2324{ 2325 struct zone *zone; 2326 struct mem_cgroup_per_zone *mz; 2327 struct page_cgroup *pc, *busy; 2328 unsigned long flags, loop; 2329 struct list_head *list; 2330 int ret = 0; 2331 2332 zone = &NODE_DATA(node)->node_zones[zid]; 2333 mz = mem_cgroup_zoneinfo(mem, node, zid); 2334 list = &mz->lists[lru]; 2335 2336 loop = MEM_CGROUP_ZSTAT(mz, lru); 2337 /* give some margin against EBUSY etc...*/ 2338 loop += 256; 2339 busy = NULL; 2340 while (loop--) { 2341 ret = 0; 2342 spin_lock_irqsave(&zone->lru_lock, flags); 2343 if (list_empty(list)) { 2344 spin_unlock_irqrestore(&zone->lru_lock, flags); 2345 break; 2346 } 2347 pc = list_entry(list->prev, struct page_cgroup, lru); 2348 if (busy == pc) { 2349 list_move(&pc->lru, list); 2350 busy = 0; 2351 spin_unlock_irqrestore(&zone->lru_lock, flags); 2352 continue; 2353 } 2354 spin_unlock_irqrestore(&zone->lru_lock, flags); 2355 2356 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 2357 if (ret == -ENOMEM) 2358 break; 2359 2360 if (ret == -EBUSY || ret == -EINVAL) { 2361 /* found lock contention or "pc" is obsolete. */ 2362 busy = pc; 2363 cond_resched(); 2364 } else 2365 busy = NULL; 2366 } 2367 2368 if (!ret && !list_empty(list)) 2369 return -EBUSY; 2370 return ret; 2371} 2372 2373/* 2374 * make mem_cgroup's charge to be 0 if there is no task. 2375 * This enables deleting this mem_cgroup. 2376 */ 2377static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 2378{ 2379 int ret; 2380 int node, zid, shrink; 2381 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2382 struct cgroup *cgrp = mem->css.cgroup; 2383 2384 css_get(&mem->css); 2385 2386 shrink = 0; 2387 /* should free all ? */ 2388 if (free_all) 2389 goto try_to_free; 2390move_account: 2391 while (mem->res.usage > 0) { 2392 ret = -EBUSY; 2393 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2394 goto out; 2395 ret = -EINTR; 2396 if (signal_pending(current)) 2397 goto out; 2398 /* This is for making all *used* pages to be on LRU. */ 2399 lru_add_drain_all(); 2400 ret = 0; 2401 for_each_node_state(node, N_HIGH_MEMORY) { 2402 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2403 enum lru_list l; 2404 for_each_lru(l) { 2405 ret = mem_cgroup_force_empty_list(mem, 2406 node, zid, l); 2407 if (ret) 2408 break; 2409 } 2410 } 2411 if (ret) 2412 break; 2413 } 2414 /* it seems parent cgroup doesn't have enough mem */ 2415 if (ret == -ENOMEM) 2416 goto try_to_free; 2417 cond_resched(); 2418 } 2419 ret = 0; 2420out: 2421 css_put(&mem->css); 2422 return ret; 2423 2424try_to_free: 2425 /* returns EBUSY if there is a task or if we come here twice. */ 2426 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 2427 ret = -EBUSY; 2428 goto out; 2429 } 2430 /* we call try-to-free pages for make this cgroup empty */ 2431 lru_add_drain_all(); 2432 /* try to free all pages in this cgroup */ 2433 shrink = 1; 2434 while (nr_retries && mem->res.usage > 0) { 2435 int progress; 2436 2437 if (signal_pending(current)) { 2438 ret = -EINTR; 2439 goto out; 2440 } 2441 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 2442 false, get_swappiness(mem)); 2443 if (!progress) { 2444 nr_retries--; 2445 /* maybe some writeback is necessary */ 2446 congestion_wait(BLK_RW_ASYNC, HZ/10); 2447 } 2448 2449 } 2450 lru_add_drain(); 2451 /* try move_account...there may be some *locked* pages. */ 2452 if (mem->res.usage) 2453 goto move_account; 2454 ret = 0; 2455 goto out; 2456} 2457 2458int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2459{ 2460 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 2461} 2462 2463 2464static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 2465{ 2466 return mem_cgroup_from_cont(cont)->use_hierarchy; 2467} 2468 2469static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 2470 u64 val) 2471{ 2472 int retval = 0; 2473 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2474 struct cgroup *parent = cont->parent; 2475 struct mem_cgroup *parent_mem = NULL; 2476 2477 if (parent) 2478 parent_mem = mem_cgroup_from_cont(parent); 2479 2480 cgroup_lock(); 2481 /* 2482 * If parent's use_hiearchy is set, we can't make any modifications 2483 * in the child subtrees. If it is unset, then the change can 2484 * occur, provided the current cgroup has no children. 2485 * 2486 * For the root cgroup, parent_mem is NULL, we allow value to be 2487 * set if there are no children. 2488 */ 2489 if ((!parent_mem || !parent_mem->use_hierarchy) && 2490 (val == 1 || val == 0)) { 2491 if (list_empty(&cont->children)) 2492 mem->use_hierarchy = val; 2493 else 2494 retval = -EBUSY; 2495 } else 2496 retval = -EINVAL; 2497 cgroup_unlock(); 2498 2499 return retval; 2500} 2501 2502struct mem_cgroup_idx_data { 2503 s64 val; 2504 enum mem_cgroup_stat_index idx; 2505}; 2506 2507static int 2508mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2509{ 2510 struct mem_cgroup_idx_data *d = data; 2511 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2512 return 0; 2513} 2514 2515static void 2516mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 2517 enum mem_cgroup_stat_index idx, s64 *val) 2518{ 2519 struct mem_cgroup_idx_data d; 2520 d.idx = idx; 2521 d.val = 0; 2522 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); 2523 *val = d.val; 2524} 2525 2526static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2527{ 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2529 u64 idx_val, val; 2530 int type, name; 2531 2532 type = MEMFILE_TYPE(cft->private); 2533 name = MEMFILE_ATTR(cft->private); 2534 switch (type) { 2535 case _MEM: 2536 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2537 mem_cgroup_get_recursive_idx_stat(mem, 2538 MEM_CGROUP_STAT_CACHE, &idx_val); 2539 val = idx_val; 2540 mem_cgroup_get_recursive_idx_stat(mem, 2541 MEM_CGROUP_STAT_RSS, &idx_val); 2542 val += idx_val; 2543 val <<= PAGE_SHIFT; 2544 } else 2545 val = res_counter_read_u64(&mem->res, name); 2546 break; 2547 case _MEMSWAP: 2548 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2549 mem_cgroup_get_recursive_idx_stat(mem, 2550 MEM_CGROUP_STAT_CACHE, &idx_val); 2551 val = idx_val; 2552 mem_cgroup_get_recursive_idx_stat(mem, 2553 MEM_CGROUP_STAT_RSS, &idx_val); 2554 val += idx_val; 2555 mem_cgroup_get_recursive_idx_stat(mem, 2556 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2557 val <<= PAGE_SHIFT; 2558 } else 2559 val = res_counter_read_u64(&mem->memsw, name); 2560 break; 2561 default: 2562 BUG(); 2563 break; 2564 } 2565 return val; 2566} 2567/* 2568 * The user of this function is... 2569 * RES_LIMIT. 2570 */ 2571static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 2572 const char *buffer) 2573{ 2574 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2575 int type, name; 2576 unsigned long long val; 2577 int ret; 2578 2579 type = MEMFILE_TYPE(cft->private); 2580 name = MEMFILE_ATTR(cft->private); 2581 switch (name) { 2582 case RES_LIMIT: 2583 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 2584 ret = -EINVAL; 2585 break; 2586 } 2587 /* This function does all necessary parse...reuse it */ 2588 ret = res_counter_memparse_write_strategy(buffer, &val); 2589 if (ret) 2590 break; 2591 if (type == _MEM) 2592 ret = mem_cgroup_resize_limit(memcg, val); 2593 else 2594 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2595 break; 2596 case RES_SOFT_LIMIT: 2597 ret = res_counter_memparse_write_strategy(buffer, &val); 2598 if (ret) 2599 break; 2600 /* 2601 * For memsw, soft limits are hard to implement in terms 2602 * of semantics, for now, we support soft limits for 2603 * control without swap 2604 */ 2605 if (type == _MEM) 2606 ret = res_counter_set_soft_limit(&memcg->res, val); 2607 else 2608 ret = -EINVAL; 2609 break; 2610 default: 2611 ret = -EINVAL; /* should be BUG() ? */ 2612 break; 2613 } 2614 return ret; 2615} 2616 2617static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 2618 unsigned long long *mem_limit, unsigned long long *memsw_limit) 2619{ 2620 struct cgroup *cgroup; 2621 unsigned long long min_limit, min_memsw_limit, tmp; 2622 2623 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 2624 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2625 cgroup = memcg->css.cgroup; 2626 if (!memcg->use_hierarchy) 2627 goto out; 2628 2629 while (cgroup->parent) { 2630 cgroup = cgroup->parent; 2631 memcg = mem_cgroup_from_cont(cgroup); 2632 if (!memcg->use_hierarchy) 2633 break; 2634 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 2635 min_limit = min(min_limit, tmp); 2636 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 2637 min_memsw_limit = min(min_memsw_limit, tmp); 2638 } 2639out: 2640 *mem_limit = min_limit; 2641 *memsw_limit = min_memsw_limit; 2642 return; 2643} 2644 2645static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 2646{ 2647 struct mem_cgroup *mem; 2648 int type, name; 2649 2650 mem = mem_cgroup_from_cont(cont); 2651 type = MEMFILE_TYPE(event); 2652 name = MEMFILE_ATTR(event); 2653 switch (name) { 2654 case RES_MAX_USAGE: 2655 if (type == _MEM) 2656 res_counter_reset_max(&mem->res); 2657 else 2658 res_counter_reset_max(&mem->memsw); 2659 break; 2660 case RES_FAILCNT: 2661 if (type == _MEM) 2662 res_counter_reset_failcnt(&mem->res); 2663 else 2664 res_counter_reset_failcnt(&mem->memsw); 2665 break; 2666 } 2667 2668 return 0; 2669} 2670 2671 2672/* For read statistics */ 2673enum { 2674 MCS_CACHE, 2675 MCS_RSS, 2676 MCS_MAPPED_FILE, 2677 MCS_PGPGIN, 2678 MCS_PGPGOUT, 2679 MCS_SWAP, 2680 MCS_INACTIVE_ANON, 2681 MCS_ACTIVE_ANON, 2682 MCS_INACTIVE_FILE, 2683 MCS_ACTIVE_FILE, 2684 MCS_UNEVICTABLE, 2685 NR_MCS_STAT, 2686}; 2687 2688struct mcs_total_stat { 2689 s64 stat[NR_MCS_STAT]; 2690}; 2691 2692struct { 2693 char *local_name; 2694 char *total_name; 2695} memcg_stat_strings[NR_MCS_STAT] = { 2696 {"cache", "total_cache"}, 2697 {"rss", "total_rss"}, 2698 {"mapped_file", "total_mapped_file"}, 2699 {"pgpgin", "total_pgpgin"}, 2700 {"pgpgout", "total_pgpgout"}, 2701 {"swap", "total_swap"}, 2702 {"inactive_anon", "total_inactive_anon"}, 2703 {"active_anon", "total_active_anon"}, 2704 {"inactive_file", "total_inactive_file"}, 2705 {"active_file", "total_active_file"}, 2706 {"unevictable", "total_unevictable"} 2707}; 2708 2709 2710static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 2711{ 2712 struct mcs_total_stat *s = data; 2713 s64 val; 2714 2715 /* per cpu stat */ 2716 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 2717 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2718 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2719 s->stat[MCS_RSS] += val * PAGE_SIZE; 2720 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2721 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2722 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2723 s->stat[MCS_PGPGIN] += val; 2724 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2725 s->stat[MCS_PGPGOUT] += val; 2726 if (do_swap_account) { 2727 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 2728 s->stat[MCS_SWAP] += val * PAGE_SIZE; 2729 } 2730 2731 /* per zone stat */ 2732 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2733 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 2734 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 2735 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 2736 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 2737 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 2738 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 2739 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 2740 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 2741 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 2742 return 0; 2743} 2744 2745static void 2746mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 2747{ 2748 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 2749} 2750 2751static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2752 struct cgroup_map_cb *cb) 2753{ 2754 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2755 struct mcs_total_stat mystat; 2756 int i; 2757 2758 memset(&mystat, 0, sizeof(mystat)); 2759 mem_cgroup_get_local_stat(mem_cont, &mystat); 2760 2761 for (i = 0; i < NR_MCS_STAT; i++) { 2762 if (i == MCS_SWAP && !do_swap_account) 2763 continue; 2764 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2765 } 2766 2767 /* Hierarchical information */ 2768 { 2769 unsigned long long limit, memsw_limit; 2770 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2771 cb->fill(cb, "hierarchical_memory_limit", limit); 2772 if (do_swap_account) 2773 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2774 } 2775 2776 memset(&mystat, 0, sizeof(mystat)); 2777 mem_cgroup_get_total_stat(mem_cont, &mystat); 2778 for (i = 0; i < NR_MCS_STAT; i++) { 2779 if (i == MCS_SWAP && !do_swap_account) 2780 continue; 2781 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2782 } 2783 2784#ifdef CONFIG_DEBUG_VM 2785 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2786 2787 { 2788 int nid, zid; 2789 struct mem_cgroup_per_zone *mz; 2790 unsigned long recent_rotated[2] = {0, 0}; 2791 unsigned long recent_scanned[2] = {0, 0}; 2792 2793 for_each_online_node(nid) 2794 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2795 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 2796 2797 recent_rotated[0] += 2798 mz->reclaim_stat.recent_rotated[0]; 2799 recent_rotated[1] += 2800 mz->reclaim_stat.recent_rotated[1]; 2801 recent_scanned[0] += 2802 mz->reclaim_stat.recent_scanned[0]; 2803 recent_scanned[1] += 2804 mz->reclaim_stat.recent_scanned[1]; 2805 } 2806 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 2807 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 2808 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 2809 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 2810 } 2811#endif 2812 2813 return 0; 2814} 2815 2816static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 2817{ 2818 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 2819 2820 return get_swappiness(memcg); 2821} 2822 2823static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 2824 u64 val) 2825{ 2826 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 2827 struct mem_cgroup *parent; 2828 2829 if (val > 100) 2830 return -EINVAL; 2831 2832 if (cgrp->parent == NULL) 2833 return -EINVAL; 2834 2835 parent = mem_cgroup_from_cont(cgrp->parent); 2836 2837 cgroup_lock(); 2838 2839 /* If under hierarchy, only empty-root can set this value */ 2840 if ((parent->use_hierarchy) || 2841 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 2842 cgroup_unlock(); 2843 return -EINVAL; 2844 } 2845 2846 spin_lock(&memcg->reclaim_param_lock); 2847 memcg->swappiness = val; 2848 spin_unlock(&memcg->reclaim_param_lock); 2849 2850 cgroup_unlock(); 2851 2852 return 0; 2853} 2854 2855 2856static struct cftype mem_cgroup_files[] = { 2857 { 2858 .name = "usage_in_bytes", 2859 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2860 .read_u64 = mem_cgroup_read, 2861 }, 2862 { 2863 .name = "max_usage_in_bytes", 2864 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2865 .trigger = mem_cgroup_reset, 2866 .read_u64 = mem_cgroup_read, 2867 }, 2868 { 2869 .name = "limit_in_bytes", 2870 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2871 .write_string = mem_cgroup_write, 2872 .read_u64 = mem_cgroup_read, 2873 }, 2874 { 2875 .name = "soft_limit_in_bytes", 2876 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2877 .write_string = mem_cgroup_write, 2878 .read_u64 = mem_cgroup_read, 2879 }, 2880 { 2881 .name = "failcnt", 2882 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2883 .trigger = mem_cgroup_reset, 2884 .read_u64 = mem_cgroup_read, 2885 }, 2886 { 2887 .name = "stat", 2888 .read_map = mem_control_stat_show, 2889 }, 2890 { 2891 .name = "force_empty", 2892 .trigger = mem_cgroup_force_empty_write, 2893 }, 2894 { 2895 .name = "use_hierarchy", 2896 .write_u64 = mem_cgroup_hierarchy_write, 2897 .read_u64 = mem_cgroup_hierarchy_read, 2898 }, 2899 { 2900 .name = "swappiness", 2901 .read_u64 = mem_cgroup_swappiness_read, 2902 .write_u64 = mem_cgroup_swappiness_write, 2903 }, 2904}; 2905 2906#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2907static struct cftype memsw_cgroup_files[] = { 2908 { 2909 .name = "memsw.usage_in_bytes", 2910 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2911 .read_u64 = mem_cgroup_read, 2912 }, 2913 { 2914 .name = "memsw.max_usage_in_bytes", 2915 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2916 .trigger = mem_cgroup_reset, 2917 .read_u64 = mem_cgroup_read, 2918 }, 2919 { 2920 .name = "memsw.limit_in_bytes", 2921 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2922 .write_string = mem_cgroup_write, 2923 .read_u64 = mem_cgroup_read, 2924 }, 2925 { 2926 .name = "memsw.failcnt", 2927 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2928 .trigger = mem_cgroup_reset, 2929 .read_u64 = mem_cgroup_read, 2930 }, 2931}; 2932 2933static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2934{ 2935 if (!do_swap_account) 2936 return 0; 2937 return cgroup_add_files(cont, ss, memsw_cgroup_files, 2938 ARRAY_SIZE(memsw_cgroup_files)); 2939}; 2940#else 2941static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 2942{ 2943 return 0; 2944} 2945#endif 2946 2947static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2948{ 2949 struct mem_cgroup_per_node *pn; 2950 struct mem_cgroup_per_zone *mz; 2951 enum lru_list l; 2952 int zone, tmp = node; 2953 /* 2954 * This routine is called against possible nodes. 2955 * But it's BUG to call kmalloc() against offline node. 2956 * 2957 * TODO: this routine can waste much memory for nodes which will 2958 * never be onlined. It's better to use memory hotplug callback 2959 * function. 2960 */ 2961 if (!node_state(node, N_NORMAL_MEMORY)) 2962 tmp = -1; 2963 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 2964 if (!pn) 2965 return 1; 2966 2967 mem->info.nodeinfo[node] = pn; 2968 memset(pn, 0, sizeof(*pn)); 2969 2970 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2971 mz = &pn->zoneinfo[zone]; 2972 for_each_lru(l) 2973 INIT_LIST_HEAD(&mz->lists[l]); 2974 mz->usage_in_excess = 0; 2975 mz->on_tree = false; 2976 mz->mem = mem; 2977 } 2978 return 0; 2979} 2980 2981static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2982{ 2983 kfree(mem->info.nodeinfo[node]); 2984} 2985 2986static int mem_cgroup_size(void) 2987{ 2988 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); 2989 return sizeof(struct mem_cgroup) + cpustat_size; 2990} 2991 2992static struct mem_cgroup *mem_cgroup_alloc(void) 2993{ 2994 struct mem_cgroup *mem; 2995 int size = mem_cgroup_size(); 2996 2997 if (size < PAGE_SIZE) 2998 mem = kmalloc(size, GFP_KERNEL); 2999 else 3000 mem = vmalloc(size); 3001 3002 if (mem) 3003 memset(mem, 0, size); 3004 return mem; 3005} 3006 3007/* 3008 * At destroying mem_cgroup, references from swap_cgroup can remain. 3009 * (scanning all at force_empty is too costly...) 3010 * 3011 * Instead of clearing all references at force_empty, we remember 3012 * the number of reference from swap_cgroup and free mem_cgroup when 3013 * it goes down to 0. 3014 * 3015 * Removal of cgroup itself succeeds regardless of refs from swap. 3016 */ 3017 3018static void __mem_cgroup_free(struct mem_cgroup *mem) 3019{ 3020 int node; 3021 3022 mem_cgroup_remove_from_trees(mem); 3023 free_css_id(&mem_cgroup_subsys, &mem->css); 3024 3025 for_each_node_state(node, N_POSSIBLE) 3026 free_mem_cgroup_per_zone_info(mem, node); 3027 3028 if (mem_cgroup_size() < PAGE_SIZE) 3029 kfree(mem); 3030 else 3031 vfree(mem); 3032} 3033 3034static void mem_cgroup_get(struct mem_cgroup *mem) 3035{ 3036 atomic_inc(&mem->refcnt); 3037} 3038 3039static void mem_cgroup_put(struct mem_cgroup *mem) 3040{ 3041 if (atomic_dec_and_test(&mem->refcnt)) { 3042 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3043 __mem_cgroup_free(mem); 3044 if (parent) 3045 mem_cgroup_put(parent); 3046 } 3047} 3048 3049/* 3050 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3051 */ 3052static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 3053{ 3054 if (!mem->res.parent) 3055 return NULL; 3056 return mem_cgroup_from_res_counter(mem->res.parent, res); 3057} 3058 3059#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3060static void __init enable_swap_cgroup(void) 3061{ 3062 if (!mem_cgroup_disabled() && really_do_swap_account) 3063 do_swap_account = 1; 3064} 3065#else 3066static void __init enable_swap_cgroup(void) 3067{ 3068} 3069#endif 3070 3071static int mem_cgroup_soft_limit_tree_init(void) 3072{ 3073 struct mem_cgroup_tree_per_node *rtpn; 3074 struct mem_cgroup_tree_per_zone *rtpz; 3075 int tmp, node, zone; 3076 3077 for_each_node_state(node, N_POSSIBLE) { 3078 tmp = node; 3079 if (!node_state(node, N_NORMAL_MEMORY)) 3080 tmp = -1; 3081 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 3082 if (!rtpn) 3083 return 1; 3084 3085 soft_limit_tree.rb_tree_per_node[node] = rtpn; 3086 3087 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3088 rtpz = &rtpn->rb_tree_per_zone[zone]; 3089 rtpz->rb_root = RB_ROOT; 3090 spin_lock_init(&rtpz->lock); 3091 } 3092 } 3093 return 0; 3094} 3095 3096static struct cgroup_subsys_state * __ref 3097mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3098{ 3099 struct mem_cgroup *mem, *parent; 3100 long error = -ENOMEM; 3101 int node; 3102 3103 mem = mem_cgroup_alloc(); 3104 if (!mem) 3105 return ERR_PTR(error); 3106 3107 for_each_node_state(node, N_POSSIBLE) 3108 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3109 goto free_out; 3110 3111 /* root ? */ 3112 if (cont->parent == NULL) { 3113 enable_swap_cgroup(); 3114 parent = NULL; 3115 root_mem_cgroup = mem; 3116 if (mem_cgroup_soft_limit_tree_init()) 3117 goto free_out; 3118 3119 } else { 3120 parent = mem_cgroup_from_cont(cont->parent); 3121 mem->use_hierarchy = parent->use_hierarchy; 3122 } 3123 3124 if (parent && parent->use_hierarchy) { 3125 res_counter_init(&mem->res, &parent->res); 3126 res_counter_init(&mem->memsw, &parent->memsw); 3127 /* 3128 * We increment refcnt of the parent to ensure that we can 3129 * safely access it on res_counter_charge/uncharge. 3130 * This refcnt will be decremented when freeing this 3131 * mem_cgroup(see mem_cgroup_put). 3132 */ 3133 mem_cgroup_get(parent); 3134 } else { 3135 res_counter_init(&mem->res, NULL); 3136 res_counter_init(&mem->memsw, NULL); 3137 } 3138 mem->last_scanned_child = 0; 3139 spin_lock_init(&mem->reclaim_param_lock); 3140 3141 if (parent) 3142 mem->swappiness = get_swappiness(parent); 3143 atomic_set(&mem->refcnt, 1); 3144 return &mem->css; 3145free_out: 3146 __mem_cgroup_free(mem); 3147 root_mem_cgroup = NULL; 3148 return ERR_PTR(error); 3149} 3150 3151static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 3152 struct cgroup *cont) 3153{ 3154 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3155 3156 return mem_cgroup_force_empty(mem, false); 3157} 3158 3159static void mem_cgroup_destroy(struct cgroup_subsys *ss, 3160 struct cgroup *cont) 3161{ 3162 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3163 3164 mem_cgroup_put(mem); 3165} 3166 3167static int mem_cgroup_populate(struct cgroup_subsys *ss, 3168 struct cgroup *cont) 3169{ 3170 int ret; 3171 3172 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 3173 ARRAY_SIZE(mem_cgroup_files)); 3174 3175 if (!ret) 3176 ret = register_memsw_files(cont, ss); 3177 return ret; 3178} 3179 3180static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3181 struct cgroup *cont, 3182 struct cgroup *old_cont, 3183 struct task_struct *p, 3184 bool threadgroup) 3185{ 3186 mutex_lock(&memcg_tasklist); 3187 /* 3188 * FIXME: It's better to move charges of this process from old 3189 * memcg to new memcg. But it's just on TODO-List now. 3190 */ 3191 mutex_unlock(&memcg_tasklist); 3192} 3193 3194struct cgroup_subsys mem_cgroup_subsys = { 3195 .name = "memory", 3196 .subsys_id = mem_cgroup_subsys_id, 3197 .create = mem_cgroup_create, 3198 .pre_destroy = mem_cgroup_pre_destroy, 3199 .destroy = mem_cgroup_destroy, 3200 .populate = mem_cgroup_populate, 3201 .attach = mem_cgroup_move_task, 3202 .early_init = 0, 3203 .use_id = 1, 3204}; 3205 3206#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3207 3208static int __init disable_swap_account(char *s) 3209{ 3210 really_do_swap_account = 0; 3211 return 1; 3212} 3213__setup("noswapaccount", disable_swap_account); 3214#endif