Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'akpm' (patches from Andrew Morton)

Merge fixes from Andrew Morton:
"Bunch of fixes.

And a reversion of mhocko's "Soft limit rework" patch series. This is
actually your fault for opening the merge window when I was off racing ;)

I didn't read the email thread before sending everything off.
Johannes Weiner raised significant issues:

http://www.spinics.net/lists/cgroups/msg08813.html

and we agreed to back it all out"

I clearly need to be more aware of Andrew's racing schedule.

* akpm:
MAINTAINERS: update mach-bcm related email address
checkpatch: make extern in .h prototypes quieter
cciss: fix info leak in cciss_ioctl32_passthru()
cpqarray: fix info leak in ida_locked_ioctl()
kernel/reboot.c: re-enable the function of variable reboot_default
audit: fix endless wait in audit_log_start()
revert "memcg, vmscan: integrate soft reclaim tighter with zone shrinking code"
revert "memcg: get rid of soft-limit tree infrastructure"
revert "vmscan, memcg: do softlimit reclaim also for targeted reclaim"
revert "memcg: enhance memcg iterator to support predicates"
revert "memcg: track children in soft limit excess to improve soft limit"
revert "memcg, vmscan: do not attempt soft limit reclaim if it would not scan anything"
revert "memcg: track all children over limit in the root"
revert "memcg, vmscan: do not fall into reclaim-all pass too quickly"
fs/ocfs2/super.c: use a bigger nodestr in ocfs2_dismount_volume
watchdog: update watchdog_thresh properly
watchdog: update watchdog attributes atomically

+524 -259
+2 -1
MAINTAINERS
··· 1812 1812 F: drivers/net/ethernet/broadcom/bnx2x/ 1813 1813 1814 1814 BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE 1815 - M: Christian Daudt <csd@broadcom.com> 1815 + M: Christian Daudt <bcm@fixthebug.org> 1816 + L: bcm-kernel-feedback-list@broadcom.com 1816 1817 T: git git://git.github.com/broadcom/bcm11351 1817 1818 S: Maintained 1818 1819 F: arch/arm/mach-bcm/
+1
drivers/block/cciss.c
··· 1189 1189 int err; 1190 1190 u32 cp; 1191 1191 1192 + memset(&arg64, 0, sizeof(arg64)); 1192 1193 err = 0; 1193 1194 err |= 1194 1195 copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
+1
drivers/block/cpqarray.c
··· 1193 1193 ida_pci_info_struct pciinfo; 1194 1194 1195 1195 if (!arg) return -EINVAL; 1196 + memset(&pciinfo, 0, sizeof(pciinfo)); 1196 1197 pciinfo.bus = host->pci_dev->bus->number; 1197 1198 pciinfo.dev_fn = host->pci_dev->devfn; 1198 1199 pciinfo.board_id = host->board_id;
+1 -1
fs/ocfs2/super.c
··· 1924 1924 { 1925 1925 int tmp, hangup_needed = 0; 1926 1926 struct ocfs2_super *osb = NULL; 1927 - char nodestr[8]; 1927 + char nodestr[12]; 1928 1928 1929 1929 trace_ocfs2_dismount_volume(sb); 1930 1930
+10 -45
include/linux/memcontrol.h
··· 53 53 unsigned int generation; 54 54 }; 55 55 56 - enum mem_cgroup_filter_t { 57 - VISIT, /* visit current node */ 58 - SKIP, /* skip the current node and continue traversal */ 59 - SKIP_TREE, /* skip the whole subtree and continue traversal */ 60 - }; 61 - 62 - /* 63 - * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to 64 - * iterate through the hierarchy tree. Each tree element is checked by the 65 - * predicate before it is returned by the iterator. If a filter returns 66 - * SKIP or SKIP_TREE then the iterator code continues traversal (with the 67 - * next node down the hierarchy or the next node that doesn't belong under the 68 - * memcg's subtree). 69 - */ 70 - typedef enum mem_cgroup_filter_t 71 - (*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); 72 - 73 56 #ifdef CONFIG_MEMCG 74 57 /* 75 58 * All "charge" functions with gfp_mask should use GFP_KERNEL or ··· 120 137 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 121 138 struct page *oldpage, struct page *newpage, bool migration_ok); 122 139 123 - struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 124 - struct mem_cgroup *prev, 125 - struct mem_cgroup_reclaim_cookie *reclaim, 126 - mem_cgroup_iter_filter cond); 127 - 128 - static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 129 - struct mem_cgroup *prev, 130 - struct mem_cgroup_reclaim_cookie *reclaim) 131 - { 132 - return mem_cgroup_iter_cond(root, prev, reclaim, NULL); 133 - } 134 - 140 + struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 141 + struct mem_cgroup *, 142 + struct mem_cgroup_reclaim_cookie *); 135 143 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 136 144 137 145 /* ··· 234 260 mem_cgroup_update_page_stat(page, idx, -1); 235 261 } 236 262 237 - enum mem_cgroup_filter_t 238 - mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 239 - struct mem_cgroup *root); 263 + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 264 + gfp_t gfp_mask, 265 + unsigned long *total_scanned); 240 266 241 267 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 242 268 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, ··· 350 376 struct page *oldpage, struct page *newpage, bool migration_ok) 351 377 { 352 378 } 353 - static inline struct mem_cgroup * 354 - mem_cgroup_iter_cond(struct mem_cgroup *root, 355 - struct mem_cgroup *prev, 356 - struct mem_cgroup_reclaim_cookie *reclaim, 357 - mem_cgroup_iter_filter cond) 358 - { 359 - /* first call must return non-NULL, second return NULL */ 360 - return (struct mem_cgroup *)(unsigned long)!prev; 361 - } 362 379 363 380 static inline struct mem_cgroup * 364 381 mem_cgroup_iter(struct mem_cgroup *root, ··· 436 471 } 437 472 438 473 static inline 439 - enum mem_cgroup_filter_t 440 - mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 441 - struct mem_cgroup *root) 474 + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 475 + gfp_t gfp_mask, 476 + unsigned long *total_scanned) 442 477 { 443 - return VISIT; 478 + return 0; 444 479 } 445 480 446 481 static inline void mem_cgroup_split_huge_fixup(struct page *head)
+6
include/linux/smp.h
··· 155 155 156 156 static inline void kick_all_cpus_sync(void) { } 157 157 158 + static inline void __smp_call_function_single(int cpuid, 159 + struct call_single_data *data, int wait) 160 + { 161 + on_each_cpu(data->func, data->info, wait); 162 + } 163 + 158 164 #endif /* !SMP */ 159 165 160 166 /*
+3 -2
kernel/audit.c
··· 1117 1117 1118 1118 sleep_time = timeout_start + audit_backlog_wait_time - 1119 1119 jiffies; 1120 - if ((long)sleep_time > 0) 1120 + if ((long)sleep_time > 0) { 1121 1121 wait_for_auditd(sleep_time); 1122 - continue; 1122 + continue; 1123 + } 1123 1124 } 1124 1125 if (audit_rate_check() && printk_ratelimit()) 1125 1126 printk(KERN_WARNING
+8 -1
kernel/reboot.c
··· 32 32 #endif 33 33 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 34 34 35 - int reboot_default; 35 + /* 36 + * This variable is used privately to keep track of whether or not 37 + * reboot_type is still set to its default value (i.e., reboot= hasn't 38 + * been set on the command line). This is needed so that we can 39 + * suppress DMI scanning for reboot quirks. Without it, it's 40 + * impossible to override a faulty reboot quirk without recompiling. 41 + */ 42 + int reboot_default = 1; 36 43 int reboot_cpu; 37 44 enum reboot_type reboot_type = BOOT_ACPI; 38 45 int reboot_force;
+55 -5
kernel/watchdog.c
··· 486 486 .unpark = watchdog_enable, 487 487 }; 488 488 489 - static int watchdog_enable_all_cpus(void) 489 + static void restart_watchdog_hrtimer(void *info) 490 + { 491 + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 492 + int ret; 493 + 494 + /* 495 + * No need to cancel and restart hrtimer if it is currently executing 496 + * because it will reprogram itself with the new period now. 497 + * We should never see it unqueued here because we are running per-cpu 498 + * with interrupts disabled. 499 + */ 500 + ret = hrtimer_try_to_cancel(hrtimer); 501 + if (ret == 1) 502 + hrtimer_start(hrtimer, ns_to_ktime(sample_period), 503 + HRTIMER_MODE_REL_PINNED); 504 + } 505 + 506 + static void update_timers(int cpu) 507 + { 508 + struct call_single_data data = {.func = restart_watchdog_hrtimer}; 509 + /* 510 + * Make sure that perf event counter will adopt to a new 511 + * sampling period. Updating the sampling period directly would 512 + * be much nicer but we do not have an API for that now so 513 + * let's use a big hammer. 514 + * Hrtimer will adopt the new period on the next tick but this 515 + * might be late already so we have to restart the timer as well. 516 + */ 517 + watchdog_nmi_disable(cpu); 518 + __smp_call_function_single(cpu, &data, 1); 519 + watchdog_nmi_enable(cpu); 520 + } 521 + 522 + static void update_timers_all_cpus(void) 523 + { 524 + int cpu; 525 + 526 + get_online_cpus(); 527 + preempt_disable(); 528 + for_each_online_cpu(cpu) 529 + update_timers(cpu); 530 + preempt_enable(); 531 + put_online_cpus(); 532 + } 533 + 534 + static int watchdog_enable_all_cpus(bool sample_period_changed) 490 535 { 491 536 int err = 0; 492 537 ··· 541 496 pr_err("Failed to create watchdog threads, disabled\n"); 542 497 else 543 498 watchdog_running = 1; 499 + } else if (sample_period_changed) { 500 + update_timers_all_cpus(); 544 501 } 545 502 546 503 return err; ··· 567 520 void __user *buffer, size_t *lenp, loff_t *ppos) 568 521 { 569 522 int err, old_thresh, old_enabled; 523 + static DEFINE_MUTEX(watchdog_proc_mutex); 570 524 525 + mutex_lock(&watchdog_proc_mutex); 571 526 old_thresh = ACCESS_ONCE(watchdog_thresh); 572 527 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 573 528 574 529 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 575 530 if (err || !write) 576 - return err; 531 + goto out; 577 532 578 533 set_sample_period(); 579 534 /* ··· 584 535 * watchdog_*_all_cpus() function takes care of this. 585 536 */ 586 537 if (watchdog_user_enabled && watchdog_thresh) 587 - err = watchdog_enable_all_cpus(); 538 + err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); 588 539 else 589 540 watchdog_disable_all_cpus(); 590 541 ··· 593 544 watchdog_thresh = old_thresh; 594 545 watchdog_user_enabled = old_enabled; 595 546 } 596 - 547 + out: 548 + mutex_unlock(&watchdog_proc_mutex); 597 549 return err; 598 550 } 599 551 #endif /* CONFIG_SYSCTL */ ··· 604 554 set_sample_period(); 605 555 606 556 if (watchdog_user_enabled) 607 - watchdog_enable_all_cpus(); 557 + watchdog_enable_all_cpus(false); 608 558 }
+404 -150
mm/memcontrol.c
··· 39 39 #include <linux/limits.h> 40 40 #include <linux/export.h> 41 41 #include <linux/mutex.h> 42 + #include <linux/rbtree.h> 42 43 #include <linux/slab.h> 43 44 #include <linux/swap.h> 44 45 #include <linux/swapops.h> ··· 161 160 162 161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 163 162 163 + struct rb_node tree_node; /* RB tree node */ 164 + unsigned long long usage_in_excess;/* Set to the value by which */ 165 + /* the soft limit is exceeded*/ 166 + bool on_tree; 164 167 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 165 168 /* use container_of */ 166 169 }; ··· 172 167 struct mem_cgroup_per_node { 173 168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 174 169 }; 170 + 171 + /* 172 + * Cgroups above their limits are maintained in a RB-Tree, independent of 173 + * their hierarchy representation 174 + */ 175 + 176 + struct mem_cgroup_tree_per_zone { 177 + struct rb_root rb_root; 178 + spinlock_t lock; 179 + }; 180 + 181 + struct mem_cgroup_tree_per_node { 182 + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 183 + }; 184 + 185 + struct mem_cgroup_tree { 186 + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 187 + }; 188 + 189 + static struct mem_cgroup_tree soft_limit_tree __read_mostly; 175 190 176 191 struct mem_cgroup_threshold { 177 192 struct eventfd_ctx *eventfd; ··· 328 303 atomic_t numainfo_events; 329 304 atomic_t numainfo_updating; 330 305 #endif 331 - /* 332 - * Protects soft_contributed transitions. 333 - * See mem_cgroup_update_soft_limit 334 - */ 335 - spinlock_t soft_lock; 336 - 337 - /* 338 - * If true then this group has increased parents' children_in_excess 339 - * when it got over the soft limit. 340 - * When a group falls bellow the soft limit, parents' children_in_excess 341 - * is decreased and soft_contributed changed to false. 342 - */ 343 - bool soft_contributed; 344 - 345 - /* Number of children that are in soft limit excess */ 346 - atomic_t children_in_excess; 347 306 348 307 struct mem_cgroup_per_node *nodeinfo[0]; 349 308 /* WARNING: nodeinfo must be the last member here */ ··· 431 422 * limit reclaim to prevent infinite loops, if they ever occur. 432 423 */ 433 424 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 425 + #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 434 426 435 427 enum charge_type { 436 428 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, ··· 658 648 return mem_cgroup_zoneinfo(memcg, nid, zid); 659 649 } 660 650 651 + static struct mem_cgroup_tree_per_zone * 652 + soft_limit_tree_node_zone(int nid, int zid) 653 + { 654 + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 655 + } 656 + 657 + static struct mem_cgroup_tree_per_zone * 658 + soft_limit_tree_from_page(struct page *page) 659 + { 660 + int nid = page_to_nid(page); 661 + int zid = page_zonenum(page); 662 + 663 + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 664 + } 665 + 666 + static void 667 + __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 668 + struct mem_cgroup_per_zone *mz, 669 + struct mem_cgroup_tree_per_zone *mctz, 670 + unsigned long long new_usage_in_excess) 671 + { 672 + struct rb_node **p = &mctz->rb_root.rb_node; 673 + struct rb_node *parent = NULL; 674 + struct mem_cgroup_per_zone *mz_node; 675 + 676 + if (mz->on_tree) 677 + return; 678 + 679 + mz->usage_in_excess = new_usage_in_excess; 680 + if (!mz->usage_in_excess) 681 + return; 682 + while (*p) { 683 + parent = *p; 684 + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 685 + tree_node); 686 + if (mz->usage_in_excess < mz_node->usage_in_excess) 687 + p = &(*p)->rb_left; 688 + /* 689 + * We can't avoid mem cgroups that are over their soft 690 + * limit by the same amount 691 + */ 692 + else if (mz->usage_in_excess >= mz_node->usage_in_excess) 693 + p = &(*p)->rb_right; 694 + } 695 + rb_link_node(&mz->tree_node, parent, p); 696 + rb_insert_color(&mz->tree_node, &mctz->rb_root); 697 + mz->on_tree = true; 698 + } 699 + 700 + static void 701 + __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 702 + struct mem_cgroup_per_zone *mz, 703 + struct mem_cgroup_tree_per_zone *mctz) 704 + { 705 + if (!mz->on_tree) 706 + return; 707 + rb_erase(&mz->tree_node, &mctz->rb_root); 708 + mz->on_tree = false; 709 + } 710 + 711 + static void 712 + mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 713 + struct mem_cgroup_per_zone *mz, 714 + struct mem_cgroup_tree_per_zone *mctz) 715 + { 716 + spin_lock(&mctz->lock); 717 + __mem_cgroup_remove_exceeded(memcg, mz, mctz); 718 + spin_unlock(&mctz->lock); 719 + } 720 + 721 + 722 + static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 723 + { 724 + unsigned long long excess; 725 + struct mem_cgroup_per_zone *mz; 726 + struct mem_cgroup_tree_per_zone *mctz; 727 + int nid = page_to_nid(page); 728 + int zid = page_zonenum(page); 729 + mctz = soft_limit_tree_from_page(page); 730 + 731 + /* 732 + * Necessary to update all ancestors when hierarchy is used. 733 + * because their event counter is not touched. 734 + */ 735 + for (; memcg; memcg = parent_mem_cgroup(memcg)) { 736 + mz = mem_cgroup_zoneinfo(memcg, nid, zid); 737 + excess = res_counter_soft_limit_excess(&memcg->res); 738 + /* 739 + * We have to update the tree if mz is on RB-tree or 740 + * mem is over its softlimit. 741 + */ 742 + if (excess || mz->on_tree) { 743 + spin_lock(&mctz->lock); 744 + /* if on-tree, remove it */ 745 + if (mz->on_tree) 746 + __mem_cgroup_remove_exceeded(memcg, mz, mctz); 747 + /* 748 + * Insert again. mz->usage_in_excess will be updated. 749 + * If excess is 0, no tree ops. 750 + */ 751 + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 752 + spin_unlock(&mctz->lock); 753 + } 754 + } 755 + } 756 + 757 + static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 758 + { 759 + int node, zone; 760 + struct mem_cgroup_per_zone *mz; 761 + struct mem_cgroup_tree_per_zone *mctz; 762 + 763 + for_each_node(node) { 764 + for (zone = 0; zone < MAX_NR_ZONES; zone++) { 765 + mz = mem_cgroup_zoneinfo(memcg, node, zone); 766 + mctz = soft_limit_tree_node_zone(node, zone); 767 + mem_cgroup_remove_exceeded(memcg, mz, mctz); 768 + } 769 + } 770 + } 771 + 772 + static struct mem_cgroup_per_zone * 773 + __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 774 + { 775 + struct rb_node *rightmost = NULL; 776 + struct mem_cgroup_per_zone *mz; 777 + 778 + retry: 779 + mz = NULL; 780 + rightmost = rb_last(&mctz->rb_root); 781 + if (!rightmost) 782 + goto done; /* Nothing to reclaim from */ 783 + 784 + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 785 + /* 786 + * Remove the node now but someone else can add it back, 787 + * we will to add it back at the end of reclaim to its correct 788 + * position in the tree. 789 + */ 790 + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 791 + if (!res_counter_soft_limit_excess(&mz->memcg->res) || 792 + !css_tryget(&mz->memcg->css)) 793 + goto retry; 794 + done: 795 + return mz; 796 + } 797 + 798 + static struct mem_cgroup_per_zone * 799 + mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 800 + { 801 + struct mem_cgroup_per_zone *mz; 802 + 803 + spin_lock(&mctz->lock); 804 + mz = __mem_cgroup_largest_soft_limit_node(mctz); 805 + spin_unlock(&mctz->lock); 806 + return mz; 807 + } 808 + 661 809 /* 662 810 * Implementation Note: reading percpu statistics for memcg. 663 811 * ··· 990 822 } 991 823 992 824 /* 993 - * Called from rate-limited memcg_check_events when enough 994 - * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure 995 - * that all the parents up the hierarchy will be notified that this group 996 - * is in excess or that it is not in excess anymore. mmecg->soft_contributed 997 - * makes the transition a single action whenever the state flips from one to 998 - * the other. 999 - */ 1000 - static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) 1001 - { 1002 - unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); 1003 - struct mem_cgroup *parent = memcg; 1004 - int delta = 0; 1005 - 1006 - spin_lock(&memcg->soft_lock); 1007 - if (excess) { 1008 - if (!memcg->soft_contributed) { 1009 - delta = 1; 1010 - memcg->soft_contributed = true; 1011 - } 1012 - } else { 1013 - if (memcg->soft_contributed) { 1014 - delta = -1; 1015 - memcg->soft_contributed = false; 1016 - } 1017 - } 1018 - 1019 - /* 1020 - * Necessary to update all ancestors when hierarchy is used 1021 - * because their event counter is not touched. 1022 - * We track children even outside the hierarchy for the root 1023 - * cgroup because tree walk starting at root should visit 1024 - * all cgroups and we want to prevent from pointless tree 1025 - * walk if no children is below the limit. 1026 - */ 1027 - while (delta && (parent = parent_mem_cgroup(parent))) 1028 - atomic_add(delta, &parent->children_in_excess); 1029 - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) 1030 - atomic_add(delta, &root_mem_cgroup->children_in_excess); 1031 - spin_unlock(&memcg->soft_lock); 1032 - } 1033 - 1034 - /* 1035 825 * Check events in order. 1036 826 * 1037 827 */ ··· 1012 886 1013 887 mem_cgroup_threshold(memcg); 1014 888 if (unlikely(do_softlimit)) 1015 - mem_cgroup_update_soft_limit(memcg); 889 + mem_cgroup_update_tree(memcg, page); 1016 890 #if MAX_NUMNODES > 1 1017 891 if (unlikely(do_numainfo)) 1018 892 atomic_inc(&memcg->numainfo_events); ··· 1055 929 return memcg; 1056 930 } 1057 931 1058 - static enum mem_cgroup_filter_t 1059 - mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, 1060 - mem_cgroup_iter_filter cond) 1061 - { 1062 - if (!cond) 1063 - return VISIT; 1064 - return cond(memcg, root); 1065 - } 1066 - 1067 932 /* 1068 933 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1069 934 * ref. count) or NULL if the whole root's subtree has been visited. ··· 1062 945 * helper function to be used by mem_cgroup_iter 1063 946 */ 1064 947 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1065 - struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 948 + struct mem_cgroup *last_visited) 1066 949 { 1067 950 struct cgroup_subsys_state *prev_css, *next_css; 1068 951 ··· 1080 963 if (next_css) { 1081 964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1082 965 1083 - switch (mem_cgroup_filter(mem, root, cond)) { 1084 - case SKIP: 966 + if (css_tryget(&mem->css)) 967 + return mem; 968 + else { 1085 969 prev_css = next_css; 1086 970 goto skip_node; 1087 - case SKIP_TREE: 1088 - if (mem == root) 1089 - return NULL; 1090 - /* 1091 - * css_rightmost_descendant is not an optimal way to 1092 - * skip through a subtree (especially for imbalanced 1093 - * trees leaning to right) but that's what we have right 1094 - * now. More effective solution would be traversing 1095 - * right-up for first non-NULL without calling 1096 - * css_next_descendant_pre afterwards. 1097 - */ 1098 - prev_css = css_rightmost_descendant(next_css); 1099 - goto skip_node; 1100 - case VISIT: 1101 - if (css_tryget(&mem->css)) 1102 - return mem; 1103 - else { 1104 - prev_css = next_css; 1105 - goto skip_node; 1106 - } 1107 - break; 1108 971 } 1109 972 } 1110 973 ··· 1148 1051 * @root: hierarchy root 1149 1052 * @prev: previously returned memcg, NULL on first invocation 1150 1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1151 - * @cond: filter for visited nodes, NULL for no filter 1152 1054 * 1153 1055 * Returns references to children of the hierarchy below @root, or 1154 1056 * @root itself, or %NULL after a full round-trip. ··· 1160 1064 * divide up the memcgs in the hierarchy among all concurrent 1161 1065 * reclaimers operating on the same zone and priority. 1162 1066 */ 1163 - struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1067 + struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1164 1068 struct mem_cgroup *prev, 1165 - struct mem_cgroup_reclaim_cookie *reclaim, 1166 - mem_cgroup_iter_filter cond) 1069 + struct mem_cgroup_reclaim_cookie *reclaim) 1167 1070 { 1168 1071 struct mem_cgroup *memcg = NULL; 1169 1072 struct mem_cgroup *last_visited = NULL; 1170 1073 1171 - if (mem_cgroup_disabled()) { 1172 - /* first call must return non-NULL, second return NULL */ 1173 - return (struct mem_cgroup *)(unsigned long)!prev; 1174 - } 1074 + if (mem_cgroup_disabled()) 1075 + return NULL; 1175 1076 1176 1077 if (!root) 1177 1078 root = root_mem_cgroup; ··· 1179 1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1180 1087 if (prev) 1181 1088 goto out_css_put; 1182 - if (mem_cgroup_filter(root, root, cond) == VISIT) 1183 - return root; 1184 - return NULL; 1089 + return root; 1185 1090 } 1186 1091 1187 1092 rcu_read_lock(); ··· 1202 1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1203 1112 } 1204 1113 1205 - memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1114 + memcg = __mem_cgroup_iter_next(root, last_visited); 1206 1115 1207 1116 if (reclaim) { 1208 1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); ··· 1213 1122 reclaim->generation = iter->generation; 1214 1123 } 1215 1124 1216 - /* 1217 - * We have finished the whole tree walk or no group has been 1218 - * visited because filter told us to skip the root node. 1219 - */ 1220 - if (!memcg && (prev || (cond && !last_visited))) 1125 + if (prev && !memcg) 1221 1126 goto out_unlock; 1222 1127 } 1223 1128 out_unlock: ··· 1854 1767 return total; 1855 1768 } 1856 1769 1857 - #if MAX_NUMNODES > 1 1858 1770 /** 1859 1771 * test_mem_cgroup_node_reclaimable 1860 1772 * @memcg: the target memcg ··· 1876 1790 return false; 1877 1791 1878 1792 } 1793 + #if MAX_NUMNODES > 1 1879 1794 1880 1795 /* 1881 1796 * Always updating the nodemask is not very good - even if we have an empty ··· 1944 1857 return node; 1945 1858 } 1946 1859 1860 + /* 1861 + * Check all nodes whether it contains reclaimable pages or not. 1862 + * For quick scan, we make use of scan_nodes. This will allow us to skip 1863 + * unused nodes. But scan_nodes is lazily updated and may not cotain 1864 + * enough new information. We need to do double check. 1865 + */ 1866 + static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1867 + { 1868 + int nid; 1869 + 1870 + /* 1871 + * quick check...making use of scan_node. 1872 + * We can skip unused nodes. 1873 + */ 1874 + if (!nodes_empty(memcg->scan_nodes)) { 1875 + for (nid = first_node(memcg->scan_nodes); 1876 + nid < MAX_NUMNODES; 1877 + nid = next_node(nid, memcg->scan_nodes)) { 1878 + 1879 + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1880 + return true; 1881 + } 1882 + } 1883 + /* 1884 + * Check rest of nodes. 1885 + */ 1886 + for_each_node_state(nid, N_MEMORY) { 1887 + if (node_isset(nid, memcg->scan_nodes)) 1888 + continue; 1889 + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1890 + return true; 1891 + } 1892 + return false; 1893 + } 1894 + 1947 1895 #else 1948 1896 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1949 1897 { 1950 1898 return 0; 1951 1899 } 1952 1900 1901 + static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1902 + { 1903 + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1904 + } 1953 1905 #endif 1954 1906 1955 - /* 1956 - * A group is eligible for the soft limit reclaim under the given root 1957 - * hierarchy if 1958 - * a) it is over its soft limit 1959 - * b) any parent up the hierarchy is over its soft limit 1960 - * 1961 - * If the given group doesn't have any children over the limit then it 1962 - * doesn't make any sense to iterate its subtree. 1963 - */ 1964 - enum mem_cgroup_filter_t 1965 - mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 1966 - struct mem_cgroup *root) 1907 + static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1908 + struct zone *zone, 1909 + gfp_t gfp_mask, 1910 + unsigned long *total_scanned) 1967 1911 { 1968 - struct mem_cgroup *parent; 1912 + struct mem_cgroup *victim = NULL; 1913 + int total = 0; 1914 + int loop = 0; 1915 + unsigned long excess; 1916 + unsigned long nr_scanned; 1917 + struct mem_cgroup_reclaim_cookie reclaim = { 1918 + .zone = zone, 1919 + .priority = 0, 1920 + }; 1969 1921 1970 - if (!memcg) 1971 - memcg = root_mem_cgroup; 1972 - parent = memcg; 1922 + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1973 1923 1974 - if (res_counter_soft_limit_excess(&memcg->res)) 1975 - return VISIT; 1976 - 1977 - /* 1978 - * If any parent up to the root in the hierarchy is over its soft limit 1979 - * then we have to obey and reclaim from this group as well. 1980 - */ 1981 - while ((parent = parent_mem_cgroup(parent))) { 1982 - if (res_counter_soft_limit_excess(&parent->res)) 1983 - return VISIT; 1984 - if (parent == root) 1924 + while (1) { 1925 + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1926 + if (!victim) { 1927 + loop++; 1928 + if (loop >= 2) { 1929 + /* 1930 + * If we have not been able to reclaim 1931 + * anything, it might because there are 1932 + * no reclaimable pages under this hierarchy 1933 + */ 1934 + if (!total) 1935 + break; 1936 + /* 1937 + * We want to do more targeted reclaim. 1938 + * excess >> 2 is not to excessive so as to 1939 + * reclaim too much, nor too less that we keep 1940 + * coming back to reclaim from this cgroup 1941 + */ 1942 + if (total >= (excess >> 2) || 1943 + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1944 + break; 1945 + } 1946 + continue; 1947 + } 1948 + if (!mem_cgroup_reclaimable(victim, false)) 1949 + continue; 1950 + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1951 + zone, &nr_scanned); 1952 + *total_scanned += nr_scanned; 1953 + if (!res_counter_soft_limit_excess(&root_memcg->res)) 1985 1954 break; 1986 1955 } 1987 - 1988 - if (!atomic_read(&memcg->children_in_excess)) 1989 - return SKIP_TREE; 1990 - return SKIP; 1956 + mem_cgroup_iter_break(root_memcg, victim); 1957 + return total; 1991 1958 } 1992 1959 1993 1960 static DEFINE_SPINLOCK(memcg_oom_lock); ··· 2953 2812 unlock_page_cgroup(pc); 2954 2813 2955 2814 /* 2956 - * "charge_statistics" updated event counter. 2815 + * "charge_statistics" updated event counter. Then, check it. 2816 + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2817 + * if they exceeds softlimit. 2957 2818 */ 2958 2819 memcg_check_events(memcg, page); 2959 2820 } ··· 4790 4647 return ret; 4791 4648 } 4792 4649 4650 + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4651 + gfp_t gfp_mask, 4652 + unsigned long *total_scanned) 4653 + { 4654 + unsigned long nr_reclaimed = 0; 4655 + struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4656 + unsigned long reclaimed; 4657 + int loop = 0; 4658 + struct mem_cgroup_tree_per_zone *mctz; 4659 + unsigned long long excess; 4660 + unsigned long nr_scanned; 4661 + 4662 + if (order > 0) 4663 + return 0; 4664 + 4665 + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4666 + /* 4667 + * This loop can run a while, specially if mem_cgroup's continuously 4668 + * keep exceeding their soft limit and putting the system under 4669 + * pressure 4670 + */ 4671 + do { 4672 + if (next_mz) 4673 + mz = next_mz; 4674 + else 4675 + mz = mem_cgroup_largest_soft_limit_node(mctz); 4676 + if (!mz) 4677 + break; 4678 + 4679 + nr_scanned = 0; 4680 + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4681 + gfp_mask, &nr_scanned); 4682 + nr_reclaimed += reclaimed; 4683 + *total_scanned += nr_scanned; 4684 + spin_lock(&mctz->lock); 4685 + 4686 + /* 4687 + * If we failed to reclaim anything from this memory cgroup 4688 + * it is time to move on to the next cgroup 4689 + */ 4690 + next_mz = NULL; 4691 + if (!reclaimed) { 4692 + do { 4693 + /* 4694 + * Loop until we find yet another one. 4695 + * 4696 + * By the time we get the soft_limit lock 4697 + * again, someone might have aded the 4698 + * group back on the RB tree. Iterate to 4699 + * make sure we get a different mem. 4700 + * mem_cgroup_largest_soft_limit_node returns 4701 + * NULL if no other cgroup is present on 4702 + * the tree 4703 + */ 4704 + next_mz = 4705 + __mem_cgroup_largest_soft_limit_node(mctz); 4706 + if (next_mz == mz) 4707 + css_put(&next_mz->memcg->css); 4708 + else /* next_mz == NULL or other memcg */ 4709 + break; 4710 + } while (1); 4711 + } 4712 + __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4713 + excess = res_counter_soft_limit_excess(&mz->memcg->res); 4714 + /* 4715 + * One school of thought says that we should not add 4716 + * back the node to the tree if reclaim returns 0. 4717 + * But our reclaim could return 0, simply because due 4718 + * to priority we are exposing a smaller subset of 4719 + * memory to reclaim from. Consider this as a longer 4720 + * term TODO. 4721 + */ 4722 + /* If excess == 0, no tree ops */ 4723 + __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4724 + spin_unlock(&mctz->lock); 4725 + css_put(&mz->memcg->css); 4726 + loop++; 4727 + /* 4728 + * Could not reclaim anything and there are no more 4729 + * mem cgroups to try or we seem to be looping without 4730 + * reclaiming anything. 4731 + */ 4732 + if (!nr_reclaimed && 4733 + (next_mz == NULL || 4734 + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4735 + break; 4736 + } while (!nr_reclaimed); 4737 + if (next_mz) 4738 + css_put(&next_mz->memcg->css); 4739 + return nr_reclaimed; 4740 + } 4741 + 4793 4742 /** 4794 4743 * mem_cgroup_force_empty_list - clears LRU of a group 4795 4744 * @memcg: group to clear ··· 6146 5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6147 5912 mz = &pn->zoneinfo[zone]; 6148 5913 lruvec_init(&mz->lruvec); 5914 + mz->usage_in_excess = 0; 5915 + mz->on_tree = false; 6149 5916 mz->memcg = memcg; 6150 5917 } 6151 5918 memcg->nodeinfo[node] = pn; ··· 6203 5966 int node; 6204 5967 size_t size = memcg_size(); 6205 5968 5969 + mem_cgroup_remove_from_trees(memcg); 6206 5970 free_css_id(&mem_cgroup_subsys, &memcg->css); 6207 5971 6208 5972 for_each_node(node) ··· 6240 6002 } 6241 6003 EXPORT_SYMBOL(parent_mem_cgroup); 6242 6004 6005 + static void __init mem_cgroup_soft_limit_tree_init(void) 6006 + { 6007 + struct mem_cgroup_tree_per_node *rtpn; 6008 + struct mem_cgroup_tree_per_zone *rtpz; 6009 + int tmp, node, zone; 6010 + 6011 + for_each_node(node) { 6012 + tmp = node; 6013 + if (!node_state(node, N_NORMAL_MEMORY)) 6014 + tmp = -1; 6015 + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6016 + BUG_ON(!rtpn); 6017 + 6018 + soft_limit_tree.rb_tree_per_node[node] = rtpn; 6019 + 6020 + for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6021 + rtpz = &rtpn->rb_tree_per_zone[zone]; 6022 + rtpz->rb_root = RB_ROOT; 6023 + spin_lock_init(&rtpz->lock); 6024 + } 6025 + } 6026 + } 6027 + 6243 6028 static struct cgroup_subsys_state * __ref 6244 6029 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6245 6030 { ··· 6292 6031 mutex_init(&memcg->thresholds_lock); 6293 6032 spin_lock_init(&memcg->move_lock); 6294 6033 vmpressure_init(&memcg->vmpressure); 6295 - spin_lock_init(&memcg->soft_lock); 6296 6034 6297 6035 return &memcg->css; 6298 6036 ··· 6369 6109 6370 6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6371 6111 mem_cgroup_reparent_charges(memcg); 6372 - if (memcg->soft_contributed) { 6373 - while ((memcg = parent_mem_cgroup(memcg))) 6374 - atomic_dec(&memcg->children_in_excess); 6375 - 6376 - if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) 6377 - atomic_dec(&root_mem_cgroup->children_in_excess); 6378 - } 6379 6112 mem_cgroup_destroy_all_caches(memcg); 6380 6113 vmpressure_cleanup(&memcg->vmpressure); 6381 6114 } ··· 7043 6790 { 7044 6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7045 6792 enable_swap_cgroup(); 6793 + mem_cgroup_soft_limit_tree_init(); 7046 6794 memcg_stock_init(); 7047 6795 return 0; 7048 6796 }
+31 -52
mm/vmscan.c
··· 139 139 { 140 140 return !sc->target_mem_cgroup; 141 141 } 142 - 143 - static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) 144 - { 145 - struct mem_cgroup *root = sc->target_mem_cgroup; 146 - return !mem_cgroup_disabled() && 147 - mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; 148 - } 149 142 #else 150 143 static bool global_reclaim(struct scan_control *sc) 151 144 { 152 145 return true; 153 - } 154 - 155 - static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) 156 - { 157 - return false; 158 146 } 159 147 #endif 160 148 ··· 2164 2176 } 2165 2177 } 2166 2178 2167 - static int 2168 - __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) 2179 + static void shrink_zone(struct zone *zone, struct scan_control *sc) 2169 2180 { 2170 2181 unsigned long nr_reclaimed, nr_scanned; 2171 - int groups_scanned = 0; 2172 2182 2173 2183 do { 2174 2184 struct mem_cgroup *root = sc->target_mem_cgroup; ··· 2174 2188 .zone = zone, 2175 2189 .priority = sc->priority, 2176 2190 }; 2177 - struct mem_cgroup *memcg = NULL; 2178 - mem_cgroup_iter_filter filter = (soft_reclaim) ? 2179 - mem_cgroup_soft_reclaim_eligible : NULL; 2191 + struct mem_cgroup *memcg; 2180 2192 2181 2193 nr_reclaimed = sc->nr_reclaimed; 2182 2194 nr_scanned = sc->nr_scanned; 2183 2195 2184 - while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { 2196 + memcg = mem_cgroup_iter(root, NULL, &reclaim); 2197 + do { 2185 2198 struct lruvec *lruvec; 2186 2199 2187 - groups_scanned++; 2188 2200 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2189 2201 2190 2202 shrink_lruvec(lruvec, sc); ··· 2202 2218 mem_cgroup_iter_break(root, memcg); 2203 2219 break; 2204 2220 } 2205 - } 2221 + memcg = mem_cgroup_iter(root, memcg, &reclaim); 2222 + } while (memcg); 2206 2223 2207 2224 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2208 2225 sc->nr_scanned - nr_scanned, ··· 2211 2226 2212 2227 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2213 2228 sc->nr_scanned - nr_scanned, sc)); 2214 - 2215 - return groups_scanned; 2216 - } 2217 - 2218 - 2219 - static void shrink_zone(struct zone *zone, struct scan_control *sc) 2220 - { 2221 - bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); 2222 - unsigned long nr_scanned = sc->nr_scanned; 2223 - int scanned_groups; 2224 - 2225 - scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); 2226 - /* 2227 - * memcg iterator might race with other reclaimer or start from 2228 - * a incomplete tree walk so the tree walk in __shrink_zone 2229 - * might have missed groups that are above the soft limit. Try 2230 - * another loop to catch up with others. Do it just once to 2231 - * prevent from reclaim latencies when other reclaimers always 2232 - * preempt this one. 2233 - */ 2234 - if (do_soft_reclaim && !scanned_groups) 2235 - __shrink_zone(zone, sc, do_soft_reclaim); 2236 - 2237 - /* 2238 - * No group is over the soft limit or those that are do not have 2239 - * pages in the zone we are reclaiming so we have to reclaim everybody 2240 - */ 2241 - if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { 2242 - __shrink_zone(zone, sc, false); 2243 - return; 2244 - } 2245 2229 } 2246 2230 2247 2231 /* Returns true if compaction should go ahead for a high-order request */ ··· 2274 2320 { 2275 2321 struct zoneref *z; 2276 2322 struct zone *zone; 2323 + unsigned long nr_soft_reclaimed; 2324 + unsigned long nr_soft_scanned; 2277 2325 bool aborted_reclaim = false; 2278 2326 2279 2327 /* ··· 2315 2359 continue; 2316 2360 } 2317 2361 } 2362 + /* 2363 + * This steals pages from memory cgroups over softlimit 2364 + * and returns the number of reclaimed pages and 2365 + * scanned pages. This works for global memory pressure 2366 + * and balancing, not for a memcg's limit. 2367 + */ 2368 + nr_soft_scanned = 0; 2369 + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2370 + sc->order, sc->gfp_mask, 2371 + &nr_soft_scanned); 2372 + sc->nr_reclaimed += nr_soft_reclaimed; 2373 + sc->nr_scanned += nr_soft_scanned; 2318 2374 /* need some check for avoid more shrink_zone() */ 2319 2375 } 2320 2376 ··· 2920 2952 { 2921 2953 int i; 2922 2954 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2955 + unsigned long nr_soft_reclaimed; 2956 + unsigned long nr_soft_scanned; 2923 2957 struct scan_control sc = { 2924 2958 .gfp_mask = GFP_KERNEL, 2925 2959 .priority = DEF_PRIORITY, ··· 3035 3065 continue; 3036 3066 3037 3067 sc.nr_scanned = 0; 3068 + 3069 + nr_soft_scanned = 0; 3070 + /* 3071 + * Call soft limit reclaim before calling shrink_zone. 3072 + */ 3073 + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 3074 + order, sc.gfp_mask, 3075 + &nr_soft_scanned); 3076 + sc.nr_reclaimed += nr_soft_reclaimed; 3038 3077 3039 3078 /* 3040 3079 * There should be no need to raise the scanning
+2 -2
scripts/checkpatch.pl
··· 3975 3975 # check for new externs in .h files. 3976 3976 if ($realfile =~ /\.h$/ && 3977 3977 $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { 3978 - if (WARN("AVOID_EXTERNS", 3979 - "extern prototypes should be avoided in .h files\n" . $herecurr) && 3978 + if (CHK("AVOID_EXTERNS", 3979 + "extern prototypes should be avoided in .h files\n" . $herecurr) && 3980 3980 $fix) { 3981 3981 $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; 3982 3982 }