Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

oom: select task from tasklist for mempolicy ooms

The oom killer presently kills current whenever there is no more memory
free or reclaimable on its mempolicy's nodes. There is no guarantee that
current is a memory-hogging task or that killing it will free any
substantial amount of memory, however.

In such situations, it is better to scan the tasklist for nodes that are
allowed to allocate on current's set of nodes and kill the task with the
highest badness() score. This ensures that the most memory-hogging task,
or the one configured by the user with /proc/pid/oom_adj, is always
selected in such scenarios.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Rientjes and committed by
Linus Torvalds
6f48d0eb 5e9d834a

+124 -37
+12 -1
include/linux/mempolicy.h
··· 210 210 unsigned long addr, gfp_t gfp_flags, 211 211 struct mempolicy **mpol, nodemask_t **nodemask); 212 212 extern bool init_nodemask_of_mempolicy(nodemask_t *mask); 213 + extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, 214 + const nodemask_t *mask); 213 215 extern unsigned slab_node(struct mempolicy *policy); 214 216 215 217 extern enum zone_type policy_zone; ··· 340 338 return node_zonelist(0, gfp_flags); 341 339 } 342 340 343 - static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } 341 + static inline bool init_nodemask_of_mempolicy(nodemask_t *m) 342 + { 343 + return false; 344 + } 345 + 346 + static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, 347 + const nodemask_t *mask) 348 + { 349 + return false; 350 + } 344 351 345 352 static inline int do_migrate_pages(struct mm_struct *mm, 346 353 const nodemask_t *from_nodes,
+44
mm/mempolicy.c
··· 1712 1712 } 1713 1713 #endif 1714 1714 1715 + /* 1716 + * mempolicy_nodemask_intersects 1717 + * 1718 + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default 1719 + * policy. Otherwise, check for intersection between mask and the policy 1720 + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' 1721 + * policy, always return true since it may allocate elsewhere on fallback. 1722 + * 1723 + * Takes task_lock(tsk) to prevent freeing of its mempolicy. 1724 + */ 1725 + bool mempolicy_nodemask_intersects(struct task_struct *tsk, 1726 + const nodemask_t *mask) 1727 + { 1728 + struct mempolicy *mempolicy; 1729 + bool ret = true; 1730 + 1731 + if (!mask) 1732 + return ret; 1733 + task_lock(tsk); 1734 + mempolicy = tsk->mempolicy; 1735 + if (!mempolicy) 1736 + goto out; 1737 + 1738 + switch (mempolicy->mode) { 1739 + case MPOL_PREFERRED: 1740 + /* 1741 + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to 1742 + * allocate from, they may fallback to other nodes when oom. 1743 + * Thus, it's possible for tsk to have allocated memory from 1744 + * nodes in mask. 1745 + */ 1746 + break; 1747 + case MPOL_BIND: 1748 + case MPOL_INTERLEAVE: 1749 + ret = nodes_intersects(mempolicy->v.nodes, *mask); 1750 + break; 1751 + default: 1752 + BUG(); 1753 + } 1754 + out: 1755 + task_unlock(tsk); 1756 + return ret; 1757 + } 1758 + 1715 1759 /* Allocate a page in interleaved policy. 1716 1760 Own path because it needs to do special accounting. */ 1717 1761 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
+68 -36
mm/oom_kill.c
··· 27 27 #include <linux/module.h> 28 28 #include <linux/notifier.h> 29 29 #include <linux/memcontrol.h> 30 + #include <linux/mempolicy.h> 30 31 #include <linux/security.h> 31 32 32 33 int sysctl_panic_on_oom; ··· 36 35 static DEFINE_SPINLOCK(zone_scan_lock); 37 36 /* #define DEBUG */ 38 37 39 - /* 40 - * Is all threads of the target process nodes overlap ours? 38 + #ifdef CONFIG_NUMA 39 + /** 40 + * has_intersects_mems_allowed() - check task eligiblity for kill 41 + * @tsk: task struct of which task to consider 42 + * @mask: nodemask passed to page allocator for mempolicy ooms 43 + * 44 + * Task eligibility is determined by whether or not a candidate task, @tsk, 45 + * shares the same mempolicy nodes as current if it is bound by such a policy 46 + * and whether or not it has the same set of allowed cpuset nodes. 41 47 */ 42 - static int has_intersects_mems_allowed(struct task_struct *tsk) 48 + static bool has_intersects_mems_allowed(struct task_struct *tsk, 49 + const nodemask_t *mask) 43 50 { 44 - struct task_struct *t; 51 + struct task_struct *start = tsk; 45 52 46 - t = tsk; 47 53 do { 48 - if (cpuset_mems_allowed_intersects(current, t)) 49 - return 1; 50 - t = next_thread(t); 51 - } while (t != tsk); 52 - 53 - return 0; 54 + if (mask) { 55 + /* 56 + * If this is a mempolicy constrained oom, tsk's 57 + * cpuset is irrelevant. Only return true if its 58 + * mempolicy intersects current, otherwise it may be 59 + * needlessly killed. 60 + */ 61 + if (mempolicy_nodemask_intersects(tsk, mask)) 62 + return true; 63 + } else { 64 + /* 65 + * This is not a mempolicy constrained oom, so only 66 + * check the mems of tsk's cpuset. 67 + */ 68 + if (cpuset_mems_allowed_intersects(current, tsk)) 69 + return true; 70 + } 71 + tsk = next_thread(tsk); 72 + } while (tsk != start); 73 + return false; 54 74 } 75 + #else 76 + static bool has_intersects_mems_allowed(struct task_struct *tsk, 77 + const nodemask_t *mask) 78 + { 79 + return true; 80 + } 81 + #endif /* CONFIG_NUMA */ 55 82 83 + /* 84 + * The process p may have detached its own ->mm while exiting or through 85 + * use_mm(), but one or more of its subthreads may still have a valid 86 + * pointer. Return p, or any of its subthreads with a valid ->mm, with 87 + * task_lock() held. 88 + */ 56 89 static struct task_struct *find_lock_task_mm(struct task_struct *p) 57 90 { 58 91 struct task_struct *t = p; ··· 141 106 * The memory size of the process is the basis for the badness. 142 107 */ 143 108 points = p->mm->total_vm; 144 - 145 - /* 146 - * After this unlock we can no longer dereference local variable `mm' 147 - */ 148 109 task_unlock(p); 149 110 150 111 /* ··· 284 253 * (not docbooked, we don't want this one cluttering up the manual) 285 254 */ 286 255 static struct task_struct *select_bad_process(unsigned long *ppoints, 287 - struct mem_cgroup *mem) 256 + struct mem_cgroup *mem, enum oom_constraint constraint, 257 + const nodemask_t *mask) 288 258 { 289 259 struct task_struct *p; 290 260 struct task_struct *chosen = NULL; ··· 301 269 continue; 302 270 if (mem && !task_in_mem_cgroup(p, mem)) 303 271 continue; 304 - if (!has_intersects_mems_allowed(p)) 272 + if (!has_intersects_mems_allowed(p, 273 + constraint == CONSTRAINT_MEMORY_POLICY ? mask : 274 + NULL)) 305 275 continue; 306 276 307 277 /* ··· 531 497 panic("out of memory(memcg). panic_on_oom is selected.\n"); 532 498 read_lock(&tasklist_lock); 533 499 retry: 534 - p = select_bad_process(&points, mem); 500 + p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL); 535 501 if (!p || PTR_ERR(p) == -1UL) 536 502 goto out; 537 503 ··· 610 576 /* 611 577 * Must be called with tasklist_lock held for read. 612 578 */ 613 - static void __out_of_memory(gfp_t gfp_mask, int order) 579 + static void __out_of_memory(gfp_t gfp_mask, int order, 580 + enum oom_constraint constraint, const nodemask_t *mask) 614 581 { 615 582 struct task_struct *p; 616 583 unsigned long points; ··· 625 590 * Rambo mode: Shoot down a process and hope it solves whatever 626 591 * issues we may have. 627 592 */ 628 - p = select_bad_process(&points, NULL); 593 + p = select_bad_process(&points, NULL, constraint, mask); 629 594 630 595 if (PTR_ERR(p) == -1UL) 631 596 return; ··· 659 624 panic("out of memory from page fault. panic_on_oom is selected.\n"); 660 625 661 626 read_lock(&tasklist_lock); 662 - __out_of_memory(0, 0); /* unknown gfp_mask and order */ 627 + /* unknown gfp_mask and order */ 628 + __out_of_memory(0, 0, CONSTRAINT_NONE, NULL); 663 629 read_unlock(&tasklist_lock); 664 630 665 631 /* ··· 676 640 * @zonelist: zonelist pointer 677 641 * @gfp_mask: memory allocation flags 678 642 * @order: amount of memory being requested as a power of 2 643 + * @nodemask: nodemask passed to page allocator 679 644 * 680 645 * If we run out of memory, we have the choice between either 681 646 * killing a random task (bad), letting the system crash (worse) ··· 715 678 */ 716 679 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 717 680 read_lock(&tasklist_lock); 718 - 719 - switch (constraint) { 720 - case CONSTRAINT_MEMORY_POLICY: 721 - oom_kill_process(current, gfp_mask, order, 0, NULL, 722 - "No available memory (MPOL_BIND)"); 723 - break; 724 - 725 - case CONSTRAINT_NONE: 726 - if (sysctl_panic_on_oom) { 681 + if (unlikely(sysctl_panic_on_oom)) { 682 + /* 683 + * panic_on_oom only affects CONSTRAINT_NONE, the kernel 684 + * should not panic for cpuset or mempolicy induced memory 685 + * failures. 686 + */ 687 + if (constraint == CONSTRAINT_NONE) { 727 688 dump_header(NULL, gfp_mask, order, NULL); 728 - panic("out of memory. panic_on_oom is selected\n"); 689 + read_unlock(&tasklist_lock); 690 + panic("Out of memory: panic_on_oom is enabled\n"); 729 691 } 730 - /* Fall-through */ 731 - case CONSTRAINT_CPUSET: 732 - __out_of_memory(gfp_mask, order); 733 - break; 734 692 } 735 - 693 + __out_of_memory(gfp_mask, order, constraint, nodemask); 736 694 read_unlock(&tasklist_lock); 737 695 738 696 /*