Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: page_alloc: embed OOM killing naturally into allocation slowpath

The OOM killing invocation does a lot of duplicative checks against the
task's allocation context. Rework it to take advantage of the existing
checks in the allocator slowpath.

The OOM killer is invoked when the allocator is unable to reclaim any
pages but the allocation has to keep looping. Instead of having a check
for __GFP_NORETRY hidden in oom_gfp_allowed(), just move the OOM
invocation to the true branch of should_alloc_retry(). The __GFP_FS
check from oom_gfp_allowed() can then be moved into the OOM avoidance
branch in __alloc_pages_may_oom(), along with the PF_DUMPCORE test.

__alloc_pages_may_oom() can then signal to the caller whether the OOM
killer was invoked, instead of requiring it to duplicate the order and
high_zoneidx checks to guess this when deciding whether to continue.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Johannes Weiner and committed by
Linus Torvalds
9879de73 26bc420b

+35 -52
-5
include/linux/oom.h
··· 85 85 oom_killer_disabled = false; 86 86 } 87 87 88 - static inline bool oom_gfp_allowed(gfp_t gfp_mask) 89 - { 90 - return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY); 91 - } 92 - 93 88 extern struct task_struct *find_lock_task_mm(struct task_struct *p); 94 89 95 90 static inline bool task_will_free_mem(struct task_struct *task)
+35 -47
mm/page_alloc.c
··· 2332 2332 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2333 2333 struct zonelist *zonelist, enum zone_type high_zoneidx, 2334 2334 nodemask_t *nodemask, struct zone *preferred_zone, 2335 - int classzone_idx, int migratetype) 2335 + int classzone_idx, int migratetype, unsigned long *did_some_progress) 2336 2336 { 2337 2337 struct page *page; 2338 2338 2339 - /* Acquire the per-zone oom lock for each zone */ 2339 + *did_some_progress = 0; 2340 + 2341 + if (oom_killer_disabled) 2342 + return NULL; 2343 + 2344 + /* 2345 + * Acquire the per-zone oom lock for each zone. If that 2346 + * fails, somebody else is making progress for us. 2347 + */ 2340 2348 if (!oom_zonelist_trylock(zonelist, gfp_mask)) { 2349 + *did_some_progress = 1; 2341 2350 schedule_timeout_uninterruptible(1); 2342 2351 return NULL; 2343 2352 } ··· 2372 2363 goto out; 2373 2364 2374 2365 if (!(gfp_mask & __GFP_NOFAIL)) { 2366 + /* Coredumps can quickly deplete all memory reserves */ 2367 + if (current->flags & PF_DUMPCORE) 2368 + goto out; 2375 2369 /* The OOM killer will not help higher order allocs */ 2376 2370 if (order > PAGE_ALLOC_COSTLY_ORDER) 2377 2371 goto out; 2378 2372 /* The OOM killer does not needlessly kill tasks for lowmem */ 2379 2373 if (high_zoneidx < ZONE_NORMAL) 2374 + goto out; 2375 + /* The OOM killer does not compensate for light reclaim */ 2376 + if (!(gfp_mask & __GFP_FS)) 2380 2377 goto out; 2381 2378 /* 2382 2379 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. ··· 2396 2381 } 2397 2382 /* Exhausted what can be done so it's blamo time */ 2398 2383 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2399 - 2384 + *did_some_progress = 1; 2400 2385 out: 2401 2386 oom_zonelist_unlock(zonelist, gfp_mask); 2402 2387 return page; ··· 2673 2658 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2674 2659 goto nopage; 2675 2660 2676 - restart: 2661 + retry: 2677 2662 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2678 2663 wake_all_kswapds(order, zonelist, high_zoneidx, 2679 2664 preferred_zone, nodemask); ··· 2696 2681 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2697 2682 } 2698 2683 2699 - rebalance: 2700 2684 /* This is the last chance, in general, before the goto nopage. */ 2701 2685 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2702 2686 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, ··· 2802 2788 if (page) 2803 2789 goto got_pg; 2804 2790 2805 - /* 2806 - * If we failed to make any progress reclaiming, then we are 2807 - * running out of options and have to consider going OOM 2808 - */ 2809 - if (!did_some_progress) { 2810 - if (oom_gfp_allowed(gfp_mask)) { 2811 - if (oom_killer_disabled) 2812 - goto nopage; 2813 - /* Coredumps can quickly deplete all memory reserves */ 2814 - if ((current->flags & PF_DUMPCORE) && 2815 - !(gfp_mask & __GFP_NOFAIL)) 2816 - goto nopage; 2817 - page = __alloc_pages_may_oom(gfp_mask, order, 2818 - zonelist, high_zoneidx, 2819 - nodemask, preferred_zone, 2820 - classzone_idx, migratetype); 2821 - if (page) 2822 - goto got_pg; 2823 - 2824 - if (!(gfp_mask & __GFP_NOFAIL)) { 2825 - /* 2826 - * The oom killer is not called for high-order 2827 - * allocations that may fail, so if no progress 2828 - * is being made, there are no other options and 2829 - * retrying is unlikely to help. 2830 - */ 2831 - if (order > PAGE_ALLOC_COSTLY_ORDER) 2832 - goto nopage; 2833 - /* 2834 - * The oom killer is not called for lowmem 2835 - * allocations to prevent needlessly killing 2836 - * innocent tasks. 2837 - */ 2838 - if (high_zoneidx < ZONE_NORMAL) 2839 - goto nopage; 2840 - } 2841 - 2842 - goto restart; 2843 - } 2844 - } 2845 - 2846 2791 /* Check if we should retry the allocation */ 2847 2792 pages_reclaimed += did_some_progress; 2848 2793 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2849 2794 pages_reclaimed)) { 2795 + /* 2796 + * If we fail to make progress by freeing individual 2797 + * pages, but the allocation wants us to keep going, 2798 + * start OOM killing tasks. 2799 + */ 2800 + if (!did_some_progress) { 2801 + page = __alloc_pages_may_oom(gfp_mask, order, zonelist, 2802 + high_zoneidx, nodemask, 2803 + preferred_zone, classzone_idx, 2804 + migratetype,&did_some_progress); 2805 + if (page) 2806 + goto got_pg; 2807 + if (!did_some_progress) 2808 + goto nopage; 2809 + } 2850 2810 /* Wait for some write requests to complete then retry */ 2851 2811 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2852 - goto rebalance; 2812 + goto retry; 2853 2813 } else { 2854 2814 /* 2855 2815 * High-order allocations do not necessarily loop after