Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: vmscan: fix do_try_to_free_pages() livelock

This patch is based on KOSAKI's work and I add a little more description,
please refer https://lkml.org/lkml/2012/6/14/74.

Currently, I found system can enter a state that there are lots of free
pages in a zone but only order-0 and order-1 pages which means the zone is
heavily fragmented, then high order allocation could make direct reclaim
path's long stall(ex, 60 seconds) especially in no swap and no compaciton
enviroment. This problem happened on v3.4, but it seems issue still lives
in current tree, the reason is do_try_to_free_pages enter live lock:

kswapd will go to sleep if the zones have been fully scanned and are still
not balanced. As kswapd thinks there's little point trying all over again
to avoid infinite loop. Instead it changes order from high-order to
0-order because kswapd think order-0 is the most important. Look at
73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep
and may leave zone->all_unreclaimable =3D 0. It assume high-order users
can still perform direct reclaim if they wish.

Direct reclaim continue to reclaim for a high order which is not a
COSTLY_ORDER without oom-killer until kswapd turn on
zone->all_unreclaimble= . This is because to avoid too early oom-kill.
So it means direct_reclaim depends on kswapd to break this loop.

In worst case, direct-reclaim may continue to page reclaim forever when
kswapd sleeps forever until someone like watchdog detect and finally kill
the process. As described in:
http://thread.gmane.org/gmane.linux.kernel.mm/103737

We can't turn on zone->all_unreclaimable from direct reclaim path because
direct reclaim path don't take any lock and this way is racy. Thus this
patch removes zone->all_unreclaimable field completely and recalculates
zone reclaimable state every time.

Note: we can't take the idea that direct-reclaim see zone->pages_scanned
directly and kswapd continue to use zone->all_unreclaimable. Because, it
is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use
zone->all_unreclaimable as a name) describes the detail.

[akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()]
Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Bob Liu <lliubbo@gmail.com>
Cc: Neil Zhang <zhangwm@marvell.com>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Lisa Du <cldu@marvell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Lisa Du and committed by
Linus Torvalds
6e543d57 7a8010cd

+44 -42
+1
include/linux/mm_inline.h
··· 2 2 #define LINUX_MM_INLINE_H 3 3 4 4 #include <linux/huge_mm.h> 5 + #include <linux/swap.h> 5 6 6 7 /** 7 8 * page_is_file_cache - should the page be on a file LRU or anon LRU?
-1
include/linux/mmzone.h
··· 353 353 * free areas of different sizes 354 354 */ 355 355 spinlock_t lock; 356 - int all_unreclaimable; /* All pages pinned */ 357 356 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 358 357 /* Set to true when the PG_migrate_skip bits should be cleared */ 359 358 bool compact_blockskip_flush;
-1
include/linux/vmstat.h
··· 143 143 } 144 144 145 145 extern unsigned long global_reclaimable_pages(void); 146 - extern unsigned long zone_reclaimable_pages(struct zone *zone); 147 146 148 147 #ifdef CONFIG_NUMA 149 148 /*
+2
mm/internal.h
··· 85 85 */ 86 86 extern int isolate_lru_page(struct page *page); 87 87 extern void putback_lru_page(struct page *page); 88 + extern unsigned long zone_reclaimable_pages(struct zone *zone); 89 + extern bool zone_reclaimable(struct zone *zone); 88 90 89 91 /* 90 92 * in mm/rmap.c:
+1 -1
mm/migrate.c
··· 1471 1471 if (!populated_zone(zone)) 1472 1472 continue; 1473 1473 1474 - if (zone->all_unreclaimable) 1474 + if (!zone_reclaimable(zone)) 1475 1475 continue; 1476 1476 1477 1477 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+3
mm/page-writeback.c
··· 36 36 #include <linux/pagevec.h> 37 37 #include <linux/timer.h> 38 38 #include <linux/sched/rt.h> 39 + #include <linux/mm_inline.h> 39 40 #include <trace/events/writeback.h> 41 + 42 + #include "internal.h" 40 43 41 44 /* 42 45 * Sleep at most 200ms at a time in balance_dirty_pages().
+2 -3
mm/page_alloc.c
··· 56 56 #include <linux/ftrace_event.h> 57 57 #include <linux/memcontrol.h> 58 58 #include <linux/prefetch.h> 59 + #include <linux/mm_inline.h> 59 60 #include <linux/migrate.h> 60 61 #include <linux/page-debug-flags.h> 61 62 #include <linux/hugetlb.h> ··· 648 647 int to_free = count; 649 648 650 649 spin_lock(&zone->lock); 651 - zone->all_unreclaimable = 0; 652 650 zone->pages_scanned = 0; 653 651 654 652 while (to_free) { ··· 696 696 int migratetype) 697 697 { 698 698 spin_lock(&zone->lock); 699 - zone->all_unreclaimable = 0; 700 699 zone->pages_scanned = 0; 701 700 702 701 __free_one_page(page, zone, order, migratetype); ··· 3163 3164 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3164 3165 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3165 3166 zone->pages_scanned, 3166 - (zone->all_unreclaimable ? "yes" : "no") 3167 + (!zone_reclaimable(zone) ? "yes" : "no") 3167 3168 ); 3168 3169 printk("lowmem_reserve[]:"); 3169 3170 for (i = 0; i < MAX_NR_ZONES; i++)
+31 -35
mm/vmscan.c
··· 146 146 } 147 147 #endif 148 148 149 + unsigned long zone_reclaimable_pages(struct zone *zone) 150 + { 151 + int nr; 152 + 153 + nr = zone_page_state(zone, NR_ACTIVE_FILE) + 154 + zone_page_state(zone, NR_INACTIVE_FILE); 155 + 156 + if (get_nr_swap_pages() > 0) 157 + nr += zone_page_state(zone, NR_ACTIVE_ANON) + 158 + zone_page_state(zone, NR_INACTIVE_ANON); 159 + 160 + return nr; 161 + } 162 + 163 + bool zone_reclaimable(struct zone *zone) 164 + { 165 + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 166 + } 167 + 149 168 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 150 169 { 151 170 if (!mem_cgroup_disabled()) ··· 1808 1789 * latencies, so it's better to scan a minimum amount there as 1809 1790 * well. 1810 1791 */ 1811 - if (current_is_kswapd() && zone->all_unreclaimable) 1792 + if (current_is_kswapd() && !zone_reclaimable(zone)) 1812 1793 force_scan = true; 1813 1794 if (!global_reclaim(sc)) 1814 1795 force_scan = true; ··· 2263 2244 if (global_reclaim(sc)) { 2264 2245 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2265 2246 continue; 2266 - if (zone->all_unreclaimable && 2267 - sc->priority != DEF_PRIORITY) 2247 + if (sc->priority != DEF_PRIORITY && 2248 + !zone_reclaimable(zone)) 2268 2249 continue; /* Let kswapd poll it */ 2269 2250 if (IS_ENABLED(CONFIG_COMPACTION)) { 2270 2251 /* ··· 2302 2283 return aborted_reclaim; 2303 2284 } 2304 2285 2305 - static bool zone_reclaimable(struct zone *zone) 2306 - { 2307 - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2308 - } 2309 - 2310 2286 /* All zones in zonelist are unreclaimable? */ 2311 2287 static bool all_unreclaimable(struct zonelist *zonelist, 2312 2288 struct scan_control *sc) ··· 2315 2301 continue; 2316 2302 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2317 2303 continue; 2318 - if (!zone->all_unreclaimable) 2304 + if (zone_reclaimable(zone)) 2319 2305 return false; 2320 2306 } 2321 2307 ··· 2726 2712 * DEF_PRIORITY. Effectively, it considers them balanced so 2727 2713 * they must be considered balanced here as well! 2728 2714 */ 2729 - if (zone->all_unreclaimable) { 2715 + if (!zone_reclaimable(zone)) { 2730 2716 balanced_pages += zone->managed_pages; 2731 2717 continue; 2732 2718 } ··· 2787 2773 unsigned long lru_pages, 2788 2774 unsigned long *nr_attempted) 2789 2775 { 2790 - unsigned long nr_slab; 2791 2776 int testorder = sc->order; 2792 2777 unsigned long balance_gap; 2793 2778 struct reclaim_state *reclaim_state = current->reclaim_state; ··· 2831 2818 shrink_zone(zone, sc); 2832 2819 2833 2820 reclaim_state->reclaimed_slab = 0; 2834 - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2821 + shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2835 2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2836 2823 2837 2824 /* Account for the number of pages attempted to reclaim */ 2838 2825 *nr_attempted += sc->nr_to_reclaim; 2839 - 2840 - if (nr_slab == 0 && !zone_reclaimable(zone)) 2841 - zone->all_unreclaimable = 1; 2842 2826 2843 2827 zone_clear_flag(zone, ZONE_WRITEBACK); 2844 2828 ··· 2845 2835 * BDIs but as pressure is relieved, speculatively avoid congestion 2846 2836 * waits. 2847 2837 */ 2848 - if (!zone->all_unreclaimable && 2838 + if (zone_reclaimable(zone) && 2849 2839 zone_balanced(zone, testorder, 0, classzone_idx)) { 2850 2840 zone_clear_flag(zone, ZONE_CONGESTED); 2851 2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); ··· 2911 2901 if (!populated_zone(zone)) 2912 2902 continue; 2913 2903 2914 - if (zone->all_unreclaimable && 2915 - sc.priority != DEF_PRIORITY) 2904 + if (sc.priority != DEF_PRIORITY && 2905 + !zone_reclaimable(zone)) 2916 2906 continue; 2917 2907 2918 2908 /* ··· 2990 2980 if (!populated_zone(zone)) 2991 2981 continue; 2992 2982 2993 - if (zone->all_unreclaimable && 2994 - sc.priority != DEF_PRIORITY) 2983 + if (sc.priority != DEF_PRIORITY && 2984 + !zone_reclaimable(zone)) 2995 2985 continue; 2996 2986 2997 2987 sc.nr_scanned = 0; ··· 3271 3261 if (get_nr_swap_pages() > 0) 3272 3262 nr += global_page_state(NR_ACTIVE_ANON) + 3273 3263 global_page_state(NR_INACTIVE_ANON); 3274 - 3275 - return nr; 3276 - } 3277 - 3278 - unsigned long zone_reclaimable_pages(struct zone *zone) 3279 - { 3280 - int nr; 3281 - 3282 - nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3283 - zone_page_state(zone, NR_INACTIVE_FILE); 3284 - 3285 - if (get_nr_swap_pages() > 0) 3286 - nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3287 - zone_page_state(zone, NR_INACTIVE_ANON); 3288 3264 3289 3265 return nr; 3290 3266 } ··· 3572 3576 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3573 3577 return ZONE_RECLAIM_FULL; 3574 3578 3575 - if (zone->all_unreclaimable) 3579 + if (!zone_reclaimable(zone)) 3576 3580 return ZONE_RECLAIM_FULL; 3577 3581 3578 3582 /*
+4 -1
mm/vmstat.c
··· 19 19 #include <linux/math64.h> 20 20 #include <linux/writeback.h> 21 21 #include <linux/compaction.h> 22 + #include <linux/mm_inline.h> 23 + 24 + #include "internal.h" 22 25 23 26 #ifdef CONFIG_VM_EVENT_COUNTERS 24 27 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; ··· 1091 1088 "\n all_unreclaimable: %u" 1092 1089 "\n start_pfn: %lu" 1093 1090 "\n inactive_ratio: %u", 1094 - zone->all_unreclaimable, 1091 + !zone_reclaimable(zone), 1095 1092 zone->zone_start_pfn, 1096 1093 zone->inactive_ratio); 1097 1094 seq_putc(m, '\n');