mm: compaction: Partially revert capture of suitable high-order page

Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when
waiting for POLLIN on a local TCP socket. It was easier to trigger if
there was disk IO and dirty pages at the same time and he bisected it to
commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page
immediately when it is made available").

The intention of that patch was to improve high-order allocations under
memory pressure after changes made to reclaim in 3.6 drastically hurt
THP allocations but the approach was flawed. For Eric, the problem was
that page->pfmemalloc was not being cleared for captured pages leading
to a poor interaction with swap-over-NFS support causing the packets to
be dropped. However, I identified a few more problems with the patch
including the fact that it can increase contention on zone->lock in some
cases which could result in async direct compaction being aborted early.

In retrospect the capture patch took the wrong approach. What it should
have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it
was allocating for THP and avoided races that way. While the patch was
showing to improve allocation success rates at the time, the benefit is
marginal given the relative complexity and it should be revisited from
scratch in the context of the other reclaim-related changes that have
taken place since the patch was first written and tested. This patch
partially reverts commit 1fb3f8ca "mm: compaction: capture a suitable
high-order page immediately when it is made available".

Reported-and-tested-by: Eric Wong <normalperson@yhbt.net>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Mel Gorman and committed by Linus Torvalds 47ecfcb7 254adaa4

+23 -110
+2 -2
include/linux/compaction.h
··· 22 extern int fragmentation_index(struct zone *zone, unsigned int order); 23 extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 24 int order, gfp_t gfp_mask, nodemask_t *mask, 25 - bool sync, bool *contended, struct page **page); 26 extern int compact_pgdat(pg_data_t *pgdat, int order); 27 extern void reset_isolation_suitable(pg_data_t *pgdat); 28 extern unsigned long compaction_suitable(struct zone *zone, int order); ··· 75 #else 76 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 77 int order, gfp_t gfp_mask, nodemask_t *nodemask, 78 - bool sync, bool *contended, struct page **page) 79 { 80 return COMPACT_CONTINUE; 81 }
··· 22 extern int fragmentation_index(struct zone *zone, unsigned int order); 23 extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 24 int order, gfp_t gfp_mask, nodemask_t *mask, 25 + bool sync, bool *contended); 26 extern int compact_pgdat(pg_data_t *pgdat, int order); 27 extern void reset_isolation_suitable(pg_data_t *pgdat); 28 extern unsigned long compaction_suitable(struct zone *zone, int order); ··· 75 #else 76 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 77 int order, gfp_t gfp_mask, nodemask_t *nodemask, 78 + bool sync, bool *contended) 79 { 80 return COMPACT_CONTINUE; 81 }
-1
include/linux/mm.h
··· 455 456 void split_page(struct page *page, unsigned int order); 457 int split_free_page(struct page *page); 458 - int capture_free_page(struct page *page, int alloc_order, int migratetype); 459 460 /* 461 * Compound pages have a destructor function. Provide a
··· 455 456 void split_page(struct page *page, unsigned int order); 457 int split_free_page(struct page *page); 458 459 /* 460 * Compound pages have a destructor function. Provide a
+13 -79
mm/compaction.c
··· 816 static int compact_finished(struct zone *zone, 817 struct compact_control *cc) 818 { 819 unsigned long watermark; 820 821 if (fatal_signal_pending(current)) ··· 851 return COMPACT_CONTINUE; 852 853 /* Direct compactor: Is a suitable page free? */ 854 - if (cc->page) { 855 - /* Was a suitable page captured? */ 856 - if (*cc->page) 857 - return COMPACT_PARTIAL; 858 - } else { 859 - unsigned int order; 860 - for (order = cc->order; order < MAX_ORDER; order++) { 861 - struct free_area *area = &zone->free_area[cc->order]; 862 - /* Job done if page is free of the right migratetype */ 863 - if (!list_empty(&area->free_list[cc->migratetype])) 864 - return COMPACT_PARTIAL; 865 866 - /* Job done if allocation would set block type */ 867 - if (cc->order >= pageblock_order && area->nr_free) 868 - return COMPACT_PARTIAL; 869 - } 870 } 871 872 return COMPACT_CONTINUE; ··· 914 return COMPACT_PARTIAL; 915 916 return COMPACT_CONTINUE; 917 - } 918 - 919 - static void compact_capture_page(struct compact_control *cc) 920 - { 921 - unsigned long flags; 922 - int mtype, mtype_low, mtype_high; 923 - 924 - if (!cc->page || *cc->page) 925 - return; 926 - 927 - /* 928 - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP 929 - * regardless of the migratetype of the freelist is is captured from. 930 - * This is fine because the order for a high-order MIGRATE_MOVABLE 931 - * allocation is typically at least a pageblock size and overall 932 - * fragmentation is not impaired. Other allocation types must 933 - * capture pages from their own migratelist because otherwise they 934 - * could pollute other pageblocks like MIGRATE_MOVABLE with 935 - * difficult to move pages and making fragmentation worse overall. 936 - */ 937 - if (cc->migratetype == MIGRATE_MOVABLE) { 938 - mtype_low = 0; 939 - mtype_high = MIGRATE_PCPTYPES; 940 - } else { 941 - mtype_low = cc->migratetype; 942 - mtype_high = cc->migratetype + 1; 943 - } 944 - 945 - /* Speculatively examine the free lists without zone lock */ 946 - for (mtype = mtype_low; mtype < mtype_high; mtype++) { 947 - int order; 948 - for (order = cc->order; order < MAX_ORDER; order++) { 949 - struct page *page; 950 - struct free_area *area; 951 - area = &(cc->zone->free_area[order]); 952 - if (list_empty(&area->free_list[mtype])) 953 - continue; 954 - 955 - /* Take the lock and attempt capture of the page */ 956 - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) 957 - return; 958 - if (!list_empty(&area->free_list[mtype])) { 959 - page = list_entry(area->free_list[mtype].next, 960 - struct page, lru); 961 - if (capture_free_page(page, cc->order, mtype)) { 962 - spin_unlock_irqrestore(&cc->zone->lock, 963 - flags); 964 - *cc->page = page; 965 - return; 966 - } 967 - } 968 - spin_unlock_irqrestore(&cc->zone->lock, flags); 969 - } 970 - } 971 } 972 973 static int compact_zone(struct zone *zone, struct compact_control *cc) ··· 995 goto out; 996 } 997 } 998 - 999 - /* Capture a page now if it is a suitable size */ 1000 - compact_capture_page(cc); 1001 } 1002 1003 out: ··· 1007 1008 static unsigned long compact_zone_order(struct zone *zone, 1009 int order, gfp_t gfp_mask, 1010 - bool sync, bool *contended, 1011 - struct page **page) 1012 { 1013 unsigned long ret; 1014 struct compact_control cc = { ··· 1017 .migratetype = allocflags_to_migratetype(gfp_mask), 1018 .zone = zone, 1019 .sync = sync, 1020 - .page = page, 1021 }; 1022 INIT_LIST_HEAD(&cc.freepages); 1023 INIT_LIST_HEAD(&cc.migratepages); ··· 1046 */ 1047 unsigned long try_to_compact_pages(struct zonelist *zonelist, 1048 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1049 - bool sync, bool *contended, struct page **page) 1050 { 1051 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1052 int may_enter_fs = gfp_mask & __GFP_FS; ··· 1072 int status; 1073 1074 status = compact_zone_order(zone, order, gfp_mask, sync, 1075 - contended, page); 1076 rc = max(status, rc); 1077 1078 /* If a normal allocation would succeed, stop compacting */ ··· 1128 struct compact_control cc = { 1129 .order = order, 1130 .sync = false, 1131 - .page = NULL, 1132 }; 1133 1134 return __compact_pgdat(pgdat, &cc); ··· 1138 struct compact_control cc = { 1139 .order = -1, 1140 .sync = true, 1141 - .page = NULL, 1142 }; 1143 1144 return __compact_pgdat(NODE_DATA(nid), &cc);
··· 816 static int compact_finished(struct zone *zone, 817 struct compact_control *cc) 818 { 819 + unsigned int order; 820 unsigned long watermark; 821 822 if (fatal_signal_pending(current)) ··· 850 return COMPACT_CONTINUE; 851 852 /* Direct compactor: Is a suitable page free? */ 853 + for (order = cc->order; order < MAX_ORDER; order++) { 854 + struct free_area *area = &zone->free_area[order]; 855 856 + /* Job done if page is free of the right migratetype */ 857 + if (!list_empty(&area->free_list[cc->migratetype])) 858 + return COMPACT_PARTIAL; 859 + 860 + /* Job done if allocation would set block type */ 861 + if (cc->order >= pageblock_order && area->nr_free) 862 + return COMPACT_PARTIAL; 863 } 864 865 return COMPACT_CONTINUE; ··· 919 return COMPACT_PARTIAL; 920 921 return COMPACT_CONTINUE; 922 } 923 924 static int compact_zone(struct zone *zone, struct compact_control *cc) ··· 1054 goto out; 1055 } 1056 } 1057 } 1058 1059 out: ··· 1069 1070 static unsigned long compact_zone_order(struct zone *zone, 1071 int order, gfp_t gfp_mask, 1072 + bool sync, bool *contended) 1073 { 1074 unsigned long ret; 1075 struct compact_control cc = { ··· 1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1081 .zone = zone, 1082 .sync = sync, 1083 }; 1084 INIT_LIST_HEAD(&cc.freepages); 1085 INIT_LIST_HEAD(&cc.migratepages); ··· 1110 */ 1111 unsigned long try_to_compact_pages(struct zonelist *zonelist, 1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1113 + bool sync, bool *contended) 1114 { 1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1116 int may_enter_fs = gfp_mask & __GFP_FS; ··· 1136 int status; 1137 1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1139 + contended); 1140 rc = max(status, rc); 1141 1142 /* If a normal allocation would succeed, stop compacting */ ··· 1192 struct compact_control cc = { 1193 .order = order, 1194 .sync = false, 1195 }; 1196 1197 return __compact_pgdat(pgdat, &cc); ··· 1203 struct compact_control cc = { 1204 .order = -1, 1205 .sync = true, 1206 }; 1207 1208 return __compact_pgdat(NODE_DATA(nid), &cc);
-1
mm/internal.h
··· 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 136 struct zone *zone; 137 bool contended; /* True if a lock was contended */ 138 - struct page **page; /* Page captured of requested size */ 139 }; 140 141 unsigned long
··· 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 136 struct zone *zone; 137 bool contended; /* True if a lock was contended */ 138 }; 139 140 unsigned long
+8 -27
mm/page_alloc.c
··· 1384 set_page_refcounted(page + i); 1385 } 1386 1387 - /* 1388 - * Similar to the split_page family of functions except that the page 1389 - * required at the given order and being isolated now to prevent races 1390 - * with parallel allocators 1391 - */ 1392 - int capture_free_page(struct page *page, int alloc_order, int migratetype) 1393 { 1394 - unsigned int order; 1395 unsigned long watermark; 1396 struct zone *zone; 1397 int mt; ··· 1393 BUG_ON(!PageBuddy(page)); 1394 1395 zone = page_zone(page); 1396 - order = page_order(page); 1397 mt = get_pageblock_migratetype(page); 1398 1399 if (mt != MIGRATE_ISOLATE) { ··· 1401 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1402 return 0; 1403 1404 - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); 1405 } 1406 1407 /* Remove page from free list */ ··· 1409 zone->free_area[order].nr_free--; 1410 rmv_page_order(page); 1411 1412 - if (alloc_order != order) 1413 - expand(zone, page, alloc_order, order, 1414 - &zone->free_area[order], migratetype); 1415 - 1416 - /* Set the pageblock if the captured page is at least a pageblock */ 1417 if (order >= pageblock_order - 1) { 1418 struct page *endpage = page + (1 << order) - 1; 1419 for (; page < endpage; page += pageblock_nr_pages) { ··· 1420 } 1421 } 1422 1423 - return 1UL << alloc_order; 1424 } 1425 1426 /* ··· 1438 unsigned int order; 1439 int nr_pages; 1440 1441 - BUG_ON(!PageBuddy(page)); 1442 order = page_order(page); 1443 1444 - nr_pages = capture_free_page(page, order, 0); 1445 if (!nr_pages) 1446 return 0; 1447 ··· 2124 bool *contended_compaction, bool *deferred_compaction, 2125 unsigned long *did_some_progress) 2126 { 2127 - struct page *page = NULL; 2128 - 2129 if (!order) 2130 return NULL; 2131 ··· 2135 current->flags |= PF_MEMALLOC; 2136 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2137 nodemask, sync_migration, 2138 - contended_compaction, &page); 2139 current->flags &= ~PF_MEMALLOC; 2140 2141 - /* If compaction captured a page, prep and use it */ 2142 - if (page) { 2143 - prep_new_page(page, order, gfp_mask); 2144 - goto got_page; 2145 - } 2146 - 2147 if (*did_some_progress != COMPACT_SKIPPED) { 2148 /* Page migration frees to the PCP lists but we want merging */ 2149 drain_pages(get_cpu()); 2150 put_cpu(); ··· 2150 alloc_flags & ~ALLOC_NO_WATERMARKS, 2151 preferred_zone, migratetype); 2152 if (page) { 2153 - got_page: 2154 preferred_zone->compact_blockskip_flush = false; 2155 preferred_zone->compact_considered = 0; 2156 preferred_zone->compact_defer_shift = 0;
··· 1384 set_page_refcounted(page + i); 1385 } 1386 1387 + static int __isolate_free_page(struct page *page, unsigned int order) 1388 { 1389 unsigned long watermark; 1390 struct zone *zone; 1391 int mt; ··· 1399 BUG_ON(!PageBuddy(page)); 1400 1401 zone = page_zone(page); 1402 mt = get_pageblock_migratetype(page); 1403 1404 if (mt != MIGRATE_ISOLATE) { ··· 1408 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1409 return 0; 1410 1411 + __mod_zone_freepage_state(zone, -(1UL << order), mt); 1412 } 1413 1414 /* Remove page from free list */ ··· 1416 zone->free_area[order].nr_free--; 1417 rmv_page_order(page); 1418 1419 + /* Set the pageblock if the isolated page is at least a pageblock */ 1420 if (order >= pageblock_order - 1) { 1421 struct page *endpage = page + (1 << order) - 1; 1422 for (; page < endpage; page += pageblock_nr_pages) { ··· 1431 } 1432 } 1433 1434 + return 1UL << order; 1435 } 1436 1437 /* ··· 1449 unsigned int order; 1450 int nr_pages; 1451 1452 order = page_order(page); 1453 1454 + nr_pages = __isolate_free_page(page, order); 1455 if (!nr_pages) 1456 return 0; 1457 ··· 2136 bool *contended_compaction, bool *deferred_compaction, 2137 unsigned long *did_some_progress) 2138 { 2139 if (!order) 2140 return NULL; 2141 ··· 2149 current->flags |= PF_MEMALLOC; 2150 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2151 nodemask, sync_migration, 2152 + contended_compaction); 2153 current->flags &= ~PF_MEMALLOC; 2154 2155 if (*did_some_progress != COMPACT_SKIPPED) { 2156 + struct page *page; 2157 + 2158 /* Page migration frees to the PCP lists but we want merging */ 2159 drain_pages(get_cpu()); 2160 put_cpu(); ··· 2168 alloc_flags & ~ALLOC_NO_WATERMARKS, 2169 preferred_zone, migratetype); 2170 if (page) { 2171 preferred_zone->compact_blockskip_flush = false; 2172 preferred_zone->compact_considered = 0; 2173 preferred_zone->compact_defer_shift = 0;