mm: compaction: Partially revert capture of suitable high-order page

Eric Wong reported on 3.7 and 3.8-rc2 that ppoll() got stuck when
waiting for POLLIN on a local TCP socket. It was easier to trigger if
there was disk IO and dirty pages at the same time and he bisected it to
commit 1fb3f8ca0e92 ("mm: compaction: capture a suitable high-order page
immediately when it is made available").

The intention of that patch was to improve high-order allocations under
memory pressure after changes made to reclaim in 3.6 drastically hurt
THP allocations but the approach was flawed. For Eric, the problem was
that page->pfmemalloc was not being cleared for captured pages leading
to a poor interaction with swap-over-NFS support causing the packets to
be dropped. However, I identified a few more problems with the patch
including the fact that it can increase contention on zone->lock in some
cases which could result in async direct compaction being aborted early.

In retrospect the capture patch took the wrong approach. What it should
have done is mark the pageblock being migrated as MIGRATE_ISOLATE if it
was allocating for THP and avoided races that way. While the patch was
showing to improve allocation success rates at the time, the benefit is
marginal given the relative complexity and it should be revisited from
scratch in the context of the other reclaim-related changes that have
taken place since the patch was first written and tested. This patch
partially reverts commit 1fb3f8ca "mm: compaction: capture a suitable
high-order page immediately when it is made available".

Reported-and-tested-by: Eric Wong <normalperson@yhbt.net>
Tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by Mel Gorman and committed by Linus Torvalds 47ecfcb7 254adaa4

+23 -110
+2 -2
include/linux/compaction.h
··· 22 22 extern int fragmentation_index(struct zone *zone, unsigned int order); 23 23 extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 24 24 int order, gfp_t gfp_mask, nodemask_t *mask, 25 - bool sync, bool *contended, struct page **page); 25 + bool sync, bool *contended); 26 26 extern int compact_pgdat(pg_data_t *pgdat, int order); 27 27 extern void reset_isolation_suitable(pg_data_t *pgdat); 28 28 extern unsigned long compaction_suitable(struct zone *zone, int order); ··· 75 75 #else 76 76 static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 77 77 int order, gfp_t gfp_mask, nodemask_t *nodemask, 78 - bool sync, bool *contended, struct page **page) 78 + bool sync, bool *contended) 79 79 { 80 80 return COMPACT_CONTINUE; 81 81 }
-1
include/linux/mm.h
··· 455 455 456 456 void split_page(struct page *page, unsigned int order); 457 457 int split_free_page(struct page *page); 458 - int capture_free_page(struct page *page, int alloc_order, int migratetype); 459 458 460 459 /* 461 460 * Compound pages have a destructor function. Provide a
+13 -79
mm/compaction.c
··· 816 816 static int compact_finished(struct zone *zone, 817 817 struct compact_control *cc) 818 818 { 819 + unsigned int order; 819 820 unsigned long watermark; 820 821 821 822 if (fatal_signal_pending(current)) ··· 851 850 return COMPACT_CONTINUE; 852 851 853 852 /* Direct compactor: Is a suitable page free? */ 854 - if (cc->page) { 855 - /* Was a suitable page captured? */ 856 - if (*cc->page) 857 - return COMPACT_PARTIAL; 858 - } else { 859 - unsigned int order; 860 - for (order = cc->order; order < MAX_ORDER; order++) { 861 - struct free_area *area = &zone->free_area[cc->order]; 862 - /* Job done if page is free of the right migratetype */ 863 - if (!list_empty(&area->free_list[cc->migratetype])) 864 - return COMPACT_PARTIAL; 853 + for (order = cc->order; order < MAX_ORDER; order++) { 854 + struct free_area *area = &zone->free_area[order]; 865 855 866 - /* Job done if allocation would set block type */ 867 - if (cc->order >= pageblock_order && area->nr_free) 868 - return COMPACT_PARTIAL; 869 - } 856 + /* Job done if page is free of the right migratetype */ 857 + if (!list_empty(&area->free_list[cc->migratetype])) 858 + return COMPACT_PARTIAL; 859 + 860 + /* Job done if allocation would set block type */ 861 + if (cc->order >= pageblock_order && area->nr_free) 862 + return COMPACT_PARTIAL; 870 863 } 871 864 872 865 return COMPACT_CONTINUE; ··· 914 919 return COMPACT_PARTIAL; 915 920 916 921 return COMPACT_CONTINUE; 917 - } 918 - 919 - static void compact_capture_page(struct compact_control *cc) 920 - { 921 - unsigned long flags; 922 - int mtype, mtype_low, mtype_high; 923 - 924 - if (!cc->page || *cc->page) 925 - return; 926 - 927 - /* 928 - * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP 929 - * regardless of the migratetype of the freelist is is captured from. 930 - * This is fine because the order for a high-order MIGRATE_MOVABLE 931 - * allocation is typically at least a pageblock size and overall 932 - * fragmentation is not impaired. Other allocation types must 933 - * capture pages from their own migratelist because otherwise they 934 - * could pollute other pageblocks like MIGRATE_MOVABLE with 935 - * difficult to move pages and making fragmentation worse overall. 936 - */ 937 - if (cc->migratetype == MIGRATE_MOVABLE) { 938 - mtype_low = 0; 939 - mtype_high = MIGRATE_PCPTYPES; 940 - } else { 941 - mtype_low = cc->migratetype; 942 - mtype_high = cc->migratetype + 1; 943 - } 944 - 945 - /* Speculatively examine the free lists without zone lock */ 946 - for (mtype = mtype_low; mtype < mtype_high; mtype++) { 947 - int order; 948 - for (order = cc->order; order < MAX_ORDER; order++) { 949 - struct page *page; 950 - struct free_area *area; 951 - area = &(cc->zone->free_area[order]); 952 - if (list_empty(&area->free_list[mtype])) 953 - continue; 954 - 955 - /* Take the lock and attempt capture of the page */ 956 - if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) 957 - return; 958 - if (!list_empty(&area->free_list[mtype])) { 959 - page = list_entry(area->free_list[mtype].next, 960 - struct page, lru); 961 - if (capture_free_page(page, cc->order, mtype)) { 962 - spin_unlock_irqrestore(&cc->zone->lock, 963 - flags); 964 - *cc->page = page; 965 - return; 966 - } 967 - } 968 - spin_unlock_irqrestore(&cc->zone->lock, flags); 969 - } 970 - } 971 922 } 972 923 973 924 static int compact_zone(struct zone *zone, struct compact_control *cc) ··· 995 1054 goto out; 996 1055 } 997 1056 } 998 - 999 - /* Capture a page now if it is a suitable size */ 1000 - compact_capture_page(cc); 1001 1057 } 1002 1058 1003 1059 out: ··· 1007 1069 1008 1070 static unsigned long compact_zone_order(struct zone *zone, 1009 1071 int order, gfp_t gfp_mask, 1010 - bool sync, bool *contended, 1011 - struct page **page) 1072 + bool sync, bool *contended) 1012 1073 { 1013 1074 unsigned long ret; 1014 1075 struct compact_control cc = { ··· 1017 1080 .migratetype = allocflags_to_migratetype(gfp_mask), 1018 1081 .zone = zone, 1019 1082 .sync = sync, 1020 - .page = page, 1021 1083 }; 1022 1084 INIT_LIST_HEAD(&cc.freepages); 1023 1085 INIT_LIST_HEAD(&cc.migratepages); ··· 1046 1110 */ 1047 1111 unsigned long try_to_compact_pages(struct zonelist *zonelist, 1048 1112 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1049 - bool sync, bool *contended, struct page **page) 1113 + bool sync, bool *contended) 1050 1114 { 1051 1115 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1052 1116 int may_enter_fs = gfp_mask & __GFP_FS; ··· 1072 1136 int status; 1073 1137 1074 1138 status = compact_zone_order(zone, order, gfp_mask, sync, 1075 - contended, page); 1139 + contended); 1076 1140 rc = max(status, rc); 1077 1141 1078 1142 /* If a normal allocation would succeed, stop compacting */ ··· 1128 1192 struct compact_control cc = { 1129 1193 .order = order, 1130 1194 .sync = false, 1131 - .page = NULL, 1132 1195 }; 1133 1196 1134 1197 return __compact_pgdat(pgdat, &cc); ··· 1138 1203 struct compact_control cc = { 1139 1204 .order = -1, 1140 1205 .sync = true, 1141 - .page = NULL, 1142 1206 }; 1143 1207 1144 1208 return __compact_pgdat(NODE_DATA(nid), &cc);
-1
mm/internal.h
··· 135 135 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 136 136 struct zone *zone; 137 137 bool contended; /* True if a lock was contended */ 138 - struct page **page; /* Page captured of requested size */ 139 138 }; 140 139 141 140 unsigned long
+8 -27
mm/page_alloc.c
··· 1384 1384 set_page_refcounted(page + i); 1385 1385 } 1386 1386 1387 - /* 1388 - * Similar to the split_page family of functions except that the page 1389 - * required at the given order and being isolated now to prevent races 1390 - * with parallel allocators 1391 - */ 1392 - int capture_free_page(struct page *page, int alloc_order, int migratetype) 1387 + static int __isolate_free_page(struct page *page, unsigned int order) 1393 1388 { 1394 - unsigned int order; 1395 1389 unsigned long watermark; 1396 1390 struct zone *zone; 1397 1391 int mt; ··· 1393 1399 BUG_ON(!PageBuddy(page)); 1394 1400 1395 1401 zone = page_zone(page); 1396 - order = page_order(page); 1397 1402 mt = get_pageblock_migratetype(page); 1398 1403 1399 1404 if (mt != MIGRATE_ISOLATE) { ··· 1401 1408 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1402 1409 return 0; 1403 1410 1404 - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); 1411 + __mod_zone_freepage_state(zone, -(1UL << order), mt); 1405 1412 } 1406 1413 1407 1414 /* Remove page from free list */ ··· 1409 1416 zone->free_area[order].nr_free--; 1410 1417 rmv_page_order(page); 1411 1418 1412 - if (alloc_order != order) 1413 - expand(zone, page, alloc_order, order, 1414 - &zone->free_area[order], migratetype); 1415 - 1416 - /* Set the pageblock if the captured page is at least a pageblock */ 1419 + /* Set the pageblock if the isolated page is at least a pageblock */ 1417 1420 if (order >= pageblock_order - 1) { 1418 1421 struct page *endpage = page + (1 << order) - 1; 1419 1422 for (; page < endpage; page += pageblock_nr_pages) { ··· 1420 1431 } 1421 1432 } 1422 1433 1423 - return 1UL << alloc_order; 1434 + return 1UL << order; 1424 1435 } 1425 1436 1426 1437 /* ··· 1438 1449 unsigned int order; 1439 1450 int nr_pages; 1440 1451 1441 - BUG_ON(!PageBuddy(page)); 1442 1452 order = page_order(page); 1443 1453 1444 - nr_pages = capture_free_page(page, order, 0); 1454 + nr_pages = __isolate_free_page(page, order); 1445 1455 if (!nr_pages) 1446 1456 return 0; 1447 1457 ··· 2124 2136 bool *contended_compaction, bool *deferred_compaction, 2125 2137 unsigned long *did_some_progress) 2126 2138 { 2127 - struct page *page = NULL; 2128 - 2129 2139 if (!order) 2130 2140 return NULL; 2131 2141 ··· 2135 2149 current->flags |= PF_MEMALLOC; 2136 2150 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2137 2151 nodemask, sync_migration, 2138 - contended_compaction, &page); 2152 + contended_compaction); 2139 2153 current->flags &= ~PF_MEMALLOC; 2140 2154 2141 - /* If compaction captured a page, prep and use it */ 2142 - if (page) { 2143 - prep_new_page(page, order, gfp_mask); 2144 - goto got_page; 2145 - } 2146 - 2147 2155 if (*did_some_progress != COMPACT_SKIPPED) { 2156 + struct page *page; 2157 + 2148 2158 /* Page migration frees to the PCP lists but we want merging */ 2149 2159 drain_pages(get_cpu()); 2150 2160 put_cpu(); ··· 2150 2168 alloc_flags & ~ALLOC_NO_WATERMARKS, 2151 2169 preferred_zone, migratetype); 2152 2170 if (page) { 2153 - got_page: 2154 2171 preferred_zone->compact_blockskip_flush = false; 2155 2172 preferred_zone->compact_considered = 0; 2156 2173 preferred_zone->compact_defer_shift = 0;