Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vmscan: narrow the scenarios in whcih lumpy reclaim uses synchrounous reclaim

shrink_page_list() can decide to give up reclaiming a page under a
number of conditions such as

1. trylock_page() failure
2. page is unevictable
3. zone reclaim and page is mapped
4. PageWriteback() is true
5. page is swapbacked and swap is full
6. add_to_swap() failure
7. page is dirty and gfpmask don't have GFP_IO, GFP_FS
8. page is pinned
9. IO queue is congested
10. pageout() start IO, but not finished

With lumpy reclaim, failures result in entering synchronous lumpy reclaim
but this can be unnecessary. In cases (2), (3), (5), (6), (7) and (8),
there is no point retrying. This patch causes lumpy reclaim to abort when
it is known it will fail.

Case (9) is more interesting. current behavior is,
1. start shrink_page_list(async)
2. found queue_congested()
3. skip pageout write
4. still start shrink_page_list(sync)
5. wait on a lot of pages
6. again, found queue_congested()
7. give up pageout write again

So, it's useless time wasting. However, just skipping page reclaim is
also notgood as x86 allocating a huge page needs 512 pages for example.
It can have more dirty pages than queue congestion threshold (~=128).

After this patch, pageout() behaves as follows;

- If order > PAGE_ALLOC_COSTLY_ORDER
Ignore queue congestion always.
- If order <= PAGE_ALLOC_COSTLY_ORDER
skip write page and disable lumpy reclaim.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

KOSAKI Motohiro and committed by
Linus Torvalds
7d3579e8 bc57e00f

+78 -48
+3 -3
include/trace/events/vmscan.h
··· 25 25 26 26 #define trace_reclaim_flags(page, sync) ( \ 27 27 (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ 28 - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 28 + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 29 29 ) 30 30 31 31 #define trace_shrink_flags(file, sync) ( \ 32 - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ 32 + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ 33 33 (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ 34 - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 34 + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ 35 35 ) 36 36 37 37 TRACE_EVENT(mm_vmscan_kswapd_sleep,
+75 -45
mm/vmscan.c
··· 51 51 #define CREATE_TRACE_POINTS 52 52 #include <trace/events/vmscan.h> 53 53 54 + enum lumpy_mode { 55 + LUMPY_MODE_NONE, 56 + LUMPY_MODE_ASYNC, 57 + LUMPY_MODE_SYNC, 58 + }; 59 + 54 60 struct scan_control { 55 61 /* Incremented by the number of inactive pages that were scanned */ 56 62 unsigned long nr_scanned; ··· 88 82 * Intend to reclaim enough continuous memory rather than reclaim 89 83 * enough amount of memory. i.e, mode for high order allocation. 90 84 */ 91 - bool lumpy_reclaim_mode; 85 + enum lumpy_mode lumpy_reclaim_mode; 92 86 93 87 /* Which cgroup do we reclaim from */ 94 88 struct mem_cgroup *mem_cgroup; ··· 271 265 return ret; 272 266 } 273 267 268 + static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, 269 + bool sync) 270 + { 271 + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 272 + 273 + /* 274 + * Some reclaim have alredy been failed. No worth to try synchronous 275 + * lumpy reclaim. 276 + */ 277 + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 278 + return; 279 + 280 + /* 281 + * If we need a large contiguous chunk of memory, or have 282 + * trouble getting a small set of contiguous pages, we 283 + * will reclaim both active and inactive pages. 284 + */ 285 + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 286 + sc->lumpy_reclaim_mode = mode; 287 + else if (sc->order && priority < DEF_PRIORITY - 2) 288 + sc->lumpy_reclaim_mode = mode; 289 + else 290 + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 291 + } 292 + 293 + static void disable_lumpy_reclaim_mode(struct scan_control *sc) 294 + { 295 + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 296 + } 297 + 274 298 static inline int is_page_cache_freeable(struct page *page) 275 299 { 276 300 /* ··· 311 275 return page_count(page) - page_has_private(page) == 2; 312 276 } 313 277 314 - static int may_write_to_queue(struct backing_dev_info *bdi) 278 + static int may_write_to_queue(struct backing_dev_info *bdi, 279 + struct scan_control *sc) 315 280 { 316 281 if (current->flags & PF_SWAPWRITE) 317 282 return 1; 318 283 if (!bdi_write_congested(bdi)) 319 284 return 1; 320 285 if (bdi == current->backing_dev_info) 286 + return 1; 287 + 288 + /* lumpy reclaim for hugepage often need a lot of write */ 289 + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 321 290 return 1; 322 291 return 0; 323 292 } ··· 348 307 unlock_page(page); 349 308 } 350 309 351 - /* Request for sync pageout. */ 352 - enum pageout_io { 353 - PAGEOUT_IO_ASYNC, 354 - PAGEOUT_IO_SYNC, 355 - }; 356 - 357 310 /* possible outcome of pageout() */ 358 311 typedef enum { 359 312 /* failed to write page out, page is locked */ ··· 365 330 * Calls ->writepage(). 366 331 */ 367 332 static pageout_t pageout(struct page *page, struct address_space *mapping, 368 - enum pageout_io sync_writeback) 333 + struct scan_control *sc) 369 334 { 370 335 /* 371 336 * If the page is dirty, only perform writeback if that write ··· 401 366 } 402 367 if (mapping->a_ops->writepage == NULL) 403 368 return PAGE_ACTIVATE; 404 - if (!may_write_to_queue(mapping->backing_dev_info)) 369 + if (!may_write_to_queue(mapping->backing_dev_info, sc)) { 370 + disable_lumpy_reclaim_mode(sc); 405 371 return PAGE_KEEP; 372 + } 406 373 407 374 if (clear_page_dirty_for_io(page)) { 408 375 int res; ··· 430 393 * direct reclaiming a large contiguous area and the 431 394 * first attempt to free a range of pages fails. 432 395 */ 433 - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 396 + if (PageWriteback(page) && 397 + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) 434 398 wait_on_page_writeback(page); 435 399 436 400 if (!PageWriteback(page)) { ··· 439 401 ClearPageReclaim(page); 440 402 } 441 403 trace_mm_vmscan_writepage(page, 442 - trace_reclaim_flags(page, sync_writeback)); 404 + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); 443 405 inc_zone_page_state(page, NR_VMSCAN_WRITE); 444 406 return PAGE_SUCCESS; 445 407 } ··· 617 579 referenced_page = TestClearPageReferenced(page); 618 580 619 581 /* Lumpy reclaim - ignore references */ 620 - if (sc->lumpy_reclaim_mode) 582 + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) 621 583 return PAGEREF_RECLAIM; 622 584 623 585 /* ··· 681 643 * shrink_page_list() returns the number of reclaimed pages 682 644 */ 683 645 static unsigned long shrink_page_list(struct list_head *page_list, 684 - struct scan_control *sc, 685 - enum pageout_io sync_writeback) 646 + struct scan_control *sc) 686 647 { 687 648 LIST_HEAD(ret_pages); 688 649 LIST_HEAD(free_pages); ··· 730 693 * for any page for which writeback has already 731 694 * started. 732 695 */ 733 - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 696 + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && 697 + may_enter_fs) 734 698 wait_on_page_writeback(page); 735 - else 736 - goto keep_locked; 699 + else { 700 + unlock_page(page); 701 + goto keep_lumpy; 702 + } 737 703 } 738 704 739 705 references = page_check_references(page, sc); ··· 790 750 goto keep_locked; 791 751 792 752 /* Page is dirty, try to write it out here */ 793 - switch (pageout(page, mapping, sync_writeback)) { 753 + switch (pageout(page, mapping, sc)) { 794 754 case PAGE_KEEP: 795 755 goto keep_locked; 796 756 case PAGE_ACTIVATE: 797 757 goto activate_locked; 798 758 case PAGE_SUCCESS: 799 - if (PageWriteback(page) || PageDirty(page)) 759 + if (PageWriteback(page)) 760 + goto keep_lumpy; 761 + if (PageDirty(page)) 800 762 goto keep; 763 + 801 764 /* 802 765 * A synchronous write - probably a ramdisk. Go 803 766 * ahead and try to reclaim the page. ··· 883 840 try_to_free_swap(page); 884 841 unlock_page(page); 885 842 putback_lru_page(page); 843 + disable_lumpy_reclaim_mode(sc); 886 844 continue; 887 845 888 846 activate_locked: ··· 896 852 keep_locked: 897 853 unlock_page(page); 898 854 keep: 855 + disable_lumpy_reclaim_mode(sc); 856 + keep_lumpy: 899 857 list_add(&page->lru, &ret_pages); 900 858 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 901 859 } ··· 1298 1252 return false; 1299 1253 1300 1254 /* Only stall on lumpy reclaim */ 1301 - if (!sc->lumpy_reclaim_mode) 1255 + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 1302 1256 return false; 1303 1257 1304 1258 /* If we have relaimed everything on the isolated list, no stall */ ··· 1343 1297 return SWAP_CLUSTER_MAX; 1344 1298 } 1345 1299 1346 - 1300 + set_lumpy_reclaim_mode(priority, sc, false); 1347 1301 lru_add_drain(); 1348 1302 spin_lock_irq(&zone->lru_lock); 1349 1303 1350 1304 if (scanning_global_lru(sc)) { 1351 1305 nr_taken = isolate_pages_global(nr_to_scan, 1352 1306 &page_list, &nr_scanned, sc->order, 1353 - sc->lumpy_reclaim_mode ? 1354 - ISOLATE_BOTH : ISOLATE_INACTIVE, 1307 + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1308 + ISOLATE_INACTIVE : ISOLATE_BOTH, 1355 1309 zone, 0, file); 1356 1310 zone->pages_scanned += nr_scanned; 1357 1311 if (current_is_kswapd()) ··· 1363 1317 } else { 1364 1318 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1365 1319 &page_list, &nr_scanned, sc->order, 1366 - sc->lumpy_reclaim_mode ? 1367 - ISOLATE_BOTH : ISOLATE_INACTIVE, 1320 + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1321 + ISOLATE_INACTIVE : ISOLATE_BOTH, 1368 1322 zone, sc->mem_cgroup, 1369 1323 0, file); 1370 1324 /* ··· 1382 1336 1383 1337 spin_unlock_irq(&zone->lru_lock); 1384 1338 1385 - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1339 + nr_reclaimed = shrink_page_list(&page_list, sc); 1386 1340 1387 1341 /* Check if we should syncronously wait for writeback */ 1388 1342 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { ··· 1393 1347 nr_active = clear_active_flags(&page_list, NULL); 1394 1348 count_vm_events(PGDEACTIVATE, nr_active); 1395 1349 1396 - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); 1350 + set_lumpy_reclaim_mode(priority, sc, true); 1351 + nr_reclaimed += shrink_page_list(&page_list, sc); 1397 1352 } 1398 1353 1399 1354 local_irq_disable(); ··· 1786 1739 } 1787 1740 } 1788 1741 1789 - static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) 1790 - { 1791 - /* 1792 - * If we need a large contiguous chunk of memory, or have 1793 - * trouble getting a small set of contiguous pages, we 1794 - * will reclaim both active and inactive pages. 1795 - */ 1796 - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1797 - sc->lumpy_reclaim_mode = 1; 1798 - else if (sc->order && priority < DEF_PRIORITY - 2) 1799 - sc->lumpy_reclaim_mode = 1; 1800 - else 1801 - sc->lumpy_reclaim_mode = 0; 1802 - } 1803 - 1804 1742 /* 1805 1743 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1806 1744 */ ··· 1799 1767 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1800 1768 1801 1769 get_scan_count(zone, sc, nr, priority); 1802 - 1803 - set_lumpy_reclaim_mode(priority, sc); 1804 1770 1805 1771 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1806 1772 nr[LRU_INACTIVE_FILE]) {