Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: introduce MADV_PAGEOUT

When a process expects no accesses to a certain memory range for a long
time, it could hint kernel that the pages can be reclaimed instantly but
data should be preserved for future use. This could reduce workingset
eviction so it ends up increasing performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall.
MADV_PAGEOUT can be used by a process to mark a memory range as not
expected to be used for a long time so that kernel reclaims *any LRU*
pages instantly. The hint can help kernel in deciding which pages to
evict proactively.

A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit
intentionally because it's automatically bounded by PMD size. If PMD
size(e.g., 256) makes some trouble, we could fix it later by limit it to
SWAP_CLUSTER_MAX[1].

- man-page material

MADV_PAGEOUT (since Linux x.x)

Do not expect access in the near future so pages in the specified
regions could be reclaimed instantly regardless of memory pressure.
Thus, access in the range after successful operation could cause
major page fault but never lose the up-to-date contents unlike
MADV_DONTNEED. Pages belonging to a shared mapping are only processed
if a write access is allowed for the calling process.

MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
VM_PFNMAP pages.

[1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/

[minchan@kernel.org: clear PG_active on MADV_PAGEOUT]
Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com
[akpm@linux-foundation.org: resolve conflicts with hmm.git]
Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: kbuild test robot <lkp@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: Daniel Colascione <dancol@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Oleksandr Natalenko <oleksandr@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Sonny Rao <sonnyrao@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Minchan Kim and committed by
Linus Torvalds
1a4e58cc 8940b34a

+251
+1
arch/alpha/include/uapi/asm/mman.h
··· 69 69 #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 70 70 71 71 #define MADV_COLD 20 /* deactivate these pages */ 72 + #define MADV_PAGEOUT 21 /* reclaim these pages */ 72 73 73 74 /* compatibility flags */ 74 75 #define MAP_FILE 0
+1
arch/mips/include/uapi/asm/mman.h
··· 96 96 #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 97 97 98 98 #define MADV_COLD 20 /* deactivate these pages */ 99 + #define MADV_PAGEOUT 21 /* reclaim these pages */ 99 100 100 101 /* compatibility flags */ 101 102 #define MAP_FILE 0
+1
arch/parisc/include/uapi/asm/mman.h
··· 49 49 #define MADV_DOFORK 11 /* do inherit across fork */ 50 50 51 51 #define MADV_COLD 20 /* deactivate these pages */ 52 + #define MADV_PAGEOUT 21 /* reclaim these pages */ 52 53 53 54 #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ 54 55 #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
+1
arch/xtensa/include/uapi/asm/mman.h
··· 104 104 #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 105 105 106 106 #define MADV_COLD 20 /* deactivate these pages */ 107 + #define MADV_PAGEOUT 21 /* reclaim these pages */ 107 108 108 109 /* compatibility flags */ 109 110 #define MAP_FILE 0
+1
include/linux/swap.h
··· 365 365 extern int remove_mapping(struct address_space *mapping, struct page *page); 366 366 extern unsigned long vm_total_pages; 367 367 368 + extern unsigned long reclaim_pages(struct list_head *page_list); 368 369 #ifdef CONFIG_NUMA 369 370 extern int node_reclaim_mode; 370 371 extern int sysctl_min_unmapped_ratio;
+1
include/uapi/asm-generic/mman-common.h
··· 68 68 #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ 69 69 70 70 #define MADV_COLD 20 /* deactivate these pages */ 71 + #define MADV_PAGEOUT 21 /* reclaim these pages */ 71 72 72 73 /* compatibility flags */ 73 74 #define MAP_FILE 0
+189
mm/madvise.c
··· 44 44 case MADV_WILLNEED: 45 45 case MADV_DONTNEED: 46 46 case MADV_COLD: 47 + case MADV_PAGEOUT: 47 48 case MADV_FREE: 48 49 return 0; 49 50 default: ··· 462 461 return 0; 463 462 } 464 463 464 + static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, 465 + unsigned long end, struct mm_walk *walk) 466 + { 467 + struct mmu_gather *tlb = walk->private; 468 + struct mm_struct *mm = tlb->mm; 469 + struct vm_area_struct *vma = walk->vma; 470 + pte_t *orig_pte, *pte, ptent; 471 + spinlock_t *ptl; 472 + LIST_HEAD(page_list); 473 + struct page *page; 474 + 475 + if (fatal_signal_pending(current)) 476 + return -EINTR; 477 + 478 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 479 + if (pmd_trans_huge(*pmd)) { 480 + pmd_t orig_pmd; 481 + unsigned long next = pmd_addr_end(addr, end); 482 + 483 + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 484 + ptl = pmd_trans_huge_lock(pmd, vma); 485 + if (!ptl) 486 + return 0; 487 + 488 + orig_pmd = *pmd; 489 + if (is_huge_zero_pmd(orig_pmd)) 490 + goto huge_unlock; 491 + 492 + if (unlikely(!pmd_present(orig_pmd))) { 493 + VM_BUG_ON(thp_migration_supported() && 494 + !is_pmd_migration_entry(orig_pmd)); 495 + goto huge_unlock; 496 + } 497 + 498 + page = pmd_page(orig_pmd); 499 + if (next - addr != HPAGE_PMD_SIZE) { 500 + int err; 501 + 502 + if (page_mapcount(page) != 1) 503 + goto huge_unlock; 504 + get_page(page); 505 + spin_unlock(ptl); 506 + lock_page(page); 507 + err = split_huge_page(page); 508 + unlock_page(page); 509 + put_page(page); 510 + if (!err) 511 + goto regular_page; 512 + return 0; 513 + } 514 + 515 + if (pmd_young(orig_pmd)) { 516 + pmdp_invalidate(vma, addr, pmd); 517 + orig_pmd = pmd_mkold(orig_pmd); 518 + 519 + set_pmd_at(mm, addr, pmd, orig_pmd); 520 + tlb_remove_tlb_entry(tlb, pmd, addr); 521 + } 522 + 523 + ClearPageReferenced(page); 524 + test_and_clear_page_young(page); 525 + 526 + if (!isolate_lru_page(page)) 527 + list_add(&page->lru, &page_list); 528 + huge_unlock: 529 + spin_unlock(ptl); 530 + reclaim_pages(&page_list); 531 + return 0; 532 + } 533 + 534 + if (pmd_trans_unstable(pmd)) 535 + return 0; 536 + regular_page: 537 + #endif 538 + tlb_change_page_size(tlb, PAGE_SIZE); 539 + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 540 + flush_tlb_batched_pending(mm); 541 + arch_enter_lazy_mmu_mode(); 542 + for (; addr < end; pte++, addr += PAGE_SIZE) { 543 + ptent = *pte; 544 + if (!pte_present(ptent)) 545 + continue; 546 + 547 + page = vm_normal_page(vma, addr, ptent); 548 + if (!page) 549 + continue; 550 + 551 + /* 552 + * creating a THP page is expensive so split it only if we 553 + * are sure it's worth. Split it if we are only owner. 554 + */ 555 + if (PageTransCompound(page)) { 556 + if (page_mapcount(page) != 1) 557 + break; 558 + get_page(page); 559 + if (!trylock_page(page)) { 560 + put_page(page); 561 + break; 562 + } 563 + pte_unmap_unlock(orig_pte, ptl); 564 + if (split_huge_page(page)) { 565 + unlock_page(page); 566 + put_page(page); 567 + pte_offset_map_lock(mm, pmd, addr, &ptl); 568 + break; 569 + } 570 + unlock_page(page); 571 + put_page(page); 572 + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 573 + pte--; 574 + addr -= PAGE_SIZE; 575 + continue; 576 + } 577 + 578 + VM_BUG_ON_PAGE(PageTransCompound(page), page); 579 + 580 + if (pte_young(ptent)) { 581 + ptent = ptep_get_and_clear_full(mm, addr, pte, 582 + tlb->fullmm); 583 + ptent = pte_mkold(ptent); 584 + set_pte_at(mm, addr, pte, ptent); 585 + tlb_remove_tlb_entry(tlb, pte, addr); 586 + } 587 + ClearPageReferenced(page); 588 + test_and_clear_page_young(page); 589 + 590 + if (!isolate_lru_page(page)) 591 + list_add(&page->lru, &page_list); 592 + } 593 + 594 + arch_leave_lazy_mmu_mode(); 595 + pte_unmap_unlock(orig_pte, ptl); 596 + reclaim_pages(&page_list); 597 + cond_resched(); 598 + 599 + return 0; 600 + } 601 + 602 + static void madvise_pageout_page_range(struct mmu_gather *tlb, 603 + struct vm_area_struct *vma, 604 + unsigned long addr, unsigned long end) 605 + { 606 + tlb_start_vma(tlb, vma); 607 + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); 608 + tlb_end_vma(tlb, vma); 609 + } 610 + 611 + static inline bool can_do_pageout(struct vm_area_struct *vma) 612 + { 613 + if (vma_is_anonymous(vma)) 614 + return true; 615 + if (!vma->vm_file) 616 + return false; 617 + /* 618 + * paging out pagecache only for non-anonymous mappings that correspond 619 + * to the files the calling process could (if tried) open for writing; 620 + * otherwise we'd be including shared non-exclusive mappings, which 621 + * opens a side channel. 622 + */ 623 + return inode_owner_or_capable(file_inode(vma->vm_file)) || 624 + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 625 + } 626 + 627 + static long madvise_pageout(struct vm_area_struct *vma, 628 + struct vm_area_struct **prev, 629 + unsigned long start_addr, unsigned long end_addr) 630 + { 631 + struct mm_struct *mm = vma->vm_mm; 632 + struct mmu_gather tlb; 633 + 634 + *prev = vma; 635 + if (!can_madv_lru_vma(vma)) 636 + return -EINVAL; 637 + 638 + if (!can_do_pageout(vma)) 639 + return 0; 640 + 641 + lru_add_drain(); 642 + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); 643 + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 644 + tlb_finish_mmu(&tlb, start_addr, end_addr); 645 + 646 + return 0; 647 + } 648 + 465 649 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 466 650 unsigned long end, struct mm_walk *walk) 467 651 ··· 1029 843 return madvise_willneed(vma, prev, start, end); 1030 844 case MADV_COLD: 1031 845 return madvise_cold(vma, prev, start, end); 846 + case MADV_PAGEOUT: 847 + return madvise_pageout(vma, prev, start, end); 1032 848 case MADV_FREE: 1033 849 case MADV_DONTNEED: 1034 850 return madvise_dontneed_free(vma, prev, start, end, behavior); ··· 1053 865 case MADV_DONTNEED: 1054 866 case MADV_FREE: 1055 867 case MADV_COLD: 868 + case MADV_PAGEOUT: 1056 869 #ifdef CONFIG_KSM 1057 870 case MADV_MERGEABLE: 1058 871 case MADV_UNMERGEABLE:
+56
mm/vmscan.c
··· 2145 2145 nr_deactivate, nr_rotated, sc->priority, file); 2146 2146 } 2147 2147 2148 + unsigned long reclaim_pages(struct list_head *page_list) 2149 + { 2150 + int nid = -1; 2151 + unsigned long nr_reclaimed = 0; 2152 + LIST_HEAD(node_page_list); 2153 + struct reclaim_stat dummy_stat; 2154 + struct page *page; 2155 + struct scan_control sc = { 2156 + .gfp_mask = GFP_KERNEL, 2157 + .priority = DEF_PRIORITY, 2158 + .may_writepage = 1, 2159 + .may_unmap = 1, 2160 + .may_swap = 1, 2161 + }; 2162 + 2163 + while (!list_empty(page_list)) { 2164 + page = lru_to_page(page_list); 2165 + if (nid == -1) { 2166 + nid = page_to_nid(page); 2167 + INIT_LIST_HEAD(&node_page_list); 2168 + } 2169 + 2170 + if (nid == page_to_nid(page)) { 2171 + ClearPageActive(page); 2172 + list_move(&page->lru, &node_page_list); 2173 + continue; 2174 + } 2175 + 2176 + nr_reclaimed += shrink_page_list(&node_page_list, 2177 + NODE_DATA(nid), 2178 + &sc, 0, 2179 + &dummy_stat, false); 2180 + while (!list_empty(&node_page_list)) { 2181 + page = lru_to_page(&node_page_list); 2182 + list_del(&page->lru); 2183 + putback_lru_page(page); 2184 + } 2185 + 2186 + nid = -1; 2187 + } 2188 + 2189 + if (!list_empty(&node_page_list)) { 2190 + nr_reclaimed += shrink_page_list(&node_page_list, 2191 + NODE_DATA(nid), 2192 + &sc, 0, 2193 + &dummy_stat, false); 2194 + while (!list_empty(&node_page_list)) { 2195 + page = lru_to_page(&node_page_list); 2196 + list_del(&page->lru); 2197 + putback_lru_page(page); 2198 + } 2199 + } 2200 + 2201 + return nr_reclaimed; 2202 + } 2203 + 2148 2204 /* 2149 2205 * The inactive anon list should be small enough that the VM never has 2150 2206 * to do too much work.