mm: multi-gen LRU: use {ptep,pmdp}_clear_young_notify()

When the MM_WALK capability is enabled, memory that is mostly accessed by
a VM appears younger than it really is, therefore this memory will be less
likely to be evicted. Therefore, the presence of a running VM can
significantly increase swap-outs for non-VM memory, regressing the
performance for the rest of the system.

Fix this regression by always calling {ptep,pmdp}_clear_young_notify()
whenever we clear the young bits on PMDs/PTEs.

[jthoughton@google.com: fix link-time error]
Link: https://lkml.kernel.org/r/20241019012940.3656292-3-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reported-by: David Stevens <stevensd@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: <stable@vger.kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by Yu Zhao and committed by Andrew Morton 1d4832be ddd6d8e9

Changed files
+55 -47
include
linux
mm
+3 -2
include/linux/mmzone.h
··· 555 555 556 556 void lru_gen_init_pgdat(struct pglist_data *pgdat); 557 557 void lru_gen_init_lruvec(struct lruvec *lruvec); 558 - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); 558 + bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); 559 559 560 560 void lru_gen_init_memcg(struct mem_cgroup *memcg); 561 561 void lru_gen_exit_memcg(struct mem_cgroup *memcg); ··· 574 574 { 575 575 } 576 576 577 - static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 577 + static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 578 578 { 579 + return false; 579 580 } 580 581 581 582 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+3 -6
mm/rmap.c
··· 885 885 return false; 886 886 } 887 887 888 - if (pvmw.pte) { 889 - if (lru_gen_enabled() && 890 - pte_young(ptep_get(pvmw.pte))) { 891 - lru_gen_look_around(&pvmw); 888 + if (lru_gen_enabled() && pvmw.pte) { 889 + if (lru_gen_look_around(&pvmw)) 892 890 referenced++; 893 - } 894 - 891 + } else if (pvmw.pte) { 895 892 if (ptep_clear_flush_young_notify(vma, address, 896 893 pvmw.pte)) 897 894 referenced++;
+49 -39
mm/vmscan.c
··· 56 56 #include <linux/khugepaged.h> 57 57 #include <linux/rculist_nulls.h> 58 58 #include <linux/random.h> 59 + #include <linux/mmu_notifier.h> 59 60 60 61 #include <asm/tlbflush.h> 61 62 #include <asm/div64.h> ··· 3295 3294 return false; 3296 3295 } 3297 3296 3298 - static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3297 + static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, 3298 + struct pglist_data *pgdat) 3299 3299 { 3300 3300 unsigned long pfn = pte_pfn(pte); 3301 3301 ··· 3308 3306 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3309 3307 return -1; 3310 3308 3309 + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) 3310 + return -1; 3311 + 3311 3312 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3313 + return -1; 3314 + 3315 + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3312 3316 return -1; 3313 3317 3314 3318 return pfn; 3315 3319 } 3316 3320 3317 - static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3321 + static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, 3322 + struct pglist_data *pgdat) 3318 3323 { 3319 3324 unsigned long pfn = pmd_pfn(pmd); 3320 3325 ··· 3333 3324 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3334 3325 return -1; 3335 3326 3327 + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) 3328 + return -1; 3329 + 3336 3330 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3331 + return -1; 3332 + 3333 + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3337 3334 return -1; 3338 3335 3339 3336 return pfn; ··· 3349 3334 struct pglist_data *pgdat, bool can_swap) 3350 3335 { 3351 3336 struct folio *folio; 3352 - 3353 - /* try to avoid unnecessary memory loads */ 3354 - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3355 - return NULL; 3356 3337 3357 3338 folio = pfn_folio(pfn); 3358 3339 if (folio_nid(folio) != pgdat->node_id) ··· 3405 3394 total++; 3406 3395 walk->mm_stats[MM_LEAF_TOTAL]++; 3407 3396 3408 - pfn = get_pte_pfn(ptent, args->vma, addr); 3397 + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); 3409 3398 if (pfn == -1) 3410 3399 continue; 3411 - 3412 - if (!pte_young(ptent)) { 3413 - continue; 3414 - } 3415 3400 3416 3401 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3417 3402 if (!folio) 3418 3403 continue; 3419 3404 3420 - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 3421 - VM_WARN_ON_ONCE(true); 3405 + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) 3406 + continue; 3422 3407 3423 3408 young++; 3424 3409 walk->mm_stats[MM_LEAF_YOUNG]++; ··· 3480 3473 /* don't round down the first address */ 3481 3474 addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; 3482 3475 3483 - pfn = get_pmd_pfn(pmd[i], vma, addr); 3484 - if (pfn == -1) 3476 + if (!pmd_present(pmd[i])) 3485 3477 goto next; 3486 3478 3487 3479 if (!pmd_trans_huge(pmd[i])) { 3488 - if (!walk->force_scan && should_clear_pmd_young()) 3480 + if (!walk->force_scan && should_clear_pmd_young() && 3481 + !mm_has_notifiers(args->mm)) 3489 3482 pmdp_test_and_clear_young(vma, addr, pmd + i); 3490 3483 goto next; 3491 3484 } 3485 + 3486 + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); 3487 + if (pfn == -1) 3488 + goto next; 3492 3489 3493 3490 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3494 3491 if (!folio) 3495 3492 goto next; 3496 3493 3497 - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 3494 + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) 3498 3495 goto next; 3499 3496 3500 3497 walk->mm_stats[MM_LEAF_YOUNG]++; ··· 3556 3545 } 3557 3546 3558 3547 if (pmd_trans_huge(val)) { 3559 - unsigned long pfn = pmd_pfn(val); 3560 3548 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3549 + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); 3561 3550 3562 3551 walk->mm_stats[MM_LEAF_TOTAL]++; 3563 3552 3564 - if (!pmd_young(val)) { 3565 - continue; 3566 - } 3567 - 3568 - /* try to avoid unnecessary memory loads */ 3569 - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3570 - continue; 3571 - 3572 - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 3553 + if (pfn != -1) 3554 + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); 3573 3555 continue; 3574 3556 } 3575 3557 3576 - if (!walk->force_scan && should_clear_pmd_young()) { 3558 + if (!walk->force_scan && should_clear_pmd_young() && 3559 + !mm_has_notifiers(args->mm)) { 3577 3560 if (!pmd_young(val)) 3578 3561 continue; 3579 3562 ··· 4041 4036 * the PTE table to the Bloom filter. This forms a feedback loop between the 4042 4037 * eviction and the aging. 4043 4038 */ 4044 - void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4039 + bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4045 4040 { 4046 4041 int i; 4047 4042 unsigned long start; 4048 4043 unsigned long end; 4049 4044 struct lru_gen_mm_walk *walk; 4050 - int young = 0; 4045 + int young = 1; 4051 4046 pte_t *pte = pvmw->pte; 4052 4047 unsigned long addr = pvmw->address; 4053 4048 struct vm_area_struct *vma = pvmw->vma; ··· 4063 4058 lockdep_assert_held(pvmw->ptl); 4064 4059 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4065 4060 4061 + if (!ptep_clear_young_notify(vma, addr, pte)) 4062 + return false; 4063 + 4066 4064 if (spin_is_contended(pvmw->ptl)) 4067 - return; 4065 + return true; 4068 4066 4069 4067 /* exclude special VMAs containing anon pages from COW */ 4070 4068 if (vma->vm_flags & VM_SPECIAL) 4071 - return; 4069 + return true; 4072 4070 4073 4071 /* avoid taking the LRU lock under the PTL when possible */ 4074 4072 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4075 4073 4076 4074 start = max(addr & PMD_MASK, vma->vm_start); 4077 4075 end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; 4076 + 4077 + if (end - start == PAGE_SIZE) 4078 + return true; 4078 4079 4079 4080 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4080 4081 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) ··· 4095 4084 4096 4085 /* folio_update_gen() requires stable folio_memcg() */ 4097 4086 if (!mem_cgroup_trylock_pages(memcg)) 4098 - return; 4087 + return true; 4099 4088 4100 4089 arch_enter_lazy_mmu_mode(); 4101 4090 ··· 4105 4094 unsigned long pfn; 4106 4095 pte_t ptent = ptep_get(pte + i); 4107 4096 4108 - pfn = get_pte_pfn(ptent, vma, addr); 4097 + pfn = get_pte_pfn(ptent, vma, addr, pgdat); 4109 4098 if (pfn == -1) 4110 - continue; 4111 - 4112 - if (!pte_young(ptent)) 4113 4099 continue; 4114 4100 4115 4101 folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); 4116 4102 if (!folio) 4117 4103 continue; 4118 4104 4119 - if (!ptep_test_and_clear_young(vma, addr, pte + i)) 4120 - VM_WARN_ON_ONCE(true); 4105 + if (!ptep_clear_young_notify(vma, addr, pte + i)) 4106 + continue; 4121 4107 4122 4108 young++; 4123 4109 ··· 4144 4136 /* feedback from rmap walkers to page table walkers */ 4145 4137 if (mm_state && suitable_to_scan(i, young)) 4146 4138 update_bloom_filter(mm_state, max_seq, pvmw->pmd); 4139 + 4140 + return true; 4147 4141 } 4148 4142 4149 4143 /******************************************************************************