mm: multi-gen LRU: minimal implementation

+36

include/linux/mm_inline.h

··· 121 121 return seq % MAX_NR_GENS; 122 122 } 123 123 124 + static inline int lru_hist_from_seq(unsigned long seq) 125 + { 126 + return seq % NR_HIST_GENS; 127 + } 128 + 129 + static inline int lru_tier_from_refs(int refs) 130 + { 131 + VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); 132 + 133 + /* see the comment in folio_lru_refs() */ 134 + return order_base_2(refs + 1); 135 + } 136 + 137 + static inline int folio_lru_refs(struct folio *folio) 138 + { 139 + unsigned long flags = READ_ONCE(folio->flags); 140 + bool workingset = flags & BIT(PG_workingset); 141 + 142 + /* 143 + * Return the number of accesses beyond PG_referenced, i.e., N-1 if the 144 + * total number of accesses is N>1, since N=0,1 both map to the first 145 + * tier. lru_tier_from_refs() will account for this off-by-one. Also see 146 + * the comment on MAX_NR_TIERS. 147 + */ 148 + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; 149 + } 150 + 124 151 static inline int folio_lru_gen(struct folio *folio) 125 152 { 126 153 unsigned long flags = READ_ONCE(folio->flags); ··· 200 173 __update_lru_size(lruvec, lru, zone, -delta); 201 174 return; 202 175 } 176 + 177 + /* promotion */ 178 + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { 179 + __update_lru_size(lruvec, lru, zone, -delta); 180 + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); 181 + } 182 + 183 + /* demotion requires isolation, e.g., lru_deactivate_fn() */ 184 + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); 203 185 } 204 186 205 187 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)

+41

include/linux/mmzone.h

··· 350 350 #define MIN_NR_GENS 2U 351 351 #define MAX_NR_GENS 4U 352 352 353 + /* 354 + * Each generation is divided into multiple tiers. A page accessed N times 355 + * through file descriptors is in tier order_base_2(N). A page in the first tier 356 + * (N=0,1) is marked by PG_referenced unless it was faulted in through page 357 + * tables or read ahead. A page in any other tier (N>1) is marked by 358 + * PG_referenced and PG_workingset. This implies a minimum of two tiers is 359 + * supported without using additional bits in folio->flags. 360 + * 361 + * In contrast to moving across generations which requires the LRU lock, moving 362 + * across tiers only involves atomic operations on folio->flags and therefore 363 + * has a negligible cost in the buffered access path. In the eviction path, 364 + * comparisons of refaulted/(evicted+protected) from the first tier and the 365 + * rest infer whether pages accessed multiple times through file descriptors 366 + * are statistically hot and thus worth protecting. 367 + * 368 + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the 369 + * number of categories of the active/inactive LRU when keeping track of 370 + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in 371 + * folio->flags. 372 + */ 373 + #define MAX_NR_TIERS 4U 374 + 353 375 #ifndef __GENERATING_BOUNDS_H 354 376 355 377 struct lruvec; ··· 385 363 LRU_GEN_ANON, 386 364 LRU_GEN_FILE, 387 365 }; 366 + 367 + #define MIN_LRU_BATCH BITS_PER_LONG 368 + #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) 369 + 370 + /* whether to keep historical stats from evicted generations */ 371 + #ifdef CONFIG_LRU_GEN_STATS 372 + #define NR_HIST_GENS MAX_NR_GENS 373 + #else 374 + #define NR_HIST_GENS 1U 375 + #endif 388 376 389 377 /* 390 378 * The youngest generation number is stored in max_seq for both anon and file ··· 418 386 struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; 419 387 /* the multi-gen LRU sizes, eventually consistent */ 420 388 long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; 389 + /* the exponential moving average of refaulted */ 390 + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; 391 + /* the exponential moving average of evicted+protected */ 392 + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; 393 + /* the first tier doesn't need protection, hence the minus one */ 394 + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; 395 + /* can be modified without holding the LRU lock */ 396 + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; 397 + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; 421 398 }; 422 399 423 400 void lru_gen_init_lruvec(struct lruvec *lruvec);

+4 -1

include/linux/page-flags-layout.h

··· 106 106 #error "Not enough bits in page flags" 107 107 #endif 108 108 109 - #define LRU_REFS_WIDTH 0 109 + /* see the comment on MAX_NR_TIERS */ 110 + #define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \ 111 + ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ 112 + NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) 110 113 111 114 #endif 112 115 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */

+2

kernel/bounds.c

··· 24 24 DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); 25 25 #ifdef CONFIG_LRU_GEN 26 26 DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); 27 + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); 27 28 #else 28 29 DEFINE(LRU_GEN_WIDTH, 0); 30 + DEFINE(__LRU_REFS_WIDTH, 0); 29 31 #endif 30 32 /* End of constants */ 31 33

+11

mm/Kconfig

··· 1118 1118 purposes. It is required to enable userfaultfd write protection on 1119 1119 file-backed memory types like shmem and hugetlbfs. 1120 1120 1121 + # multi-gen LRU { 1121 1122 config LRU_GEN 1122 1123 bool "Multi-Gen LRU" 1123 1124 depends on MMU ··· 1126 1125 depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP 1127 1126 help 1128 1127 A high performance LRU implementation to overcommit memory. 1128 + 1129 + config LRU_GEN_STATS 1130 + bool "Full stats for debugging" 1131 + depends on LRU_GEN 1132 + help 1133 + Do not enable this option unless you plan to look at historical stats 1134 + from evicted generations for debugging purpose. 1135 + 1136 + This option has a per-memcg and per-node memory overhead. 1137 + # } 1129 1138 1130 1139 source "mm/damon/Kconfig" 1131 1140

+39

mm/swap.c

··· 428 428 local_unlock(&cpu_fbatches.lock); 429 429 } 430 430 431 + #ifdef CONFIG_LRU_GEN 432 + static void folio_inc_refs(struct folio *folio) 433 + { 434 + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 435 + 436 + if (folio_test_unevictable(folio)) 437 + return; 438 + 439 + if (!folio_test_referenced(folio)) { 440 + folio_set_referenced(folio); 441 + return; 442 + } 443 + 444 + if (!folio_test_workingset(folio)) { 445 + folio_set_workingset(folio); 446 + return; 447 + } 448 + 449 + /* see the comment on MAX_NR_TIERS */ 450 + do { 451 + new_flags = old_flags & LRU_REFS_MASK; 452 + if (new_flags == LRU_REFS_MASK) 453 + break; 454 + 455 + new_flags += BIT(LRU_REFS_PGOFF); 456 + new_flags |= old_flags & ~LRU_REFS_MASK; 457 + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 458 + } 459 + #else 460 + static void folio_inc_refs(struct folio *folio) 461 + { 462 + } 463 + #endif /* CONFIG_LRU_GEN */ 464 + 431 465 /* 432 466 * Mark a page as having seen activity. 433 467 * ··· 474 440 */ 475 441 void folio_mark_accessed(struct folio *folio) 476 442 { 443 + if (lru_gen_enabled()) { 444 + folio_inc_refs(folio); 445 + return; 446 + } 447 + 477 448 if (!folio_test_referenced(folio)) { 478 449 folio_set_referenced(folio); 479 450 } else if (folio_test_unevictable(folio)) {

+784 -8

mm/vmscan.c

··· 1334 1334 1335 1335 if (folio_test_swapcache(folio)) { 1336 1336 swp_entry_t swap = folio_swap_entry(folio); 1337 - mem_cgroup_swapout(folio, swap); 1337 + 1338 + /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ 1338 1339 if (reclaimed && !mapping_exiting(mapping)) 1339 1340 shadow = workingset_eviction(folio, target_memcg); 1341 + mem_cgroup_swapout(folio, swap); 1340 1342 __delete_from_swap_cache(folio, swap, shadow); 1341 1343 xa_unlock_irq(&mapping->i_pages); 1342 1344 put_swap_page(&folio->page, swap); ··· 2735 2733 unsigned long file; 2736 2734 struct lruvec *target_lruvec; 2737 2735 2736 + if (lru_gen_enabled()) 2737 + return; 2738 + 2738 2739 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 2739 2740 2740 2741 /* ··· 3061 3056 * shorthand helpers 3062 3057 ******************************************************************************/ 3063 3058 3059 + #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) 3060 + 3061 + #define DEFINE_MAX_SEQ(lruvec) \ 3062 + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) 3063 + 3064 + #define DEFINE_MIN_SEQ(lruvec) \ 3065 + unsigned long min_seq[ANON_AND_FILE] = { \ 3066 + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ 3067 + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3068 + } 3069 + 3064 3070 #define for_each_gen_type_zone(gen, type, zone) \ 3065 3071 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3066 3072 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ··· 3095 3079 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3096 3080 3097 3081 return pgdat ? &pgdat->__lruvec : NULL; 3082 + } 3083 + 3084 + static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) 3085 + { 3086 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3087 + struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3088 + 3089 + if (!can_demote(pgdat->node_id, sc) && 3090 + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) 3091 + return 0; 3092 + 3093 + return mem_cgroup_swappiness(memcg); 3094 + } 3095 + 3096 + static int get_nr_gens(struct lruvec *lruvec, int type) 3097 + { 3098 + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; 3099 + } 3100 + 3101 + static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) 3102 + { 3103 + /* see the comment on lru_gen_struct */ 3104 + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3105 + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3106 + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3107 + } 3108 + 3109 + /****************************************************************************** 3110 + * refault feedback loop 3111 + ******************************************************************************/ 3112 + 3113 + /* 3114 + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3115 + * 3116 + * The P term is refaulted/(evicted+protected) from a tier in the generation 3117 + * currently being evicted; the I term is the exponential moving average of the 3118 + * P term over the generations previously evicted, using the smoothing factor 3119 + * 1/2; the D term isn't supported. 3120 + * 3121 + * The setpoint (SP) is always the first tier of one type; the process variable 3122 + * (PV) is either any tier of the other type or any other tier of the same 3123 + * type. 3124 + * 3125 + * The error is the difference between the SP and the PV; the correction is to 3126 + * turn off protection when SP>PV or turn on protection when SP<PV. 3127 + * 3128 + * For future optimizations: 3129 + * 1. The D term may discount the other two terms over time so that long-lived 3130 + * generations can resist stale information. 3131 + */ 3132 + struct ctrl_pos { 3133 + unsigned long refaulted; 3134 + unsigned long total; 3135 + int gain; 3136 + }; 3137 + 3138 + static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, 3139 + struct ctrl_pos *pos) 3140 + { 3141 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3142 + int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3143 + 3144 + pos->refaulted = lrugen->avg_refaulted[type][tier] + 3145 + atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3146 + pos->total = lrugen->avg_total[type][tier] + 3147 + atomic_long_read(&lrugen->evicted[hist][type][tier]); 3148 + if (tier) 3149 + pos->total += lrugen->protected[hist][type][tier - 1]; 3150 + pos->gain = gain; 3151 + } 3152 + 3153 + static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) 3154 + { 3155 + int hist, tier; 3156 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3157 + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; 3158 + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; 3159 + 3160 + lockdep_assert_held(&lruvec->lru_lock); 3161 + 3162 + if (!carryover && !clear) 3163 + return; 3164 + 3165 + hist = lru_hist_from_seq(seq); 3166 + 3167 + for (tier = 0; tier < MAX_NR_TIERS; tier++) { 3168 + if (carryover) { 3169 + unsigned long sum; 3170 + 3171 + sum = lrugen->avg_refaulted[type][tier] + 3172 + atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3173 + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); 3174 + 3175 + sum = lrugen->avg_total[type][tier] + 3176 + atomic_long_read(&lrugen->evicted[hist][type][tier]); 3177 + if (tier) 3178 + sum += lrugen->protected[hist][type][tier - 1]; 3179 + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); 3180 + } 3181 + 3182 + if (clear) { 3183 + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); 3184 + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); 3185 + if (tier) 3186 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); 3187 + } 3188 + } 3189 + } 3190 + 3191 + static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) 3192 + { 3193 + /* 3194 + * Return true if the PV has a limited number of refaults or a lower 3195 + * refaulted/total than the SP. 3196 + */ 3197 + return pv->refaulted < MIN_LRU_BATCH || 3198 + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= 3199 + (sp->refaulted + 1) * pv->total * pv->gain; 3200 + } 3201 + 3202 + /****************************************************************************** 3203 + * the aging 3204 + ******************************************************************************/ 3205 + 3206 + /* protect pages accessed multiple times through file descriptors */ 3207 + static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 3208 + { 3209 + int type = folio_is_file_lru(folio); 3210 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3211 + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 3212 + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3213 + 3214 + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); 3215 + 3216 + do { 3217 + new_gen = (old_gen + 1) % MAX_NR_GENS; 3218 + 3219 + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3220 + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; 3221 + /* for folio_end_writeback() */ 3222 + if (reclaiming) 3223 + new_flags |= BIT(PG_reclaim); 3224 + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3225 + 3226 + lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3227 + 3228 + return new_gen; 3229 + } 3230 + 3231 + static void inc_min_seq(struct lruvec *lruvec, int type) 3232 + { 3233 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3234 + 3235 + reset_ctrl_pos(lruvec, type, true); 3236 + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 3237 + } 3238 + 3239 + static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) 3240 + { 3241 + int gen, type, zone; 3242 + bool success = false; 3243 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3244 + DEFINE_MIN_SEQ(lruvec); 3245 + 3246 + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 3247 + 3248 + /* find the oldest populated generation */ 3249 + for (type = !can_swap; type < ANON_AND_FILE; type++) { 3250 + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { 3251 + gen = lru_gen_from_seq(min_seq[type]); 3252 + 3253 + for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3254 + if (!list_empty(&lrugen->lists[gen][type][zone])) 3255 + goto next; 3256 + } 3257 + 3258 + min_seq[type]++; 3259 + } 3260 + next: 3261 + ; 3262 + } 3263 + 3264 + /* see the comment on lru_gen_struct */ 3265 + if (can_swap) { 3266 + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); 3267 + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); 3268 + } 3269 + 3270 + for (type = !can_swap; type < ANON_AND_FILE; type++) { 3271 + if (min_seq[type] == lrugen->min_seq[type]) 3272 + continue; 3273 + 3274 + reset_ctrl_pos(lruvec, type, true); 3275 + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 3276 + success = true; 3277 + } 3278 + 3279 + return success; 3280 + } 3281 + 3282 + static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) 3283 + { 3284 + int prev, next; 3285 + int type, zone; 3286 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3287 + 3288 + spin_lock_irq(&lruvec->lru_lock); 3289 + 3290 + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 3291 + 3292 + if (max_seq != lrugen->max_seq) 3293 + goto unlock; 3294 + 3295 + for (type = ANON_AND_FILE - 1; type >= 0; type--) { 3296 + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 3297 + continue; 3298 + 3299 + VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); 3300 + 3301 + inc_min_seq(lruvec, type); 3302 + } 3303 + 3304 + /* 3305 + * Update the active/inactive LRU sizes for compatibility. Both sides of 3306 + * the current max_seq need to be covered, since max_seq+1 can overlap 3307 + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do 3308 + * overlap, cold/hot inversion happens. 3309 + */ 3310 + prev = lru_gen_from_seq(lrugen->max_seq - 1); 3311 + next = lru_gen_from_seq(lrugen->max_seq + 1); 3312 + 3313 + for (type = 0; type < ANON_AND_FILE; type++) { 3314 + for (zone = 0; zone < MAX_NR_ZONES; zone++) { 3315 + enum lru_list lru = type * LRU_INACTIVE_FILE; 3316 + long delta = lrugen->nr_pages[prev][type][zone] - 3317 + lrugen->nr_pages[next][type][zone]; 3318 + 3319 + if (!delta) 3320 + continue; 3321 + 3322 + __update_lru_size(lruvec, lru, zone, delta); 3323 + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); 3324 + } 3325 + } 3326 + 3327 + for (type = 0; type < ANON_AND_FILE; type++) 3328 + reset_ctrl_pos(lruvec, type, false); 3329 + 3330 + /* make sure preceding modifications appear */ 3331 + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); 3332 + unlock: 3333 + spin_unlock_irq(&lruvec->lru_lock); 3334 + } 3335 + 3336 + static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, 3337 + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 3338 + { 3339 + int gen, type, zone; 3340 + unsigned long old = 0; 3341 + unsigned long young = 0; 3342 + unsigned long total = 0; 3343 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3344 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3345 + 3346 + for (type = !can_swap; type < ANON_AND_FILE; type++) { 3347 + unsigned long seq; 3348 + 3349 + for (seq = min_seq[type]; seq <= max_seq; seq++) { 3350 + unsigned long size = 0; 3351 + 3352 + gen = lru_gen_from_seq(seq); 3353 + 3354 + for (zone = 0; zone < MAX_NR_ZONES; zone++) 3355 + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 3356 + 3357 + total += size; 3358 + if (seq == max_seq) 3359 + young += size; 3360 + else if (seq + MIN_NR_GENS == max_seq) 3361 + old += size; 3362 + } 3363 + } 3364 + 3365 + /* try to scrape all its memory if this memcg was deleted */ 3366 + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 3367 + 3368 + /* 3369 + * The aging tries to be lazy to reduce the overhead, while the eviction 3370 + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the 3371 + * ideal number of generations is MIN_NR_GENS+1. 3372 + */ 3373 + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) 3374 + return true; 3375 + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) 3376 + return false; 3377 + 3378 + /* 3379 + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) 3380 + * of the total number of pages for each generation. A reasonable range 3381 + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The 3382 + * aging cares about the upper bound of hot pages, while the eviction 3383 + * cares about the lower bound of cold pages. 3384 + */ 3385 + if (young * MIN_NR_GENS > total) 3386 + return true; 3387 + if (old * (MIN_NR_GENS + 2) < total) 3388 + return true; 3389 + 3390 + return false; 3391 + } 3392 + 3393 + static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) 3394 + { 3395 + bool need_aging; 3396 + unsigned long nr_to_scan; 3397 + int swappiness = get_swappiness(lruvec, sc); 3398 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3399 + DEFINE_MAX_SEQ(lruvec); 3400 + DEFINE_MIN_SEQ(lruvec); 3401 + 3402 + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); 3403 + 3404 + mem_cgroup_calculate_protection(NULL, memcg); 3405 + 3406 + if (mem_cgroup_below_min(memcg)) 3407 + return; 3408 + 3409 + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); 3410 + if (need_aging) 3411 + inc_max_seq(lruvec, max_seq, swappiness); 3412 + } 3413 + 3414 + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 3415 + { 3416 + struct mem_cgroup *memcg; 3417 + 3418 + VM_WARN_ON_ONCE(!current_is_kswapd()); 3419 + 3420 + memcg = mem_cgroup_iter(NULL, NULL, NULL); 3421 + do { 3422 + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 3423 + 3424 + age_lruvec(lruvec, sc); 3425 + 3426 + cond_resched(); 3427 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 3428 + } 3429 + 3430 + /****************************************************************************** 3431 + * the eviction 3432 + ******************************************************************************/ 3433 + 3434 + static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) 3435 + { 3436 + bool success; 3437 + int gen = folio_lru_gen(folio); 3438 + int type = folio_is_file_lru(folio); 3439 + int zone = folio_zonenum(folio); 3440 + int delta = folio_nr_pages(folio); 3441 + int refs = folio_lru_refs(folio); 3442 + int tier = lru_tier_from_refs(refs); 3443 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3444 + 3445 + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); 3446 + 3447 + /* unevictable */ 3448 + if (!folio_evictable(folio)) { 3449 + success = lru_gen_del_folio(lruvec, folio, true); 3450 + VM_WARN_ON_ONCE_FOLIO(!success, folio); 3451 + folio_set_unevictable(folio); 3452 + lruvec_add_folio(lruvec, folio); 3453 + __count_vm_events(UNEVICTABLE_PGCULLED, delta); 3454 + return true; 3455 + } 3456 + 3457 + /* dirty lazyfree */ 3458 + if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 3459 + success = lru_gen_del_folio(lruvec, folio, true); 3460 + VM_WARN_ON_ONCE_FOLIO(!success, folio); 3461 + folio_set_swapbacked(folio); 3462 + lruvec_add_folio_tail(lruvec, folio); 3463 + return true; 3464 + } 3465 + 3466 + /* protected */ 3467 + if (tier > tier_idx) { 3468 + int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3469 + 3470 + gen = folio_inc_gen(lruvec, folio, false); 3471 + list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); 3472 + 3473 + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 3474 + lrugen->protected[hist][type][tier - 1] + delta); 3475 + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 3476 + return true; 3477 + } 3478 + 3479 + /* waiting for writeback */ 3480 + if (folio_test_locked(folio) || folio_test_writeback(folio) || 3481 + (type == LRU_GEN_FILE && folio_test_dirty(folio))) { 3482 + gen = folio_inc_gen(lruvec, folio, true); 3483 + list_move(&folio->lru, &lrugen->lists[gen][type][zone]); 3484 + return true; 3485 + } 3486 + 3487 + return false; 3488 + } 3489 + 3490 + static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) 3491 + { 3492 + bool success; 3493 + 3494 + /* unmapping inhibited */ 3495 + if (!sc->may_unmap && folio_mapped(folio)) 3496 + return false; 3497 + 3498 + /* swapping inhibited */ 3499 + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && 3500 + (folio_test_dirty(folio) || 3501 + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) 3502 + return false; 3503 + 3504 + /* raced with release_pages() */ 3505 + if (!folio_try_get(folio)) 3506 + return false; 3507 + 3508 + /* raced with another isolation */ 3509 + if (!folio_test_clear_lru(folio)) { 3510 + folio_put(folio); 3511 + return false; 3512 + } 3513 + 3514 + /* see the comment on MAX_NR_TIERS */ 3515 + if (!folio_test_referenced(folio)) 3516 + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); 3517 + 3518 + /* for shrink_page_list() */ 3519 + folio_clear_reclaim(folio); 3520 + folio_clear_referenced(folio); 3521 + 3522 + success = lru_gen_del_folio(lruvec, folio, true); 3523 + VM_WARN_ON_ONCE_FOLIO(!success, folio); 3524 + 3525 + return true; 3526 + } 3527 + 3528 + static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, 3529 + int type, int tier, struct list_head *list) 3530 + { 3531 + int gen, zone; 3532 + enum vm_event_item item; 3533 + int sorted = 0; 3534 + int scanned = 0; 3535 + int isolated = 0; 3536 + int remaining = MAX_LRU_BATCH; 3537 + struct lru_gen_struct *lrugen = &lruvec->lrugen; 3538 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3539 + 3540 + VM_WARN_ON_ONCE(!list_empty(list)); 3541 + 3542 + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) 3543 + return 0; 3544 + 3545 + gen = lru_gen_from_seq(lrugen->min_seq[type]); 3546 + 3547 + for (zone = sc->reclaim_idx; zone >= 0; zone--) { 3548 + LIST_HEAD(moved); 3549 + int skipped = 0; 3550 + struct list_head *head = &lrugen->lists[gen][type][zone]; 3551 + 3552 + while (!list_empty(head)) { 3553 + struct folio *folio = lru_to_folio(head); 3554 + int delta = folio_nr_pages(folio); 3555 + 3556 + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 3557 + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 3558 + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 3559 + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 3560 + 3561 + scanned += delta; 3562 + 3563 + if (sort_folio(lruvec, folio, tier)) 3564 + sorted += delta; 3565 + else if (isolate_folio(lruvec, folio, sc)) { 3566 + list_add(&folio->lru, list); 3567 + isolated += delta; 3568 + } else { 3569 + list_move(&folio->lru, &moved); 3570 + skipped += delta; 3571 + } 3572 + 3573 + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) 3574 + break; 3575 + } 3576 + 3577 + if (skipped) { 3578 + list_splice(&moved, head); 3579 + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); 3580 + } 3581 + 3582 + if (!remaining || isolated >= MIN_LRU_BATCH) 3583 + break; 3584 + } 3585 + 3586 + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; 3587 + if (!cgroup_reclaim(sc)) { 3588 + __count_vm_events(item, isolated); 3589 + __count_vm_events(PGREFILL, sorted); 3590 + } 3591 + __count_memcg_events(memcg, item, isolated); 3592 + __count_memcg_events(memcg, PGREFILL, sorted); 3593 + __count_vm_events(PGSCAN_ANON + type, isolated); 3594 + 3595 + /* 3596 + * There might not be eligible pages due to reclaim_idx, may_unmap and 3597 + * may_writepage. Check the remaining to prevent livelock if it's not 3598 + * making progress. 3599 + */ 3600 + return isolated || !remaining ? scanned : 0; 3601 + } 3602 + 3603 + static int get_tier_idx(struct lruvec *lruvec, int type) 3604 + { 3605 + int tier; 3606 + struct ctrl_pos sp, pv; 3607 + 3608 + /* 3609 + * To leave a margin for fluctuations, use a larger gain factor (1:2). 3610 + * This value is chosen because any other tier would have at least twice 3611 + * as many refaults as the first tier. 3612 + */ 3613 + read_ctrl_pos(lruvec, type, 0, 1, &sp); 3614 + for (tier = 1; tier < MAX_NR_TIERS; tier++) { 3615 + read_ctrl_pos(lruvec, type, tier, 2, &pv); 3616 + if (!positive_ctrl_err(&sp, &pv)) 3617 + break; 3618 + } 3619 + 3620 + return tier - 1; 3621 + } 3622 + 3623 + static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) 3624 + { 3625 + int type, tier; 3626 + struct ctrl_pos sp, pv; 3627 + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; 3628 + 3629 + /* 3630 + * Compare the first tier of anon with that of file to determine which 3631 + * type to scan. Also need to compare other tiers of the selected type 3632 + * with the first tier of the other type to determine the last tier (of 3633 + * the selected type) to evict. 3634 + */ 3635 + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); 3636 + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); 3637 + type = positive_ctrl_err(&sp, &pv); 3638 + 3639 + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); 3640 + for (tier = 1; tier < MAX_NR_TIERS; tier++) { 3641 + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); 3642 + if (!positive_ctrl_err(&sp, &pv)) 3643 + break; 3644 + } 3645 + 3646 + *tier_idx = tier - 1; 3647 + 3648 + return type; 3649 + } 3650 + 3651 + static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 3652 + int *type_scanned, struct list_head *list) 3653 + { 3654 + int i; 3655 + int type; 3656 + int scanned; 3657 + int tier = -1; 3658 + DEFINE_MIN_SEQ(lruvec); 3659 + 3660 + /* 3661 + * Try to make the obvious choice first. When anon and file are both 3662 + * available from the same generation, interpret swappiness 1 as file 3663 + * first and 200 as anon first. 3664 + */ 3665 + if (!swappiness) 3666 + type = LRU_GEN_FILE; 3667 + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) 3668 + type = LRU_GEN_ANON; 3669 + else if (swappiness == 1) 3670 + type = LRU_GEN_FILE; 3671 + else if (swappiness == 200) 3672 + type = LRU_GEN_ANON; 3673 + else 3674 + type = get_type_to_scan(lruvec, swappiness, &tier); 3675 + 3676 + for (i = !swappiness; i < ANON_AND_FILE; i++) { 3677 + if (tier < 0) 3678 + tier = get_tier_idx(lruvec, type); 3679 + 3680 + scanned = scan_folios(lruvec, sc, type, tier, list); 3681 + if (scanned) 3682 + break; 3683 + 3684 + type = !type; 3685 + tier = -1; 3686 + } 3687 + 3688 + *type_scanned = type; 3689 + 3690 + return scanned; 3691 + } 3692 + 3693 + static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) 3694 + { 3695 + int type; 3696 + int scanned; 3697 + int reclaimed; 3698 + LIST_HEAD(list); 3699 + struct folio *folio; 3700 + enum vm_event_item item; 3701 + struct reclaim_stat stat; 3702 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3703 + struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3704 + 3705 + spin_lock_irq(&lruvec->lru_lock); 3706 + 3707 + scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 3708 + 3709 + scanned += try_to_inc_min_seq(lruvec, swappiness); 3710 + 3711 + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) 3712 + scanned = 0; 3713 + 3714 + spin_unlock_irq(&lruvec->lru_lock); 3715 + 3716 + if (list_empty(&list)) 3717 + return scanned; 3718 + 3719 + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); 3720 + 3721 + list_for_each_entry(folio, &list, lru) { 3722 + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ 3723 + if (folio_test_workingset(folio)) 3724 + folio_set_referenced(folio); 3725 + 3726 + /* don't add rejected pages to the oldest generation */ 3727 + if (folio_test_reclaim(folio) && 3728 + (folio_test_dirty(folio) || folio_test_writeback(folio))) 3729 + folio_clear_active(folio); 3730 + else 3731 + folio_set_active(folio); 3732 + } 3733 + 3734 + spin_lock_irq(&lruvec->lru_lock); 3735 + 3736 + move_pages_to_lru(lruvec, &list); 3737 + 3738 + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; 3739 + if (!cgroup_reclaim(sc)) 3740 + __count_vm_events(item, reclaimed); 3741 + __count_memcg_events(memcg, item, reclaimed); 3742 + __count_vm_events(PGSTEAL_ANON + type, reclaimed); 3743 + 3744 + spin_unlock_irq(&lruvec->lru_lock); 3745 + 3746 + mem_cgroup_uncharge_list(&list); 3747 + free_unref_page_list(&list); 3748 + 3749 + sc->nr_reclaimed += reclaimed; 3750 + 3751 + return scanned; 3752 + } 3753 + 3754 + static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, 3755 + bool can_swap) 3756 + { 3757 + bool need_aging; 3758 + unsigned long nr_to_scan; 3759 + struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3760 + DEFINE_MAX_SEQ(lruvec); 3761 + DEFINE_MIN_SEQ(lruvec); 3762 + 3763 + if (mem_cgroup_below_min(memcg) || 3764 + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) 3765 + return 0; 3766 + 3767 + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); 3768 + if (!need_aging) 3769 + return nr_to_scan; 3770 + 3771 + /* skip the aging path at the default priority */ 3772 + if (sc->priority == DEF_PRIORITY) 3773 + goto done; 3774 + 3775 + /* leave the work to lru_gen_age_node() */ 3776 + if (current_is_kswapd()) 3777 + return 0; 3778 + 3779 + inc_max_seq(lruvec, max_seq, can_swap); 3780 + done: 3781 + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; 3782 + } 3783 + 3784 + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 3785 + { 3786 + struct blk_plug plug; 3787 + unsigned long scanned = 0; 3788 + 3789 + lru_add_drain(); 3790 + 3791 + blk_start_plug(&plug); 3792 + 3793 + while (true) { 3794 + int delta; 3795 + int swappiness; 3796 + unsigned long nr_to_scan; 3797 + 3798 + if (sc->may_swap) 3799 + swappiness = get_swappiness(lruvec, sc); 3800 + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) 3801 + swappiness = 1; 3802 + else 3803 + swappiness = 0; 3804 + 3805 + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); 3806 + if (!nr_to_scan) 3807 + break; 3808 + 3809 + delta = evict_folios(lruvec, sc, swappiness); 3810 + if (!delta) 3811 + break; 3812 + 3813 + scanned += delta; 3814 + if (scanned >= nr_to_scan) 3815 + break; 3816 + 3817 + cond_resched(); 3818 + } 3819 + 3820 + blk_finish_plug(&plug); 3098 3821 } 3099 3822 3100 3823 /****************************************************************************** ··· 3878 3123 }; 3879 3124 late_initcall(init_lru_gen); 3880 3125 3126 + #else /* !CONFIG_LRU_GEN */ 3127 + 3128 + static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 3129 + { 3130 + } 3131 + 3132 + static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 3133 + { 3134 + } 3135 + 3881 3136 #endif /* CONFIG_LRU_GEN */ 3882 3137 3883 3138 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ··· 3900 3135 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 3901 3136 struct blk_plug plug; 3902 3137 bool scan_adjusted; 3138 + 3139 + if (lru_gen_enabled()) { 3140 + lru_gen_shrink_lruvec(lruvec, sc); 3141 + return; 3142 + } 3903 3143 3904 3144 get_scan_count(lruvec, sc, nr); 3905 3145 ··· 4412 3642 struct lruvec *target_lruvec; 4413 3643 unsigned long refaults; 4414 3644 3645 + if (lru_gen_enabled()) 3646 + return; 3647 + 4415 3648 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 4416 3649 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); 4417 3650 target_lruvec->refaults[WORKINGSET_ANON] = refaults; ··· 4781 4008 } 4782 4009 #endif 4783 4010 4784 - static void age_active_anon(struct pglist_data *pgdat, 4785 - struct scan_control *sc) 4011 + static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4786 4012 { 4787 4013 struct mem_cgroup *memcg; 4788 4014 struct lruvec *lruvec; 4015 + 4016 + if (lru_gen_enabled()) { 4017 + lru_gen_age_node(pgdat, sc); 4018 + return; 4019 + } 4789 4020 4790 4021 if (!can_age_anon_pages(pgdat, sc)) 4791 4022 return; ··· 5110 4333 sc.may_swap = !nr_boost_reclaim; 5111 4334 5112 4335 /* 5113 - * Do some background aging of the anon list, to give 5114 - * pages a chance to be referenced before reclaiming. All 5115 - * pages are rotated regardless of classzone as this is 5116 - * about consistent aging. 4336 + * Do some background aging, to give pages a chance to be 4337 + * referenced before reclaiming. All pages are rotated 4338 + * regardless of classzone as this is about consistent aging. 5117 4339 */ 5118 - age_active_anon(pgdat, &sc); 4340 + kswapd_age_node(pgdat, &sc); 5119 4341 5120 4342 /* 5121 4343 * If we're getting trouble reclaiming, start doing writepage

+108 -2

mm/workingset.c

··· 187 187 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 188 188 bool workingset) 189 189 { 190 - eviction >>= bucket_order; 191 190 eviction &= EVICTION_MASK; 192 191 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 193 192 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; ··· 211 212 212 213 *memcgidp = memcgid; 213 214 *pgdat = NODE_DATA(nid); 214 - *evictionp = entry << bucket_order; 215 + *evictionp = entry; 215 216 *workingsetp = workingset; 216 217 } 218 + 219 + #ifdef CONFIG_LRU_GEN 220 + 221 + static void *lru_gen_eviction(struct folio *folio) 222 + { 223 + int hist; 224 + unsigned long token; 225 + unsigned long min_seq; 226 + struct lruvec *lruvec; 227 + struct lru_gen_struct *lrugen; 228 + int type = folio_is_file_lru(folio); 229 + int delta = folio_nr_pages(folio); 230 + int refs = folio_lru_refs(folio); 231 + int tier = lru_tier_from_refs(refs); 232 + struct mem_cgroup *memcg = folio_memcg(folio); 233 + struct pglist_data *pgdat = folio_pgdat(folio); 234 + 235 + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); 236 + 237 + lruvec = mem_cgroup_lruvec(memcg, pgdat); 238 + lrugen = &lruvec->lrugen; 239 + min_seq = READ_ONCE(lrugen->min_seq[type]); 240 + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); 241 + 242 + hist = lru_hist_from_seq(min_seq); 243 + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); 244 + 245 + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); 246 + } 247 + 248 + static void lru_gen_refault(struct folio *folio, void *shadow) 249 + { 250 + int hist, tier, refs; 251 + int memcg_id; 252 + bool workingset; 253 + unsigned long token; 254 + unsigned long min_seq; 255 + struct lruvec *lruvec; 256 + struct lru_gen_struct *lrugen; 257 + struct mem_cgroup *memcg; 258 + struct pglist_data *pgdat; 259 + int type = folio_is_file_lru(folio); 260 + int delta = folio_nr_pages(folio); 261 + 262 + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); 263 + 264 + if (pgdat != folio_pgdat(folio)) 265 + return; 266 + 267 + rcu_read_lock(); 268 + 269 + memcg = folio_memcg_rcu(folio); 270 + if (memcg_id != mem_cgroup_id(memcg)) 271 + goto unlock; 272 + 273 + lruvec = mem_cgroup_lruvec(memcg, pgdat); 274 + lrugen = &lruvec->lrugen; 275 + 276 + min_seq = READ_ONCE(lrugen->min_seq[type]); 277 + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) 278 + goto unlock; 279 + 280 + hist = lru_hist_from_seq(min_seq); 281 + /* see the comment in folio_lru_refs() */ 282 + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; 283 + tier = lru_tier_from_refs(refs); 284 + 285 + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); 286 + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); 287 + 288 + /* 289 + * Count the following two cases as stalls: 290 + * 1. For pages accessed through page tables, hotter pages pushed out 291 + * hot pages which refaulted immediately. 292 + * 2. For pages accessed multiple times through file descriptors, 293 + * numbers of accesses might have been out of the range. 294 + */ 295 + if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { 296 + folio_set_workingset(folio); 297 + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); 298 + } 299 + unlock: 300 + rcu_read_unlock(); 301 + } 302 + 303 + #else /* !CONFIG_LRU_GEN */ 304 + 305 + static void *lru_gen_eviction(struct folio *folio) 306 + { 307 + return NULL; 308 + } 309 + 310 + static void lru_gen_refault(struct folio *folio, void *shadow) 311 + { 312 + } 313 + 314 + #endif /* CONFIG_LRU_GEN */ 217 315 218 316 /** 219 317 * workingset_age_nonresident - age non-resident entries as LRU ages ··· 360 264 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 361 265 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 362 266 267 + if (lru_gen_enabled()) 268 + return lru_gen_eviction(folio); 269 + 363 270 lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 364 271 /* XXX: target_memcg can be NULL, go through lruvec */ 365 272 memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); 366 273 eviction = atomic_long_read(&lruvec->nonresident_age); 274 + eviction >>= bucket_order; 367 275 workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 368 276 return pack_shadow(memcgid, pgdat, eviction, 369 277 folio_test_workingset(folio)); ··· 398 298 int memcgid; 399 299 long nr; 400 300 301 + if (lru_gen_enabled()) { 302 + lru_gen_refault(folio, shadow); 303 + return; 304 + } 305 + 401 306 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); 307 + eviction <<= bucket_order; 402 308 403 309 rcu_read_lock(); 404 310 /*