Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vmscan: per memory cgroup slab shrinkers

This patch adds SHRINKER_MEMCG_AWARE flag. If a shrinker has this flag
set, it will be called per memory cgroup. The memory cgroup to scan
objects from is passed in shrink_control->memcg. If the memory cgroup
is NULL, a memcg aware shrinker is supposed to scan objects from the
global list. Unaware shrinkers are only called on global pressure with
memcg=NULL.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Vladimir Davydov and committed by
Linus Torvalds
cb731d6c 4101b624

+79 -49
-14
fs/drop_caches.c
··· 37 37 iput(toput_inode); 38 38 } 39 39 40 - static void drop_slab(void) 41 - { 42 - int nr_objects; 43 - 44 - do { 45 - int nid; 46 - 47 - nr_objects = 0; 48 - for_each_online_node(nid) 49 - nr_objects += shrink_node_slabs(GFP_KERNEL, nid, 50 - 1000, 1000); 51 - } while (nr_objects > 10); 52 - } 53 - 54 40 int drop_caches_sysctl_handler(struct ctl_table *table, int write, 55 41 void __user *buffer, size_t *length, loff_t *ppos) 56 42 {
+7
include/linux/memcontrol.h
··· 413 413 return static_key_false(&memcg_kmem_enabled_key); 414 414 } 415 415 416 + bool memcg_kmem_is_active(struct mem_cgroup *memcg); 417 + 416 418 /* 417 419 * In general, we'll do everything in our power to not incur in any overhead 418 420 * for non-memcg users for the kmem functions. Not even a function call, if we ··· 540 538 for (; NULL; ) 541 539 542 540 static inline bool memcg_kmem_enabled(void) 541 + { 542 + return false; 543 + } 544 + 545 + static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) 543 546 { 544 547 return false; 545 548 }
+2 -3
include/linux/mm.h
··· 2168 2168 void __user *, size_t *, loff_t *); 2169 2169 #endif 2170 2170 2171 - unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 2172 - unsigned long nr_scanned, 2173 - unsigned long nr_eligible); 2171 + void drop_slab(void); 2172 + void drop_slab_node(int nid); 2174 2173 2175 2174 #ifndef CONFIG_MMU 2176 2175 #define randomize_va_space 0
+5 -1
include/linux/shrinker.h
··· 20 20 21 21 /* current node being shrunk (for NUMA aware shrinkers) */ 22 22 int nid; 23 + 24 + /* current memcg being shrunk (for memcg aware shrinkers) */ 25 + struct mem_cgroup *memcg; 23 26 }; 24 27 25 28 #define SHRINK_STOP (~0UL) ··· 64 61 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ 65 62 66 63 /* Flags */ 67 - #define SHRINKER_NUMA_AWARE (1 << 0) 64 + #define SHRINKER_NUMA_AWARE (1 << 0) 65 + #define SHRINKER_MEMCG_AWARE (1 << 1) 68 66 69 67 extern int register_shrinker(struct shrinker *); 70 68 extern void unregister_shrinker(struct shrinker *);
+1 -1
mm/memcontrol.c
··· 352 352 }; 353 353 354 354 #ifdef CONFIG_MEMCG_KMEM 355 - static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 355 + bool memcg_kmem_is_active(struct mem_cgroup *memcg) 356 356 { 357 357 return memcg->kmemcg_id >= 0; 358 358 }
+2 -9
mm/memory-failure.c
··· 242 242 * Only call shrink_node_slabs here (which would also shrink 243 243 * other caches) if access is not potentially fatal. 244 244 */ 245 - if (access) { 246 - int nr; 247 - int nid = page_to_nid(p); 248 - do { 249 - nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); 250 - if (page_count(p) == 1) 251 - break; 252 - } while (nr > 10); 253 - } 245 + if (access) 246 + drop_slab_node(page_to_nid(p)); 254 247 } 255 248 EXPORT_SYMBOL_GPL(shake_page); 256 249
+62 -21
mm/vmscan.c
··· 232 232 233 233 #define SHRINK_BATCH 128 234 234 235 - static unsigned long shrink_slabs(struct shrink_control *shrinkctl, 236 - struct shrinker *shrinker, 237 - unsigned long nr_scanned, 238 - unsigned long nr_eligible) 235 + static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 236 + struct shrinker *shrinker, 237 + unsigned long nr_scanned, 238 + unsigned long nr_eligible) 239 239 { 240 240 unsigned long freed = 0; 241 241 unsigned long long delta; ··· 344 344 } 345 345 346 346 /** 347 - * shrink_node_slabs - shrink slab caches of a given node 347 + * shrink_slab - shrink slab caches 348 348 * @gfp_mask: allocation context 349 349 * @nid: node whose slab caches to target 350 + * @memcg: memory cgroup whose slab caches to target 350 351 * @nr_scanned: pressure numerator 351 352 * @nr_eligible: pressure denominator 352 353 * ··· 355 354 * 356 355 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 357 356 * unaware shrinkers will receive a node id of 0 instead. 357 + * 358 + * @memcg specifies the memory cgroup to target. If it is not NULL, 359 + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan 360 + * objects from the memory cgroup specified. Otherwise all shrinkers 361 + * are called, and memcg aware shrinkers are supposed to scan the 362 + * global list then. 358 363 * 359 364 * @nr_scanned and @nr_eligible form a ratio that indicate how much of 360 365 * the available objects should be scanned. Page reclaim for example ··· 372 365 * 373 366 * Returns the number of reclaimed slab objects. 374 367 */ 375 - unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 376 - unsigned long nr_scanned, 377 - unsigned long nr_eligible) 368 + static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 369 + struct mem_cgroup *memcg, 370 + unsigned long nr_scanned, 371 + unsigned long nr_eligible) 378 372 { 379 373 struct shrinker *shrinker; 380 374 unsigned long freed = 0; 375 + 376 + if (memcg && !memcg_kmem_is_active(memcg)) 377 + return 0; 381 378 382 379 if (nr_scanned == 0) 383 380 nr_scanned = SWAP_CLUSTER_MAX; ··· 401 390 struct shrink_control sc = { 402 391 .gfp_mask = gfp_mask, 403 392 .nid = nid, 393 + .memcg = memcg, 404 394 }; 395 + 396 + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) 397 + continue; 405 398 406 399 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 407 400 sc.nid = 0; 408 401 409 - freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); 402 + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); 410 403 } 411 404 412 405 up_read(&shrinker_rwsem); 413 406 out: 414 407 cond_resched(); 415 408 return freed; 409 + } 410 + 411 + void drop_slab_node(int nid) 412 + { 413 + unsigned long freed; 414 + 415 + do { 416 + struct mem_cgroup *memcg = NULL; 417 + 418 + freed = 0; 419 + do { 420 + freed += shrink_slab(GFP_KERNEL, nid, memcg, 421 + 1000, 1000); 422 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 423 + } while (freed > 10); 424 + } 425 + 426 + void drop_slab(void) 427 + { 428 + int nid; 429 + 430 + for_each_online_node(nid) 431 + drop_slab_node(nid); 416 432 } 417 433 418 434 static inline int is_page_cache_freeable(struct page *page) ··· 2314 2276 static bool shrink_zone(struct zone *zone, struct scan_control *sc, 2315 2277 bool is_classzone) 2316 2278 { 2279 + struct reclaim_state *reclaim_state = current->reclaim_state; 2317 2280 unsigned long nr_reclaimed, nr_scanned; 2318 2281 bool reclaimable = false; 2319 2282 ··· 2333 2294 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2334 2295 do { 2335 2296 unsigned long lru_pages; 2297 + unsigned long scanned; 2336 2298 struct lruvec *lruvec; 2337 2299 int swappiness; 2338 2300 ··· 2345 2305 2346 2306 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2347 2307 swappiness = mem_cgroup_swappiness(memcg); 2308 + scanned = sc->nr_scanned; 2348 2309 2349 2310 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2350 2311 zone_lru_pages += lru_pages; 2312 + 2313 + if (memcg && is_classzone) 2314 + shrink_slab(sc->gfp_mask, zone_to_nid(zone), 2315 + memcg, sc->nr_scanned - scanned, 2316 + lru_pages); 2351 2317 2352 2318 /* 2353 2319 * Direct reclaim and kswapd have to scan all memory ··· 2376 2330 * Shrink the slab caches in the same proportion that 2377 2331 * the eligible LRU pages were scanned. 2378 2332 */ 2379 - if (global_reclaim(sc) && is_classzone) { 2380 - struct reclaim_state *reclaim_state; 2333 + if (global_reclaim(sc) && is_classzone) 2334 + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, 2335 + sc->nr_scanned - nr_scanned, 2336 + zone_lru_pages); 2381 2337 2382 - shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), 2383 - sc->nr_scanned - nr_scanned, 2384 - zone_lru_pages); 2385 - 2386 - reclaim_state = current->reclaim_state; 2387 - if (reclaim_state) { 2388 - sc->nr_reclaimed += 2389 - reclaim_state->reclaimed_slab; 2390 - reclaim_state->reclaimed_slab = 0; 2391 - } 2338 + if (reclaim_state) { 2339 + sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2340 + reclaim_state->reclaimed_slab = 0; 2392 2341 } 2393 2342 2394 2343 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,