Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

list_lru: introduce list_lru_shrink_{count,walk}

Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support. That means when we hit the limit we will get ENOMEM w/o any
chance to recover. What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup. This is what
this patch set is intended to do.

Basically, it does two things. First, it introduces the notion of
per-memcg slab shrinker. A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE. Then it will be
passed the memory cgroup to scan from in shrink_control->memcg. For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.

Secondly, this patch set makes the list_lru structure per-memcg. It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru. Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to. This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.

As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit. Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.

This patch (of 9):

NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists. Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:

count = list_lru_count_node(lru, sc->nid);
freed = list_lru_walk_node(lru, sc->nid, isolate_func,
isolate_arg, &sc->nr_to_scan);

where sc is an instance of the shrink_control structure passed to it
from vmscan.

To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.

This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Vladimir Davydov and committed by
Linus Torvalds
503c358c 10c1045f

+51 -43
+6 -8
fs/dcache.c
··· 930 930 /** 931 931 * prune_dcache_sb - shrink the dcache 932 932 * @sb: superblock 933 - * @nr_to_scan : number of entries to try to free 934 - * @nid: which node to scan for freeable entities 933 + * @sc: shrink control, passed to list_lru_shrink_walk() 935 934 * 936 - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 937 - * done when we need more memory an called from the superblock shrinker 935 + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This 936 + * is done when we need more memory and called from the superblock shrinker 938 937 * function. 939 938 * 940 939 * This function may fail to free any resources if all the dentries are in 941 940 * use. 942 941 */ 943 - long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 944 - int nid) 942 + long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) 945 943 { 946 944 LIST_HEAD(dispose); 947 945 long freed; 948 946 949 - freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, 950 - &dispose, &nr_to_scan); 947 + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, 948 + dentry_lru_isolate, &dispose); 951 949 shrink_dentry_list(&dispose); 952 950 return freed; 953 951 }
+3 -3
fs/gfs2/quota.c
··· 171 171 if (!(sc->gfp_mask & __GFP_FS)) 172 172 return SHRINK_STOP; 173 173 174 - freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, 175 - &dispose, &sc->nr_to_scan); 174 + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, 175 + gfs2_qd_isolate, &dispose); 176 176 177 177 gfs2_qd_dispose(&dispose); 178 178 ··· 182 182 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, 183 183 struct shrink_control *sc) 184 184 { 185 - return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); 185 + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); 186 186 } 187 187 188 188 struct shrinker gfs2_qd_shrinker = {
+3 -4
fs/inode.c
··· 751 751 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 752 752 * then are freed outside inode_lock by dispose_list(). 753 753 */ 754 - long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 755 - int nid) 754 + long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 756 755 { 757 756 LIST_HEAD(freeable); 758 757 long freed; 759 758 760 - freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, 761 - &freeable, &nr_to_scan); 759 + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 760 + inode_lru_isolate, &freeable); 762 761 dispose_list(&freeable); 763 762 return freed; 764 763 }
+3 -4
fs/internal.h
··· 14 14 struct linux_binprm; 15 15 struct path; 16 16 struct mount; 17 + struct shrink_control; 17 18 18 19 /* 19 20 * block_dev.c ··· 112 111 * inode.c 113 112 */ 114 113 extern spinlock_t inode_sb_list_lock; 115 - extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 116 - int nid); 114 + extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); 117 115 extern void inode_add_lru(struct inode *inode); 118 116 119 117 /* ··· 129 129 */ 130 130 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 131 131 extern int d_set_mounted(struct dentry *dentry); 132 - extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 133 - int nid); 132 + extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); 134 133 135 134 /* 136 135 * read_write.c
+11 -13
fs/super.c
··· 77 77 if (sb->s_op->nr_cached_objects) 78 78 fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); 79 79 80 - inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 81 - dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 80 + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); 81 + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); 82 82 total_objects = dentries + inodes + fs_objects + 1; 83 83 if (!total_objects) 84 84 total_objects = 1; ··· 86 86 /* proportion the scan between the caches */ 87 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 88 88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); 89 + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); 89 90 90 91 /* 91 92 * prune the dcache first as the icache is pinned by it, then 92 93 * prune the icache, followed by the filesystem specific caches 93 94 */ 94 - freed = prune_dcache_sb(sb, dentries, sc->nid); 95 - freed += prune_icache_sb(sb, inodes, sc->nid); 95 + sc->nr_to_scan = dentries; 96 + freed = prune_dcache_sb(sb, sc); 97 + sc->nr_to_scan = inodes; 98 + freed += prune_icache_sb(sb, sc); 96 99 97 - if (fs_objects) { 98 - fs_objects = mult_frac(sc->nr_to_scan, fs_objects, 99 - total_objects); 100 + if (fs_objects) 100 101 freed += sb->s_op->free_cached_objects(sb, fs_objects, 101 102 sc->nid); 102 - } 103 103 104 104 drop_super(sb); 105 105 return freed; ··· 118 118 * scalability bottleneck. The counts could get updated 119 119 * between super_cache_count and super_cache_scan anyway. 120 120 * Call to super_cache_count with shrinker_rwsem held 121 - * ensures the safety of call to list_lru_count_node() and 121 + * ensures the safety of call to list_lru_shrink_count() and 122 122 * s_op->nr_cached_objects(). 123 123 */ 124 124 if (sb->s_op && sb->s_op->nr_cached_objects) 125 125 total_objects = sb->s_op->nr_cached_objects(sb, 126 126 sc->nid); 127 127 128 - total_objects += list_lru_count_node(&sb->s_dentry_lru, 129 - sc->nid); 130 - total_objects += list_lru_count_node(&sb->s_inode_lru, 131 - sc->nid); 128 + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); 129 + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); 132 130 133 131 total_objects = vfs_pressure_ratio(total_objects); 134 132 return total_objects;
+3 -4
fs/xfs/xfs_buf.c
··· 1583 1583 struct xfs_buftarg, bt_shrinker); 1584 1584 LIST_HEAD(dispose); 1585 1585 unsigned long freed; 1586 - unsigned long nr_to_scan = sc->nr_to_scan; 1587 1586 1588 - freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, 1589 - &dispose, &nr_to_scan); 1587 + freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1588 + xfs_buftarg_isolate, &dispose); 1590 1589 1591 1590 while (!list_empty(&dispose)) { 1592 1591 struct xfs_buf *bp; ··· 1604 1605 { 1605 1606 struct xfs_buftarg *btp = container_of(shrink, 1606 1607 struct xfs_buftarg, bt_shrinker); 1607 - return list_lru_count_node(&btp->bt_lru, sc->nid); 1608 + return list_lru_shrink_count(&btp->bt_lru, sc); 1608 1609 } 1609 1610 1610 1611 void
+3 -4
fs/xfs/xfs_qm.c
··· 523 523 struct xfs_qm_isolate isol; 524 524 unsigned long freed; 525 525 int error; 526 - unsigned long nr_to_scan = sc->nr_to_scan; 527 526 528 527 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 529 528 return 0; ··· 530 531 INIT_LIST_HEAD(&isol.buffers); 531 532 INIT_LIST_HEAD(&isol.dispose); 532 533 533 - freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, 534 - &nr_to_scan); 534 + freed = list_lru_shrink_walk(&qi->qi_lru, sc, 535 + xfs_qm_dquot_isolate, &isol); 535 536 536 537 error = xfs_buf_delwri_submit(&isol.buffers); 537 538 if (error) ··· 556 557 struct xfs_quotainfo *qi = container_of(shrink, 557 558 struct xfs_quotainfo, qi_shrinker); 558 559 559 - return list_lru_count_node(&qi->qi_lru, sc->nid); 560 + return list_lru_shrink_count(&qi->qi_lru, sc); 560 561 } 561 562 562 563 /*
+16
include/linux/list_lru.h
··· 9 9 10 10 #include <linux/list.h> 11 11 #include <linux/nodemask.h> 12 + #include <linux/shrinker.h> 12 13 13 14 /* list_lru_walk_cb has to always return one of those */ 14 15 enum lru_status { ··· 82 81 * Callers that want such a guarantee need to provide an outer lock. 83 82 */ 84 83 unsigned long list_lru_count_node(struct list_lru *lru, int nid); 84 + 85 + static inline unsigned long list_lru_shrink_count(struct list_lru *lru, 86 + struct shrink_control *sc) 87 + { 88 + return list_lru_count_node(lru, sc->nid); 89 + } 90 + 85 91 static inline unsigned long list_lru_count(struct list_lru *lru) 86 92 { 87 93 long count = 0; ··· 126 118 unsigned long list_lru_walk_node(struct list_lru *lru, int nid, 127 119 list_lru_walk_cb isolate, void *cb_arg, 128 120 unsigned long *nr_to_walk); 121 + 122 + static inline unsigned long 123 + list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, 124 + list_lru_walk_cb isolate, void *cb_arg) 125 + { 126 + return list_lru_walk_node(lru, sc->nid, isolate, cb_arg, 127 + &sc->nr_to_scan); 128 + } 129 129 130 130 static inline unsigned long 131 131 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
+3 -3
mm/workingset.c
··· 275 275 276 276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 277 277 local_irq_disable(); 278 - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); 278 + shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); 279 279 local_irq_enable(); 280 280 281 281 pages = node_present_pages(sc->nid); ··· 376 376 377 377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 378 378 local_irq_disable(); 379 - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, 380 - shadow_lru_isolate, NULL, &sc->nr_to_scan); 379 + ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, 380 + shadow_lru_isolate, NULL); 381 381 local_irq_enable(); 382 382 return ret; 383 383 }