Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

memcg, slab: RCU protect memcg_params for root caches

We relocate root cache's memcg_params whenever we need to grow the
memcg_caches array to accommodate all kmem-active memory cgroups.
Currently on relocation we free the old version immediately, which can
lead to use-after-free, because the memcg_caches array is accessed
lock-free (see cache_from_memcg_idx()). This patch fixes this by making
memcg_params RCU-protected for root caches.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Vladimir Davydov and committed by
Linus Torvalds
f8570263 f717eb3a

+30 -10
+7 -2
include/linux/slab.h
··· 513 513 * 514 514 * Both the root cache and the child caches will have it. For the root cache, 515 515 * this will hold a dynamically allocated array large enough to hold 516 - * information about the currently limited memcgs in the system. 516 + * information about the currently limited memcgs in the system. To allow the 517 + * array to be accessed without taking any locks, on relocation we free the old 518 + * version only after a grace period. 517 519 * 518 520 * Child caches will hold extra metadata needed for its operation. Fields are: 519 521 * ··· 530 528 struct memcg_cache_params { 531 529 bool is_root_cache; 532 530 union { 533 - struct kmem_cache *memcg_caches[0]; 531 + struct { 532 + struct rcu_head rcu_head; 533 + struct kmem_cache *memcg_caches[0]; 534 + }; 534 535 struct { 535 536 struct mem_cgroup *memcg; 536 537 struct list_head list;
+8 -7
mm/memcontrol.c
··· 3178 3178 3179 3179 if (num_groups > memcg_limited_groups_array_size) { 3180 3180 int i; 3181 + struct memcg_cache_params *new_params; 3181 3182 ssize_t size = memcg_caches_array_size(num_groups); 3182 3183 3183 3184 size *= sizeof(void *); 3184 3185 size += offsetof(struct memcg_cache_params, memcg_caches); 3185 3186 3186 - s->memcg_params = kzalloc(size, GFP_KERNEL); 3187 - if (!s->memcg_params) { 3188 - s->memcg_params = cur_params; 3187 + new_params = kzalloc(size, GFP_KERNEL); 3188 + if (!new_params) 3189 3189 return -ENOMEM; 3190 - } 3191 3190 3192 - s->memcg_params->is_root_cache = true; 3191 + new_params->is_root_cache = true; 3193 3192 3194 3193 /* 3195 3194 * There is the chance it will be bigger than ··· 3202 3203 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3203 3204 if (!cur_params->memcg_caches[i]) 3204 3205 continue; 3205 - s->memcg_params->memcg_caches[i] = 3206 + new_params->memcg_caches[i] = 3206 3207 cur_params->memcg_caches[i]; 3207 3208 } 3208 3209 ··· 3215 3216 * bigger than the others. And all updates will reset this 3216 3217 * anyway. 3217 3218 */ 3218 - kfree(cur_params); 3219 + rcu_assign_pointer(s->memcg_params, new_params); 3220 + if (cur_params) 3221 + kfree_rcu(cur_params, rcu_head); 3219 3222 } 3220 3223 return 0; 3221 3224 }
+15 -1
mm/slab.h
··· 160 160 return s->name; 161 161 } 162 162 163 + /* 164 + * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. 165 + * That said the caller must assure the memcg's cache won't go away. Since once 166 + * created a memcg's cache is destroyed only along with the root cache, it is 167 + * true if we are going to allocate from the cache or hold a reference to the 168 + * root cache by other means. Otherwise, we should hold either the slab_mutex 169 + * or the memcg's slab_caches_mutex while calling this function and accessing 170 + * the returned value. 171 + */ 163 172 static inline struct kmem_cache * 164 173 cache_from_memcg_idx(struct kmem_cache *s, int idx) 165 174 { 166 175 struct kmem_cache *cachep; 176 + struct memcg_cache_params *params; 167 177 168 178 if (!s->memcg_params) 169 179 return NULL; 170 - cachep = s->memcg_params->memcg_caches[idx]; 180 + 181 + rcu_read_lock(); 182 + params = rcu_dereference(s->memcg_params); 183 + cachep = params->memcg_caches[idx]; 184 + rcu_read_unlock(); 171 185 172 186 /* 173 187 * Make sure we will access the up-to-date value. The code updating