Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.

The following kmemleak splat:

[ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black
[ 8.106521] Call Trace:
[ 8.106521] <TASK>
[ 8.106521] dump_stack_lvl+0x4b/0x70
[ 8.106521] kvfree_call_rcu+0xcb/0x3b0
[ 8.106521] ? hrtimer_cancel+0x21/0x40
[ 8.106521] bpf_obj_free_fields+0x193/0x200
[ 8.106521] htab_map_update_elem+0x29c/0x410
[ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b
[ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86
[ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0

happens due to the combination of features and fixes, but mainly due to
commit 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip
kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()->
kvfree_call_rcu()->kmemleak_ignore() complains with the above splat.

To fix this imbalance, replace bpf_map_kmalloc_node() with
kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to
make sure that the objects allocated with kmalloc_nolock() are freed
with kfree_nolock() rather than the implicit kfree() that kfree_rcu()
uses internally.

Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so
it will always fail in PREEMPT_RT. This is not an issue at the moment,
since bpf_timers are disabled in PREEMPT_RT. In the future
bpf_spin_lock will be replaced with state machine similar to
bpf_task_work.

Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/bpf/20251015000700.28988-1-alexei.starovoitov@gmail.com

authored by

Alexei Starovoitov and committed by
Daniel Borkmann
5fb750e8 e603a342

+33 -11
+4
include/linux/bpf.h
··· 2499 2499 #ifdef CONFIG_MEMCG 2500 2500 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, 2501 2501 int node); 2502 + void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, 2503 + int node); 2502 2504 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); 2503 2505 void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, 2504 2506 gfp_t flags); ··· 2513 2511 */ 2514 2512 #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ 2515 2513 kmalloc_node(_size, _flags, _node) 2514 + #define bpf_map_kmalloc_nolock(_map, _size, _flags, _node) \ 2515 + kmalloc_nolock(_size, _flags, _node) 2516 2516 #define bpf_map_kzalloc(_map, _size, _flags) \ 2517 2517 kzalloc(_size, _flags) 2518 2518 #define bpf_map_kvcalloc(_map, _n, _size, _flags) \
+14 -11
kernel/bpf/helpers.c
··· 1215 1215 rcu_read_unlock_trace(); 1216 1216 } 1217 1217 1218 + static void bpf_async_cb_rcu_free(struct rcu_head *rcu) 1219 + { 1220 + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); 1221 + 1222 + kfree_nolock(cb); 1223 + } 1224 + 1218 1225 static void bpf_wq_delete_work(struct work_struct *work) 1219 1226 { 1220 1227 struct bpf_work *w = container_of(work, struct bpf_work, delete_work); 1221 1228 1222 1229 cancel_work_sync(&w->work); 1223 1230 1224 - kfree_rcu(w, cb.rcu); 1231 + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); 1225 1232 } 1226 1233 1227 1234 static void bpf_timer_delete_work(struct work_struct *work) ··· 1237 1230 1238 1231 /* Cancel the timer and wait for callback to complete if it was running. 1239 1232 * If hrtimer_cancel() can be safely called it's safe to call 1240 - * kfree_rcu(t) right after for both preallocated and non-preallocated 1233 + * call_rcu() right after for both preallocated and non-preallocated 1241 1234 * maps. The async->cb = NULL was already done and no code path can see 1242 1235 * address 't' anymore. Timer if armed for existing bpf_hrtimer before 1243 1236 * bpf_timer_cancel_and_free will have been cancelled. 1244 1237 */ 1245 1238 hrtimer_cancel(&t->timer); 1246 - kfree_rcu(t, cb.rcu); 1239 + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); 1247 1240 } 1248 1241 1249 1242 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, ··· 1277 1270 goto out; 1278 1271 } 1279 1272 1280 - /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until 1281 - * kmalloc_nolock() is available, avoid locking issues by using 1282 - * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). 1283 - */ 1284 - cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); 1273 + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); 1285 1274 if (!cb) { 1286 1275 ret = -ENOMEM; 1287 1276 goto out; ··· 1318 1315 * or pinned in bpffs. 1319 1316 */ 1320 1317 WRITE_ONCE(async->cb, NULL); 1321 - kfree(cb); 1318 + kfree_nolock(cb); 1322 1319 ret = -EPERM; 1323 1320 } 1324 1321 out: ··· 1583 1580 * timer _before_ calling us, such that failing to cancel it here will 1584 1581 * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. 1585 1582 * Therefore, we _need_ to cancel any outstanding timers before we do 1586 - * kfree_rcu, even though no more timers can be armed. 1583 + * call_rcu, even though no more timers can be armed. 1587 1584 * 1588 1585 * Moreover, we need to schedule work even if timer does not belong to 1589 1586 * the calling callback_fn, as on two different CPUs, we can end up in a ··· 1610 1607 * completion. 1611 1608 */ 1612 1609 if (hrtimer_try_to_cancel(&t->timer) >= 0) 1613 - kfree_rcu(t, cb.rcu); 1610 + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); 1614 1611 else 1615 1612 queue_work(system_dfl_wq, &t->cb.delete_work); 1616 1613 } else {
+15
kernel/bpf/syscall.c
··· 520 520 return ptr; 521 521 } 522 522 523 + void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, 524 + int node) 525 + { 526 + struct mem_cgroup *memcg, *old_memcg; 527 + void *ptr; 528 + 529 + memcg = bpf_map_get_memcg(map); 530 + old_memcg = set_active_memcg(memcg); 531 + ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); 532 + set_active_memcg(old_memcg); 533 + mem_cgroup_put(memcg); 534 + 535 + return ptr; 536 + } 537 + 523 538 void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) 524 539 { 525 540 struct mem_cgroup *memcg, *old_memcg;