Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add a hint to allocated objects.

To address OOM issue when one cpu is allocating and another cpu is freeing add
a target bpf_mem_cache hint to allocated objects and when local cpu free_llist
overflows free to that bpf_mem_cache. The hint addresses the OOM while
maintaining the same performance for common case when alloc/free are done on the
same cpu.

Note that do_call_rcu_ttrace() now has to check 'draining' flag in one more case,
since do_call_rcu_ttrace() is called not only for current cpu.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230706033447.54696-9-alexei.starovoitov@gmail.com

authored by

Alexei Starovoitov and committed by
Daniel Borkmann
822fb26b d114dde2

+31 -19
+31 -19
kernel/bpf/memalloc.c
··· 99 99 int low_watermark, high_watermark, batch; 100 100 int percpu_size; 101 101 bool draining; 102 + struct bpf_mem_cache *tgt; 102 103 103 104 /* list of objects to be freed after RCU tasks trace GP */ 104 105 struct llist_head free_by_rcu_ttrace; ··· 200 199 201 200 for (i = 0; i < cnt; i++) { 202 201 /* 203 - * free_by_rcu_ttrace is only manipulated by irq work refill_work(). 204 - * IRQ works on the same CPU are called sequentially, so it is 205 - * safe to use __llist_del_first() here. If alloc_bulk() is 206 - * invoked by the initial prefill, there will be no running 207 - * refill_work(), so __llist_del_first() is fine as well. 208 - * 209 - * In most cases, objects on free_by_rcu_ttrace are from the same CPU. 210 - * If some objects come from other CPUs, it doesn't incur any 211 - * harm because NUMA_NO_NODE means the preference for current 212 - * numa node and it is not a guarantee. 202 + * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 203 + * done only by one CPU == current CPU. Other CPUs might 204 + * llist_add() and llist_del_all() in parallel. 213 205 */ 214 - obj = __llist_del_first(&c->free_by_rcu_ttrace); 206 + obj = llist_del_first(&c->free_by_rcu_ttrace); 215 207 if (!obj) 216 208 break; 217 209 add_obj_to_free_list(c, obj); ··· 278 284 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 279 285 * Nothing races to add to free_by_rcu_ttrace list. 280 286 */ 281 - __llist_add(llnode, &c->free_by_rcu_ttrace); 287 + llist_add(llnode, &c->free_by_rcu_ttrace); 282 288 } 283 289 284 290 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 285 291 { 286 292 struct llist_node *llnode, *t; 287 293 288 - if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) 294 + if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 295 + if (unlikely(READ_ONCE(c->draining))) { 296 + llnode = llist_del_all(&c->free_by_rcu_ttrace); 297 + free_all(llnode, !!c->percpu_size); 298 + } 289 299 return; 300 + } 290 301 291 302 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 292 - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu_ttrace)) 303 + llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 293 304 /* There is no concurrent __llist_add(waiting_for_gp_ttrace) access. 294 305 * It doesn't race with llist_del_all either. 295 306 * But there could be two concurrent llist_del_all(waiting_for_gp_ttrace): ··· 317 318 318 319 static void free_bulk(struct bpf_mem_cache *c) 319 320 { 321 + struct bpf_mem_cache *tgt = c->tgt; 320 322 struct llist_node *llnode, *t; 321 323 unsigned long flags; 322 324 int cnt; 325 + 326 + WARN_ON_ONCE(tgt->unit_size != c->unit_size); 323 327 324 328 do { 325 329 inc_active(c, &flags); ··· 333 331 cnt = 0; 334 332 dec_active(c, flags); 335 333 if (llnode) 336 - enque_to_free(c, llnode); 334 + enque_to_free(tgt, llnode); 337 335 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 338 336 339 337 /* and drain free_llist_extra */ 340 338 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 341 - enque_to_free(c, llnode); 342 - do_call_rcu_ttrace(c); 339 + enque_to_free(tgt, llnode); 340 + do_call_rcu_ttrace(tgt); 343 341 } 344 342 345 343 static void bpf_mem_refill(struct irq_work *work) ··· 438 436 c->unit_size = unit_size; 439 437 c->objcg = objcg; 440 438 c->percpu_size = percpu_size; 439 + c->tgt = c; 441 440 prefill_mem_cache(c, cpu); 442 441 } 443 442 ma->cache = pc; ··· 461 458 c = &cc->cache[i]; 462 459 c->unit_size = sizes[i]; 463 460 c->objcg = objcg; 461 + c->tgt = c; 464 462 prefill_mem_cache(c, cpu); 465 463 } 466 464 } ··· 480 476 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 481 477 * on these lists, so it is safe to use __llist_del_all(). 482 478 */ 483 - free_all(__llist_del_all(&c->free_by_rcu_ttrace), percpu); 479 + free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 484 480 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 485 481 free_all(__llist_del_all(&c->free_llist), percpu); 486 482 free_all(__llist_del_all(&c->free_llist_extra), percpu); ··· 605 601 local_irq_save(flags); 606 602 if (local_inc_return(&c->active) == 1) { 607 603 llnode = __llist_del_first(&c->free_llist); 608 - if (llnode) 604 + if (llnode) { 609 605 cnt = --c->free_cnt; 606 + *(struct bpf_mem_cache **)llnode = c; 607 + } 610 608 } 611 609 local_dec(&c->active); 612 610 local_irq_restore(flags); ··· 631 625 int cnt = 0; 632 626 633 627 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 628 + 629 + /* 630 + * Remember bpf_mem_cache that allocated this object. 631 + * The hint is not accurate. 632 + */ 633 + c->tgt = *(struct bpf_mem_cache **)llnode; 634 634 635 635 local_irq_save(flags); 636 636 if (local_inc_return(&c->active) == 1) {