Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Allow pre-ordering for bpf cgroup progs

Currently for bpf progs in a cgroup hierarchy, the effective prog array
is computed from bottom cgroup to upper cgroups (post-ordering). For
example, the following cgroup hierarchy
root cgroup: p1, p2
subcgroup: p3, p4
have BPF_F_ALLOW_MULTI for both cgroup levels.
The effective cgroup array ordering looks like
p3 p4 p1 p2
and at run time, progs will execute based on that order.

But in some cases, it is desirable to have root prog executes earlier than
children progs (pre-ordering). For example,
- prog p1 intends to collect original pkt dest addresses.
- prog p3 will modify original pkt dest addresses to a proxy address for
security reason.
The end result is that prog p1 gets proxy address which is not what it
wants. Putting p1 to every child cgroup is not desirable either as it
will duplicate itself in many child cgroups. And this is exactly a use case
we are encountering in Meta.

To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag
is specified at attachment time, the prog has higher priority and the
ordering with that flag will be from top to bottom (pre-ordering).
For example, in the above example,
root cgroup: p1, p2
subcgroup: p3, p4
Let us say p2 and p4 are marked with BPF_F_PREORDER. The final
effective array ordering will be
p2 p4 p3 p1

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Yonghong Song and committed by
Alexei Starovoitov
4b82b181 7c2f207a

+30 -9
+1
include/linux/bpf-cgroup.h
··· 111 111 struct bpf_prog *prog; 112 112 struct bpf_cgroup_link *link; 113 113 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; 114 + u32 flags; 114 115 }; 115 116 116 117 int cgroup_bpf_inherit(struct cgroup *cgrp);
+1
include/uapi/linux/bpf.h
··· 1207 1207 #define BPF_F_BEFORE (1U << 3) 1208 1208 #define BPF_F_AFTER (1U << 4) 1209 1209 #define BPF_F_ID (1U << 5) 1210 + #define BPF_F_PREORDER (1U << 6) 1210 1211 #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ 1211 1212 1212 1213 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+25 -8
kernel/bpf/cgroup.c
··· 369 369 /* count number of elements in the list. 370 370 * it's slow but the list cannot be long 371 371 */ 372 - static u32 prog_list_length(struct hlist_head *head) 372 + static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) 373 373 { 374 374 struct bpf_prog_list *pl; 375 375 u32 cnt = 0; ··· 377 377 hlist_for_each_entry(pl, head, node) { 378 378 if (!prog_list_prog(pl)) 379 379 continue; 380 + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) 381 + (*preorder_cnt)++; 380 382 cnt++; 381 383 } 382 384 return cnt; ··· 402 400 403 401 if (flags & BPF_F_ALLOW_MULTI) 404 402 return true; 405 - cnt = prog_list_length(&p->bpf.progs[atype]); 403 + cnt = prog_list_length(&p->bpf.progs[atype], NULL); 406 404 WARN_ON_ONCE(cnt > 1); 407 405 if (cnt == 1) 408 406 return !!(flags & BPF_F_ALLOW_OVERRIDE); ··· 425 423 struct bpf_prog_array *progs; 426 424 struct bpf_prog_list *pl; 427 425 struct cgroup *p = cgrp; 428 - int cnt = 0; 426 + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; 429 427 430 428 /* count number of effective programs by walking parents */ 431 429 do { 432 430 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 433 - cnt += prog_list_length(&p->bpf.progs[atype]); 431 + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); 434 432 p = cgroup_parent(p); 435 433 } while (p); 436 434 ··· 441 439 /* populate the array with effective progs */ 442 440 cnt = 0; 443 441 p = cgrp; 442 + fstart = preorder_cnt; 443 + bstart = preorder_cnt - 1; 444 444 do { 445 445 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 446 446 continue; 447 447 448 + init_bstart = bstart; 448 449 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { 449 450 if (!prog_list_prog(pl)) 450 451 continue; 451 452 452 - item = &progs->items[cnt]; 453 + if (pl->flags & BPF_F_PREORDER) { 454 + item = &progs->items[bstart]; 455 + bstart--; 456 + } else { 457 + item = &progs->items[fstart]; 458 + fstart++; 459 + } 453 460 item->prog = prog_list_prog(pl); 454 461 bpf_cgroup_storages_assign(item->cgroup_storage, 455 462 pl->storage); 456 463 cnt++; 457 464 } 465 + 466 + /* reverse pre-ordering progs at this cgroup level */ 467 + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) 468 + swap(progs->items[i], progs->items[j]); 469 + 458 470 } while ((p = cgroup_parent(p))); 459 471 460 472 *array = progs; ··· 679 663 */ 680 664 return -EPERM; 681 665 682 - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 666 + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) 683 667 return -E2BIG; 684 668 685 669 pl = find_attach_entry(progs, prog, link, replace_prog, ··· 714 698 715 699 pl->prog = prog; 716 700 pl->link = link; 701 + pl->flags = flags; 717 702 bpf_cgroup_storages_assign(pl->storage, storage); 718 703 cgrp->bpf.flags[atype] = saved_flags; 719 704 ··· 1090 1073 lockdep_is_held(&cgroup_mutex)); 1091 1074 total_cnt += bpf_prog_array_length(effective); 1092 1075 } else { 1093 - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); 1076 + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); 1094 1077 } 1095 1078 } 1096 1079 ··· 1122 1105 u32 id; 1123 1106 1124 1107 progs = &cgrp->bpf.progs[atype]; 1125 - cnt = min_t(int, prog_list_length(progs), total_cnt); 1108 + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); 1126 1109 i = 0; 1127 1110 hlist_for_each_entry(pl, progs, node) { 1128 1111 prog = prog_list_prog(pl);
+2 -1
kernel/bpf/syscall.c
··· 4170 4170 #define BPF_F_ATTACH_MASK_BASE \ 4171 4171 (BPF_F_ALLOW_OVERRIDE | \ 4172 4172 BPF_F_ALLOW_MULTI | \ 4173 - BPF_F_REPLACE) 4173 + BPF_F_REPLACE | \ 4174 + BPF_F_PREORDER) 4174 4175 4175 4176 #define BPF_F_ATTACH_MASK_MPROG \ 4176 4177 (BPF_F_REPLACE | \
+1
tools/include/uapi/linux/bpf.h
··· 1207 1207 #define BPF_F_BEFORE (1U << 3) 1208 1208 #define BPF_F_AFTER (1U << 4) 1209 1209 #define BPF_F_ID (1U << 5) 1210 + #define BPF_F_PREORDER (1U << 6) 1210 1211 #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ 1211 1212 1212 1213 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the