Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: add new BPF_CGROUP_ITER_CHILDREN control option

Currently, the BPF cgroup iterator supports walking descendants in
either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order
(BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive
depth-first search (DFS) of the hierarchy. In scenarios where a BPF
program may need to inspect only the direct children of a given parent
cgroup, a full DFS is unnecessarily expensive.

This patch introduces a new BPF cgroup iterator control option,
BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal
to the immediate children of a specified parent cgroup, allowing for
more targeted and efficient iteration, particularly when exhaustive
depth-first search (DFS) traversal is not required.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Matt Bobrowski and committed by
Alexei Starovoitov
752b8070 8016abd6

+37 -5
+8
include/uapi/linux/bpf.h
··· 119 119 BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ 120 120 BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ 121 121 BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ 122 + /* 123 + * Walks the immediate children of the specified parent 124 + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, 125 + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP 126 + * the iterator does not include the specified parent as one of the 127 + * returned iterator elements. 128 + */ 129 + BPF_CGROUP_ITER_CHILDREN, 122 130 }; 123 131 124 132 union bpf_iter_link_info {
+21 -5
kernel/bpf/cgroup_iter.c
··· 8 8 9 9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ 10 10 11 - /* cgroup_iter provides four modes of traversal to the cgroup hierarchy. 11 + /* cgroup_iter provides five modes of traversal to the cgroup hierarchy. 12 12 * 13 13 * 1. Walk the descendants of a cgroup in pre-order. 14 14 * 2. Walk the descendants of a cgroup in post-order. 15 15 * 3. Walk the ancestors of a cgroup. 16 16 * 4. Show the given cgroup only. 17 + * 5. Walk the children of a given parent cgroup. 17 18 * 18 19 * For walking descendants, cgroup_iter can walk in either pre-order or 19 20 * post-order. For walking ancestors, the iter walks up from a cgroup to ··· 79 78 return css_next_descendant_pre(NULL, p->start_css); 80 79 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 81 80 return css_next_descendant_post(NULL, p->start_css); 81 + else if (p->order == BPF_CGROUP_ITER_CHILDREN) 82 + return css_next_child(NULL, p->start_css); 82 83 else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ 83 84 return p->start_css; 84 85 } ··· 116 113 return css_next_descendant_post(curr, p->start_css); 117 114 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 118 115 return curr->parent; 116 + else if (p->order == BPF_CGROUP_ITER_CHILDREN) 117 + return css_next_child(curr, p->start_css); 119 118 else /* BPF_CGROUP_ITER_SELF_ONLY */ 120 119 return NULL; 121 120 } ··· 205 200 int order = linfo->cgroup.order; 206 201 struct cgroup *cgrp; 207 202 208 - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && 209 - order != BPF_CGROUP_ITER_DESCENDANTS_POST && 210 - order != BPF_CGROUP_ITER_ANCESTORS_UP && 211 - order != BPF_CGROUP_ITER_SELF_ONLY) 203 + switch (order) { 204 + case BPF_CGROUP_ITER_DESCENDANTS_PRE: 205 + case BPF_CGROUP_ITER_DESCENDANTS_POST: 206 + case BPF_CGROUP_ITER_ANCESTORS_UP: 207 + case BPF_CGROUP_ITER_SELF_ONLY: 208 + case BPF_CGROUP_ITER_CHILDREN: 209 + break; 210 + default: 212 211 return -EINVAL; 212 + } 213 213 214 214 if (fd && id) 215 215 return -EINVAL; ··· 267 257 seq_puts(seq, "order: descendants_post\n"); 268 258 else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) 269 259 seq_puts(seq, "order: ancestors_up\n"); 260 + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) 261 + seq_puts(seq, "order: children\n"); 270 262 else /* BPF_CGROUP_ITER_SELF_ONLY */ 271 263 seq_puts(seq, "order: self_only\n"); 272 264 } ··· 332 320 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 333 321 case BPF_CGROUP_ITER_DESCENDANTS_POST: 334 322 case BPF_CGROUP_ITER_ANCESTORS_UP: 323 + case BPF_CGROUP_ITER_CHILDREN: 335 324 break; 336 325 default: 337 326 return -EINVAL; ··· 357 344 break; 358 345 case BPF_CGROUP_ITER_DESCENDANTS_POST: 359 346 kit->pos = css_next_descendant_post(kit->pos, kit->start); 347 + break; 348 + case BPF_CGROUP_ITER_CHILDREN: 349 + kit->pos = css_next_child(kit->pos, kit->start); 360 350 break; 361 351 case BPF_CGROUP_ITER_ANCESTORS_UP: 362 352 kit->pos = kit->pos ? kit->pos->parent : kit->start;
+8
tools/include/uapi/linux/bpf.h
··· 119 119 BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ 120 120 BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ 121 121 BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ 122 + /* 123 + * Walks the immediate children of the specified parent 124 + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, 125 + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP 126 + * the iterator does not include the specified parent as one of the 127 + * returned iterator elements. 128 + */ 129 + BPF_CGROUP_ITER_CHILDREN, 122 130 }; 123 131 124 132 union bpf_iter_link_info {