Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Adjust BPF stack helper functions to accommodate skip > 0

Let's say that the caller has storage for num_elem stack frames. Then,
the BPF stack helper functions walk the stack for only num_elem frames.
This means that if skip > 0, one keeps only 'num_elem - skip' frames.

This is because it sets init_nr in the perf_callchain_entry to the end
of the buffer to save num_elem entries only. I believe it was because
the perf callchain code unwound the stack frames until it reached the
global max size (sysctl_perf_event_max_stack).

However it now has perf_callchain_entry_ctx.max_stack to limit the
iteration locally. This simplifies the code to handle init_nr in the
BPF callstack entries and removes the confusion with the perf_event's
__PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0.

Also change the comment on bpf_get_stack() in the header file to be
more explicit what the return value means.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com
Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org

Based-on-patch-by: Eugene Loh <eugene.loh@oracle.com>

authored by

Namhyung Kim and committed by
Alexei Starovoitov
ee2a0988 ef078600

+28 -36
+4 -4
include/uapi/linux/bpf.h
··· 3009 3009 * 3010 3010 * # sysctl kernel.perf_event_max_stack=<new value> 3011 3011 * Return 3012 - * A non-negative value equal to or less than *size* on success, 3013 - * or a negative error in case of failure. 3012 + * The non-negative copied *buf* length equal to or less than 3013 + * *size* on success, or a negative error in case of failure. 3014 3014 * 3015 3015 * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) 3016 3016 * Description ··· 4316 4316 * 4317 4317 * # sysctl kernel.perf_event_max_stack=<new value> 4318 4318 * Return 4319 - * A non-negative value equal to or less than *size* on success, 4320 - * or a negative error in case of failure. 4319 + * The non-negative copied *buf* length equal to or less than 4320 + * *size* on success, or a negative error in case of failure. 4321 4321 * 4322 4322 * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) 4323 4323 * Description
+24 -32
kernel/bpf/stackmap.c
··· 176 176 } 177 177 178 178 static struct perf_callchain_entry * 179 - get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) 179 + get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) 180 180 { 181 181 #ifdef CONFIG_STACKTRACE 182 182 struct perf_callchain_entry *entry; ··· 187 187 if (!entry) 188 188 return NULL; 189 189 190 - entry->nr = init_nr + 191 - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), 192 - sysctl_perf_event_max_stack - init_nr, 0); 190 + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, 191 + max_depth, 0); 193 192 194 193 /* stack_trace_save_tsk() works on unsigned long array, while 195 194 * perf_callchain_entry uses u64 array. For 32-bit systems, it is ··· 200 201 int i; 201 202 202 203 /* copy data from the end to avoid using extra buffer */ 203 - for (i = entry->nr - 1; i >= (int)init_nr; i--) 204 + for (i = entry->nr - 1; i >= 0; i--) 204 205 to[i] = (u64)(from[i]); 205 206 } 206 207 ··· 217 218 { 218 219 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 219 220 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 220 - u32 max_depth = map->value_size / stack_map_data_size(map); 221 - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ 222 - u32 init_nr = sysctl_perf_event_max_stack - max_depth; 223 221 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 224 222 u32 hash, id, trace_nr, trace_len; 225 223 bool user = flags & BPF_F_USER_STACK; 226 224 u64 *ips; 227 225 bool hash_matches; 228 226 229 - /* get_perf_callchain() guarantees that trace->nr >= init_nr 230 - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth 231 - */ 232 - trace_nr = trace->nr - init_nr; 233 - 234 - if (trace_nr <= skip) 227 + if (trace->nr <= skip) 235 228 /* skipping more than usable stack trace */ 236 229 return -EFAULT; 237 230 238 - trace_nr -= skip; 231 + trace_nr = trace->nr - skip; 239 232 trace_len = trace_nr * sizeof(u64); 240 - ips = trace->ip + skip + init_nr; 233 + ips = trace->ip + skip; 241 234 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); 242 235 id = hash & (smap->n_buckets - 1); 243 236 bucket = READ_ONCE(smap->buckets[id]); ··· 286 295 u64, flags) 287 296 { 288 297 u32 max_depth = map->value_size / stack_map_data_size(map); 289 - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ 290 - u32 init_nr = sysctl_perf_event_max_stack - max_depth; 298 + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 291 299 bool user = flags & BPF_F_USER_STACK; 292 300 struct perf_callchain_entry *trace; 293 301 bool kernel = !user; ··· 295 305 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 296 306 return -EINVAL; 297 307 298 - trace = get_perf_callchain(regs, init_nr, kernel, user, 299 - sysctl_perf_event_max_stack, false, false); 308 + max_depth += skip; 309 + if (max_depth > sysctl_perf_event_max_stack) 310 + max_depth = sysctl_perf_event_max_stack; 311 + 312 + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, 313 + false, false); 300 314 301 315 if (unlikely(!trace)) 302 316 /* couldn't fetch the stack trace */ ··· 391 397 struct perf_callchain_entry *trace_in, 392 398 void *buf, u32 size, u64 flags) 393 399 { 394 - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; 400 + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; 395 401 bool user_build_id = flags & BPF_F_USER_BUILD_ID; 396 402 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 397 403 bool user = flags & BPF_F_USER_STACK; ··· 416 422 goto err_fault; 417 423 418 424 num_elem = size / elem_size; 419 - if (sysctl_perf_event_max_stack < num_elem) 420 - init_nr = 0; 421 - else 422 - init_nr = sysctl_perf_event_max_stack - num_elem; 425 + max_depth = num_elem + skip; 426 + if (sysctl_perf_event_max_stack < max_depth) 427 + max_depth = sysctl_perf_event_max_stack; 423 428 424 429 if (trace_in) 425 430 trace = trace_in; 426 431 else if (kernel && task) 427 - trace = get_callchain_entry_for_task(task, init_nr); 432 + trace = get_callchain_entry_for_task(task, max_depth); 428 433 else 429 - trace = get_perf_callchain(regs, init_nr, kernel, user, 430 - sysctl_perf_event_max_stack, 434 + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, 431 435 false, false); 432 436 if (unlikely(!trace)) 433 437 goto err_fault; 434 438 435 - trace_nr = trace->nr - init_nr; 436 - if (trace_nr < skip) 439 + if (trace->nr < skip) 437 440 goto err_fault; 438 441 439 - trace_nr -= skip; 442 + trace_nr = trace->nr - skip; 440 443 trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; 441 444 copy_len = trace_nr * elem_size; 442 - ips = trace->ip + skip + init_nr; 445 + 446 + ips = trace->ip + skip; 443 447 if (user && user_build_id) 444 448 stack_map_get_build_id_offset(buf, ips, trace_nr, user); 445 449 else