selftest/bpf/benchs: Add bpf_loop benchmark

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add benchmark to measure the throughput and latency of the bpf_loop
call.

Testing this on my dev machine on 1 thread, the data is as follows:

nr_loops: 10
bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op

nr_loops: 100
bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op

nr_loops: 500
bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op

nr_loops: 1000
bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op

nr_loops: 5000
bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op

nr_loops: 10000
bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op

nr_loops: 50000
bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op

nr_loops: 100000
bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op

nr_loops: 500000
bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op

nr_loops: 1000000
bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op

>From this data, we can see that the latency per loop decreases as the
number of loops increases. On this particular machine, each loop had an
overhead of about ~4 ns, and we were able to run ~250 million loops
per second.

Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-5-joannekoong@fb.com

authored by

Joanne Koong and committed by

Alexei Starovoitov 4 years ago ec151037 f6e659b7

+203 -1

7 changed files

expand all

tools

testing

selftests

bpf

Makefile

bench.c

bench.h

benchs

bench_bpf_loop.c

run_bench_bpf_loop.sh

run_common.sh

progs

bpf_loop_bench.c

+3 -1

tools/testing/selftests/bpf/Makefile

··· 531 531 $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ 532 532 $(OUTPUT)/perfbuf_bench.skel.h 533 533 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h 534 + $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h 534 535 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) 535 536 $(OUTPUT)/bench: LDLIBS += -lm 536 537 $(OUTPUT)/bench: $(OUTPUT)/bench.o \ ··· 541 540 $(OUTPUT)/bench_rename.o \ 542 541 $(OUTPUT)/bench_trigger.o \ 543 542 $(OUTPUT)/bench_ringbufs.o \ 544 - $(OUTPUT)/bench_bloom_filter_map.o 543 + $(OUTPUT)/bench_bloom_filter_map.o \ 544 + $(OUTPUT)/bench_bpf_loop.o 545 545 $(call msg,BINARY,,$@) 546 546 $(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ 547 547

+37

tools/testing/selftests/bpf/bench.c

··· 134 134 total_ops_mean, total_ops_stddev); 135 135 } 136 136 137 + void ops_report_progress(int iter, struct bench_res *res, long delta_ns) 138 + { 139 + double hits_per_sec, hits_per_prod; 140 + 141 + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); 142 + hits_per_prod = hits_per_sec / env.producer_cnt; 143 + 144 + printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0); 145 + 146 + printf("hits %8.3lfM/s (%7.3lfM/prod)\n", hits_per_sec, hits_per_prod); 147 + } 148 + 149 + void ops_report_final(struct bench_res res[], int res_cnt) 150 + { 151 + double hits_mean = 0.0, hits_stddev = 0.0; 152 + int i; 153 + 154 + for (i = 0; i < res_cnt; i++) 155 + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); 156 + 157 + if (res_cnt > 1) { 158 + for (i = 0; i < res_cnt; i++) 159 + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * 160 + (hits_mean - res[i].hits / 1000000.0) / 161 + (res_cnt - 1.0); 162 + 163 + hits_stddev = sqrt(hits_stddev); 164 + } 165 + printf("Summary: throughput %8.3lf \u00B1 %5.3lf M ops/s (%7.3lfM ops/prod), ", 166 + hits_mean, hits_stddev, hits_mean / env.producer_cnt); 167 + printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt); 168 + } 169 + 137 170 const char *argp_program_version = "benchmark"; 138 171 const char *argp_program_bug_address = "<bpf@vger.kernel.org>"; 139 172 const char argp_program_doc[] = ··· 204 171 205 172 extern struct argp bench_ringbufs_argp; 206 173 extern struct argp bench_bloom_map_argp; 174 + extern struct argp bench_bpf_loop_argp; 207 175 208 176 static const struct argp_child bench_parsers[] = { 209 177 { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, 210 178 { &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 }, 179 + { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, 211 180 {}, 212 181 }; 213 182 ··· 408 373 extern const struct bench bench_bloom_false_positive; 409 374 extern const struct bench bench_hashmap_without_bloom; 410 375 extern const struct bench bench_hashmap_with_bloom; 376 + extern const struct bench bench_bpf_loop; 411 377 412 378 static const struct bench *benchs[] = { 413 379 &bench_count_global, ··· 440 404 &bench_bloom_false_positive, 441 405 &bench_hashmap_without_bloom, 442 406 &bench_hashmap_with_bloom, 407 + &bench_bpf_loop, 443 408 }; 444 409 445 410 static void setup_benchmark()

tools/testing/selftests/bpf/bench.h

··· 59 59 void hits_drops_report_final(struct bench_res res[], int res_cnt); 60 60 void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); 61 61 void false_hits_report_final(struct bench_res res[], int res_cnt); 62 + void ops_report_progress(int iter, struct bench_res *res, long delta_ns); 63 + void ops_report_final(struct bench_res res[], int res_cnt); 62 64 63 65 static inline __u64 get_time_ns() { 64 66 struct timespec t;

+105

tools/testing/selftests/bpf/benchs/bench_bpf_loop.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2021 Facebook */ 3 + 4 + #include <argp.h> 5 + #include "bench.h" 6 + #include "bpf_loop_bench.skel.h" 7 + 8 + /* BPF triggering benchmarks */ 9 + static struct ctx { 10 + struct bpf_loop_bench *skel; 11 + } ctx; 12 + 13 + static struct { 14 + __u32 nr_loops; 15 + } args = { 16 + .nr_loops = 10, 17 + }; 18 + 19 + enum { 20 + ARG_NR_LOOPS = 4000, 21 + }; 22 + 23 + static const struct argp_option opts[] = { 24 + { "nr_loops", ARG_NR_LOOPS, "nr_loops", 0, 25 + "Set number of loops for the bpf_loop helper"}, 26 + {}, 27 + }; 28 + 29 + static error_t parse_arg(int key, char *arg, struct argp_state *state) 30 + { 31 + switch (key) { 32 + case ARG_NR_LOOPS: 33 + args.nr_loops = strtol(arg, NULL, 10); 34 + break; 35 + default: 36 + return ARGP_ERR_UNKNOWN; 37 + } 38 + 39 + return 0; 40 + } 41 + 42 + /* exported into benchmark runner */ 43 + const struct argp bench_bpf_loop_argp = { 44 + .options = opts, 45 + .parser = parse_arg, 46 + }; 47 + 48 + static void validate(void) 49 + { 50 + if (env.consumer_cnt != 1) { 51 + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); 52 + exit(1); 53 + } 54 + } 55 + 56 + static void *producer(void *input) 57 + { 58 + while (true) 59 + /* trigger the bpf program */ 60 + syscall(__NR_getpgid); 61 + 62 + return NULL; 63 + } 64 + 65 + static void *consumer(void *input) 66 + { 67 + return NULL; 68 + } 69 + 70 + static void measure(struct bench_res *res) 71 + { 72 + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); 73 + } 74 + 75 + static void setup(void) 76 + { 77 + struct bpf_link *link; 78 + 79 + setup_libbpf(); 80 + 81 + ctx.skel = bpf_loop_bench__open_and_load(); 82 + if (!ctx.skel) { 83 + fprintf(stderr, "failed to open skeleton\n"); 84 + exit(1); 85 + } 86 + 87 + link = bpf_program__attach(ctx.skel->progs.benchmark); 88 + if (!link) { 89 + fprintf(stderr, "failed to attach program!\n"); 90 + exit(1); 91 + } 92 + 93 + ctx.skel->bss->nr_loops = args.nr_loops; 94 + } 95 + 96 + const struct bench bench_bpf_loop = { 97 + .name = "bpf-loop", 98 + .validate = validate, 99 + .setup = setup, 100 + .producer_thread = producer, 101 + .consumer_thread = consumer, 102 + .measure = measure, 103 + .report_progress = ops_report_progress, 104 + .report_final = ops_report_final, 105 + };

+15

tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + source ./benchs/run_common.sh 5 + 6 + set -eufo pipefail 7 + 8 + for t in 1 4 8 12 16; do 9 + for i in 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do 10 + subtitle "nr_loops: $i, nr_threads: $t" 11 + summarize_ops "bpf_loop: " \ 12 + "$($RUN_BENCH -p $t --nr_loops $i bpf-loop)" 13 + printf "\n" 14 + done 15 + done

+15

tools/testing/selftests/bpf/benchs/run_common.sh

··· 33 33 echo "$*" | sed -E "s/.*Percentage\s=\s+([0-9]+\.[0-9]+).*/\1/" 34 34 } 35 35 36 + function ops() 37 + { 38 + echo -n "throughput: " 39 + echo -n "$*" | sed -E "s/.*throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" 40 + echo -n -e ", latency: " 41 + echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" 42 + } 43 + 36 44 function total() 37 45 { 38 46 echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" ··· 58 50 bench="$1" 59 51 summary=$(echo $2 | tail -n1) 60 52 printf "%-20s %s%%\n" "$bench" "$(percentage $summary)" 53 + } 54 + 55 + function summarize_ops() 56 + { 57 + bench="$1" 58 + summary=$(echo $2 | tail -n1) 59 + printf "%-20s %s\n" "$bench" "$(ops $summary)" 61 60 } 62 61 63 62 function summarize_total()

+26

tools/testing/selftests/bpf/progs/bpf_loop_bench.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2021 Facebook */ 3 + 4 + #include "vmlinux.h" 5 + #include <bpf/bpf_helpers.h> 6 + 7 + char _license[] SEC("license") = "GPL"; 8 + 9 + u32 nr_loops; 10 + long hits; 11 + 12 + static int empty_callback(__u32 index, void *data) 13 + { 14 + return 0; 15 + } 16 + 17 + SEC("fentry/__x64_sys_getpgid") 18 + int benchmark(void *ctx) 19 + { 20 + for (int i = 0; i < 1000; i++) { 21 + bpf_loop(nr_loops, empty_callback, NULL, 0); 22 + 23 + __sync_add_and_fetch(&hits, nr_loops); 24 + } 25 + return 0; 26 + }