Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf bench: Add breakpoint benchmarks

Add 2 benchmarks:

1. Performance of thread creation/exiting in presence of breakpoints.
2. Performance of breakpoint modification in presence of threads.

The benchmarks capture use cases that we are interested in:
using inheritable breakpoints in large highly-threaded applications.

The benchmarks show significant slowdown imposed by breakpoints
(even when they don't fire).

Testing on Intel 8173M with 112 HW threads show:

perf bench --repeat=56 breakpoint thread --breakpoints=0 --parallelism=56 --threads=20
78.675000 usecs/op
perf bench --repeat=56 breakpoint thread --breakpoints=4 --parallelism=56 --threads=20
12967.135714 usecs/op

That's 165x slowdown due to presence of the breakpoints.

perf bench --repeat=20000 breakpoint enable --passive=0 --active=0
1.433250 usecs/op
perf bench --repeat=20000 breakpoint enable --passive=224 --active=0
585.318400 usecs/op
perf bench --repeat=20000 breakpoint enable --passive=0 --active=111
635.953000 usecs/op

That's 408x and 444x slowdown due to presence of threads.

Profiles show some overhead in toggle_bp_slot,
but also very high contention:

90.83% breakpoint-thre [kernel.kallsyms] [k] osq_lock
4.69% breakpoint-thre [kernel.kallsyms] [k] mutex_spin_on_owner
2.06% breakpoint-thre [kernel.kallsyms] [k] __reserve_bp_slot
2.04% breakpoint-thre [kernel.kallsyms] [k] toggle_bp_slot

79.01% breakpoint-enab [kernel.kallsyms] [k] smp_call_function_single
9.94% breakpoint-enab [kernel.kallsyms] [k] llist_add_batch
5.70% breakpoint-enab [kernel.kallsyms] [k] _raw_spin_lock_irq
1.84% breakpoint-enab [kernel.kallsyms] [k] event_function_call
1.12% breakpoint-enab [kernel.kallsyms] [k] send_call_function_single_ipi
0.37% breakpoint-enab [kernel.kallsyms] [k] generic_exec_single
0.24% breakpoint-enab [kernel.kallsyms] [k] __perf_event_disable
0.20% breakpoint-enab [kernel.kallsyms] [k] _perf_event_enable
0.18% breakpoint-enab [kernel.kallsyms] [k] toggle_bp_slot

Committer notes:

Fixup struct init for older compilers:

3 32.90 alpine:3.5 : FAIL clang version 3.8.1 (tags/RELEASE_381/final)
bench/breakpoint.c:49:34: error: missing field 'size' initializer [-Werror,-Wmissing-field-initializers]
struct perf_event_attr attr = {0};
^
1 error generated.
7 37.31 alpine:3.9 : FAIL gcc version 8.3.0 (Alpine 8.3.0)
bench/breakpoint.c:49:34: error: missing field 'size' initializer [-Werror,-Wmissing-field-initializers]
struct perf_event_attr attr = {0};
^
1 error generated.

Signed-off-by: Dmitriy Vyukov <dvyukov@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220505155745.1690906-1-dvyukov@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Dmitry Vyukov and committed by
Arnaldo Carvalho de Melo
68a6772f 467cd948

+255
+1
tools/perf/bench/Build
··· 14 14 perf-y += find-bit-bench.o 15 15 perf-y += inject-buildid.o 16 16 perf-y += evlist-open-close.o 17 + perf-y += breakpoint.o 17 18 18 19 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o 19 20 perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
+2
tools/perf/bench/bench.h
··· 49 49 int bench_kallsyms_parse(int argc, const char **argv); 50 50 int bench_inject_build_id(int argc, const char **argv); 51 51 int bench_evlist_open_close(int argc, const char **argv); 52 + int bench_breakpoint_thread(int argc, const char **argv); 53 + int bench_breakpoint_enable(int argc, const char **argv); 52 54 53 55 #define BENCH_FORMAT_DEFAULT_STR "default" 54 56 #define BENCH_FORMAT_DEFAULT 0
+244
tools/perf/bench/breakpoint.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <subcmd/parse-options.h> 4 + #include <linux/hw_breakpoint.h> 5 + #include <linux/perf_event.h> 6 + #include <linux/time64.h> 7 + #include <sys/syscall.h> 8 + #include <sys/ioctl.h> 9 + #include <sys/time.h> 10 + #include <pthread.h> 11 + #include <stddef.h> 12 + #include <stdlib.h> 13 + #include <unistd.h> 14 + #include <stdio.h> 15 + #include <errno.h> 16 + #include "bench.h" 17 + #include "futex.h" 18 + 19 + struct { 20 + unsigned int nbreakpoints; 21 + unsigned int nparallel; 22 + unsigned int nthreads; 23 + } thread_params = { 24 + .nbreakpoints = 1, 25 + .nparallel = 1, 26 + .nthreads = 1, 27 + }; 28 + 29 + static const struct option thread_options[] = { 30 + OPT_UINTEGER('b', "breakpoints", &thread_params.nbreakpoints, 31 + "Specify amount of breakpoints"), 32 + OPT_UINTEGER('p', "parallelism", &thread_params.nparallel, "Specify amount of parallelism"), 33 + OPT_UINTEGER('t', "threads", &thread_params.nthreads, "Specify amount of threads"), 34 + OPT_END() 35 + }; 36 + 37 + static const char * const thread_usage[] = { 38 + "perf bench breakpoint thread <options>", 39 + NULL 40 + }; 41 + 42 + struct breakpoint { 43 + int fd; 44 + char watched; 45 + }; 46 + 47 + static int breakpoint_setup(void *addr) 48 + { 49 + struct perf_event_attr attr = { .size = 0, }; 50 + 51 + attr.type = PERF_TYPE_BREAKPOINT; 52 + attr.size = sizeof(attr); 53 + attr.inherit = 1; 54 + attr.exclude_kernel = 1; 55 + attr.exclude_hv = 1; 56 + attr.bp_addr = (uint64_t)addr; 57 + attr.bp_type = HW_BREAKPOINT_RW; 58 + attr.bp_len = HW_BREAKPOINT_LEN_1; 59 + return syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0); 60 + } 61 + 62 + static void *passive_thread(void *arg) 63 + { 64 + unsigned int *done = (unsigned int *)arg; 65 + 66 + while (!__atomic_load_n(done, __ATOMIC_RELAXED)) 67 + futex_wait(done, 0, NULL, 0); 68 + return NULL; 69 + } 70 + 71 + static void *active_thread(void *arg) 72 + { 73 + unsigned int *done = (unsigned int *)arg; 74 + 75 + while (!__atomic_load_n(done, __ATOMIC_RELAXED)); 76 + return NULL; 77 + } 78 + 79 + static void *breakpoint_thread(void *arg) 80 + { 81 + unsigned int i, done; 82 + int *repeat = (int *)arg; 83 + pthread_t *threads; 84 + 85 + threads = calloc(thread_params.nthreads, sizeof(threads[0])); 86 + if (!threads) 87 + exit((perror("calloc"), EXIT_FAILURE)); 88 + 89 + while (__atomic_fetch_sub(repeat, 1, __ATOMIC_RELAXED) > 0) { 90 + done = 0; 91 + for (i = 0; i < thread_params.nthreads; i++) { 92 + if (pthread_create(&threads[i], NULL, passive_thread, &done)) 93 + exit((perror("pthread_create"), EXIT_FAILURE)); 94 + } 95 + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); 96 + futex_wake(&done, thread_params.nthreads, 0); 97 + for (i = 0; i < thread_params.nthreads; i++) 98 + pthread_join(threads[i], NULL); 99 + } 100 + free(threads); 101 + return NULL; 102 + } 103 + 104 + // The benchmark creates nbreakpoints inheritable breakpoints, 105 + // then starts nparallel threads which create and join bench_repeat batches of nthreads threads. 106 + int bench_breakpoint_thread(int argc, const char **argv) 107 + { 108 + unsigned int i, result_usec; 109 + int repeat = bench_repeat; 110 + struct breakpoint *breakpoints; 111 + pthread_t *parallel; 112 + struct timeval start, stop, diff; 113 + 114 + if (parse_options(argc, argv, thread_options, thread_usage, 0)) { 115 + usage_with_options(thread_usage, thread_options); 116 + exit(EXIT_FAILURE); 117 + } 118 + breakpoints = calloc(thread_params.nbreakpoints, sizeof(breakpoints[0])); 119 + parallel = calloc(thread_params.nparallel, sizeof(parallel[0])); 120 + if (!breakpoints || !parallel) 121 + exit((perror("calloc"), EXIT_FAILURE)); 122 + 123 + for (i = 0; i < thread_params.nbreakpoints; i++) { 124 + breakpoints[i].fd = breakpoint_setup(&breakpoints[i].watched); 125 + if (breakpoints[i].fd == -1) 126 + exit((perror("perf_event_open"), EXIT_FAILURE)); 127 + } 128 + gettimeofday(&start, NULL); 129 + for (i = 0; i < thread_params.nparallel; i++) { 130 + if (pthread_create(&parallel[i], NULL, breakpoint_thread, &repeat)) 131 + exit((perror("pthread_create"), EXIT_FAILURE)); 132 + } 133 + for (i = 0; i < thread_params.nparallel; i++) 134 + pthread_join(parallel[i], NULL); 135 + gettimeofday(&stop, NULL); 136 + timersub(&stop, &start, &diff); 137 + for (i = 0; i < thread_params.nbreakpoints; i++) 138 + close(breakpoints[i].fd); 139 + free(parallel); 140 + free(breakpoints); 141 + switch (bench_format) { 142 + case BENCH_FORMAT_DEFAULT: 143 + printf("# Created/joined %d threads with %d breakpoints and %d parallelism\n", 144 + bench_repeat, thread_params.nbreakpoints, thread_params.nparallel); 145 + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", 146 + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); 147 + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; 148 + printf(" %14lf usecs/op\n", 149 + (double)result_usec / bench_repeat / thread_params.nthreads); 150 + printf(" %14lf usecs/op/cpu\n", 151 + (double)result_usec / bench_repeat / 152 + thread_params.nthreads * thread_params.nparallel); 153 + break; 154 + case BENCH_FORMAT_SIMPLE: 155 + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); 156 + break; 157 + default: 158 + fprintf(stderr, "Unknown format: %d\n", bench_format); 159 + exit(EXIT_FAILURE); 160 + } 161 + return 0; 162 + } 163 + 164 + struct { 165 + unsigned int npassive; 166 + unsigned int nactive; 167 + } enable_params = { 168 + .nactive = 0, 169 + .npassive = 0, 170 + }; 171 + 172 + static const struct option enable_options[] = { 173 + OPT_UINTEGER('p', "passive", &enable_params.npassive, "Specify amount of passive threads"), 174 + OPT_UINTEGER('a', "active", &enable_params.nactive, "Specify amount of active threads"), 175 + OPT_END() 176 + }; 177 + 178 + static const char * const enable_usage[] = { 179 + "perf bench breakpoint enable <options>", 180 + NULL 181 + }; 182 + 183 + // The benchmark creates an inheritable breakpoint, 184 + // then starts npassive threads that block and nactive threads that actively spin 185 + // and then disables and enables the breakpoint bench_repeat times. 186 + int bench_breakpoint_enable(int argc, const char **argv) 187 + { 188 + unsigned int i, nthreads, result_usec, done = 0; 189 + char watched; 190 + int fd; 191 + pthread_t *threads; 192 + struct timeval start, stop, diff; 193 + 194 + if (parse_options(argc, argv, enable_options, enable_usage, 0)) { 195 + usage_with_options(enable_usage, enable_options); 196 + exit(EXIT_FAILURE); 197 + } 198 + fd = breakpoint_setup(&watched); 199 + if (fd == -1) 200 + exit((perror("perf_event_open"), EXIT_FAILURE)); 201 + nthreads = enable_params.npassive + enable_params.nactive; 202 + threads = calloc(nthreads, sizeof(threads[0])); 203 + if (!threads) 204 + exit((perror("calloc"), EXIT_FAILURE)); 205 + 206 + for (i = 0; i < nthreads; i++) { 207 + if (pthread_create(&threads[i], NULL, 208 + i < enable_params.npassive ? passive_thread : active_thread, &done)) 209 + exit((perror("pthread_create"), EXIT_FAILURE)); 210 + } 211 + usleep(10000); // let the threads block 212 + gettimeofday(&start, NULL); 213 + for (i = 0; i < bench_repeat; i++) { 214 + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0)) 215 + exit((perror("ioctl(PERF_EVENT_IOC_DISABLE)"), EXIT_FAILURE)); 216 + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) 217 + exit((perror("ioctl(PERF_EVENT_IOC_ENABLE)"), EXIT_FAILURE)); 218 + } 219 + gettimeofday(&stop, NULL); 220 + timersub(&stop, &start, &diff); 221 + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); 222 + futex_wake(&done, enable_params.npassive, 0); 223 + for (i = 0; i < nthreads; i++) 224 + pthread_join(threads[i], NULL); 225 + free(threads); 226 + close(fd); 227 + switch (bench_format) { 228 + case BENCH_FORMAT_DEFAULT: 229 + printf("# Enabled/disabled breakpoint %d time with %d passive and %d active threads\n", 230 + bench_repeat, enable_params.npassive, enable_params.nactive); 231 + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", 232 + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); 233 + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; 234 + printf(" %14lf usecs/op\n", (double)result_usec / bench_repeat); 235 + break; 236 + case BENCH_FORMAT_SIMPLE: 237 + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); 238 + break; 239 + default: 240 + fprintf(stderr, "Unknown format: %d\n", bench_format); 241 + exit(EXIT_FAILURE); 242 + } 243 + return 0; 244 + }
+8
tools/perf/builtin-bench.c
··· 92 92 { NULL, NULL, NULL } 93 93 }; 94 94 95 + static struct bench breakpoint_benchmarks[] = { 96 + { "thread", "Benchmark thread start/finish with breakpoints", bench_breakpoint_thread}, 97 + { "enable", "Benchmark breakpoint enable/disable", bench_breakpoint_enable}, 98 + { "all", "Run all breakpoint benchmarks", NULL}, 99 + { NULL, NULL, NULL }, 100 + }; 101 + 95 102 struct collection { 96 103 const char *name; 97 104 const char *summary; ··· 117 110 {"epoll", "Epoll stressing benchmarks", epoll_benchmarks }, 118 111 #endif 119 112 { "internals", "Perf-internals benchmarks", internals_benchmarks }, 113 + { "breakpoint", "Breakpoint benchmarks", breakpoint_benchmarks }, 120 114 { "all", "All benchmarks", NULL }, 121 115 { NULL, NULL, NULL } 122 116 };