Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf bench futex: Support parallel waker threads

The futex-wake benchmark only measures wakeups done within a single
process. While this has value in its own, it does not really generate
any hb->lock contention.

A new benchmark 'wake-parallel' is added, by extending the futex-wake
code such that we can measure parallel waker threads. The program output
shows the avg per-thread latency in order to complete its share of
wakeups:

Run summary [PID 13474]: blocking on 512 threads (at [private] futex 0xa88668), 8 threads waking up 64 at a time.

[Run 1]: Avg per-thread latency (waking 64/512 threads) in 0.6230 ms (+-15.31%)
[Run 2]: Avg per-thread latency (waking 64/512 threads) in 0.5175 ms (+-29.95%)
[Run 3]: Avg per-thread latency (waking 64/512 threads) in 0.7578 ms (+-18.03%)
[Run 4]: Avg per-thread latency (waking 64/512 threads) in 0.8944 ms (+-12.54%)
[Run 5]: Avg per-thread latency (waking 64/512 threads) in 1.1204 ms (+-23.85%)
Avg per-thread latency (waking 64/512 threads) in 0.7826 ms (+-9.91%)

Naturally, different combinations of numbers of blocking and waker
threads will exhibit different information.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Link: http://lkml.kernel.org/r/1431110280-20231-1-git-send-email-dave@stgolabs.net
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Davidlohr Bueso and committed by
Arnaldo Carvalho de Melo
d65817b4 b91fc39f

+301
+3
tools/perf/Documentation/perf-bench.txt
··· 210 210 *wake*:: 211 211 Suite for evaluating wake calls. 212 212 213 + *wake-parallel*:: 214 + Suite for evaluating parallel wake calls. 215 + 213 216 *requeue*:: 214 217 Suite for evaluating requeue calls. 215 218
+1
tools/perf/bench/Build
··· 3 3 perf-y += mem-memcpy.o 4 4 perf-y += futex-hash.o 5 5 perf-y += futex-wake.o 6 + perf-y += futex-wake-parallel.o 6 7 perf-y += futex-requeue.o 7 8 8 9 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
+2
tools/perf/bench/bench.h
··· 33 33 extern int bench_mem_memset(int argc, const char **argv, const char *prefix); 34 34 extern int bench_futex_hash(int argc, const char **argv, const char *prefix); 35 35 extern int bench_futex_wake(int argc, const char **argv, const char *prefix); 36 + extern int bench_futex_wake_parallel(int argc, const char **argv, 37 + const char *prefix); 36 38 extern int bench_futex_requeue(int argc, const char **argv, const char *prefix); 37 39 38 40 #define BENCH_FORMAT_DEFAULT_STR "default"
+294
tools/perf/bench/futex-wake-parallel.c
··· 1 + /* 2 + * Copyright (C) 2015 Davidlohr Bueso. 3 + * 4 + * Block a bunch of threads and let parallel waker threads wakeup an 5 + * equal amount of them. The program output reflects the avg latency 6 + * for each individual thread to service its share of work. Ultimately 7 + * it can be used to measure futex_wake() changes. 8 + */ 9 + 10 + #include "../perf.h" 11 + #include "../util/util.h" 12 + #include "../util/stat.h" 13 + #include "../util/parse-options.h" 14 + #include "../util/header.h" 15 + #include "bench.h" 16 + #include "futex.h" 17 + 18 + #include <err.h> 19 + #include <stdlib.h> 20 + #include <sys/time.h> 21 + #include <pthread.h> 22 + 23 + struct thread_data { 24 + pthread_t worker; 25 + unsigned int nwoken; 26 + struct timeval runtime; 27 + }; 28 + 29 + static unsigned int nwakes = 1; 30 + 31 + /* all threads will block on the same futex -- hash bucket chaos ;) */ 32 + static u_int32_t futex = 0; 33 + 34 + static pthread_t *blocked_worker; 35 + static bool done = false, silent = false, fshared = false; 36 + static unsigned int nblocked_threads = 0, nwaking_threads = 0; 37 + static pthread_mutex_t thread_lock; 38 + static pthread_cond_t thread_parent, thread_worker; 39 + static struct stats waketime_stats, wakeup_stats; 40 + static unsigned int ncpus, threads_starting; 41 + static int futex_flag = 0; 42 + 43 + static const struct option options[] = { 44 + OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"), 45 + OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"), 46 + OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"), 47 + OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"), 48 + OPT_END() 49 + }; 50 + 51 + static const char * const bench_futex_wake_parallel_usage[] = { 52 + "perf bench futex wake-parallel <options>", 53 + NULL 54 + }; 55 + 56 + static void *waking_workerfn(void *arg) 57 + { 58 + struct thread_data *waker = (struct thread_data *) arg; 59 + struct timeval start, end; 60 + 61 + gettimeofday(&start, NULL); 62 + 63 + waker->nwoken = futex_wake(&futex, nwakes, futex_flag); 64 + if (waker->nwoken != nwakes) 65 + warnx("couldn't wakeup all tasks (%d/%d)", 66 + waker->nwoken, nwakes); 67 + 68 + gettimeofday(&end, NULL); 69 + timersub(&end, &start, &waker->runtime); 70 + 71 + pthread_exit(NULL); 72 + return NULL; 73 + } 74 + 75 + static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr) 76 + { 77 + unsigned int i; 78 + 79 + pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); 80 + 81 + /* create and block all threads */ 82 + for (i = 0; i < nwaking_threads; i++) { 83 + /* 84 + * Thread creation order will impact per-thread latency 85 + * as it will affect the order to acquire the hb spinlock. 86 + * For now let the scheduler decide. 87 + */ 88 + if (pthread_create(&td[i].worker, &thread_attr, 89 + waking_workerfn, (void *)&td[i])) 90 + err(EXIT_FAILURE, "pthread_create"); 91 + } 92 + 93 + for (i = 0; i < nwaking_threads; i++) 94 + if (pthread_join(td[i].worker, NULL)) 95 + err(EXIT_FAILURE, "pthread_join"); 96 + } 97 + 98 + static void *blocked_workerfn(void *arg __maybe_unused) 99 + { 100 + pthread_mutex_lock(&thread_lock); 101 + threads_starting--; 102 + if (!threads_starting) 103 + pthread_cond_signal(&thread_parent); 104 + pthread_cond_wait(&thread_worker, &thread_lock); 105 + pthread_mutex_unlock(&thread_lock); 106 + 107 + while (1) { /* handle spurious wakeups */ 108 + if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) 109 + break; 110 + } 111 + 112 + pthread_exit(NULL); 113 + return NULL; 114 + } 115 + 116 + static void block_threads(pthread_t *w, pthread_attr_t thread_attr) 117 + { 118 + cpu_set_t cpu; 119 + unsigned int i; 120 + 121 + threads_starting = nblocked_threads; 122 + 123 + /* create and block all threads */ 124 + for (i = 0; i < nblocked_threads; i++) { 125 + CPU_ZERO(&cpu); 126 + CPU_SET(i % ncpus, &cpu); 127 + 128 + if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) 129 + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); 130 + 131 + if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) 132 + err(EXIT_FAILURE, "pthread_create"); 133 + } 134 + } 135 + 136 + static void print_run(struct thread_data *waking_worker, unsigned int run_num) 137 + { 138 + unsigned int i, wakeup_avg; 139 + double waketime_avg, waketime_stddev; 140 + struct stats __waketime_stats, __wakeup_stats; 141 + 142 + init_stats(&__wakeup_stats); 143 + init_stats(&__waketime_stats); 144 + 145 + for (i = 0; i < nwaking_threads; i++) { 146 + update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec); 147 + update_stats(&__wakeup_stats, waking_worker[i].nwoken); 148 + } 149 + 150 + waketime_avg = avg_stats(&__waketime_stats); 151 + waketime_stddev = stddev_stats(&__waketime_stats); 152 + wakeup_avg = avg_stats(&__wakeup_stats); 153 + 154 + printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) " 155 + "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg, 156 + nblocked_threads, waketime_avg/1e3, 157 + rel_stddev_stats(waketime_stddev, waketime_avg)); 158 + } 159 + 160 + static void print_summary(void) 161 + { 162 + unsigned int wakeup_avg; 163 + double waketime_avg, waketime_stddev; 164 + 165 + waketime_avg = avg_stats(&waketime_stats); 166 + waketime_stddev = stddev_stats(&waketime_stats); 167 + wakeup_avg = avg_stats(&wakeup_stats); 168 + 169 + printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n", 170 + wakeup_avg, 171 + nblocked_threads, 172 + waketime_avg/1e3, 173 + rel_stddev_stats(waketime_stddev, waketime_avg)); 174 + } 175 + 176 + 177 + static void do_run_stats(struct thread_data *waking_worker) 178 + { 179 + unsigned int i; 180 + 181 + for (i = 0; i < nwaking_threads; i++) { 182 + update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec); 183 + update_stats(&wakeup_stats, waking_worker[i].nwoken); 184 + } 185 + 186 + } 187 + 188 + static void toggle_done(int sig __maybe_unused, 189 + siginfo_t *info __maybe_unused, 190 + void *uc __maybe_unused) 191 + { 192 + done = true; 193 + } 194 + 195 + int bench_futex_wake_parallel(int argc, const char **argv, 196 + const char *prefix __maybe_unused) 197 + { 198 + int ret = 0; 199 + unsigned int i, j; 200 + struct sigaction act; 201 + pthread_attr_t thread_attr; 202 + struct thread_data *waking_worker; 203 + 204 + argc = parse_options(argc, argv, options, 205 + bench_futex_wake_parallel_usage, 0); 206 + if (argc) { 207 + usage_with_options(bench_futex_wake_parallel_usage, options); 208 + exit(EXIT_FAILURE); 209 + } 210 + 211 + sigfillset(&act.sa_mask); 212 + act.sa_sigaction = toggle_done; 213 + sigaction(SIGINT, &act, NULL); 214 + 215 + ncpus = sysconf(_SC_NPROCESSORS_ONLN); 216 + if (!nblocked_threads) 217 + nblocked_threads = ncpus; 218 + 219 + /* some sanity checks */ 220 + if (nwaking_threads > nblocked_threads || !nwaking_threads) 221 + nwaking_threads = nblocked_threads; 222 + 223 + if (nblocked_threads % nwaking_threads) 224 + errx(EXIT_FAILURE, "Must be perfectly divisible"); 225 + /* 226 + * Each thread will wakeup nwakes tasks in 227 + * a single futex_wait call. 228 + */ 229 + nwakes = nblocked_threads/nwaking_threads; 230 + 231 + blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker)); 232 + if (!blocked_worker) 233 + err(EXIT_FAILURE, "calloc"); 234 + 235 + if (!fshared) 236 + futex_flag = FUTEX_PRIVATE_FLAG; 237 + 238 + printf("Run summary [PID %d]: blocking on %d threads (at [%s] " 239 + "futex %p), %d threads waking up %d at a time.\n\n", 240 + getpid(), nblocked_threads, fshared ? "shared":"private", 241 + &futex, nwaking_threads, nwakes); 242 + 243 + init_stats(&wakeup_stats); 244 + init_stats(&waketime_stats); 245 + 246 + pthread_attr_init(&thread_attr); 247 + pthread_mutex_init(&thread_lock, NULL); 248 + pthread_cond_init(&thread_parent, NULL); 249 + pthread_cond_init(&thread_worker, NULL); 250 + 251 + for (j = 0; j < bench_repeat && !done; j++) { 252 + waking_worker = calloc(nwaking_threads, sizeof(*waking_worker)); 253 + if (!waking_worker) 254 + err(EXIT_FAILURE, "calloc"); 255 + 256 + /* create, launch & block all threads */ 257 + block_threads(blocked_worker, thread_attr); 258 + 259 + /* make sure all threads are already blocked */ 260 + pthread_mutex_lock(&thread_lock); 261 + while (threads_starting) 262 + pthread_cond_wait(&thread_parent, &thread_lock); 263 + pthread_cond_broadcast(&thread_worker); 264 + pthread_mutex_unlock(&thread_lock); 265 + 266 + usleep(100000); 267 + 268 + /* Ok, all threads are patiently blocked, start waking folks up */ 269 + wakeup_threads(waking_worker, thread_attr); 270 + 271 + for (i = 0; i < nblocked_threads; i++) { 272 + ret = pthread_join(blocked_worker[i], NULL); 273 + if (ret) 274 + err(EXIT_FAILURE, "pthread_join"); 275 + } 276 + 277 + do_run_stats(waking_worker); 278 + if (!silent) 279 + print_run(waking_worker, j); 280 + 281 + free(waking_worker); 282 + } 283 + 284 + /* cleanup & report results */ 285 + pthread_cond_destroy(&thread_parent); 286 + pthread_cond_destroy(&thread_worker); 287 + pthread_mutex_destroy(&thread_lock); 288 + pthread_attr_destroy(&thread_attr); 289 + 290 + print_summary(); 291 + 292 + free(blocked_worker); 293 + return ret; 294 + }
+1
tools/perf/builtin-bench.c
··· 58 58 static struct bench futex_benchmarks[] = { 59 59 { "hash", "Benchmark for futex hash table", bench_futex_hash }, 60 60 { "wake", "Benchmark for futex wake calls", bench_futex_wake }, 61 + { "wake-parallel", "Benchmark for parallel futex wake calls", bench_futex_wake_parallel }, 61 62 { "requeue", "Benchmark for futex requeue calls", bench_futex_requeue }, 62 63 { "all", "Test all futex benchmarks", NULL }, 63 64 { NULL, NULL, NULL }