Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf bench: Also allow measuring memset()

This simply clones the respective memcpy() implementation.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/4F16D743020000780006D735@nat28.tlf.novell.com
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Jan Beulich and committed by
Arnaldo Carvalho de Melo
be3de80d 800eb014

+331 -2
+3 -1
tools/perf/Makefile
··· 61 61 ifeq (${IS_X86_64}, 1) 62 62 RAW_ARCH := x86_64 63 63 ARCH_CFLAGS := -DARCH_X86_64 64 - ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S 64 + ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S 65 65 endif 66 66 endif 67 67 ··· 362 362 BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o 363 363 ifeq ($(RAW_ARCH),x86_64) 364 364 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o 365 + BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o 365 366 endif 366 367 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o 368 + BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o 367 369 368 370 BUILTIN_OBJS += $(OUTPUT)builtin-diff.o 369 371 BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
+1
tools/perf/bench/bench.h
··· 4 4 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix); 5 5 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); 6 6 extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used); 7 + extern int bench_mem_memset(int argc, const char **argv, const char *prefix); 7 8 8 9 #define BENCH_FORMAT_DEFAULT_STR "default" 9 10 #define BENCH_FORMAT_DEFAULT 0
+12
tools/perf/bench/mem-memset-arch.h
··· 1 + 2 + #ifdef ARCH_X86_64 3 + 4 + #define MEMSET_FN(fn, name, desc) \ 5 + extern void *fn(void *, int, size_t); 6 + 7 + #include "mem-memset-x86-64-asm-def.h" 8 + 9 + #undef MEMSET_FN 10 + 11 + #endif 12 +
+12
tools/perf/bench/mem-memset-x86-64-asm-def.h
··· 1 + 2 + MEMSET_FN(__memset, 3 + "x86-64-unrolled", 4 + "unrolled memset() in arch/x86/lib/memset_64.S") 5 + 6 + MEMSET_FN(memset_c, 7 + "x86-64-stosq", 8 + "movsq-based memset() in arch/x86/lib/memset_64.S") 9 + 10 + MEMSET_FN(memset_c_e, 11 + "x86-64-stosb", 12 + "movsb-based memset() in arch/x86/lib/memset_64.S")
+6
tools/perf/bench/mem-memset-x86-64-asm.S
··· 1 + #define memset MEMSET /* don't hide glibc's memset() */ 2 + #define altinstr_replacement text 3 + #define globl p2align 4; .globl 4 + #define Lmemset_c globl memset_c; memset_c 5 + #define Lmemset_c_e globl memset_c_e; memset_c_e 6 + #include "../../../arch/x86/lib/memset_64.S"
+291
tools/perf/bench/mem-memset.c
··· 1 + /* 2 + * mem-memset.c 3 + * 4 + * memset: Simple memory set in various ways 5 + * 6 + * Trivial clone of mem-memcpy.c. 7 + */ 8 + #include <ctype.h> 9 + 10 + #include "../perf.h" 11 + #include "../util/util.h" 12 + #include "../util/parse-options.h" 13 + #include "../util/header.h" 14 + #include "bench.h" 15 + #include "mem-memset-arch.h" 16 + 17 + #include <stdio.h> 18 + #include <stdlib.h> 19 + #include <string.h> 20 + #include <sys/time.h> 21 + #include <errno.h> 22 + 23 + #define K 1024 24 + 25 + static const char *length_str = "1MB"; 26 + static const char *routine = "default"; 27 + static bool use_clock; 28 + static int clock_fd; 29 + static bool only_prefault; 30 + static bool no_prefault; 31 + 32 + static const struct option options[] = { 33 + OPT_STRING('l', "length", &length_str, "1MB", 34 + "Specify length of memory to copy. " 35 + "available unit: B, MB, GB (upper and lower)"), 36 + OPT_STRING('r', "routine", &routine, "default", 37 + "Specify routine to copy"), 38 + OPT_BOOLEAN('c', "clock", &use_clock, 39 + "Use CPU clock for measuring"), 40 + OPT_BOOLEAN('o', "only-prefault", &only_prefault, 41 + "Show only the result with page faults before memset()"), 42 + OPT_BOOLEAN('n', "no-prefault", &no_prefault, 43 + "Show only the result without page faults before memset()"), 44 + OPT_END() 45 + }; 46 + 47 + typedef void *(*memset_t)(void *, int, size_t); 48 + 49 + struct routine { 50 + const char *name; 51 + const char *desc; 52 + memset_t fn; 53 + }; 54 + 55 + static const struct routine routines[] = { 56 + { "default", 57 + "Default memset() provided by glibc", 58 + memset }, 59 + #ifdef ARCH_X86_64 60 + 61 + #define MEMSET_FN(fn, name, desc) { name, desc, fn }, 62 + #include "mem-memset-x86-64-asm-def.h" 63 + #undef MEMSET_FN 64 + 65 + #endif 66 + 67 + { NULL, 68 + NULL, 69 + NULL } 70 + }; 71 + 72 + static const char * const bench_mem_memset_usage[] = { 73 + "perf bench mem memset <options>", 74 + NULL 75 + }; 76 + 77 + static struct perf_event_attr clock_attr = { 78 + .type = PERF_TYPE_HARDWARE, 79 + .config = PERF_COUNT_HW_CPU_CYCLES 80 + }; 81 + 82 + static void init_clock(void) 83 + { 84 + clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0); 85 + 86 + if (clock_fd < 0 && errno == ENOSYS) 87 + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 88 + else 89 + BUG_ON(clock_fd < 0); 90 + } 91 + 92 + static u64 get_clock(void) 93 + { 94 + int ret; 95 + u64 clk; 96 + 97 + ret = read(clock_fd, &clk, sizeof(u64)); 98 + BUG_ON(ret != sizeof(u64)); 99 + 100 + return clk; 101 + } 102 + 103 + static double timeval2double(struct timeval *ts) 104 + { 105 + return (double)ts->tv_sec + 106 + (double)ts->tv_usec / (double)1000000; 107 + } 108 + 109 + static void alloc_mem(void **dst, size_t length) 110 + { 111 + *dst = zalloc(length); 112 + if (!dst) 113 + die("memory allocation failed - maybe length is too large?\n"); 114 + } 115 + 116 + static u64 do_memset_clock(memset_t fn, size_t len, bool prefault) 117 + { 118 + u64 clock_start = 0ULL, clock_end = 0ULL; 119 + void *dst = NULL; 120 + 121 + alloc_mem(&dst, len); 122 + 123 + if (prefault) 124 + fn(dst, -1, len); 125 + 126 + clock_start = get_clock(); 127 + fn(dst, 0, len); 128 + clock_end = get_clock(); 129 + 130 + free(dst); 131 + return clock_end - clock_start; 132 + } 133 + 134 + static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault) 135 + { 136 + struct timeval tv_start, tv_end, tv_diff; 137 + void *dst = NULL; 138 + 139 + alloc_mem(&dst, len); 140 + 141 + if (prefault) 142 + fn(dst, -1, len); 143 + 144 + BUG_ON(gettimeofday(&tv_start, NULL)); 145 + fn(dst, 0, len); 146 + BUG_ON(gettimeofday(&tv_end, NULL)); 147 + 148 + timersub(&tv_end, &tv_start, &tv_diff); 149 + 150 + free(dst); 151 + return (double)((double)len / timeval2double(&tv_diff)); 152 + } 153 + 154 + #define pf (no_prefault ? 0 : 1) 155 + 156 + #define print_bps(x) do { \ 157 + if (x < K) \ 158 + printf(" %14lf B/Sec", x); \ 159 + else if (x < K * K) \ 160 + printf(" %14lfd KB/Sec", x / K); \ 161 + else if (x < K * K * K) \ 162 + printf(" %14lf MB/Sec", x / K / K); \ 163 + else \ 164 + printf(" %14lf GB/Sec", x / K / K / K); \ 165 + } while (0) 166 + 167 + int bench_mem_memset(int argc, const char **argv, 168 + const char *prefix __used) 169 + { 170 + int i; 171 + size_t len; 172 + double result_bps[2]; 173 + u64 result_clock[2]; 174 + 175 + argc = parse_options(argc, argv, options, 176 + bench_mem_memset_usage, 0); 177 + 178 + if (use_clock) 179 + init_clock(); 180 + 181 + len = (size_t)perf_atoll((char *)length_str); 182 + 183 + result_clock[0] = result_clock[1] = 0ULL; 184 + result_bps[0] = result_bps[1] = 0.0; 185 + 186 + if ((s64)len <= 0) { 187 + fprintf(stderr, "Invalid length:%s\n", length_str); 188 + return 1; 189 + } 190 + 191 + /* same to without specifying either of prefault and no-prefault */ 192 + if (only_prefault && no_prefault) 193 + only_prefault = no_prefault = false; 194 + 195 + for (i = 0; routines[i].name; i++) { 196 + if (!strcmp(routines[i].name, routine)) 197 + break; 198 + } 199 + if (!routines[i].name) { 200 + printf("Unknown routine:%s\n", routine); 201 + printf("Available routines...\n"); 202 + for (i = 0; routines[i].name; i++) { 203 + printf("\t%s ... %s\n", 204 + routines[i].name, routines[i].desc); 205 + } 206 + return 1; 207 + } 208 + 209 + if (bench_format == BENCH_FORMAT_DEFAULT) 210 + printf("# Copying %s Bytes ...\n\n", length_str); 211 + 212 + if (!only_prefault && !no_prefault) { 213 + /* show both of results */ 214 + if (use_clock) { 215 + result_clock[0] = 216 + do_memset_clock(routines[i].fn, len, false); 217 + result_clock[1] = 218 + do_memset_clock(routines[i].fn, len, true); 219 + } else { 220 + result_bps[0] = 221 + do_memset_gettimeofday(routines[i].fn, 222 + len, false); 223 + result_bps[1] = 224 + do_memset_gettimeofday(routines[i].fn, 225 + len, true); 226 + } 227 + } else { 228 + if (use_clock) { 229 + result_clock[pf] = 230 + do_memset_clock(routines[i].fn, 231 + len, only_prefault); 232 + } else { 233 + result_bps[pf] = 234 + do_memset_gettimeofday(routines[i].fn, 235 + len, only_prefault); 236 + } 237 + } 238 + 239 + switch (bench_format) { 240 + case BENCH_FORMAT_DEFAULT: 241 + if (!only_prefault && !no_prefault) { 242 + if (use_clock) { 243 + printf(" %14lf Clock/Byte\n", 244 + (double)result_clock[0] 245 + / (double)len); 246 + printf(" %14lf Clock/Byte (with prefault)\n ", 247 + (double)result_clock[1] 248 + / (double)len); 249 + } else { 250 + print_bps(result_bps[0]); 251 + printf("\n"); 252 + print_bps(result_bps[1]); 253 + printf(" (with prefault)\n"); 254 + } 255 + } else { 256 + if (use_clock) { 257 + printf(" %14lf Clock/Byte", 258 + (double)result_clock[pf] 259 + / (double)len); 260 + } else 261 + print_bps(result_bps[pf]); 262 + 263 + printf("%s\n", only_prefault ? " (with prefault)" : ""); 264 + } 265 + break; 266 + case BENCH_FORMAT_SIMPLE: 267 + if (!only_prefault && !no_prefault) { 268 + if (use_clock) { 269 + printf("%lf %lf\n", 270 + (double)result_clock[0] / (double)len, 271 + (double)result_clock[1] / (double)len); 272 + } else { 273 + printf("%lf %lf\n", 274 + result_bps[0], result_bps[1]); 275 + } 276 + } else { 277 + if (use_clock) { 278 + printf("%lf\n", (double)result_clock[pf] 279 + / (double)len); 280 + } else 281 + printf("%lf\n", result_bps[pf]); 282 + } 283 + break; 284 + default: 285 + /* reaching this means there's some disaster: */ 286 + die("unknown format: %d\n", bench_format); 287 + break; 288 + } 289 + 290 + return 0; 291 + }
+3
tools/perf/builtin-bench.c
··· 52 52 { "memcpy", 53 53 "Simple memory copy in various ways", 54 54 bench_mem_memcpy }, 55 + { "memset", 56 + "Simple memory set in various ways", 57 + bench_mem_memset }, 55 58 suite_all, 56 59 { NULL, 57 60 NULL,
+3 -1
tools/perf/util/include/asm/dwarf2.h
··· 2 2 #ifndef PERF_DWARF2_H 3 3 #define PERF_DWARF2_H 4 4 5 - /* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */ 5 + /* dwarf2.h ... dummy header file for including arch/x86/lib/mem{cpy,set}_64.S */ 6 6 7 7 #define CFI_STARTPROC 8 8 #define CFI_ENDPROC 9 + #define CFI_REMEMBER_STATE 10 + #define CFI_RESTORE_STATE 9 11 10 12 #endif /* PERF_DWARF2_H */ 11 13