Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'bpf-perf-hw-sw-events'

Alexei Starovoitov says:

====================
perf, bpf: add support for bpf in sw/hw perf_events

this patch set is a follow up to the discussion:
https://lkml.kernel.org/r/20160804142853.GO6862%20()%20twins%20!%20programming%20!%20kicks-ass%20!%20net
It turned out to be simpler than what we discussed.

Patches 1-3 is bpf-side prep for the main patch 4
that adds bpf program as an overflow_handler to sw and hw perf_events.

Patches 5 and 6 are examples from myself and Brendan.

Peter,
to implement your suggestion to add ifdef CONFIG_BPF_SYSCALL
inside struct perf_event, I had to shuffle ifdefs in events/core.c
Please double check whether that is what you wanted to see.

v2->v3: fixed few more minor issues
v1->v2: fixed issues spotted by Peter and Daniel.
====================

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

+737 -6
+4
include/linux/bpf.h
··· 297 297 static inline void bpf_prog_put(struct bpf_prog *prog) 298 298 { 299 299 } 300 + static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) 301 + { 302 + return ERR_PTR(-EOPNOTSUPP); 303 + } 300 304 #endif /* CONFIG_BPF_SYSCALL */ 301 305 302 306 /* verifier prototypes for helper functions called from eBPF programs */
+9
include/linux/perf_event.h
··· 679 679 u64 (*clock)(void); 680 680 perf_overflow_handler_t overflow_handler; 681 681 void *overflow_handler_context; 682 + #ifdef CONFIG_BPF_SYSCALL 683 + perf_overflow_handler_t orig_overflow_handler; 684 + struct bpf_prog *prog; 685 + #endif 682 686 683 687 #ifdef CONFIG_EVENT_TRACING 684 688 struct trace_event_call *tp_event; ··· 790 786 unsigned long head; 791 787 }; 792 788 int page; 789 + }; 790 + 791 + struct bpf_perf_event_data_kern { 792 + struct pt_regs *regs; 793 + struct perf_sample_data *data; 793 794 }; 794 795 795 796 #ifdef CONFIG_CGROUP_PERF
+1
include/uapi/linux/Kbuild
··· 71 71 header-y += blkpg.h 72 72 header-y += blktrace_api.h 73 73 header-y += bpf_common.h 74 + header-y += bpf_perf_event.h 74 75 header-y += bpf.h 75 76 header-y += bpqether.h 76 77 header-y += bsg.h
+1
include/uapi/linux/bpf.h
··· 95 95 BPF_PROG_TYPE_SCHED_ACT, 96 96 BPF_PROG_TYPE_TRACEPOINT, 97 97 BPF_PROG_TYPE_XDP, 98 + BPF_PROG_TYPE_PERF_EVENT, 98 99 }; 99 100 100 101 #define BPF_PSEUDO_MAP_FD 1
+18
include/uapi/linux/bpf_perf_event.h
··· 1 + /* Copyright (c) 2016 Facebook 2 + * 3 + * This program is free software; you can redistribute it and/or 4 + * modify it under the terms of version 2 of the GNU General Public 5 + * License as published by the Free Software Foundation. 6 + */ 7 + #ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ 8 + #define _UAPI__LINUX_BPF_PERF_EVENT_H__ 9 + 10 + #include <linux/types.h> 11 + #include <linux/ptrace.h> 12 + 13 + struct bpf_perf_event_data { 14 + struct pt_regs regs; 15 + __u64 sample_period; 16 + }; 17 + 18 + #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
+27 -4
kernel/bpf/verifier.c
··· 2333 2333 if (err) 2334 2334 return err; 2335 2335 2336 - if (BPF_SIZE(insn->code) != BPF_W) { 2336 + if (BPF_SIZE(insn->code) != BPF_W && 2337 + BPF_SIZE(insn->code) != BPF_DW) { 2337 2338 insn_idx++; 2338 2339 continue; 2339 2340 } ··· 2511 2510 return 0; 2512 2511 } 2513 2512 2513 + static int check_map_prog_compatibility(struct bpf_map *map, 2514 + struct bpf_prog *prog) 2515 + 2516 + { 2517 + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && 2518 + (map->map_type == BPF_MAP_TYPE_HASH || 2519 + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && 2520 + (map->map_flags & BPF_F_NO_PREALLOC)) { 2521 + verbose("perf_event programs can only use preallocated hash map\n"); 2522 + return -EINVAL; 2523 + } 2524 + return 0; 2525 + } 2526 + 2514 2527 /* look for pseudo eBPF instructions that access map FDs and 2515 2528 * replace them with actual map pointers 2516 2529 */ ··· 2532 2517 { 2533 2518 struct bpf_insn *insn = env->prog->insnsi; 2534 2519 int insn_cnt = env->prog->len; 2535 - int i, j; 2520 + int i, j, err; 2536 2521 2537 2522 for (i = 0; i < insn_cnt; i++, insn++) { 2538 2523 if (BPF_CLASS(insn->code) == BPF_LDX && ··· 2574 2559 verbose("fd %d is not pointing to valid bpf_map\n", 2575 2560 insn->imm); 2576 2561 return PTR_ERR(map); 2562 + } 2563 + 2564 + err = check_map_prog_compatibility(map, env->prog); 2565 + if (err) { 2566 + fdput(f); 2567 + return err; 2577 2568 } 2578 2569 2579 2570 /* store map pointer inside BPF_LD_IMM64 instruction */ ··· 2663 2642 for (i = 0; i < insn_cnt; i++, insn++) { 2664 2643 u32 insn_delta, cnt; 2665 2644 2666 - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) 2645 + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || 2646 + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) 2667 2647 type = BPF_READ; 2668 - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) 2648 + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || 2649 + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) 2669 2650 type = BPF_WRITE; 2670 2651 else 2671 2652 continue;
+88 -1
kernel/events/core.c
··· 7022 7022 irq_work_queue(&event->pending); 7023 7023 } 7024 7024 7025 - event->overflow_handler(event, data, regs); 7025 + READ_ONCE(event->overflow_handler)(event, data, regs); 7026 7026 7027 7027 if (*perf_event_fasync(event) && event->pending_kill) { 7028 7028 event->pending_wakeup = 1; ··· 7637 7637 ftrace_profile_free_filter(event); 7638 7638 } 7639 7639 7640 + #ifdef CONFIG_BPF_SYSCALL 7641 + static void bpf_overflow_handler(struct perf_event *event, 7642 + struct perf_sample_data *data, 7643 + struct pt_regs *regs) 7644 + { 7645 + struct bpf_perf_event_data_kern ctx = { 7646 + .data = data, 7647 + .regs = regs, 7648 + }; 7649 + int ret = 0; 7650 + 7651 + preempt_disable(); 7652 + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 7653 + goto out; 7654 + rcu_read_lock(); 7655 + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); 7656 + rcu_read_unlock(); 7657 + out: 7658 + __this_cpu_dec(bpf_prog_active); 7659 + preempt_enable(); 7660 + if (!ret) 7661 + return; 7662 + 7663 + event->orig_overflow_handler(event, data, regs); 7664 + } 7665 + 7666 + static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) 7667 + { 7668 + struct bpf_prog *prog; 7669 + 7670 + if (event->overflow_handler_context) 7671 + /* hw breakpoint or kernel counter */ 7672 + return -EINVAL; 7673 + 7674 + if (event->prog) 7675 + return -EEXIST; 7676 + 7677 + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); 7678 + if (IS_ERR(prog)) 7679 + return PTR_ERR(prog); 7680 + 7681 + event->prog = prog; 7682 + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); 7683 + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); 7684 + return 0; 7685 + } 7686 + 7687 + static void perf_event_free_bpf_handler(struct perf_event *event) 7688 + { 7689 + struct bpf_prog *prog = event->prog; 7690 + 7691 + if (!prog) 7692 + return; 7693 + 7694 + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); 7695 + event->prog = NULL; 7696 + bpf_prog_put(prog); 7697 + } 7698 + #else 7699 + static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) 7700 + { 7701 + return -EOPNOTSUPP; 7702 + } 7703 + static void perf_event_free_bpf_handler(struct perf_event *event) 7704 + { 7705 + } 7706 + #endif 7707 + 7640 7708 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 7641 7709 { 7642 7710 bool is_kprobe, is_tracepoint; 7643 7711 struct bpf_prog *prog; 7712 + 7713 + if (event->attr.type == PERF_TYPE_HARDWARE || 7714 + event->attr.type == PERF_TYPE_SOFTWARE) 7715 + return perf_event_set_bpf_handler(event, prog_fd); 7644 7716 7645 7717 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7646 7718 return -EINVAL; ··· 7753 7681 static void perf_event_free_bpf_prog(struct perf_event *event) 7754 7682 { 7755 7683 struct bpf_prog *prog; 7684 + 7685 + perf_event_free_bpf_handler(event); 7756 7686 7757 7687 if (!event->tp_event) 7758 7688 return; ··· 9072 8998 if (!overflow_handler && parent_event) { 9073 8999 overflow_handler = parent_event->overflow_handler; 9074 9000 context = parent_event->overflow_handler_context; 9001 + #ifdef CONFIG_BPF_SYSCALL 9002 + if (overflow_handler == bpf_overflow_handler) { 9003 + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); 9004 + 9005 + if (IS_ERR(prog)) { 9006 + err = PTR_ERR(prog); 9007 + goto err_ns; 9008 + } 9009 + event->prog = prog; 9010 + event->orig_overflow_handler = 9011 + parent_event->orig_overflow_handler; 9012 + } 9013 + #endif 9075 9014 } 9076 9015 9077 9016 if (overflow_handler) {
+61
kernel/trace/bpf_trace.c
··· 1 1 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com 2 + * Copyright (c) 2016 Facebook 2 3 * 3 4 * This program is free software; you can redistribute it and/or 4 5 * modify it under the terms of version 2 of the GNU General Public ··· 9 8 #include <linux/types.h> 10 9 #include <linux/slab.h> 11 10 #include <linux/bpf.h> 11 + #include <linux/bpf_perf_event.h> 12 12 #include <linux/filter.h> 13 13 #include <linux/uaccess.h> 14 14 #include <linux/ctype.h> ··· 554 552 .type = BPF_PROG_TYPE_TRACEPOINT, 555 553 }; 556 554 555 + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 556 + enum bpf_reg_type *reg_type) 557 + { 558 + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) 559 + return false; 560 + if (type != BPF_READ) 561 + return false; 562 + if (off % size != 0) 563 + return false; 564 + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { 565 + if (size != sizeof(u64)) 566 + return false; 567 + } else { 568 + if (size != sizeof(long)) 569 + return false; 570 + } 571 + return true; 572 + } 573 + 574 + static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, 575 + int src_reg, int ctx_off, 576 + struct bpf_insn *insn_buf, 577 + struct bpf_prog *prog) 578 + { 579 + struct bpf_insn *insn = insn_buf; 580 + 581 + switch (ctx_off) { 582 + case offsetof(struct bpf_perf_event_data, sample_period): 583 + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); 584 + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)), 585 + dst_reg, src_reg, 586 + offsetof(struct bpf_perf_event_data_kern, data)); 587 + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, 588 + offsetof(struct perf_sample_data, period)); 589 + break; 590 + default: 591 + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)), 592 + dst_reg, src_reg, 593 + offsetof(struct bpf_perf_event_data_kern, regs)); 594 + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)), 595 + dst_reg, dst_reg, ctx_off); 596 + break; 597 + } 598 + 599 + return insn - insn_buf; 600 + } 601 + 602 + static const struct bpf_verifier_ops perf_event_prog_ops = { 603 + .get_func_proto = tp_prog_func_proto, 604 + .is_valid_access = pe_prog_is_valid_access, 605 + .convert_ctx_access = pe_prog_convert_ctx_access, 606 + }; 607 + 608 + static struct bpf_prog_type_list perf_event_tl = { 609 + .ops = &perf_event_prog_ops, 610 + .type = BPF_PROG_TYPE_PERF_EVENT, 611 + }; 612 + 557 613 static int __init register_kprobe_prog_ops(void) 558 614 { 559 615 bpf_register_prog_type(&kprobe_tl); 560 616 bpf_register_prog_type(&tracepoint_tl); 617 + bpf_register_prog_type(&perf_event_tl); 561 618 return 0; 562 619 } 563 620 late_initcall(register_kprobe_prog_ops);
+8
samples/bpf/Makefile
··· 25 25 hostprogs-y += xdp1 26 26 hostprogs-y += xdp2 27 27 hostprogs-y += test_current_task_under_cgroup 28 + hostprogs-y += trace_event 29 + hostprogs-y += sampleip 28 30 29 31 test_verifier-objs := test_verifier.o libbpf.o 30 32 test_maps-objs := test_maps.o libbpf.o ··· 54 52 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o 55 53 test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ 56 54 test_current_task_under_cgroup_user.o 55 + trace_event-objs := bpf_load.o libbpf.o trace_event_user.o 56 + sampleip-objs := bpf_load.o libbpf.o sampleip_user.o 57 57 58 58 # Tell kbuild to always build the programs 59 59 always := $(hostprogs-y) ··· 83 79 always += xdp1_kern.o 84 80 always += xdp2_kern.o 85 81 always += test_current_task_under_cgroup_kern.o 82 + always += trace_event_kern.o 83 + always += sampleip_kern.o 86 84 87 85 HOSTCFLAGS += -I$(objtree)/usr/include 88 86 ··· 109 103 HOSTLOADLIBES_xdp1 += -lelf 110 104 HOSTLOADLIBES_xdp2 += -lelf 111 105 HOSTLOADLIBES_test_current_task_under_cgroup += -lelf 106 + HOSTLOADLIBES_trace_event += -lelf 107 + HOSTLOADLIBES_sampleip += -lelf 112 108 113 109 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: 114 110 # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+2
samples/bpf/bpf_helpers.h
··· 55 55 (void *) BPF_FUNC_skb_get_tunnel_opt; 56 56 static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = 57 57 (void *) BPF_FUNC_skb_set_tunnel_opt; 58 + static unsigned long long (*bpf_get_prandom_u32)(void) = 59 + (void *) BPF_FUNC_get_prandom_u32; 58 60 59 61 /* llvm builtin functions that eBPF C program may use to 60 62 * emit BPF_LD_ABS and BPF_LD_IND instructions
+6 -1
samples/bpf/bpf_load.c
··· 51 51 bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 52 52 bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; 53 53 bool is_xdp = strncmp(event, "xdp", 3) == 0; 54 + bool is_perf_event = strncmp(event, "perf_event", 10) == 0; 54 55 enum bpf_prog_type prog_type; 55 56 char buf[256]; 56 57 int fd, efd, err, id; ··· 70 69 prog_type = BPF_PROG_TYPE_TRACEPOINT; 71 70 } else if (is_xdp) { 72 71 prog_type = BPF_PROG_TYPE_XDP; 72 + } else if (is_perf_event) { 73 + prog_type = BPF_PROG_TYPE_PERF_EVENT; 73 74 } else { 74 75 printf("Unknown event '%s'\n", event); 75 76 return -1; ··· 85 82 86 83 prog_fd[prog_cnt++] = fd; 87 84 88 - if (is_xdp) 85 + if (is_xdp || is_perf_event) 89 86 return 0; 90 87 91 88 if (is_socket) { ··· 329 326 memcmp(shname_prog, "kretprobe/", 10) == 0 || 330 327 memcmp(shname_prog, "tracepoint/", 11) == 0 || 331 328 memcmp(shname_prog, "xdp", 3) == 0 || 329 + memcmp(shname_prog, "perf_event", 10) == 0 || 332 330 memcmp(shname_prog, "socket", 6) == 0) 333 331 load_and_attach(shname_prog, insns, data_prog->d_size); 334 332 } ··· 348 344 memcmp(shname, "kretprobe/", 10) == 0 || 349 345 memcmp(shname, "tracepoint/", 11) == 0 || 350 346 memcmp(shname, "xdp", 3) == 0 || 347 + memcmp(shname, "perf_event", 10) == 0 || 351 348 memcmp(shname, "socket", 6) == 0) 352 349 load_and_attach(shname, data->d_buf, data->d_size); 353 350 }
+38
samples/bpf/sampleip_kern.c
··· 1 + /* Copyright 2016 Netflix, Inc. 2 + * 3 + * This program is free software; you can redistribute it and/or 4 + * modify it under the terms of version 2 of the GNU General Public 5 + * License as published by the Free Software Foundation. 6 + */ 7 + #include <linux/version.h> 8 + #include <linux/ptrace.h> 9 + #include <uapi/linux/bpf.h> 10 + #include <uapi/linux/bpf_perf_event.h> 11 + #include "bpf_helpers.h" 12 + 13 + #define MAX_IPS 8192 14 + 15 + struct bpf_map_def SEC("maps") ip_map = { 16 + .type = BPF_MAP_TYPE_HASH, 17 + .key_size = sizeof(u64), 18 + .value_size = sizeof(u32), 19 + .max_entries = MAX_IPS, 20 + }; 21 + 22 + SEC("perf_event") 23 + int do_sample(struct bpf_perf_event_data *ctx) 24 + { 25 + u64 ip; 26 + u32 *value, init_val = 1; 27 + 28 + ip = ctx->regs.ip; 29 + value = bpf_map_lookup_elem(&ip_map, &ip); 30 + if (value) 31 + *value += 1; 32 + else 33 + /* E2BIG not tested for this example only */ 34 + bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST); 35 + 36 + return 0; 37 + } 38 + char _license[] SEC("license") = "GPL";
+196
samples/bpf/sampleip_user.c
··· 1 + /* 2 + * sampleip: sample instruction pointer and frequency count in a BPF map. 3 + * 4 + * Copyright 2016 Netflix, Inc. 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of version 2 of the GNU General Public 8 + * License as published by the Free Software Foundation. 9 + */ 10 + #include <stdio.h> 11 + #include <stdlib.h> 12 + #include <stdio.h> 13 + #include <unistd.h> 14 + #include <errno.h> 15 + #include <signal.h> 16 + #include <string.h> 17 + #include <assert.h> 18 + #include <linux/perf_event.h> 19 + #include <linux/ptrace.h> 20 + #include <linux/bpf.h> 21 + #include <sys/ioctl.h> 22 + #include "libbpf.h" 23 + #include "bpf_load.h" 24 + 25 + #define DEFAULT_FREQ 99 26 + #define DEFAULT_SECS 5 27 + #define MAX_IPS 8192 28 + #define PAGE_OFFSET 0xffff880000000000 29 + 30 + static int nr_cpus; 31 + 32 + static void usage(void) 33 + { 34 + printf("USAGE: sampleip [-F freq] [duration]\n"); 35 + printf(" -F freq # sample frequency (Hertz), default 99\n"); 36 + printf(" duration # sampling duration (seconds), default 5\n"); 37 + } 38 + 39 + static int sampling_start(int *pmu_fd, int freq) 40 + { 41 + int i; 42 + 43 + struct perf_event_attr pe_sample_attr = { 44 + .type = PERF_TYPE_SOFTWARE, 45 + .freq = 1, 46 + .sample_period = freq, 47 + .config = PERF_COUNT_SW_CPU_CLOCK, 48 + .inherit = 1, 49 + }; 50 + 51 + for (i = 0; i < nr_cpus; i++) { 52 + pmu_fd[i] = perf_event_open(&pe_sample_attr, -1 /* pid */, i, 53 + -1 /* group_fd */, 0 /* flags */); 54 + if (pmu_fd[i] < 0) { 55 + fprintf(stderr, "ERROR: Initializing perf sampling\n"); 56 + return 1; 57 + } 58 + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, 59 + prog_fd[0]) == 0); 60 + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); 61 + } 62 + 63 + return 0; 64 + } 65 + 66 + static void sampling_end(int *pmu_fd) 67 + { 68 + int i; 69 + 70 + for (i = 0; i < nr_cpus; i++) 71 + close(pmu_fd[i]); 72 + } 73 + 74 + struct ipcount { 75 + __u64 ip; 76 + __u32 count; 77 + }; 78 + 79 + /* used for sorting */ 80 + struct ipcount counts[MAX_IPS]; 81 + 82 + static int count_cmp(const void *p1, const void *p2) 83 + { 84 + return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count; 85 + } 86 + 87 + static void print_ip_map(int fd) 88 + { 89 + struct ksym *sym; 90 + __u64 key, next_key; 91 + __u32 value; 92 + int i, max; 93 + 94 + printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT"); 95 + 96 + /* fetch IPs and counts */ 97 + key = 0, i = 0; 98 + while (bpf_get_next_key(fd, &key, &next_key) == 0) { 99 + bpf_lookup_elem(fd, &next_key, &value); 100 + counts[i].ip = next_key; 101 + counts[i++].count = value; 102 + key = next_key; 103 + } 104 + max = i; 105 + 106 + /* sort and print */ 107 + qsort(counts, max, sizeof(struct ipcount), count_cmp); 108 + for (i = 0; i < max; i++) { 109 + if (counts[i].ip > PAGE_OFFSET) { 110 + sym = ksym_search(counts[i].ip); 111 + printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, 112 + counts[i].count); 113 + } else { 114 + printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)", 115 + counts[i].count); 116 + } 117 + } 118 + 119 + if (max == MAX_IPS) { 120 + printf("WARNING: IP hash was full (max %d entries); ", max); 121 + printf("may have dropped samples\n"); 122 + } 123 + } 124 + 125 + static void int_exit(int sig) 126 + { 127 + printf("\n"); 128 + print_ip_map(map_fd[0]); 129 + exit(0); 130 + } 131 + 132 + int main(int argc, char **argv) 133 + { 134 + char filename[256]; 135 + int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS; 136 + 137 + /* process arguments */ 138 + while ((opt = getopt(argc, argv, "F:h")) != -1) { 139 + switch (opt) { 140 + case 'F': 141 + freq = atoi(optarg); 142 + break; 143 + case 'h': 144 + default: 145 + usage(); 146 + return 0; 147 + } 148 + } 149 + if (argc - optind == 1) 150 + secs = atoi(argv[optind]); 151 + if (freq == 0 || secs == 0) { 152 + usage(); 153 + return 1; 154 + } 155 + 156 + /* initialize kernel symbol translation */ 157 + if (load_kallsyms()) { 158 + fprintf(stderr, "ERROR: loading /proc/kallsyms\n"); 159 + return 2; 160 + } 161 + 162 + /* create perf FDs for each CPU */ 163 + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); 164 + pmu_fd = malloc(nr_cpus * sizeof(int)); 165 + if (pmu_fd == NULL) { 166 + fprintf(stderr, "ERROR: malloc of pmu_fd\n"); 167 + return 1; 168 + } 169 + 170 + /* load BPF program */ 171 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 172 + if (load_bpf_file(filename)) { 173 + fprintf(stderr, "ERROR: loading BPF program (errno %d):\n", 174 + errno); 175 + if (strcmp(bpf_log_buf, "") == 0) 176 + fprintf(stderr, "Try: ulimit -l unlimited\n"); 177 + else 178 + fprintf(stderr, "%s", bpf_log_buf); 179 + return 1; 180 + } 181 + signal(SIGINT, int_exit); 182 + 183 + /* do sampling */ 184 + printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", 185 + freq, secs); 186 + if (sampling_start(pmu_fd, freq) != 0) 187 + return 1; 188 + sleep(secs); 189 + sampling_end(pmu_fd); 190 + free(pmu_fd); 191 + 192 + /* output sample counts */ 193 + print_ip_map(map_fd[0]); 194 + 195 + return 0; 196 + }
+65
samples/bpf/trace_event_kern.c
··· 1 + /* Copyright (c) 2016 Facebook 2 + * 3 + * This program is free software; you can redistribute it and/or 4 + * modify it under the terms of version 2 of the GNU General Public 5 + * License as published by the Free Software Foundation. 6 + */ 7 + #include <linux/ptrace.h> 8 + #include <linux/version.h> 9 + #include <uapi/linux/bpf.h> 10 + #include <uapi/linux/bpf_perf_event.h> 11 + #include <uapi/linux/perf_event.h> 12 + #include "bpf_helpers.h" 13 + 14 + struct key_t { 15 + char comm[TASK_COMM_LEN]; 16 + u32 kernstack; 17 + u32 userstack; 18 + }; 19 + 20 + struct bpf_map_def SEC("maps") counts = { 21 + .type = BPF_MAP_TYPE_HASH, 22 + .key_size = sizeof(struct key_t), 23 + .value_size = sizeof(u64), 24 + .max_entries = 10000, 25 + }; 26 + 27 + struct bpf_map_def SEC("maps") stackmap = { 28 + .type = BPF_MAP_TYPE_STACK_TRACE, 29 + .key_size = sizeof(u32), 30 + .value_size = PERF_MAX_STACK_DEPTH * sizeof(u64), 31 + .max_entries = 10000, 32 + }; 33 + 34 + #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) 35 + #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) 36 + 37 + SEC("perf_event") 38 + int bpf_prog1(struct bpf_perf_event_data *ctx) 39 + { 40 + char fmt[] = "CPU-%d period %lld ip %llx"; 41 + u32 cpu = bpf_get_smp_processor_id(); 42 + struct key_t key; 43 + u64 *val, one = 1; 44 + 45 + if (ctx->sample_period < 10000) 46 + /* ignore warmup */ 47 + return 0; 48 + bpf_get_current_comm(&key.comm, sizeof(key.comm)); 49 + key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); 50 + key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS); 51 + if ((int)key.kernstack < 0 && (int)key.userstack < 0) { 52 + bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period, 53 + ctx->regs.ip); 54 + return 0; 55 + } 56 + 57 + val = bpf_map_lookup_elem(&counts, &key); 58 + if (val) 59 + (*val)++; 60 + else 61 + bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST); 62 + return 0; 63 + } 64 + 65 + char _license[] SEC("license") = "GPL";
+213
samples/bpf/trace_event_user.c
··· 1 + /* Copyright (c) 2016 Facebook 2 + * 3 + * This program is free software; you can redistribute it and/or 4 + * modify it under the terms of version 2 of the GNU General Public 5 + * License as published by the Free Software Foundation. 6 + */ 7 + #include <stdio.h> 8 + #include <unistd.h> 9 + #include <stdlib.h> 10 + #include <stdbool.h> 11 + #include <string.h> 12 + #include <fcntl.h> 13 + #include <poll.h> 14 + #include <sys/ioctl.h> 15 + #include <linux/perf_event.h> 16 + #include <linux/bpf.h> 17 + #include <signal.h> 18 + #include <assert.h> 19 + #include <errno.h> 20 + #include <sys/resource.h> 21 + #include "libbpf.h" 22 + #include "bpf_load.h" 23 + 24 + #define SAMPLE_FREQ 50 25 + 26 + static bool sys_read_seen, sys_write_seen; 27 + 28 + static void print_ksym(__u64 addr) 29 + { 30 + struct ksym *sym; 31 + 32 + if (!addr) 33 + return; 34 + sym = ksym_search(addr); 35 + printf("%s;", sym->name); 36 + if (!strcmp(sym->name, "sys_read")) 37 + sys_read_seen = true; 38 + else if (!strcmp(sym->name, "sys_write")) 39 + sys_write_seen = true; 40 + } 41 + 42 + static void print_addr(__u64 addr) 43 + { 44 + if (!addr) 45 + return; 46 + printf("%llx;", addr); 47 + } 48 + 49 + #define TASK_COMM_LEN 16 50 + 51 + struct key_t { 52 + char comm[TASK_COMM_LEN]; 53 + __u32 kernstack; 54 + __u32 userstack; 55 + }; 56 + 57 + static void print_stack(struct key_t *key, __u64 count) 58 + { 59 + __u64 ip[PERF_MAX_STACK_DEPTH] = {}; 60 + static bool warned; 61 + int i; 62 + 63 + printf("%3lld %s;", count, key->comm); 64 + if (bpf_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) { 65 + printf("---;"); 66 + } else { 67 + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) 68 + print_ksym(ip[i]); 69 + } 70 + printf("-;"); 71 + if (bpf_lookup_elem(map_fd[1], &key->userstack, ip) != 0) { 72 + printf("---;"); 73 + } else { 74 + for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) 75 + print_addr(ip[i]); 76 + } 77 + printf("\n"); 78 + 79 + if (key->kernstack == -EEXIST && !warned) { 80 + printf("stackmap collisions seen. Consider increasing size\n"); 81 + warned = true; 82 + } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) { 83 + printf("err stackid %d %d\n", key->kernstack, key->userstack); 84 + } 85 + } 86 + 87 + static void int_exit(int sig) 88 + { 89 + kill(0, SIGKILL); 90 + exit(0); 91 + } 92 + 93 + static void print_stacks(void) 94 + { 95 + struct key_t key = {}, next_key; 96 + __u64 value; 97 + __u32 stackid = 0, next_id; 98 + int fd = map_fd[0], stack_map = map_fd[1]; 99 + 100 + sys_read_seen = sys_write_seen = false; 101 + while (bpf_get_next_key(fd, &key, &next_key) == 0) { 102 + bpf_lookup_elem(fd, &next_key, &value); 103 + print_stack(&next_key, value); 104 + bpf_delete_elem(fd, &next_key); 105 + key = next_key; 106 + } 107 + 108 + if (!sys_read_seen || !sys_write_seen) { 109 + printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); 110 + int_exit(0); 111 + } 112 + 113 + /* clear stack map */ 114 + while (bpf_get_next_key(stack_map, &stackid, &next_id) == 0) { 115 + bpf_delete_elem(stack_map, &next_id); 116 + stackid = next_id; 117 + } 118 + } 119 + 120 + static void test_perf_event_all_cpu(struct perf_event_attr *attr) 121 + { 122 + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); 123 + int *pmu_fd = malloc(nr_cpus * sizeof(int)); 124 + int i; 125 + 126 + /* open perf_event on all cpus */ 127 + for (i = 0; i < nr_cpus; i++) { 128 + pmu_fd[i] = perf_event_open(attr, -1, i, -1, 0); 129 + if (pmu_fd[i] < 0) { 130 + printf("perf_event_open failed\n"); 131 + goto all_cpu_err; 132 + } 133 + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); 134 + assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0); 135 + } 136 + system("dd if=/dev/zero of=/dev/null count=5000k"); 137 + print_stacks(); 138 + all_cpu_err: 139 + for (i--; i >= 0; i--) 140 + close(pmu_fd[i]); 141 + free(pmu_fd); 142 + } 143 + 144 + static void test_perf_event_task(struct perf_event_attr *attr) 145 + { 146 + int pmu_fd; 147 + 148 + /* open task bound event */ 149 + pmu_fd = perf_event_open(attr, 0, -1, -1, 0); 150 + if (pmu_fd < 0) { 151 + printf("perf_event_open failed\n"); 152 + return; 153 + } 154 + assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0); 155 + assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0); 156 + system("dd if=/dev/zero of=/dev/null count=5000k"); 157 + print_stacks(); 158 + close(pmu_fd); 159 + } 160 + 161 + static void test_bpf_perf_event(void) 162 + { 163 + struct perf_event_attr attr_type_hw = { 164 + .sample_freq = SAMPLE_FREQ, 165 + .freq = 1, 166 + .type = PERF_TYPE_HARDWARE, 167 + .config = PERF_COUNT_HW_CPU_CYCLES, 168 + .inherit = 1, 169 + }; 170 + struct perf_event_attr attr_type_sw = { 171 + .sample_freq = SAMPLE_FREQ, 172 + .freq = 1, 173 + .type = PERF_TYPE_SOFTWARE, 174 + .config = PERF_COUNT_SW_CPU_CLOCK, 175 + .inherit = 1, 176 + }; 177 + 178 + test_perf_event_all_cpu(&attr_type_hw); 179 + test_perf_event_task(&attr_type_hw); 180 + test_perf_event_all_cpu(&attr_type_sw); 181 + test_perf_event_task(&attr_type_sw); 182 + } 183 + 184 + 185 + int main(int argc, char **argv) 186 + { 187 + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; 188 + char filename[256]; 189 + 190 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 191 + setrlimit(RLIMIT_MEMLOCK, &r); 192 + 193 + signal(SIGINT, int_exit); 194 + 195 + if (load_kallsyms()) { 196 + printf("failed to process /proc/kallsyms\n"); 197 + return 1; 198 + } 199 + 200 + if (load_bpf_file(filename)) { 201 + printf("%s", bpf_log_buf); 202 + return 2; 203 + } 204 + 205 + if (fork() == 0) { 206 + read_trace_pipe(); 207 + return 0; 208 + } 209 + test_bpf_perf_event(); 210 + 211 + int_exit(0); 212 + return 0; 213 + }