perf_counter: Fix/complete ftrace event records sampling

This patch implements the kernel side support for ftrace event
record sampling.

A new counter sampling attribute is added:

PERF_SAMPLE_TP_RECORD

which requests ftrace events record sampling. In this case
if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint
fires, we emit the tracepoint binary record to the
perfcounter event buffer, as a sample.

Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf
record:

perf record -f -F 1 -a -e workqueue:workqueue_execution
perf report -D

0x21e18 [0x48]: event: 9
.
. ... raw event: size 72 bytes
. 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........
. 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!......
. 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve
. 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1...........
. 0040: e0 b1 31 81 ff ff ff ff .......
.
0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33

The raw ftrace binary record starts at offset 0020.

Translation:

struct trace_entry {
type = 0x2b = 43;
flags = 1;
preempt_count = 2;
pid = 0xa = 10;
tgid = 0xa = 10;
}

thread_comm = "events/1"
thread_pid = 0xa = 10;
func = 0xffffffff8131b1e0 = flush_to_ldisc()

What will come next?

- Userspace support ('perf trace'), 'flight data recorder' mode
for perf trace, etc.

- The unconditional copy from the profiling callback brings
some costs however if someone wants no such sampling to
occur, and needs to be fixed in the future. For that we need
to have an instant access to the perf counter attribute.
This is a matter of a flag to add in the struct ftrace_event.

- Take care of the events recursivity! Don't ever try to record
a lock event for example, it seems some locking is used in
the profiling fast path and lead to a tracing recursivity.
That will be fixed using raw spinlock or recursivity
protection.

- [...]

- Profit! :-)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Frederic Weisbecker and committed by Ingo Molnar f413cdb8 3a659305

+126 -41
+3 -1
include/linux/ftrace_event.h
··· 89 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ 90 }; 91 92 - 93 struct ring_buffer_event * 94 trace_current_buffer_lock_reserve(int type, unsigned long len, 95 unsigned long flags, int pc);
··· 89 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ 90 }; 91 92 + void tracing_generic_entry_update(struct trace_entry *entry, 93 + unsigned long flags, 94 + int pc); 95 struct ring_buffer_event * 96 trace_current_buffer_lock_reserve(int type, unsigned long len, 97 unsigned long flags, int pc);
+8 -1
include/linux/perf_counter.h
··· 121 PERF_SAMPLE_CPU = 1U << 7, 122 PERF_SAMPLE_PERIOD = 1U << 8, 123 PERF_SAMPLE_STREAM_ID = 1U << 9, 124 125 - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ 126 }; 127 128 /* ··· 414 __u64 ip[PERF_MAX_STACK_DEPTH]; 415 }; 416 417 struct task_struct; 418 419 /** ··· 687 struct pt_regs *regs; 688 u64 addr; 689 u64 period; 690 }; 691 692 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
··· 121 PERF_SAMPLE_CPU = 1U << 7, 122 PERF_SAMPLE_PERIOD = 1U << 8, 123 PERF_SAMPLE_STREAM_ID = 1U << 9, 124 + PERF_SAMPLE_TP_RECORD = 1U << 10, 125 126 + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ 127 }; 128 129 /* ··· 413 __u64 ip[PERF_MAX_STACK_DEPTH]; 414 }; 415 416 + struct perf_tracepoint_record { 417 + int size; 418 + char *record; 419 + }; 420 + 421 struct task_struct; 422 423 /** ··· 681 struct pt_regs *regs; 682 u64 addr; 683 u64 period; 684 + void *private; 685 }; 686 687 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+96 -34
include/trace/ftrace.h
··· 353 /* 354 * Generate the functions needed for tracepoint perf_counter support. 355 * 356 - * static void ftrace_profile_<call>(proto) 357 - * { 358 - * extern void perf_tpcounter_event(int, u64, u64); 359 - * u64 __addr = 0, __count = 1; 360 - * 361 - * <assign> <-- here we expand the TP_perf_assign() macro 362 - * 363 - * perf_tpcounter_event(event_<call>.id, __addr, __count); 364 - * } 365 * 366 * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call) 367 * { ··· 373 * 374 */ 375 376 - #undef TP_fast_assign 377 - #define TP_fast_assign(args...) 378 - 379 - #undef TP_perf_assign 380 - #define TP_perf_assign(args...) args 381 - 382 - #undef __perf_addr 383 - #define __perf_addr(a) __addr = (a) 384 - 385 - #undef __perf_count 386 - #define __perf_count(c) __count = (c) 387 - 388 #undef TRACE_EVENT 389 #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 390 \ 391 - static void ftrace_profile_##call(proto) \ 392 - { \ 393 - extern void perf_tpcounter_event(int, u64, u64); \ 394 - u64 __addr = 0, __count = 1; \ 395 - { assign; } \ 396 - perf_tpcounter_event(event_##call.id, __addr, __count); \ 397 - } \ 398 \ 399 static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ 400 { \ ··· 395 } 396 397 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 398 - 399 - #undef TP_fast_assign 400 - #define TP_fast_assign(args...) args 401 - 402 - #undef TP_perf_assign 403 - #define TP_perf_assign(args...) 404 405 #endif 406 ··· 614 } 615 616 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 617 618 #undef _TRACE_PROFILE_INIT 619
··· 353 /* 354 * Generate the functions needed for tracepoint perf_counter support. 355 * 356 + * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later 357 * 358 * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call) 359 * { ··· 381 * 382 */ 383 384 #undef TRACE_EVENT 385 #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 386 \ 387 + static void ftrace_profile_##call(proto); \ 388 \ 389 static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ 390 { \ ··· 421 } 422 423 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 424 425 #endif 426 ··· 646 } 647 648 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 649 + 650 + /* 651 + * Define the insertion callback to profile events 652 + * 653 + * The job is very similar to ftrace_raw_event_<call> except that we don't 654 + * insert in the ring buffer but in a perf counter. 655 + * 656 + * static void ftrace_profile_<call>(proto) 657 + * { 658 + * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets; 659 + * struct ftrace_event_call *event_call = &event_<call>; 660 + * extern void perf_tpcounter_event(int, u64, u64, void *, int); 661 + * struct ftrace_raw_##call *entry; 662 + * u64 __addr = 0, __count = 1; 663 + * unsigned long irq_flags; 664 + * int __entry_size; 665 + * int __data_size; 666 + * int pc; 667 + * 668 + * local_save_flags(irq_flags); 669 + * pc = preempt_count(); 670 + * 671 + * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); 672 + * __entry_size = __data_size + sizeof(*entry); 673 + * 674 + * do { 675 + * char raw_data[__entry_size]; <- allocate our sample in the stack 676 + * struct trace_entry *ent; 677 + * 678 + * entry = (struct ftrace_raw_<call> *)raw_data; 679 + * ent = &entry->ent; 680 + * tracing_generic_entry_update(ent, irq_flags, pc); 681 + * ent->type = event_call->id; 682 + * 683 + * <tstruct> <- do some jobs with dynamic arrays 684 + * 685 + * <assign> <- affect our values 686 + * 687 + * perf_tpcounter_event(event_call->id, __addr, __count, entry, 688 + * __entry_size); <- submit them to perf counter 689 + * } while (0); 690 + * 691 + * } 692 + */ 693 + 694 + #ifdef CONFIG_EVENT_PROFILE 695 + 696 + #undef __perf_addr 697 + #define __perf_addr(a) __addr = (a) 698 + 699 + #undef __perf_count 700 + #define __perf_count(c) __count = (c) 701 + 702 + #undef TRACE_EVENT 703 + #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 704 + static void ftrace_profile_##call(proto) \ 705 + { \ 706 + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 707 + struct ftrace_event_call *event_call = &event_##call; \ 708 + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ 709 + struct ftrace_raw_##call *entry; \ 710 + u64 __addr = 0, __count = 1; \ 711 + unsigned long irq_flags; \ 712 + int __entry_size; \ 713 + int __data_size; \ 714 + int pc; \ 715 + \ 716 + local_save_flags(irq_flags); \ 717 + pc = preempt_count(); \ 718 + \ 719 + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 720 + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ 721 + \ 722 + do { \ 723 + char raw_data[__entry_size]; \ 724 + struct trace_entry *ent; \ 725 + \ 726 + entry = (struct ftrace_raw_##call *)raw_data; \ 727 + ent = &entry->ent; \ 728 + tracing_generic_entry_update(ent, irq_flags, pc); \ 729 + ent->type = event_call->id; \ 730 + \ 731 + tstruct \ 732 + \ 733 + { assign; } \ 734 + \ 735 + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ 736 + __entry_size); \ 737 + } while (0); \ 738 + \ 739 + } 740 + 741 + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 742 + #endif /* CONFIG_EVENT_PROFILE */ 743 744 #undef _TRACE_PROFILE_INIT 745
+17 -1
kernel/perf_counter.c
··· 2646 u64 counter; 2647 } group_entry; 2648 struct perf_callchain_entry *callchain = NULL; 2649 int callchain_size = 0; 2650 u64 time; 2651 struct { ··· 2715 header.size += sizeof(u64); 2716 } 2717 2718 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2719 if (ret) 2720 return; ··· 2782 perf_output_put(&handle, nr); 2783 } 2784 } 2785 2786 perf_output_end(&handle); 2787 } ··· 3712 }; 3713 3714 #ifdef CONFIG_EVENT_PROFILE 3715 - void perf_tpcounter_event(int event_id, u64 addr, u64 count) 3716 { 3717 struct perf_sample_data data = { 3718 .regs = get_irq_regs(), 3719 .addr = addr, 3720 }; 3721 3722 if (!data.regs)
··· 2646 u64 counter; 2647 } group_entry; 2648 struct perf_callchain_entry *callchain = NULL; 2649 + struct perf_tracepoint_record *tp; 2650 int callchain_size = 0; 2651 u64 time; 2652 struct { ··· 2714 header.size += sizeof(u64); 2715 } 2716 2717 + if (sample_type & PERF_SAMPLE_TP_RECORD) { 2718 + tp = data->private; 2719 + header.size += tp->size; 2720 + } 2721 + 2722 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2723 if (ret) 2724 return; ··· 2776 perf_output_put(&handle, nr); 2777 } 2778 } 2779 + 2780 + if (sample_type & PERF_SAMPLE_TP_RECORD) 2781 + perf_output_copy(&handle, tp->record, tp->size); 2782 2783 perf_output_end(&handle); 2784 } ··· 3703 }; 3704 3705 #ifdef CONFIG_EVENT_PROFILE 3706 + void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, 3707 + int entry_size) 3708 { 3709 + struct perf_tracepoint_record tp = { 3710 + .size = entry_size, 3711 + .record = record, 3712 + }; 3713 + 3714 struct perf_sample_data data = { 3715 .regs = get_irq_regs(), 3716 .addr = addr, 3717 + .private = &tp, 3718 }; 3719 3720 if (!data.regs)
+1
kernel/trace/trace.c
··· 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 850 } 851 852 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853 int type,
··· 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 850 } 851 + EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 852 853 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 854 int type,
-4
kernel/trace/trace.h
··· 438 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 439 int *ent_cpu, u64 *ent_ts); 440 441 - void tracing_generic_entry_update(struct trace_entry *entry, 442 - unsigned long flags, 443 - int pc); 444 - 445 void default_wait_pipe(struct trace_iterator *iter); 446 void poll_wait_pipe(struct trace_iterator *iter); 447
··· 438 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 439 int *ent_cpu, u64 *ent_ts); 440 441 void default_wait_pipe(struct trace_iterator *iter); 442 void poll_wait_pipe(struct trace_iterator *iter); 443
+1
tools/perf/builtin-record.c
··· 412 if (call_graph) 413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 414 415 attr->mmap = track; 416 attr->comm = track; 417 attr->inherit = (cpu < 0) && inherit;
··· 412 if (call_graph) 413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 414 415 + 416 attr->mmap = track; 417 attr->comm = track; 418 attr->inherit = (cpu < 0) && inherit;