perf_counter: Fix/complete ftrace event records sampling

This patch implements the kernel side support for ftrace event
record sampling.

A new counter sampling attribute is added:

PERF_SAMPLE_TP_RECORD

which requests ftrace events record sampling. In this case
if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint
fires, we emit the tracepoint binary record to the
perfcounter event buffer, as a sample.

Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf
record:

perf record -f -F 1 -a -e workqueue:workqueue_execution
perf report -D

0x21e18 [0x48]: event: 9
.
. ... raw event: size 72 bytes
. 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........
. 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!......
. 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve
. 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1...........
. 0040: e0 b1 31 81 ff ff ff ff .......
.
0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33

The raw ftrace binary record starts at offset 0020.

Translation:

struct trace_entry {
type = 0x2b = 43;
flags = 1;
preempt_count = 2;
pid = 0xa = 10;
tgid = 0xa = 10;
}

thread_comm = "events/1"
thread_pid = 0xa = 10;
func = 0xffffffff8131b1e0 = flush_to_ldisc()

What will come next?

- Userspace support ('perf trace'), 'flight data recorder' mode
for perf trace, etc.

- The unconditional copy from the profiling callback brings
some costs however if someone wants no such sampling to
occur, and needs to be fixed in the future. For that we need
to have an instant access to the perf counter attribute.
This is a matter of a flag to add in the struct ftrace_event.

- Take care of the events recursivity! Don't ever try to record
a lock event for example, it seems some locking is used in
the profiling fast path and lead to a tracing recursivity.
That will be fixed using raw spinlock or recursivity
protection.

- [...]

- Profit! :-)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Frederic Weisbecker and committed by Ingo Molnar f413cdb8 3a659305

+126 -41
+3 -1
include/linux/ftrace_event.h
··· 89 89 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ 90 90 }; 91 91 92 - 92 + void tracing_generic_entry_update(struct trace_entry *entry, 93 + unsigned long flags, 94 + int pc); 93 95 struct ring_buffer_event * 94 96 trace_current_buffer_lock_reserve(int type, unsigned long len, 95 97 unsigned long flags, int pc);
+8 -1
include/linux/perf_counter.h
··· 121 121 PERF_SAMPLE_CPU = 1U << 7, 122 122 PERF_SAMPLE_PERIOD = 1U << 8, 123 123 PERF_SAMPLE_STREAM_ID = 1U << 9, 124 + PERF_SAMPLE_TP_RECORD = 1U << 10, 124 125 125 - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ 126 + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ 126 127 }; 127 128 128 129 /* ··· 414 413 __u64 ip[PERF_MAX_STACK_DEPTH]; 415 414 }; 416 415 416 + struct perf_tracepoint_record { 417 + int size; 418 + char *record; 419 + }; 420 + 417 421 struct task_struct; 418 422 419 423 /** ··· 687 681 struct pt_regs *regs; 688 682 u64 addr; 689 683 u64 period; 684 + void *private; 690 685 }; 691 686 692 687 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+96 -34
include/trace/ftrace.h
··· 353 353 /* 354 354 * Generate the functions needed for tracepoint perf_counter support. 355 355 * 356 - * static void ftrace_profile_<call>(proto) 357 - * { 358 - * extern void perf_tpcounter_event(int, u64, u64); 359 - * u64 __addr = 0, __count = 1; 360 - * 361 - * <assign> <-- here we expand the TP_perf_assign() macro 362 - * 363 - * perf_tpcounter_event(event_<call>.id, __addr, __count); 364 - * } 356 + * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later 365 357 * 366 358 * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call) 367 359 * { ··· 373 381 * 374 382 */ 375 383 376 - #undef TP_fast_assign 377 - #define TP_fast_assign(args...) 378 - 379 - #undef TP_perf_assign 380 - #define TP_perf_assign(args...) args 381 - 382 - #undef __perf_addr 383 - #define __perf_addr(a) __addr = (a) 384 - 385 - #undef __perf_count 386 - #define __perf_count(c) __count = (c) 387 - 388 384 #undef TRACE_EVENT 389 385 #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 390 386 \ 391 - static void ftrace_profile_##call(proto) \ 392 - { \ 393 - extern void perf_tpcounter_event(int, u64, u64); \ 394 - u64 __addr = 0, __count = 1; \ 395 - { assign; } \ 396 - perf_tpcounter_event(event_##call.id, __addr, __count); \ 397 - } \ 387 + static void ftrace_profile_##call(proto); \ 398 388 \ 399 389 static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ 400 390 { \ ··· 395 421 } 396 422 397 423 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 398 - 399 - #undef TP_fast_assign 400 - #define TP_fast_assign(args...) args 401 - 402 - #undef TP_perf_assign 403 - #define TP_perf_assign(args...) 404 424 405 425 #endif 406 426 ··· 614 646 } 615 647 616 648 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 649 + 650 + /* 651 + * Define the insertion callback to profile events 652 + * 653 + * The job is very similar to ftrace_raw_event_<call> except that we don't 654 + * insert in the ring buffer but in a perf counter. 655 + * 656 + * static void ftrace_profile_<call>(proto) 657 + * { 658 + * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets; 659 + * struct ftrace_event_call *event_call = &event_<call>; 660 + * extern void perf_tpcounter_event(int, u64, u64, void *, int); 661 + * struct ftrace_raw_##call *entry; 662 + * u64 __addr = 0, __count = 1; 663 + * unsigned long irq_flags; 664 + * int __entry_size; 665 + * int __data_size; 666 + * int pc; 667 + * 668 + * local_save_flags(irq_flags); 669 + * pc = preempt_count(); 670 + * 671 + * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); 672 + * __entry_size = __data_size + sizeof(*entry); 673 + * 674 + * do { 675 + * char raw_data[__entry_size]; <- allocate our sample in the stack 676 + * struct trace_entry *ent; 677 + * 678 + * entry = (struct ftrace_raw_<call> *)raw_data; 679 + * ent = &entry->ent; 680 + * tracing_generic_entry_update(ent, irq_flags, pc); 681 + * ent->type = event_call->id; 682 + * 683 + * <tstruct> <- do some jobs with dynamic arrays 684 + * 685 + * <assign> <- affect our values 686 + * 687 + * perf_tpcounter_event(event_call->id, __addr, __count, entry, 688 + * __entry_size); <- submit them to perf counter 689 + * } while (0); 690 + * 691 + * } 692 + */ 693 + 694 + #ifdef CONFIG_EVENT_PROFILE 695 + 696 + #undef __perf_addr 697 + #define __perf_addr(a) __addr = (a) 698 + 699 + #undef __perf_count 700 + #define __perf_count(c) __count = (c) 701 + 702 + #undef TRACE_EVENT 703 + #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 704 + static void ftrace_profile_##call(proto) \ 705 + { \ 706 + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 707 + struct ftrace_event_call *event_call = &event_##call; \ 708 + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ 709 + struct ftrace_raw_##call *entry; \ 710 + u64 __addr = 0, __count = 1; \ 711 + unsigned long irq_flags; \ 712 + int __entry_size; \ 713 + int __data_size; \ 714 + int pc; \ 715 + \ 716 + local_save_flags(irq_flags); \ 717 + pc = preempt_count(); \ 718 + \ 719 + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 720 + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ 721 + \ 722 + do { \ 723 + char raw_data[__entry_size]; \ 724 + struct trace_entry *ent; \ 725 + \ 726 + entry = (struct ftrace_raw_##call *)raw_data; \ 727 + ent = &entry->ent; \ 728 + tracing_generic_entry_update(ent, irq_flags, pc); \ 729 + ent->type = event_call->id; \ 730 + \ 731 + tstruct \ 732 + \ 733 + { assign; } \ 734 + \ 735 + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ 736 + __entry_size); \ 737 + } while (0); \ 738 + \ 739 + } 740 + 741 + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 742 + #endif /* CONFIG_EVENT_PROFILE */ 617 743 618 744 #undef _TRACE_PROFILE_INIT 619 745
+17 -1
kernel/perf_counter.c
··· 2646 2646 u64 counter; 2647 2647 } group_entry; 2648 2648 struct perf_callchain_entry *callchain = NULL; 2649 + struct perf_tracepoint_record *tp; 2649 2650 int callchain_size = 0; 2650 2651 u64 time; 2651 2652 struct { ··· 2715 2714 header.size += sizeof(u64); 2716 2715 } 2717 2716 2717 + if (sample_type & PERF_SAMPLE_TP_RECORD) { 2718 + tp = data->private; 2719 + header.size += tp->size; 2720 + } 2721 + 2718 2722 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2719 2723 if (ret) 2720 2724 return; ··· 2782 2776 perf_output_put(&handle, nr); 2783 2777 } 2784 2778 } 2779 + 2780 + if (sample_type & PERF_SAMPLE_TP_RECORD) 2781 + perf_output_copy(&handle, tp->record, tp->size); 2785 2782 2786 2783 perf_output_end(&handle); 2787 2784 } ··· 3712 3703 }; 3713 3704 3714 3705 #ifdef CONFIG_EVENT_PROFILE 3715 - void perf_tpcounter_event(int event_id, u64 addr, u64 count) 3706 + void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, 3707 + int entry_size) 3716 3708 { 3709 + struct perf_tracepoint_record tp = { 3710 + .size = entry_size, 3711 + .record = record, 3712 + }; 3713 + 3717 3714 struct perf_sample_data data = { 3718 3715 .regs = get_irq_regs(), 3719 3716 .addr = addr, 3717 + .private = &tp, 3720 3718 }; 3721 3719 3722 3720 if (!data.regs)
+1
kernel/trace/trace.c
··· 848 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 849 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 850 850 } 851 + EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 851 852 852 853 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853 854 int type,
-4
kernel/trace/trace.h
··· 438 438 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 439 439 int *ent_cpu, u64 *ent_ts); 440 440 441 - void tracing_generic_entry_update(struct trace_entry *entry, 442 - unsigned long flags, 443 - int pc); 444 - 445 441 void default_wait_pipe(struct trace_iterator *iter); 446 442 void poll_wait_pipe(struct trace_iterator *iter); 447 443
+1
tools/perf/builtin-record.c
··· 412 412 if (call_graph) 413 413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 414 414 415 + 415 416 attr->mmap = track; 416 417 attr->comm = track; 417 418 attr->inherit = (cpu < 0) && inherit;