Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf/x86/intel: Record branch type

Perf already has support for disassembling the branch instruction
and using the branch type for filtering. The patch just records
the branch type in perf_branch_entry.

Before recording, the patch converts the x86 branch type to
common branch type.

Change log:

v10: Set the branch_map array to be static. The previous version
has it on stack then makes the compiler to create it every
time when the function gets called.

v9: Use __ffs() to find first bit in type in common_branch_type().
It lets the code be clear.

v8: Change PERF_BR_NONE to PERF_BR_UNKNOWN.

v7: Just convert following x86 branch types to common branch types.

X86_BR_CALL -> PERF_BR_CALL
X86_BR_RET -> PERF_BR_RET
X86_BR_JCC -> PERF_BR_COND
X86_BR_JMP -> PERF_BR_UNCOND
X86_BR_IND_CALL -> PERF_BR_IND_CALL
X86_BR_ZERO_CALL -> PERF_BR_CALL
X86_BR_IND_JMP -> PERF_BR_IND
X86_BR_SYSCALL -> PERF_BR_SYSCALL
X86_BR_SYSRET -> PERF_BR_SYSRET

Others are set to PERF_BR_NONE

v6: Not changed.

v5: Just fix the merge error. No other update.

v4: Comparing to previous version, the major changes are:

1. Uses a lookup table to convert x86 branch type to common branch
type.

2. Move the JCC forward/JCC backward and cross page computing to
user space.

3. Initialize branch type to 0 in intel_pmu_lbr_read_32 and
intel_pmu_lbr_read_64

Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: http://lkml.kernel.org/r/1500379995-6449-3-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Jin Yao and committed by
Arnaldo Carvalho de Melo
d5c7f9dc eb0baf8a

+51 -1
+51 -1
arch/x86/events/intel/lbr.c
··· 109 109 X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ 110 110 X86_BR_CALL_STACK = 1 << 16,/* call stack */ 111 111 X86_BR_IND_JMP = 1 << 17,/* indirect jump */ 112 + 113 + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ 114 + 112 115 }; 113 116 114 117 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) ··· 513 510 cpuc->lbr_entries[i].in_tx = 0; 514 511 cpuc->lbr_entries[i].abort = 0; 515 512 cpuc->lbr_entries[i].cycles = 0; 513 + cpuc->lbr_entries[i].type = 0; 516 514 cpuc->lbr_entries[i].reserved = 0; 517 515 } 518 516 cpuc->lbr_stack.nr = i; ··· 600 596 cpuc->lbr_entries[out].in_tx = in_tx; 601 597 cpuc->lbr_entries[out].abort = abort; 602 598 cpuc->lbr_entries[out].cycles = cycles; 599 + cpuc->lbr_entries[out].type = 0; 603 600 cpuc->lbr_entries[out].reserved = 0; 604 601 out++; 605 602 } ··· 678 673 679 674 if (br_type & PERF_SAMPLE_BRANCH_CALL) 680 675 mask |= X86_BR_CALL | X86_BR_ZERO_CALL; 676 + 677 + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) 678 + mask |= X86_BR_TYPE_SAVE; 679 + 681 680 /* 682 681 * stash actual user request into reg, it may 683 682 * be used by fixup code for some CPU ··· 935 926 return ret; 936 927 } 937 928 929 + #define X86_BR_TYPE_MAP_MAX 16 930 + 931 + static int branch_map[X86_BR_TYPE_MAP_MAX] = { 932 + PERF_BR_CALL, /* X86_BR_CALL */ 933 + PERF_BR_RET, /* X86_BR_RET */ 934 + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ 935 + PERF_BR_SYSRET, /* X86_BR_SYSRET */ 936 + PERF_BR_UNKNOWN, /* X86_BR_INT */ 937 + PERF_BR_UNKNOWN, /* X86_BR_IRET */ 938 + PERF_BR_COND, /* X86_BR_JCC */ 939 + PERF_BR_UNCOND, /* X86_BR_JMP */ 940 + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ 941 + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ 942 + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ 943 + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ 944 + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ 945 + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ 946 + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ 947 + PERF_BR_IND, /* X86_BR_IND_JMP */ 948 + }; 949 + 950 + static int 951 + common_branch_type(int type) 952 + { 953 + int i; 954 + 955 + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ 956 + 957 + if (type) { 958 + i = __ffs(type); 959 + if (i < X86_BR_TYPE_MAP_MAX) 960 + return branch_map[i]; 961 + } 962 + 963 + return PERF_BR_UNKNOWN; 964 + } 965 + 938 966 /* 939 967 * implement actual branch filter based on user demand. 940 968 * Hardware may not exactly satisfy that request, thus ··· 988 942 bool compress = false; 989 943 990 944 /* if sampling all branches, then nothing to filter */ 991 - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) 945 + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && 946 + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) 992 947 return; 993 948 994 949 for (i = 0; i < cpuc->lbr_stack.nr; i++) { ··· 1010 963 cpuc->lbr_entries[i].from = 0; 1011 964 compress = true; 1012 965 } 966 + 967 + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) 968 + cpuc->lbr_entries[i].type = common_branch_type(type); 1013 969 } 1014 970 1015 971 if (!compress)