Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf inject: Add support for injecting guest sideband events

Inject events from a perf.data file recorded in a virtual machine into
a perf.data file recorded on the host at the same time.

Only side band events (e.g. mmap, comm, fork, exit etc) and build IDs are
injected. Additionally, the guest kcore_dir is copied as kcore_dir__
appended to the machine PID.

This is non-trivial because:
o It is not possible to process 2 sessions simultaneously so instead
events are first written to a temporary file.
o To avoid conflict, guest sample IDs are replaced with new unused sample
IDs.
o Guest event's CPU is changed to be the host CPU because it is more
useful for reporting and analysis.
o Sample ID is mapped to machine PID which is recorded with VCPU in the
id index. This is important to allow guest events to be related to the
guest machine and VCPU.
o Timestamps must be converted.
o Events are inserted to obey finished-round ordering.

The anticipated use-case is:
- start recording sideband events in a guest machine
- start recording an AUX area trace on the host which can trace also the
guest (e.g. Intel PT)
- run test case on the guest
- stop recording on the host
- stop recording on the guest
- copy the guest perf.data file to the host
- inject the guest perf.data file sideband events into the host perf.data
file using perf inject
- the resulting perf.data file can now be used

Subsequent patches provide Intel PT support for this.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: kvm@vger.kernel.org
Link: https://lore.kernel.org/r/20220711093218.10967-25-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Adrian Hunter and committed by
Arnaldo Carvalho de Melo
97406a7e 10d34700

+1059 -1
+17
tools/perf/Documentation/perf-inject.txt
··· 85 85 without updating it. Currently this option is supported only by 86 86 Intel PT, refer linkperf:perf-intel-pt[1] 87 87 88 + --guest-data=<path>,<pid>[,<time offset>[,<time scale>]]:: 89 + Insert events from a perf.data file recorded in a virtual machine at 90 + the same time as the input perf.data file was recorded on the host. 91 + The Process ID (PID) of the QEMU hypervisor process must be provided, 92 + and the time offset and time scale (multiplier) will likely be needed 93 + to convert guest time stamps into host time stamps. For example, for 94 + x86 the TSC Offset and Multiplier could be provided for a virtual machine 95 + using Linux command line option no-kvmclock. 96 + Currently only mmap, mmap2, comm, task, context_switch, ksymbol, 97 + and text_poke events are inserted, as well as build ID information. 98 + The QEMU option -name debug-threads=on is needed so that thread names 99 + can be used to determine which thread is running which VCPU. Note 100 + libvirt seems to use this by default. 101 + When using perf record in the guest, option --sample-identifier 102 + should be used, and also --buildid-all and --switch-events may be 103 + useful. 104 + 88 105 SEE ALSO 89 106 -------- 90 107 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1],
+1042 -1
tools/perf/builtin-inject.c
··· 26 26 #include "util/thread.h" 27 27 #include "util/namespaces.h" 28 28 #include "util/util.h" 29 + #include "util/tsc.h" 29 30 30 31 #include <internal/lib.h> 31 32 ··· 36 35 37 36 #include <linux/list.h> 38 37 #include <linux/string.h> 38 + #include <linux/zalloc.h> 39 + #include <linux/hash.h> 39 40 #include <errno.h> 40 41 #include <signal.h> 42 + #include <inttypes.h> 43 + 44 + struct guest_event { 45 + struct perf_sample sample; 46 + union perf_event *event; 47 + char event_buf[PERF_SAMPLE_MAX_SIZE]; 48 + }; 49 + 50 + struct guest_id { 51 + /* hlist_node must be first, see free_hlist() */ 52 + struct hlist_node node; 53 + u64 id; 54 + u64 host_id; 55 + u32 vcpu; 56 + }; 57 + 58 + struct guest_tid { 59 + /* hlist_node must be first, see free_hlist() */ 60 + struct hlist_node node; 61 + /* Thread ID of QEMU thread */ 62 + u32 tid; 63 + u32 vcpu; 64 + }; 65 + 66 + struct guest_vcpu { 67 + /* Current host CPU */ 68 + u32 cpu; 69 + /* Thread ID of QEMU thread */ 70 + u32 tid; 71 + }; 72 + 73 + struct guest_session { 74 + char *perf_data_file; 75 + u32 machine_pid; 76 + u64 time_offset; 77 + double time_scale; 78 + struct perf_tool tool; 79 + struct perf_data data; 80 + struct perf_session *session; 81 + char *tmp_file_name; 82 + int tmp_fd; 83 + struct perf_tsc_conversion host_tc; 84 + struct perf_tsc_conversion guest_tc; 85 + bool copy_kcore_dir; 86 + bool have_tc; 87 + bool fetched; 88 + bool ready; 89 + u16 dflt_id_hdr_size; 90 + u64 dflt_id; 91 + u64 highest_id; 92 + /* Array of guest_vcpu */ 93 + struct guest_vcpu *vcpu; 94 + size_t vcpu_cnt; 95 + /* Hash table for guest_id */ 96 + struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; 97 + /* Hash table for guest_tid */ 98 + struct hlist_head tids[PERF_EVLIST__HLIST_SIZE]; 99 + /* Place to stash next guest event */ 100 + struct guest_event ev; 101 + }; 41 102 42 103 struct perf_inject { 43 104 struct perf_tool tool; ··· 122 59 struct itrace_synth_opts itrace_synth_opts; 123 60 char event_copy[PERF_SAMPLE_MAX_SIZE]; 124 61 struct perf_file_section secs[HEADER_FEAT_BITS]; 62 + struct guest_session guest_session; 125 63 }; 126 64 127 65 struct event_entry { ··· 762 698 return perf_event__repipe(tool, event_sw, &sample_sw, machine); 763 699 } 764 700 701 + static struct guest_vcpu *guest_session__vcpu(struct guest_session *gs, u32 vcpu) 702 + { 703 + if (realloc_array_as_needed(gs->vcpu, gs->vcpu_cnt, vcpu, NULL)) 704 + return NULL; 705 + return &gs->vcpu[vcpu]; 706 + } 707 + 708 + static int guest_session__output_bytes(struct guest_session *gs, void *buf, size_t sz) 709 + { 710 + ssize_t ret = writen(gs->tmp_fd, buf, sz); 711 + 712 + return ret < 0 ? ret : 0; 713 + } 714 + 715 + static int guest_session__repipe(struct perf_tool *tool, 716 + union perf_event *event, 717 + struct perf_sample *sample __maybe_unused, 718 + struct machine *machine __maybe_unused) 719 + { 720 + struct guest_session *gs = container_of(tool, struct guest_session, tool); 721 + 722 + return guest_session__output_bytes(gs, event, event->header.size); 723 + } 724 + 725 + static int guest_session__map_tid(struct guest_session *gs, u32 tid, u32 vcpu) 726 + { 727 + struct guest_tid *guest_tid = zalloc(sizeof(*guest_tid)); 728 + int hash; 729 + 730 + if (!guest_tid) 731 + return -ENOMEM; 732 + 733 + guest_tid->tid = tid; 734 + guest_tid->vcpu = vcpu; 735 + hash = hash_32(guest_tid->tid, PERF_EVLIST__HLIST_BITS); 736 + hlist_add_head(&guest_tid->node, &gs->tids[hash]); 737 + 738 + return 0; 739 + } 740 + 741 + static int host_peek_vm_comms_cb(struct perf_session *session __maybe_unused, 742 + union perf_event *event, 743 + u64 offset __maybe_unused, void *data) 744 + { 745 + struct guest_session *gs = data; 746 + unsigned int vcpu; 747 + struct guest_vcpu *guest_vcpu; 748 + int ret; 749 + 750 + if (event->header.type != PERF_RECORD_COMM || 751 + event->comm.pid != gs->machine_pid) 752 + return 0; 753 + 754 + /* 755 + * QEMU option -name debug-threads=on, causes thread names formatted as 756 + * below, although it is not an ABI. Also libvirt seems to use this by 757 + * default. Here we rely on it to tell us which thread is which VCPU. 758 + */ 759 + ret = sscanf(event->comm.comm, "CPU %u/KVM", &vcpu); 760 + if (ret <= 0) 761 + return ret; 762 + pr_debug("Found VCPU: tid %u comm %s vcpu %u\n", 763 + event->comm.tid, event->comm.comm, vcpu); 764 + if (vcpu > INT_MAX) { 765 + pr_err("Invalid VCPU %u\n", vcpu); 766 + return -EINVAL; 767 + } 768 + guest_vcpu = guest_session__vcpu(gs, vcpu); 769 + if (!guest_vcpu) 770 + return -ENOMEM; 771 + if (guest_vcpu->tid && guest_vcpu->tid != event->comm.tid) { 772 + pr_err("Fatal error: Two threads found with the same VCPU\n"); 773 + return -EINVAL; 774 + } 775 + guest_vcpu->tid = event->comm.tid; 776 + 777 + return guest_session__map_tid(gs, event->comm.tid, vcpu); 778 + } 779 + 780 + static int host_peek_vm_comms(struct perf_session *session, struct guest_session *gs) 781 + { 782 + return perf_session__peek_events(session, session->header.data_offset, 783 + session->header.data_size, 784 + host_peek_vm_comms_cb, gs); 785 + } 786 + 787 + static bool evlist__is_id_used(struct evlist *evlist, u64 id) 788 + { 789 + return evlist__id2sid(evlist, id); 790 + } 791 + 792 + static u64 guest_session__allocate_new_id(struct guest_session *gs, struct evlist *host_evlist) 793 + { 794 + do { 795 + gs->highest_id += 1; 796 + } while (!gs->highest_id || evlist__is_id_used(host_evlist, gs->highest_id)); 797 + 798 + return gs->highest_id; 799 + } 800 + 801 + static int guest_session__map_id(struct guest_session *gs, u64 id, u64 host_id, u32 vcpu) 802 + { 803 + struct guest_id *guest_id = zalloc(sizeof(*guest_id)); 804 + int hash; 805 + 806 + if (!guest_id) 807 + return -ENOMEM; 808 + 809 + guest_id->id = id; 810 + guest_id->host_id = host_id; 811 + guest_id->vcpu = vcpu; 812 + hash = hash_64(guest_id->id, PERF_EVLIST__HLIST_BITS); 813 + hlist_add_head(&guest_id->node, &gs->heads[hash]); 814 + 815 + return 0; 816 + } 817 + 818 + static u64 evlist__find_highest_id(struct evlist *evlist) 819 + { 820 + struct evsel *evsel; 821 + u64 highest_id = 1; 822 + 823 + evlist__for_each_entry(evlist, evsel) { 824 + u32 j; 825 + 826 + for (j = 0; j < evsel->core.ids; j++) { 827 + u64 id = evsel->core.id[j]; 828 + 829 + if (id > highest_id) 830 + highest_id = id; 831 + } 832 + } 833 + 834 + return highest_id; 835 + } 836 + 837 + static int guest_session__map_ids(struct guest_session *gs, struct evlist *host_evlist) 838 + { 839 + struct evlist *evlist = gs->session->evlist; 840 + struct evsel *evsel; 841 + int ret; 842 + 843 + evlist__for_each_entry(evlist, evsel) { 844 + u32 j; 845 + 846 + for (j = 0; j < evsel->core.ids; j++) { 847 + struct perf_sample_id *sid; 848 + u64 host_id; 849 + u64 id; 850 + 851 + id = evsel->core.id[j]; 852 + sid = evlist__id2sid(evlist, id); 853 + if (!sid || sid->cpu.cpu == -1) 854 + continue; 855 + host_id = guest_session__allocate_new_id(gs, host_evlist); 856 + ret = guest_session__map_id(gs, id, host_id, sid->cpu.cpu); 857 + if (ret) 858 + return ret; 859 + } 860 + } 861 + 862 + return 0; 863 + } 864 + 865 + static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 id) 866 + { 867 + struct hlist_head *head; 868 + struct guest_id *guest_id; 869 + int hash; 870 + 871 + hash = hash_64(id, PERF_EVLIST__HLIST_BITS); 872 + head = &gs->heads[hash]; 873 + 874 + hlist_for_each_entry(guest_id, head, node) 875 + if (guest_id->id == id) 876 + return guest_id; 877 + 878 + return NULL; 879 + } 880 + 881 + static int process_attr(struct perf_tool *tool, union perf_event *event, 882 + struct perf_sample *sample __maybe_unused, 883 + struct machine *machine __maybe_unused) 884 + { 885 + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); 886 + 887 + return perf_event__process_attr(tool, event, &inject->session->evlist); 888 + } 889 + 890 + static int guest_session__add_attr(struct guest_session *gs, struct evsel *evsel) 891 + { 892 + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); 893 + struct perf_event_attr attr = evsel->core.attr; 894 + u64 *id_array; 895 + u32 *vcpu_array; 896 + int ret = -ENOMEM; 897 + u32 i; 898 + 899 + id_array = calloc(evsel->core.ids, sizeof(*id_array)); 900 + if (!id_array) 901 + return -ENOMEM; 902 + 903 + vcpu_array = calloc(evsel->core.ids, sizeof(*vcpu_array)); 904 + if (!vcpu_array) 905 + goto out; 906 + 907 + for (i = 0; i < evsel->core.ids; i++) { 908 + u64 id = evsel->core.id[i]; 909 + struct guest_id *guest_id = guest_session__lookup_id(gs, id); 910 + 911 + if (!guest_id) { 912 + pr_err("Failed to find guest id %"PRIu64"\n", id); 913 + ret = -EINVAL; 914 + goto out; 915 + } 916 + id_array[i] = guest_id->host_id; 917 + vcpu_array[i] = guest_id->vcpu; 918 + } 919 + 920 + attr.sample_type |= PERF_SAMPLE_IDENTIFIER; 921 + attr.exclude_host = 1; 922 + attr.exclude_guest = 0; 923 + 924 + ret = perf_event__synthesize_attr(&inject->tool, &attr, evsel->core.ids, 925 + id_array, process_attr); 926 + if (ret) 927 + pr_err("Failed to add guest attr.\n"); 928 + 929 + for (i = 0; i < evsel->core.ids; i++) { 930 + struct perf_sample_id *sid; 931 + u32 vcpu = vcpu_array[i]; 932 + 933 + sid = evlist__id2sid(inject->session->evlist, id_array[i]); 934 + /* Guest event is per-thread from the host point of view */ 935 + sid->cpu.cpu = -1; 936 + sid->tid = gs->vcpu[vcpu].tid; 937 + sid->machine_pid = gs->machine_pid; 938 + sid->vcpu.cpu = vcpu; 939 + } 940 + out: 941 + free(vcpu_array); 942 + free(id_array); 943 + return ret; 944 + } 945 + 946 + static int guest_session__add_attrs(struct guest_session *gs) 947 + { 948 + struct evlist *evlist = gs->session->evlist; 949 + struct evsel *evsel; 950 + int ret; 951 + 952 + evlist__for_each_entry(evlist, evsel) { 953 + ret = guest_session__add_attr(gs, evsel); 954 + if (ret) 955 + return ret; 956 + } 957 + 958 + return 0; 959 + } 960 + 961 + static int synthesize_id_index(struct perf_inject *inject, size_t new_cnt) 962 + { 963 + struct perf_session *session = inject->session; 964 + struct evlist *evlist = session->evlist; 965 + struct machine *machine = &session->machines.host; 966 + size_t from = evlist->core.nr_entries - new_cnt; 967 + 968 + return __perf_event__synthesize_id_index(&inject->tool, perf_event__repipe, 969 + evlist, machine, from); 970 + } 971 + 972 + static struct guest_tid *guest_session__lookup_tid(struct guest_session *gs, u32 tid) 973 + { 974 + struct hlist_head *head; 975 + struct guest_tid *guest_tid; 976 + int hash; 977 + 978 + hash = hash_32(tid, PERF_EVLIST__HLIST_BITS); 979 + head = &gs->tids[hash]; 980 + 981 + hlist_for_each_entry(guest_tid, head, node) 982 + if (guest_tid->tid == tid) 983 + return guest_tid; 984 + 985 + return NULL; 986 + } 987 + 988 + static bool dso__is_in_kernel_space(struct dso *dso) 989 + { 990 + if (dso__is_vdso(dso)) 991 + return false; 992 + 993 + return dso__is_kcore(dso) || 994 + dso->kernel || 995 + is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); 996 + } 997 + 998 + static u64 evlist__first_id(struct evlist *evlist) 999 + { 1000 + struct evsel *evsel; 1001 + 1002 + evlist__for_each_entry(evlist, evsel) { 1003 + if (evsel->core.ids) 1004 + return evsel->core.id[0]; 1005 + } 1006 + return 0; 1007 + } 1008 + 1009 + static int process_build_id(struct perf_tool *tool, 1010 + union perf_event *event, 1011 + struct perf_sample *sample __maybe_unused, 1012 + struct machine *machine __maybe_unused) 1013 + { 1014 + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); 1015 + 1016 + return perf_event__process_build_id(inject->session, event); 1017 + } 1018 + 1019 + static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_t machine_pid) 1020 + { 1021 + struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid); 1022 + u8 cpumode = dso__is_in_kernel_space(dso) ? 1023 + PERF_RECORD_MISC_GUEST_KERNEL : 1024 + PERF_RECORD_MISC_GUEST_USER; 1025 + 1026 + if (!machine) 1027 + return -ENOMEM; 1028 + 1029 + dso->hit = 1; 1030 + 1031 + return perf_event__synthesize_build_id(&inject->tool, dso, cpumode, 1032 + process_build_id, machine); 1033 + } 1034 + 1035 + static int guest_session__add_build_ids(struct guest_session *gs) 1036 + { 1037 + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); 1038 + struct machine *machine = &gs->session->machines.host; 1039 + struct dso *dso; 1040 + int ret; 1041 + 1042 + /* Build IDs will be put in the Build ID feature section */ 1043 + perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID); 1044 + 1045 + dsos__for_each_with_build_id(dso, &machine->dsos.head) { 1046 + ret = synthesize_build_id(inject, dso, gs->machine_pid); 1047 + if (ret) 1048 + return ret; 1049 + } 1050 + 1051 + return 0; 1052 + } 1053 + 1054 + static int guest_session__ksymbol_event(struct perf_tool *tool, 1055 + union perf_event *event, 1056 + struct perf_sample *sample __maybe_unused, 1057 + struct machine *machine __maybe_unused) 1058 + { 1059 + struct guest_session *gs = container_of(tool, struct guest_session, tool); 1060 + 1061 + /* Only support out-of-line i.e. no BPF support */ 1062 + if (event->ksymbol.ksym_type != PERF_RECORD_KSYMBOL_TYPE_OOL) 1063 + return 0; 1064 + 1065 + return guest_session__output_bytes(gs, event, event->header.size); 1066 + } 1067 + 1068 + static int guest_session__start(struct guest_session *gs, const char *name, bool force) 1069 + { 1070 + char tmp_file_name[] = "/tmp/perf-inject-guest_session-XXXXXX"; 1071 + struct perf_session *session; 1072 + int ret; 1073 + 1074 + /* Only these events will be injected */ 1075 + gs->tool.mmap = guest_session__repipe; 1076 + gs->tool.mmap2 = guest_session__repipe; 1077 + gs->tool.comm = guest_session__repipe; 1078 + gs->tool.fork = guest_session__repipe; 1079 + gs->tool.exit = guest_session__repipe; 1080 + gs->tool.lost = guest_session__repipe; 1081 + gs->tool.context_switch = guest_session__repipe; 1082 + gs->tool.ksymbol = guest_session__ksymbol_event; 1083 + gs->tool.text_poke = guest_session__repipe; 1084 + /* 1085 + * Processing a build ID creates a struct dso with that build ID. Later, 1086 + * all guest dsos are iterated and the build IDs processed into the host 1087 + * session where they will be output to the Build ID feature section 1088 + * when the perf.data file header is written. 1089 + */ 1090 + gs->tool.build_id = perf_event__process_build_id; 1091 + /* Process the id index to know what VCPU an ID belongs to */ 1092 + gs->tool.id_index = perf_event__process_id_index; 1093 + 1094 + gs->tool.ordered_events = true; 1095 + gs->tool.ordering_requires_timestamps = true; 1096 + 1097 + gs->data.path = name; 1098 + gs->data.force = force; 1099 + gs->data.mode = PERF_DATA_MODE_READ; 1100 + 1101 + session = perf_session__new(&gs->data, &gs->tool); 1102 + if (IS_ERR(session)) 1103 + return PTR_ERR(session); 1104 + gs->session = session; 1105 + 1106 + /* 1107 + * Initial events have zero'd ID samples. Get default ID sample size 1108 + * used for removing them. 1109 + */ 1110 + gs->dflt_id_hdr_size = session->machines.host.id_hdr_size; 1111 + /* And default ID for adding back a host-compatible ID sample */ 1112 + gs->dflt_id = evlist__first_id(session->evlist); 1113 + if (!gs->dflt_id) { 1114 + pr_err("Guest data has no sample IDs"); 1115 + return -EINVAL; 1116 + } 1117 + 1118 + /* Temporary file for guest events */ 1119 + gs->tmp_file_name = strdup(tmp_file_name); 1120 + if (!gs->tmp_file_name) 1121 + return -ENOMEM; 1122 + gs->tmp_fd = mkstemp(gs->tmp_file_name); 1123 + if (gs->tmp_fd < 0) 1124 + return -errno; 1125 + 1126 + if (zstd_init(&gs->session->zstd_data, 0) < 0) 1127 + pr_warning("Guest session decompression initialization failed.\n"); 1128 + 1129 + /* 1130 + * perf does not support processing 2 sessions simultaneously, so output 1131 + * guest events to a temporary file. 1132 + */ 1133 + ret = perf_session__process_events(gs->session); 1134 + if (ret) 1135 + return ret; 1136 + 1137 + if (lseek(gs->tmp_fd, 0, SEEK_SET)) 1138 + return -errno; 1139 + 1140 + return 0; 1141 + } 1142 + 1143 + /* Free hlist nodes assuming hlist_node is the first member of hlist entries */ 1144 + static void free_hlist(struct hlist_head *heads, size_t hlist_sz) 1145 + { 1146 + struct hlist_node *pos, *n; 1147 + size_t i; 1148 + 1149 + for (i = 0; i < hlist_sz; ++i) { 1150 + hlist_for_each_safe(pos, n, &heads[i]) { 1151 + hlist_del(pos); 1152 + free(pos); 1153 + } 1154 + } 1155 + } 1156 + 1157 + static void guest_session__exit(struct guest_session *gs) 1158 + { 1159 + if (gs->session) { 1160 + perf_session__delete(gs->session); 1161 + free_hlist(gs->heads, PERF_EVLIST__HLIST_SIZE); 1162 + free_hlist(gs->tids, PERF_EVLIST__HLIST_SIZE); 1163 + } 1164 + if (gs->tmp_file_name) { 1165 + if (gs->tmp_fd >= 0) 1166 + close(gs->tmp_fd); 1167 + unlink(gs->tmp_file_name); 1168 + free(gs->tmp_file_name); 1169 + } 1170 + free(gs->vcpu); 1171 + free(gs->perf_data_file); 1172 + } 1173 + 1174 + static void get_tsc_conv(struct perf_tsc_conversion *tc, struct perf_record_time_conv *time_conv) 1175 + { 1176 + tc->time_shift = time_conv->time_shift; 1177 + tc->time_mult = time_conv->time_mult; 1178 + tc->time_zero = time_conv->time_zero; 1179 + tc->time_cycles = time_conv->time_cycles; 1180 + tc->time_mask = time_conv->time_mask; 1181 + tc->cap_user_time_zero = time_conv->cap_user_time_zero; 1182 + tc->cap_user_time_short = time_conv->cap_user_time_short; 1183 + } 1184 + 1185 + static void guest_session__get_tc(struct guest_session *gs) 1186 + { 1187 + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); 1188 + 1189 + get_tsc_conv(&gs->host_tc, &inject->session->time_conv); 1190 + get_tsc_conv(&gs->guest_tc, &gs->session->time_conv); 1191 + } 1192 + 1193 + static void guest_session__convert_time(struct guest_session *gs, u64 guest_time, u64 *host_time) 1194 + { 1195 + u64 tsc; 1196 + 1197 + if (!guest_time) { 1198 + *host_time = 0; 1199 + return; 1200 + } 1201 + 1202 + if (gs->guest_tc.cap_user_time_zero) 1203 + tsc = perf_time_to_tsc(guest_time, &gs->guest_tc); 1204 + else 1205 + tsc = guest_time; 1206 + 1207 + /* 1208 + * This is the correct order of operations for x86 if the TSC Offset and 1209 + * Multiplier values are used. 1210 + */ 1211 + tsc -= gs->time_offset; 1212 + tsc /= gs->time_scale; 1213 + 1214 + if (gs->host_tc.cap_user_time_zero) 1215 + *host_time = tsc_to_perf_time(tsc, &gs->host_tc); 1216 + else 1217 + *host_time = tsc; 1218 + } 1219 + 1220 + static int guest_session__fetch(struct guest_session *gs) 1221 + { 1222 + void *buf = gs->ev.event_buf; 1223 + struct perf_event_header *hdr = buf; 1224 + size_t hdr_sz = sizeof(*hdr); 1225 + ssize_t ret; 1226 + 1227 + ret = readn(gs->tmp_fd, buf, hdr_sz); 1228 + if (ret < 0) 1229 + return ret; 1230 + 1231 + if (!ret) { 1232 + /* Zero size means EOF */ 1233 + hdr->size = 0; 1234 + return 0; 1235 + } 1236 + 1237 + buf += hdr_sz; 1238 + 1239 + ret = readn(gs->tmp_fd, buf, hdr->size - hdr_sz); 1240 + if (ret < 0) 1241 + return ret; 1242 + 1243 + gs->ev.event = (union perf_event *)gs->ev.event_buf; 1244 + gs->ev.sample.time = 0; 1245 + 1246 + if (hdr->type >= PERF_RECORD_USER_TYPE_START) { 1247 + pr_err("Unexpected type fetching guest event"); 1248 + return 0; 1249 + } 1250 + 1251 + ret = evlist__parse_sample(gs->session->evlist, gs->ev.event, &gs->ev.sample); 1252 + if (ret) { 1253 + pr_err("Parse failed fetching guest event"); 1254 + return ret; 1255 + } 1256 + 1257 + if (!gs->have_tc) { 1258 + guest_session__get_tc(gs); 1259 + gs->have_tc = true; 1260 + } 1261 + 1262 + guest_session__convert_time(gs, gs->ev.sample.time, &gs->ev.sample.time); 1263 + 1264 + return 0; 1265 + } 1266 + 1267 + static int evlist__append_id_sample(struct evlist *evlist, union perf_event *ev, 1268 + const struct perf_sample *sample) 1269 + { 1270 + struct evsel *evsel; 1271 + void *array; 1272 + int ret; 1273 + 1274 + evsel = evlist__id2evsel(evlist, sample->id); 1275 + array = ev; 1276 + 1277 + if (!evsel) { 1278 + pr_err("No evsel for id %"PRIu64"\n", sample->id); 1279 + return -EINVAL; 1280 + } 1281 + 1282 + array += ev->header.size; 1283 + ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample); 1284 + if (ret < 0) 1285 + return ret; 1286 + 1287 + if (ret & 7) { 1288 + pr_err("Bad id sample size %d\n", ret); 1289 + return -EINVAL; 1290 + } 1291 + 1292 + ev->header.size += ret; 1293 + 1294 + return 0; 1295 + } 1296 + 1297 + static int guest_session__inject_events(struct guest_session *gs, u64 timestamp) 1298 + { 1299 + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); 1300 + int ret; 1301 + 1302 + if (!gs->ready) 1303 + return 0; 1304 + 1305 + while (1) { 1306 + struct perf_sample *sample; 1307 + struct guest_id *guest_id; 1308 + union perf_event *ev; 1309 + u16 id_hdr_size; 1310 + u8 cpumode; 1311 + u64 id; 1312 + 1313 + if (!gs->fetched) { 1314 + ret = guest_session__fetch(gs); 1315 + if (ret) 1316 + return ret; 1317 + gs->fetched = true; 1318 + } 1319 + 1320 + ev = gs->ev.event; 1321 + sample = &gs->ev.sample; 1322 + 1323 + if (!ev->header.size) 1324 + return 0; /* EOF */ 1325 + 1326 + if (sample->time > timestamp) 1327 + return 0; 1328 + 1329 + /* Change cpumode to guest */ 1330 + cpumode = ev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; 1331 + if (cpumode & PERF_RECORD_MISC_USER) 1332 + cpumode = PERF_RECORD_MISC_GUEST_USER; 1333 + else 1334 + cpumode = PERF_RECORD_MISC_GUEST_KERNEL; 1335 + ev->header.misc &= ~PERF_RECORD_MISC_CPUMODE_MASK; 1336 + ev->header.misc |= cpumode; 1337 + 1338 + id = sample->id; 1339 + if (!id) { 1340 + id = gs->dflt_id; 1341 + id_hdr_size = gs->dflt_id_hdr_size; 1342 + } else { 1343 + struct evsel *evsel = evlist__id2evsel(gs->session->evlist, id); 1344 + 1345 + id_hdr_size = evsel__id_hdr_size(evsel); 1346 + } 1347 + 1348 + if (id_hdr_size & 7) { 1349 + pr_err("Bad id_hdr_size %u\n", id_hdr_size); 1350 + return -EINVAL; 1351 + } 1352 + 1353 + if (ev->header.size & 7) { 1354 + pr_err("Bad event size %u\n", ev->header.size); 1355 + return -EINVAL; 1356 + } 1357 + 1358 + /* Remove guest id sample */ 1359 + ev->header.size -= id_hdr_size; 1360 + 1361 + if (ev->header.size & 7) { 1362 + pr_err("Bad raw event size %u\n", ev->header.size); 1363 + return -EINVAL; 1364 + } 1365 + 1366 + guest_id = guest_session__lookup_id(gs, id); 1367 + if (!guest_id) { 1368 + pr_err("Guest event with unknown id %llu\n", 1369 + (unsigned long long)id); 1370 + return -EINVAL; 1371 + } 1372 + 1373 + /* Change to host ID to avoid conflicting ID values */ 1374 + sample->id = guest_id->host_id; 1375 + sample->stream_id = guest_id->host_id; 1376 + 1377 + if (sample->cpu != (u32)-1) { 1378 + if (sample->cpu >= gs->vcpu_cnt) { 1379 + pr_err("Guest event with unknown VCPU %u\n", 1380 + sample->cpu); 1381 + return -EINVAL; 1382 + } 1383 + /* Change to host CPU instead of guest VCPU */ 1384 + sample->cpu = gs->vcpu[sample->cpu].cpu; 1385 + } 1386 + 1387 + /* New id sample with new ID and CPU */ 1388 + ret = evlist__append_id_sample(inject->session->evlist, ev, sample); 1389 + if (ret) 1390 + return ret; 1391 + 1392 + if (ev->header.size & 7) { 1393 + pr_err("Bad new event size %u\n", ev->header.size); 1394 + return -EINVAL; 1395 + } 1396 + 1397 + gs->fetched = false; 1398 + 1399 + ret = output_bytes(inject, ev, ev->header.size); 1400 + if (ret) 1401 + return ret; 1402 + } 1403 + } 1404 + 1405 + static int guest_session__flush_events(struct guest_session *gs) 1406 + { 1407 + return guest_session__inject_events(gs, -1); 1408 + } 1409 + 1410 + static int host__repipe(struct perf_tool *tool, 1411 + union perf_event *event, 1412 + struct perf_sample *sample, 1413 + struct machine *machine) 1414 + { 1415 + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); 1416 + int ret; 1417 + 1418 + ret = guest_session__inject_events(&inject->guest_session, sample->time); 1419 + if (ret) 1420 + return ret; 1421 + 1422 + return perf_event__repipe(tool, event, sample, machine); 1423 + } 1424 + 1425 + static int host__finished_init(struct perf_session *session, union perf_event *event) 1426 + { 1427 + struct perf_inject *inject = container_of(session->tool, struct perf_inject, tool); 1428 + struct guest_session *gs = &inject->guest_session; 1429 + int ret; 1430 + 1431 + /* 1432 + * Peek through host COMM events to find QEMU threads and the VCPU they 1433 + * are running. 1434 + */ 1435 + ret = host_peek_vm_comms(session, gs); 1436 + if (ret) 1437 + return ret; 1438 + 1439 + if (!gs->vcpu_cnt) { 1440 + pr_err("No VCPU theads found for pid %u\n", gs->machine_pid); 1441 + return -EINVAL; 1442 + } 1443 + 1444 + /* 1445 + * Allocate new (unused) host sample IDs and map them to the guest IDs. 1446 + */ 1447 + gs->highest_id = evlist__find_highest_id(session->evlist); 1448 + ret = guest_session__map_ids(gs, session->evlist); 1449 + if (ret) 1450 + return ret; 1451 + 1452 + ret = guest_session__add_attrs(gs); 1453 + if (ret) 1454 + return ret; 1455 + 1456 + ret = synthesize_id_index(inject, gs->session->evlist->core.nr_entries); 1457 + if (ret) { 1458 + pr_err("Failed to synthesize id_index\n"); 1459 + return ret; 1460 + } 1461 + 1462 + ret = guest_session__add_build_ids(gs); 1463 + if (ret) { 1464 + pr_err("Failed to add guest build IDs\n"); 1465 + return ret; 1466 + } 1467 + 1468 + gs->ready = true; 1469 + 1470 + ret = guest_session__inject_events(gs, 0); 1471 + if (ret) 1472 + return ret; 1473 + 1474 + return perf_event__repipe_op2_synth(session, event); 1475 + } 1476 + 1477 + /* 1478 + * Obey finished-round ordering. The FINISHED_ROUND event is first processed 1479 + * which flushes host events to file up until the last flush time. Then inject 1480 + * guest events up to the same time. Finally write out the FINISHED_ROUND event 1481 + * itself. 1482 + */ 1483 + static int host__finished_round(struct perf_tool *tool, 1484 + union perf_event *event, 1485 + struct ordered_events *oe) 1486 + { 1487 + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); 1488 + int ret = perf_event__process_finished_round(tool, event, oe); 1489 + u64 timestamp = ordered_events__last_flush_time(oe); 1490 + 1491 + if (ret) 1492 + return ret; 1493 + 1494 + ret = guest_session__inject_events(&inject->guest_session, timestamp); 1495 + if (ret) 1496 + return ret; 1497 + 1498 + return perf_event__repipe_oe_synth(tool, event, oe); 1499 + } 1500 + 1501 + static int host__context_switch(struct perf_tool *tool, 1502 + union perf_event *event, 1503 + struct perf_sample *sample, 1504 + struct machine *machine) 1505 + { 1506 + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); 1507 + bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT; 1508 + struct guest_session *gs = &inject->guest_session; 1509 + u32 pid = event->context_switch.next_prev_pid; 1510 + u32 tid = event->context_switch.next_prev_tid; 1511 + struct guest_tid *guest_tid; 1512 + u32 vcpu; 1513 + 1514 + if (out || pid != gs->machine_pid) 1515 + goto out; 1516 + 1517 + guest_tid = guest_session__lookup_tid(gs, tid); 1518 + if (!guest_tid) 1519 + goto out; 1520 + 1521 + if (sample->cpu == (u32)-1) { 1522 + pr_err("Switch event does not have CPU\n"); 1523 + return -EINVAL; 1524 + } 1525 + 1526 + vcpu = guest_tid->vcpu; 1527 + if (vcpu >= gs->vcpu_cnt) 1528 + return -EINVAL; 1529 + 1530 + /* Guest is switching in, record which CPU the VCPU is now running on */ 1531 + gs->vcpu[vcpu].cpu = sample->cpu; 1532 + out: 1533 + return host__repipe(tool, event, sample, machine); 1534 + } 1535 + 765 1536 static void sig_handler(int sig __maybe_unused) 766 1537 { 767 1538 session_done = 1; ··· 1664 765 inject->itrace_synth_opts.vm_tm_corr_args = strdup(args); 1665 766 1666 767 return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM; 768 + } 769 + 770 + static int parse_guest_data(const struct option *opt, const char *str, int unset) 771 + { 772 + struct perf_inject *inject = opt->value; 773 + struct guest_session *gs = &inject->guest_session; 774 + char *tok; 775 + char *s; 776 + 777 + if (unset) 778 + return 0; 779 + 780 + if (!str) 781 + goto bad_args; 782 + 783 + s = strdup(str); 784 + if (!s) 785 + return -ENOMEM; 786 + 787 + gs->perf_data_file = strsep(&s, ","); 788 + if (!gs->perf_data_file) 789 + goto bad_args; 790 + 791 + gs->copy_kcore_dir = has_kcore_dir(gs->perf_data_file); 792 + if (gs->copy_kcore_dir) 793 + inject->output.is_dir = true; 794 + 795 + tok = strsep(&s, ","); 796 + if (!tok) 797 + goto bad_args; 798 + gs->machine_pid = strtoul(tok, NULL, 0); 799 + if (!inject->guest_session.machine_pid) 800 + goto bad_args; 801 + 802 + gs->time_scale = 1; 803 + 804 + tok = strsep(&s, ","); 805 + if (!tok) 806 + goto out; 807 + gs->time_offset = strtoull(tok, NULL, 0); 808 + 809 + tok = strsep(&s, ","); 810 + if (!tok) 811 + goto out; 812 + gs->time_scale = strtod(tok, NULL); 813 + if (!gs->time_scale) 814 + goto bad_args; 815 + out: 816 + return 0; 817 + 818 + bad_args: 819 + pr_err("--guest-data option requires guest perf.data file name, " 820 + "guest machine PID, and optionally guest timestamp offset, " 821 + "and guest timestamp scale factor, separated by commas.\n"); 822 + return -1; 1667 823 } 1668 824 1669 825 static int save_section_info_cb(struct perf_file_section *section, ··· 1850 896 return ret; 1851 897 } 1852 898 899 + static int guest_session__copy_kcore_dir(struct guest_session *gs) 900 + { 901 + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); 902 + char *cmd; 903 + int ret; 904 + 905 + ret = asprintf(&cmd, "cp -r -n %s/kcore_dir %s/kcore_dir__%u >/dev/null 2>&1", 906 + gs->perf_data_file, inject->output.path, gs->machine_pid); 907 + if (ret < 0) 908 + return ret; 909 + pr_debug("%s\n", cmd); 910 + ret = system(cmd); 911 + free(cmd); 912 + return ret; 913 + } 914 + 1853 915 static int output_fd(struct perf_inject *inject) 1854 916 { 1855 917 return inject->in_place_update ? -1 : perf_data__fd(&inject->output); ··· 1874 904 static int __cmd_inject(struct perf_inject *inject) 1875 905 { 1876 906 int ret = -EINVAL; 907 + struct guest_session *gs = &inject->guest_session; 1877 908 struct perf_session *session = inject->session; 1878 909 int fd = output_fd(inject); 1879 910 u64 output_data_offset; ··· 1939 968 output_data_offset = roundup(8192 + session->header.data_offset, 4096); 1940 969 if (inject->strip) 1941 970 strip_init(inject); 971 + } else if (gs->perf_data_file) { 972 + char *name = gs->perf_data_file; 973 + 974 + /* 975 + * Not strictly necessary, but keep these events in order wrt 976 + * guest events. 977 + */ 978 + inject->tool.mmap = host__repipe; 979 + inject->tool.mmap2 = host__repipe; 980 + inject->tool.comm = host__repipe; 981 + inject->tool.fork = host__repipe; 982 + inject->tool.exit = host__repipe; 983 + inject->tool.lost = host__repipe; 984 + inject->tool.context_switch = host__repipe; 985 + inject->tool.ksymbol = host__repipe; 986 + inject->tool.text_poke = host__repipe; 987 + /* 988 + * Once the host session has initialized, set up sample ID 989 + * mapping and feed in guest attrs, build IDs and initial 990 + * events. 991 + */ 992 + inject->tool.finished_init = host__finished_init; 993 + /* Obey finished round ordering */ 994 + inject->tool.finished_round = host__finished_round, 995 + /* Keep track of which CPU a VCPU is runnng on */ 996 + inject->tool.context_switch = host__context_switch; 997 + /* 998 + * Must order events to be able to obey finished round 999 + * ordering. 1000 + */ 1001 + inject->tool.ordered_events = true; 1002 + inject->tool.ordering_requires_timestamps = true; 1003 + /* Set up a separate session to process guest perf.data file */ 1004 + ret = guest_session__start(gs, name, session->data->force); 1005 + if (ret) { 1006 + pr_err("Failed to process %s, error %d\n", name, ret); 1007 + return ret; 1008 + } 1009 + /* Allow space in the header for guest attributes */ 1010 + output_data_offset += gs->session->header.data_offset; 1011 + output_data_offset = roundup(output_data_offset, 4096); 1942 1012 } 1943 1013 1944 1014 if (!inject->itrace_synth_opts.set) ··· 1991 979 ret = perf_session__process_events(session); 1992 980 if (ret) 1993 981 return ret; 982 + 983 + if (gs->session) { 984 + /* 985 + * Remaining guest events have later timestamps. Flush them 986 + * out to file. 987 + */ 988 + ret = guest_session__flush_events(gs); 989 + if (ret) { 990 + pr_err("Failed to flush guest events\n"); 991 + return ret; 992 + } 993 + } 1994 994 1995 995 if (!inject->is_pipe && !inject->in_place_update) { 1996 996 struct inject_fc inj_fc = { ··· 2038 1014 2039 1015 if (inject->copy_kcore_dir) { 2040 1016 ret = copy_kcore_dir(inject); 2041 - if (ret) 1017 + if (ret) { 1018 + pr_err("Failed to copy kcore\n"); 2042 1019 return ret; 1020 + } 1021 + } 1022 + if (gs->copy_kcore_dir) { 1023 + ret = guest_session__copy_kcore_dir(gs); 1024 + if (ret) { 1025 + pr_err("Failed to copy guest kcore\n"); 1026 + return ret; 1027 + } 2043 1028 } 2044 1029 } 2045 1030 ··· 2146 1113 OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts", 2147 1114 "correlate time between VM guests and the host", 2148 1115 parse_vm_time_correlation), 1116 + OPT_CALLBACK_OPTARG(0, "guest-data", &inject, NULL, "opts", 1117 + "inject events from a guest perf.data file", 1118 + parse_guest_data), 1119 + OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", 1120 + "guest mount directory under which every guest os" 1121 + " instance has a subdir"), 2149 1122 OPT_END() 2150 1123 }; 2151 1124 const char * const inject_usage[] = { ··· 2281 1242 goto out_delete; 2282 1243 2283 1244 ret = __cmd_inject(&inject); 1245 + 1246 + guest_session__exit(&inject.guest_session); 2284 1247 2285 1248 out_delete: 2286 1249 zstd_fini(&(inject.session->zstd_data));