Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf symbols: Add Rust demangling

Rust demangling is another step after bfd demangling. Add a diagnosis to
identify mangled Rust symbols based on the hash that the Rust mangler appends
as the last path component, as well as other characteristics. Add a demangler
to reconstruct the original symbol.

Committer notes:

How I tested it:

Enabled COPR on Fedora 24 and then installed the 'rust-binary' package,
with it:

$ cat src/main.rs
fn main() {
println!("Hello, world!");
}
$ cat Cargo.toml
[package]

name = "hello_world"
version = "0.0.1"
authors = [ "Arnaldo Carvalho de Melo <acme@kernel.org>" ]

$ perf record cargo bench
Compiling hello_world v0.0.1 (file:///home/acme/projects/hello_world)
Running target/release/hello_world-d4b9dab4b2a47d75

running 0 tests

test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured

[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.096 MB perf.data (1457 samples) ]
$

Before this patch:

$ perf report --stdio --dsos librbml-e8edd0fd.so
# dso: librbml-e8edd0fd.so
#
# Total Lost Samples: 0
#
# Samples: 1K of event 'cycles:u'
# Event count (approx.): 979599126
#
# Overhead Command Symbol
# ........ ....... .............................................................................................................
#
1.78% rustc [.] rbml::reader::maybe_get_doc::hb9d387df6024b15b
1.50% rustc [.] _$LT$reader..DocsIterator$LT$$u27$a$GT$$u20$as$u20$std..iter..Iterator$GT$::next::hd9af9e60d79a35c8
1.20% rustc [.] rbml::reader::doc_at::hc88107fba445af31
0.46% rustc [.] _$LT$reader..TaggedDocsIterator$LT$$u27$a$GT$$u20$as$u20$std..iter..Iterator$GT$::next::h0cb40e696e4bb489
0.35% rustc [.] rbml::reader::Decoder::_next_int::h66eef7825a398bc3
0.29% rustc [.] rbml::reader::Decoder::_next_sub::h8e5266005580b836
0.15% rustc [.] rbml::reader::get_doc::h094521c645459139
0.14% rustc [.] _$LT$reader..Decoder$LT$$u27$doc$GT$$u20$as$u20$serialize..Decoder$GT$::read_u32::h0acea2fff9669327
0.07% rustc [.] rbml::reader::Decoder::next_doc::h6714d469c9dfaf91
0.07% rustc [.] _ZN4rbml6reader10doc_as_u6417h930b740aa94f1d3aE@plt
0.06% rustc [.] _fini
$

After:

$ perf report --stdio --dsos librbml-e8edd0fd.so
# dso: librbml-e8edd0fd.so
#
# Total Lost Samples: 0
#
# Samples: 1K of event 'cycles:u'
# Event count (approx.): 979599126
#
# Overhead Command Symbol
# ........ ....... .................................................................
#
1.78% rustc [.] rbml::reader::maybe_get_doc
1.50% rustc [.] <reader::DocsIterator<'a> as std::iter::Iterator>::next
1.20% rustc [.] rbml::reader::doc_at
0.46% rustc [.] <reader::TaggedDocsIterator<'a> as std::iter::Iterator>::next
0.35% rustc [.] rbml::reader::Decoder::_next_int
0.29% rustc [.] rbml::reader::Decoder::_next_sub
0.15% rustc [.] rbml::reader::get_doc
0.14% rustc [.] <reader::Decoder<'doc> as serialize::Decoder>::read_u32
0.07% rustc [.] rbml::reader::Decoder::next_doc
0.07% rustc [.] _ZN4rbml6reader10doc_as_u6417h930b740aa94f1d3aE@plt
0.06% rustc [.] _fini
$

Signed-off-by: David Tolnay <dtolnay@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5780B7FA.3030602@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by dtolnay.tngl.sh and committed by

Arnaldo Carvalho de Melo cae15db7 1c1a3a47

+285
+1
tools/perf/util/Build
··· 113 113 libperf-$(CONFIG_ZLIB) += zlib.o 114 114 libperf-$(CONFIG_LZMA) += lzma.o 115 115 libperf-y += demangle-java.o 116 + libperf-y += demangle-rust.o 116 117 117 118 ifdef CONFIG_JITDUMP 118 119 libperf-$(CONFIG_LIBELF) += jitdump.o
+269
tools/perf/util/demangle-rust.c
··· 1 + #include <string.h> 2 + #include "util.h" 3 + #include "debug.h" 4 + 5 + #include "demangle-rust.h" 6 + 7 + /* 8 + * Mangled Rust symbols look like this: 9 + * 10 + * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a 11 + * 12 + * The original symbol is: 13 + * 14 + * <std::sys::fd::FileDesc as core::ops::Drop>::drop 15 + * 16 + * The last component of the path is a 64-bit hash in lowercase hex, prefixed 17 + * with "h". Rust does not have a global namespace between crates, an illusion 18 + * which Rust maintains by using the hash to distinguish things that would 19 + * otherwise have the same symbol. 20 + * 21 + * Any path component not starting with a XID_Start character is prefixed with 22 + * "_". 23 + * 24 + * The following escape sequences are used: 25 + * 26 + * "," => $C$ 27 + * "@" => $SP$ 28 + * "*" => $BP$ 29 + * "&" => $RF$ 30 + * "<" => $LT$ 31 + * ">" => $GT$ 32 + * "(" => $LP$ 33 + * ")" => $RP$ 34 + * " " => $u20$ 35 + * "'" => $u27$ 36 + * "[" => $u5b$ 37 + * "]" => $u5d$ 38 + * "~" => $u7e$ 39 + * 40 + * A double ".." means "::" and a single "." means "-". 41 + * 42 + * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ 43 + */ 44 + 45 + static const char *hash_prefix = "::h"; 46 + static const size_t hash_prefix_len = 3; 47 + static const size_t hash_len = 16; 48 + 49 + static bool is_prefixed_hash(const char *start); 50 + static bool looks_like_rust(const char *sym, size_t len); 51 + static bool unescape(const char **in, char **out, const char *seq, char value); 52 + 53 + /* 54 + * INPUT: 55 + * sym: symbol that has been through BFD-demangling 56 + * 57 + * This function looks for the following indicators: 58 + * 59 + * 1. The hash must consist of "h" followed by 16 lowercase hex digits. 60 + * 61 + * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible 62 + * hex digits. This is true of 99.9998% of hashes so once in your life you 63 + * may see a false negative. The point is to notice path components that 64 + * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In 65 + * this case a false positive (non-Rust symbol has an important path 66 + * component removed because it looks like a Rust hash) is worse than a 67 + * false negative (the rare Rust symbol is not demangled) so this sets the 68 + * balance in favor of false negatives. 69 + * 70 + * 3. There must be no characters other than a-zA-Z0-9 and _.:$ 71 + * 72 + * 4. There must be no unrecognized $-sign sequences. 73 + * 74 + * 5. There must be no sequence of three or more dots in a row ("..."). 75 + */ 76 + bool 77 + rust_is_mangled(const char *sym) 78 + { 79 + size_t len, len_without_hash; 80 + 81 + if (!sym) 82 + return false; 83 + 84 + len = strlen(sym); 85 + if (len <= hash_prefix_len + hash_len) 86 + /* Not long enough to contain "::h" + hash + something else */ 87 + return false; 88 + 89 + len_without_hash = len - (hash_prefix_len + hash_len); 90 + if (!is_prefixed_hash(sym + len_without_hash)) 91 + return false; 92 + 93 + return looks_like_rust(sym, len_without_hash); 94 + } 95 + 96 + /* 97 + * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex 98 + * digits must comprise between 5 and 15 (inclusive) distinct digits. 99 + */ 100 + static bool is_prefixed_hash(const char *str) 101 + { 102 + const char *end; 103 + bool seen[16]; 104 + size_t i; 105 + int count; 106 + 107 + if (strncmp(str, hash_prefix, hash_prefix_len)) 108 + return false; 109 + str += hash_prefix_len; 110 + 111 + memset(seen, false, sizeof(seen)); 112 + for (end = str + hash_len; str < end; str++) 113 + if (*str >= '0' && *str <= '9') 114 + seen[*str - '0'] = true; 115 + else if (*str >= 'a' && *str <= 'f') 116 + seen[*str - 'a' + 10] = true; 117 + else 118 + return false; 119 + 120 + /* Count how many distinct digits seen */ 121 + count = 0; 122 + for (i = 0; i < 16; i++) 123 + if (seen[i]) 124 + count++; 125 + 126 + return count >= 5 && count <= 15; 127 + } 128 + 129 + static bool looks_like_rust(const char *str, size_t len) 130 + { 131 + const char *end = str + len; 132 + 133 + while (str < end) 134 + switch (*str) { 135 + case '$': 136 + if (!strncmp(str, "$C$", 3)) 137 + str += 3; 138 + else if (!strncmp(str, "$SP$", 4) 139 + || !strncmp(str, "$BP$", 4) 140 + || !strncmp(str, "$RF$", 4) 141 + || !strncmp(str, "$LT$", 4) 142 + || !strncmp(str, "$GT$", 4) 143 + || !strncmp(str, "$LP$", 4) 144 + || !strncmp(str, "$RP$", 4)) 145 + str += 4; 146 + else if (!strncmp(str, "$u20$", 5) 147 + || !strncmp(str, "$u27$", 5) 148 + || !strncmp(str, "$u5b$", 5) 149 + || !strncmp(str, "$u5d$", 5) 150 + || !strncmp(str, "$u7e$", 5)) 151 + str += 5; 152 + else 153 + return false; 154 + break; 155 + case '.': 156 + /* Do not allow three or more consecutive dots */ 157 + if (!strncmp(str, "...", 3)) 158 + return false; 159 + /* Fall through */ 160 + case 'a' ... 'z': 161 + case 'A' ... 'Z': 162 + case '0' ... '9': 163 + case '_': 164 + case ':': 165 + str++; 166 + break; 167 + default: 168 + return false; 169 + } 170 + 171 + return true; 172 + } 173 + 174 + /* 175 + * INPUT: 176 + * sym: symbol for which rust_is_mangled(sym) returns true 177 + * 178 + * The input is demangled in-place because the mangled name is always longer 179 + * than the demangled one. 180 + */ 181 + void 182 + rust_demangle_sym(char *sym) 183 + { 184 + const char *in; 185 + char *out; 186 + const char *end; 187 + 188 + if (!sym) 189 + return; 190 + 191 + in = sym; 192 + out = sym; 193 + end = sym + strlen(sym) - (hash_prefix_len + hash_len); 194 + 195 + while (in < end) 196 + switch (*in) { 197 + case '$': 198 + if (!(unescape(&in, &out, "$C$", ',') 199 + || unescape(&in, &out, "$SP$", '@') 200 + || unescape(&in, &out, "$BP$", '*') 201 + || unescape(&in, &out, "$RF$", '&') 202 + || unescape(&in, &out, "$LT$", '<') 203 + || unescape(&in, &out, "$GT$", '>') 204 + || unescape(&in, &out, "$LP$", '(') 205 + || unescape(&in, &out, "$RP$", ')') 206 + || unescape(&in, &out, "$u20$", ' ') 207 + || unescape(&in, &out, "$u27$", '\'') 208 + || unescape(&in, &out, "$u5b$", '[') 209 + || unescape(&in, &out, "$u5d$", ']') 210 + || unescape(&in, &out, "$u7e$", '~'))) { 211 + pr_err("demangle-rust: unexpected escape sequence"); 212 + goto done; 213 + } 214 + break; 215 + case '_': 216 + /* 217 + * If this is the start of a path component and the next 218 + * character is an escape sequence, ignore the 219 + * underscore. The mangler inserts an underscore to make 220 + * sure the path component begins with a XID_Start 221 + * character. 222 + */ 223 + if ((in == sym || in[-1] == ':') && in[1] == '$') 224 + in++; 225 + else 226 + *out++ = *in++; 227 + break; 228 + case '.': 229 + if (in[1] == '.') { 230 + /* ".." becomes "::" */ 231 + *out++ = ':'; 232 + *out++ = ':'; 233 + in += 2; 234 + } else { 235 + /* "." becomes "-" */ 236 + *out++ = '-'; 237 + in++; 238 + } 239 + break; 240 + case 'a' ... 'z': 241 + case 'A' ... 'Z': 242 + case '0' ... '9': 243 + case ':': 244 + *out++ = *in++; 245 + break; 246 + default: 247 + pr_err("demangle-rust: unexpected character '%c' in symbol\n", 248 + *in); 249 + goto done; 250 + } 251 + 252 + done: 253 + *out = '\0'; 254 + } 255 + 256 + static bool unescape(const char **in, char **out, const char *seq, char value) 257 + { 258 + size_t len = strlen(seq); 259 + 260 + if (strncmp(*in, seq, len)) 261 + return false; 262 + 263 + **out = value; 264 + 265 + *in += len; 266 + *out += 1; 267 + 268 + return true; 269 + }
+7
tools/perf/util/demangle-rust.h
··· 1 + #ifndef __PERF_DEMANGLE_RUST 2 + #define __PERF_DEMANGLE_RUST 1 3 + 4 + bool rust_is_mangled(const char *str); 5 + void rust_demangle_sym(char *str); 6 + 7 + #endif /* __PERF_DEMANGLE_RUST */
+8
tools/perf/util/symbol-elf.c
··· 7 7 8 8 #include "symbol.h" 9 9 #include "demangle-java.h" 10 + #include "demangle-rust.h" 10 11 #include "machine.h" 11 12 #include "vdso.h" 12 13 #include <symbol/kallsyms.h> ··· 1082 1081 demangled = bfd_demangle(NULL, elf_name, demangle_flags); 1083 1082 if (demangled == NULL) 1084 1083 demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET); 1084 + else if (rust_is_mangled(demangled)) 1085 + /* 1086 + * Input to Rust demangling is the BFD-demangled 1087 + * name which it Rust-demangles in place. 1088 + */ 1089 + rust_demangle_sym(demangled); 1090 + 1085 1091 if (demangled != NULL) 1086 1092 elf_name = demangled; 1087 1093 }