Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2#include <ctype.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <string.h>
6#include <assert.h>
7#include <errno.h>
8#include <fcntl.h>
9#include <poll.h>
10#include <pthread.h>
11#include <unistd.h>
12#include <linux/perf_event.h>
13#include <linux/fs.h>
14#include <sys/ioctl.h>
15#include <sys/mman.h>
16#include "trace_helpers.h"
17#include <linux/limits.h>
18#include <libelf.h>
19#include <gelf.h>
20#include "bpf/hashmap.h"
21#include "bpf/libbpf_internal.h"
22#include "bpf_util.h"
23
24#define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe"
25#define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe"
26
27struct ksyms {
28 struct ksym *syms;
29 size_t sym_cap;
30 size_t sym_cnt;
31};
32
33static struct ksyms *ksyms;
34static pthread_mutex_t ksyms_mutex = PTHREAD_MUTEX_INITIALIZER;
35
36static int ksyms__add_symbol(struct ksyms *ksyms, const char *name,
37 unsigned long addr)
38{
39 void *tmp;
40
41 tmp = strdup(name);
42 if (!tmp)
43 return -ENOMEM;
44 ksyms->syms[ksyms->sym_cnt].addr = addr;
45 ksyms->syms[ksyms->sym_cnt].name = tmp;
46 ksyms->sym_cnt++;
47 return 0;
48}
49
50void free_kallsyms_local(struct ksyms *ksyms)
51{
52 unsigned int i;
53
54 if (!ksyms)
55 return;
56
57 if (!ksyms->syms) {
58 free(ksyms);
59 return;
60 }
61
62 for (i = 0; i < ksyms->sym_cnt; i++)
63 free(ksyms->syms[i].name);
64 free(ksyms->syms);
65 free(ksyms);
66}
67
68static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb)
69{
70 FILE *f;
71 char func[256], buf[256];
72 char symbol;
73 void *addr;
74 int ret;
75 struct ksyms *ksyms;
76
77 f = fopen("/proc/kallsyms", "r");
78 if (!f)
79 return NULL;
80
81 ksyms = calloc(1, sizeof(struct ksyms));
82 if (!ksyms) {
83 fclose(f);
84 return NULL;
85 }
86
87 while (fgets(buf, sizeof(buf), f)) {
88 if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
89 break;
90 if (!addr)
91 continue;
92
93 ret = libbpf_ensure_mem((void **) &ksyms->syms, &ksyms->sym_cap,
94 sizeof(struct ksym), ksyms->sym_cnt + 1);
95 if (ret)
96 goto error;
97 ret = ksyms__add_symbol(ksyms, func, (unsigned long)addr);
98 if (ret)
99 goto error;
100 }
101 fclose(f);
102 qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb);
103 return ksyms;
104
105error:
106 fclose(f);
107 free_kallsyms_local(ksyms);
108 return NULL;
109}
110
111static int ksym_cmp(const void *p1, const void *p2)
112{
113 return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
114}
115
116struct ksyms *load_kallsyms_local(void)
117{
118 return load_kallsyms_local_common(ksym_cmp);
119}
120
121struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb)
122{
123 return load_kallsyms_local_common(cmp_cb);
124}
125
126int load_kallsyms(void)
127{
128 pthread_mutex_lock(&ksyms_mutex);
129 if (!ksyms)
130 ksyms = load_kallsyms_local();
131 pthread_mutex_unlock(&ksyms_mutex);
132 return ksyms ? 0 : 1;
133}
134
135struct ksym *ksym_search_local(struct ksyms *ksyms, long key)
136{
137 int start = 0, end = ksyms->sym_cnt;
138 int result;
139
140 /* kallsyms not loaded. return NULL */
141 if (ksyms->sym_cnt <= 0)
142 return NULL;
143
144 while (start < end) {
145 size_t mid = start + (end - start) / 2;
146
147 result = key - ksyms->syms[mid].addr;
148 if (result < 0)
149 end = mid;
150 else if (result > 0)
151 start = mid + 1;
152 else
153 return &ksyms->syms[mid];
154 }
155
156 if (start >= 1 && ksyms->syms[start - 1].addr < key &&
157 key < ksyms->syms[start].addr)
158 /* valid ksym */
159 return &ksyms->syms[start - 1];
160
161 /* out of range. return _stext */
162 return &ksyms->syms[0];
163}
164
165struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p,
166 ksym_search_cmp_t cmp_cb)
167{
168 int start = 0, mid, end = ksyms->sym_cnt;
169 struct ksym *ks;
170 int result;
171
172 while (start < end) {
173 mid = start + (end - start) / 2;
174 ks = &ksyms->syms[mid];
175 result = cmp_cb(p, ks);
176 if (result < 0)
177 end = mid;
178 else if (result > 0)
179 start = mid + 1;
180 else
181 return ks;
182 }
183
184 return NULL;
185}
186
187struct ksym *ksym_search(long key)
188{
189 if (!ksyms)
190 return NULL;
191 return ksym_search_local(ksyms, key);
192}
193
194long ksym_get_addr_local(struct ksyms *ksyms, const char *name)
195{
196 int i;
197
198 for (i = 0; i < ksyms->sym_cnt; i++) {
199 if (strcmp(ksyms->syms[i].name, name) == 0)
200 return ksyms->syms[i].addr;
201 }
202
203 return 0;
204}
205
206long ksym_get_addr(const char *name)
207{
208 if (!ksyms)
209 return 0;
210 return ksym_get_addr_local(ksyms, name);
211}
212
213/* open kallsyms and read symbol addresses on the fly. Without caching all symbols,
214 * this is faster than load + find.
215 */
216int kallsyms_find(const char *sym, unsigned long long *addr)
217{
218 char type, name[500], *match;
219 unsigned long long value;
220 int err = 0;
221 FILE *f;
222
223 f = fopen("/proc/kallsyms", "r");
224 if (!f)
225 return -EINVAL;
226
227 while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
228 /* If CONFIG_LTO_CLANG_THIN is enabled, static variable/function
229 * symbols could be promoted to global due to cross-file inlining.
230 * For such cases, clang compiler will add .llvm.<hash> suffix
231 * to those symbols to avoid potential naming conflict.
232 * Let us ignore .llvm.<hash> suffix during symbol comparison.
233 */
234 if (type == 'd') {
235 match = strstr(name, ".llvm.");
236 if (match)
237 *match = '\0';
238 }
239 if (strcmp(name, sym) == 0) {
240 *addr = value;
241 goto out;
242 }
243 }
244 err = -ENOENT;
245
246out:
247 fclose(f);
248 return err;
249}
250
251#ifdef PROCMAP_QUERY
252int env_verbosity __weak = 0;
253
254static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
255{
256 char path_buf[PATH_MAX], build_id_buf[20];
257 struct procmap_query q;
258 int err;
259
260 memset(&q, 0, sizeof(q));
261 q.size = sizeof(q);
262 q.query_flags = query_flags;
263 q.query_addr = (__u64)addr;
264 q.vma_name_addr = (__u64)path_buf;
265 q.vma_name_size = sizeof(path_buf);
266 q.build_id_addr = (__u64)build_id_buf;
267 q.build_id_size = sizeof(build_id_buf);
268
269 err = ioctl(fd, PROCMAP_QUERY, &q);
270 if (err < 0) {
271 err = -errno;
272 if (err == -ENOTTY)
273 return -EOPNOTSUPP; /* ioctl() not implemented yet */
274 if (err == -ENOENT)
275 return -ESRCH; /* vma not found */
276 return err;
277 }
278
279 if (env_verbosity >= 1) {
280 printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n",
281 (long)addr, (long)q.vma_start, (long)q.vma_end,
282 (q.vma_flags & PROCMAP_QUERY_VMA_READABLE) ? 'r' : '-',
283 (q.vma_flags & PROCMAP_QUERY_VMA_WRITABLE) ? 'w' : '-',
284 (q.vma_flags & PROCMAP_QUERY_VMA_EXECUTABLE) ? 'x' : '-',
285 (q.vma_flags & PROCMAP_QUERY_VMA_SHARED) ? 's' : 'p',
286 (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode,
287 q.vma_name_size ? path_buf : "",
288 q.build_id_size ? "YES" : "NO",
289 q.build_id_size);
290 }
291
292 *start = q.vma_start;
293 *offset = q.vma_offset;
294 *flags = q.vma_flags;
295 return 0;
296}
297#else
298# ifndef PROCMAP_QUERY_VMA_EXECUTABLE
299# define PROCMAP_QUERY_VMA_EXECUTABLE 0x04
300# endif
301
302static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags)
303{
304 return -EOPNOTSUPP;
305}
306#endif
307
308ssize_t get_uprobe_offset(const void *addr)
309{
310 size_t start, base, end;
311 FILE *f;
312 char buf[256];
313 int err, flags;
314
315 f = fopen("/proc/self/maps", "r");
316 if (!f)
317 return -errno;
318
319 /* requested executable VMA only */
320 err = procmap_query(fileno(f), addr, PROCMAP_QUERY_VMA_EXECUTABLE, &start, &base, &flags);
321 if (err == -EOPNOTSUPP) {
322 bool found = false;
323
324 while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
325 if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
326 found = true;
327 break;
328 }
329 }
330 if (!found) {
331 fclose(f);
332 return -ESRCH;
333 }
334 } else if (err) {
335 fclose(f);
336 return err;
337 }
338 fclose(f);
339
340#if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2
341
342#define OP_RT_RA_MASK 0xffff0000UL
343#define LIS_R2 0x3c400000UL
344#define ADDIS_R2_R12 0x3c4c0000UL
345#define ADDI_R2_R2 0x38420000UL
346
347 /*
348 * A PPC64 ABIv2 function may have a local and a global entry
349 * point. We need to use the local entry point when patching
350 * functions, so identify and step over the global entry point
351 * sequence.
352 *
353 * The global entry point sequence is always of the form:
354 *
355 * addis r2,r12,XXXX
356 * addi r2,r2,XXXX
357 *
358 * A linker optimisation may convert the addis to lis:
359 *
360 * lis r2,XXXX
361 * addi r2,r2,XXXX
362 */
363 {
364 const __u32 *insn = (const __u32 *)(uintptr_t)addr;
365
366 if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
367 ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
368 ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2))
369 return (uintptr_t)(insn + 2) - start + base;
370 }
371#endif
372 return (uintptr_t)addr - start + base;
373}
374
375ssize_t get_rel_offset(uintptr_t addr)
376{
377 size_t start, end, offset;
378 char buf[256];
379 FILE *f;
380 int err, flags;
381
382 f = fopen("/proc/self/maps", "r");
383 if (!f)
384 return -errno;
385
386 err = procmap_query(fileno(f), (const void *)addr, 0, &start, &offset, &flags);
387 if (err == 0) {
388 fclose(f);
389 return (size_t)addr - start + offset;
390 } else if (err != -EOPNOTSUPP) {
391 fclose(f);
392 return err;
393 } else if (err) {
394 while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) {
395 if (addr >= start && addr < end) {
396 fclose(f);
397 return (size_t)addr - start + offset;
398 }
399 }
400 }
401
402 fclose(f);
403 return -EINVAL;
404}
405
406static int
407parse_build_id_buf(const void *note_start, Elf32_Word note_size, char *build_id)
408{
409 Elf32_Word note_offs = 0;
410
411 while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
412 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
413
414 if (nhdr->n_type == 3 && nhdr->n_namesz == sizeof("GNU") &&
415 !strcmp((char *)(nhdr + 1), "GNU") && nhdr->n_descsz > 0 &&
416 nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
417 memcpy(build_id, note_start + note_offs +
418 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), nhdr->n_descsz);
419 memset(build_id + nhdr->n_descsz, 0, BPF_BUILD_ID_SIZE - nhdr->n_descsz);
420 return (int) nhdr->n_descsz;
421 }
422
423 note_offs = note_offs + sizeof(Elf32_Nhdr) +
424 ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
425 }
426
427 return -ENOENT;
428}
429
430/* Reads binary from *path* file and returns it in the *build_id* buffer
431 * with *size* which is expected to be at least BPF_BUILD_ID_SIZE bytes.
432 * Returns size of build id on success. On error the error value is
433 * returned.
434 */
435int read_build_id(const char *path, char *build_id, size_t size)
436{
437 int fd, err = -EINVAL;
438 Elf *elf = NULL;
439 GElf_Ehdr ehdr;
440 size_t max, i;
441
442 if (size < BPF_BUILD_ID_SIZE)
443 return -EINVAL;
444
445 fd = open(path, O_RDONLY | O_CLOEXEC);
446 if (fd < 0)
447 return -errno;
448
449 (void)elf_version(EV_CURRENT);
450
451 elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
452 if (!elf)
453 goto out;
454 if (elf_kind(elf) != ELF_K_ELF)
455 goto out;
456 if (!gelf_getehdr(elf, &ehdr))
457 goto out;
458
459 for (i = 0; i < ehdr.e_phnum; i++) {
460 GElf_Phdr mem, *phdr;
461 char *data;
462
463 phdr = gelf_getphdr(elf, i, &mem);
464 if (!phdr)
465 goto out;
466 if (phdr->p_type != PT_NOTE)
467 continue;
468 data = elf_rawfile(elf, &max);
469 if (!data)
470 goto out;
471 if (phdr->p_offset + phdr->p_memsz > max)
472 goto out;
473 err = parse_build_id_buf(data + phdr->p_offset, phdr->p_memsz, build_id);
474 if (err > 0)
475 break;
476 }
477
478out:
479 if (elf)
480 elf_end(elf);
481 close(fd);
482 return err;
483}
484
485int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter)
486{
487 size_t buflen, n;
488 char *buf = NULL;
489 FILE *fp = NULL;
490
491 if (access(TRACEFS_PIPE, F_OK) == 0)
492 fp = fopen(TRACEFS_PIPE, "r");
493 else
494 fp = fopen(DEBUGFS_PIPE, "r");
495 if (!fp)
496 return -1;
497
498 /* We do not want to wait forever when iter is specified. */
499 if (iter)
500 fcntl(fileno(fp), F_SETFL, O_NONBLOCK);
501
502 while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) {
503 if (n > 0)
504 cb(buf, data);
505 if (iter && !(--iter))
506 break;
507 }
508
509 free(buf);
510 if (fp)
511 fclose(fp);
512 return 0;
513}
514
515static void trace_pipe_cb(const char *str, void *data)
516{
517 printf("%s", str);
518}
519
520void read_trace_pipe(void)
521{
522 read_trace_pipe_iter(trace_pipe_cb, NULL, 0);
523}
524
525static size_t symbol_hash(long key, void *ctx __maybe_unused)
526{
527 return str_hash((const char *) key);
528}
529
530static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
531{
532 return strcmp((const char *) key1, (const char *) key2) == 0;
533}
534
535static bool is_invalid_entry(char *buf, bool kernel)
536{
537 if (kernel && strchr(buf, '['))
538 return true;
539 if (!kernel && !strchr(buf, '['))
540 return true;
541 return false;
542}
543
544static const char * const trace_blacklist[] = {
545 "migrate_disable",
546 "migrate_enable",
547 "rcu_read_unlock_strict",
548 "preempt_count_add",
549 "preempt_count_sub",
550 "__rcu_read_lock",
551 "__rcu_read_unlock",
552 "bpf_get_numa_node_id",
553};
554
555static bool skip_entry(char *name)
556{
557 int i;
558
559 /*
560 * We attach to almost all kernel functions and some of them
561 * will cause 'suspicious RCU usage' when fprobe is attached
562 * to them. Filter out the current culprits - arch_cpu_idle
563 * default_idle and rcu_* functions.
564 */
565 if (!strcmp(name, "arch_cpu_idle"))
566 return true;
567 if (!strcmp(name, "default_idle"))
568 return true;
569 if (!strncmp(name, "rcu_", 4))
570 return true;
571 if (!strcmp(name, "bpf_dispatcher_xdp_func"))
572 return true;
573 if (!strncmp(name, "__ftrace_invalid_address__",
574 sizeof("__ftrace_invalid_address__") - 1))
575 return true;
576
577 for (i = 0; i < ARRAY_SIZE(trace_blacklist); i++) {
578 if (!strcmp(name, trace_blacklist[i]))
579 return true;
580 }
581
582 return false;
583}
584
585/* Do comparison by ignoring '.llvm.<hash>' suffixes. */
586static int compare_name(const char *name1, const char *name2)
587{
588 const char *res1, *res2;
589 int len1, len2;
590
591 res1 = strstr(name1, ".llvm.");
592 res2 = strstr(name2, ".llvm.");
593 len1 = res1 ? res1 - name1 : strlen(name1);
594 len2 = res2 ? res2 - name2 : strlen(name2);
595
596 if (len1 == len2)
597 return strncmp(name1, name2, len1);
598 if (len1 < len2)
599 return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
600 return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
601}
602
603static int load_kallsyms_compare(const void *p1, const void *p2)
604{
605 return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
606}
607
608static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
609{
610 return compare_name(p1, p2->name);
611}
612
613int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel)
614{
615 size_t cap = 0, cnt = 0;
616 char *name = NULL, *ksym_name, **syms = NULL;
617 struct hashmap *map;
618 struct ksyms *ksyms;
619 struct ksym *ks;
620 char buf[256];
621 FILE *f;
622 int err = 0;
623
624 ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
625 if (!ksyms)
626 return -EINVAL;
627
628 /*
629 * The available_filter_functions contains many duplicates,
630 * but other than that all symbols are usable to trace.
631 * Filtering out duplicates by using hashmap__add, which won't
632 * add existing entry.
633 */
634
635 if (access("/sys/kernel/tracing/trace", F_OK) == 0)
636 f = fopen("/sys/kernel/tracing/available_filter_functions", "r");
637 else
638 f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r");
639
640 if (!f)
641 return -EINVAL;
642
643 map = hashmap__new(symbol_hash, symbol_equal, NULL);
644 if (IS_ERR(map)) {
645 err = libbpf_get_error(map);
646 goto error;
647 }
648
649 while (fgets(buf, sizeof(buf), f)) {
650 if (is_invalid_entry(buf, kernel))
651 continue;
652
653 free(name);
654 if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
655 continue;
656 if (skip_entry(name))
657 continue;
658
659 ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
660 if (!ks) {
661 err = -EINVAL;
662 goto error;
663 }
664
665 ksym_name = ks->name;
666 err = hashmap__add(map, ksym_name, 0);
667 if (err == -EEXIST) {
668 err = 0;
669 continue;
670 }
671 if (err)
672 goto error;
673
674 err = libbpf_ensure_mem((void **) &syms, &cap,
675 sizeof(*syms), cnt + 1);
676 if (err)
677 goto error;
678
679 syms[cnt++] = ksym_name;
680 }
681
682 *symsp = syms;
683 *cntp = cnt;
684
685error:
686 free(name);
687 fclose(f);
688 hashmap__free(map);
689 if (err)
690 free(syms);
691 return err;
692}
693
694int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
695{
696 unsigned long *addr, *addrs, *tmp_addrs;
697 int err = 0, max_cnt, inc_cnt;
698 char *name = NULL;
699 size_t cnt = 0;
700 char buf[256];
701 FILE *f;
702
703 if (access("/sys/kernel/tracing/trace", F_OK) == 0)
704 f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
705 else
706 f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
707
708 if (!f)
709 return -ENOENT;
710
711 /* In my local setup, the number of entries is 50k+ so Let us initially
712 * allocate space to hold 64k entries. If 64k is not enough, incrementally
713 * increase 1k each time.
714 */
715 max_cnt = 65536;
716 inc_cnt = 1024;
717 addrs = malloc(max_cnt * sizeof(long));
718 if (addrs == NULL) {
719 err = -ENOMEM;
720 goto error;
721 }
722
723 while (fgets(buf, sizeof(buf), f)) {
724 if (is_invalid_entry(buf, kernel))
725 continue;
726
727 free(name);
728 if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
729 continue;
730 if (skip_entry(name))
731 continue;
732
733 if (cnt == max_cnt) {
734 max_cnt += inc_cnt;
735 tmp_addrs = realloc(addrs, max_cnt * sizeof(long));
736 if (!tmp_addrs) {
737 err = -ENOMEM;
738 goto error;
739 }
740 addrs = tmp_addrs;
741 }
742
743 addrs[cnt++] = (unsigned long)addr;
744 }
745
746 *addrsp = addrs;
747 *cntp = cnt;
748
749error:
750 free(name);
751 fclose(f);
752 if (err)
753 free(addrs);
754 return err;
755}