at v4.16-rc3 697 lines 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2#include <stdio.h> 3#include <sys/types.h> 4#include <sys/stat.h> 5#include <fcntl.h> 6#include <libelf.h> 7#include <gelf.h> 8#include <errno.h> 9#include <unistd.h> 10#include <string.h> 11#include <stdbool.h> 12#include <stdlib.h> 13#include <linux/bpf.h> 14#include <linux/filter.h> 15#include <linux/perf_event.h> 16#include <linux/netlink.h> 17#include <linux/rtnetlink.h> 18#include <linux/types.h> 19#include <sys/types.h> 20#include <sys/socket.h> 21#include <sys/syscall.h> 22#include <sys/ioctl.h> 23#include <sys/mman.h> 24#include <poll.h> 25#include <ctype.h> 26#include <assert.h> 27#include "libbpf.h" 28#include "bpf_load.h" 29#include "perf-sys.h" 30 31#define DEBUGFS "/sys/kernel/debug/tracing/" 32 33static char license[128]; 34static int kern_version; 35static bool processed_sec[128]; 36char bpf_log_buf[BPF_LOG_BUF_SIZE]; 37int map_fd[MAX_MAPS]; 38int prog_fd[MAX_PROGS]; 39int event_fd[MAX_PROGS]; 40int prog_cnt; 41int prog_array_fd = -1; 42 43struct bpf_map_data map_data[MAX_MAPS]; 44int map_data_count = 0; 45 46static int populate_prog_array(const char *event, int prog_fd) 47{ 48 int ind = atoi(event), err; 49 50 err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); 51 if (err < 0) { 52 printf("failed to store prog_fd in prog_array\n"); 53 return -1; 54 } 55 return 0; 56} 57 58static int load_and_attach(const char *event, struct bpf_insn *prog, int size) 59{ 60 bool is_socket = strncmp(event, "socket", 6) == 0; 61 bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; 62 bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 63 bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; 64 bool is_xdp = strncmp(event, "xdp", 3) == 0; 65 bool is_perf_event = strncmp(event, "perf_event", 10) == 0; 66 bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; 67 bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; 68 bool is_sockops = strncmp(event, "sockops", 7) == 0; 69 bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; 70 size_t insns_cnt = size / sizeof(struct bpf_insn); 71 enum bpf_prog_type prog_type; 72 char buf[256]; 73 int fd, efd, err, id; 74 struct perf_event_attr attr = {}; 75 76 attr.type = PERF_TYPE_TRACEPOINT; 77 attr.sample_type = PERF_SAMPLE_RAW; 78 attr.sample_period = 1; 79 attr.wakeup_events = 1; 80 81 if (is_socket) { 82 prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 83 } else if (is_kprobe || is_kretprobe) { 84 prog_type = BPF_PROG_TYPE_KPROBE; 85 } else if (is_tracepoint) { 86 prog_type = BPF_PROG_TYPE_TRACEPOINT; 87 } else if (is_xdp) { 88 prog_type = BPF_PROG_TYPE_XDP; 89 } else if (is_perf_event) { 90 prog_type = BPF_PROG_TYPE_PERF_EVENT; 91 } else if (is_cgroup_skb) { 92 prog_type = BPF_PROG_TYPE_CGROUP_SKB; 93 } else if (is_cgroup_sk) { 94 prog_type = BPF_PROG_TYPE_CGROUP_SOCK; 95 } else if (is_sockops) { 96 prog_type = BPF_PROG_TYPE_SOCK_OPS; 97 } else if (is_sk_skb) { 98 prog_type = BPF_PROG_TYPE_SK_SKB; 99 } else { 100 printf("Unknown event '%s'\n", event); 101 return -1; 102 } 103 104 fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, 105 bpf_log_buf, BPF_LOG_BUF_SIZE); 106 if (fd < 0) { 107 printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); 108 return -1; 109 } 110 111 prog_fd[prog_cnt++] = fd; 112 113 if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) 114 return 0; 115 116 if (is_socket || is_sockops || is_sk_skb) { 117 if (is_socket) 118 event += 6; 119 else 120 event += 7; 121 if (*event != '/') 122 return 0; 123 event++; 124 if (!isdigit(*event)) { 125 printf("invalid prog number\n"); 126 return -1; 127 } 128 return populate_prog_array(event, fd); 129 } 130 131 if (is_kprobe || is_kretprobe) { 132 if (is_kprobe) 133 event += 7; 134 else 135 event += 10; 136 137 if (*event == 0) { 138 printf("event name cannot be empty\n"); 139 return -1; 140 } 141 142 if (isdigit(*event)) 143 return populate_prog_array(event, fd); 144 145 snprintf(buf, sizeof(buf), 146 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", 147 is_kprobe ? 'p' : 'r', event, event); 148 err = system(buf); 149 if (err < 0) { 150 printf("failed to create kprobe '%s' error '%s'\n", 151 event, strerror(errno)); 152 return -1; 153 } 154 155 strcpy(buf, DEBUGFS); 156 strcat(buf, "events/kprobes/"); 157 strcat(buf, event); 158 strcat(buf, "/id"); 159 } else if (is_tracepoint) { 160 event += 11; 161 162 if (*event == 0) { 163 printf("event name cannot be empty\n"); 164 return -1; 165 } 166 strcpy(buf, DEBUGFS); 167 strcat(buf, "events/"); 168 strcat(buf, event); 169 strcat(buf, "/id"); 170 } 171 172 efd = open(buf, O_RDONLY, 0); 173 if (efd < 0) { 174 printf("failed to open event %s\n", event); 175 return -1; 176 } 177 178 err = read(efd, buf, sizeof(buf)); 179 if (err < 0 || err >= sizeof(buf)) { 180 printf("read from '%s' failed '%s'\n", event, strerror(errno)); 181 return -1; 182 } 183 184 close(efd); 185 186 buf[err] = 0; 187 id = atoi(buf); 188 attr.config = id; 189 190 efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); 191 if (efd < 0) { 192 printf("event %d fd %d err %s\n", id, efd, strerror(errno)); 193 return -1; 194 } 195 event_fd[prog_cnt - 1] = efd; 196 err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); 197 if (err < 0) { 198 printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", 199 strerror(errno)); 200 return -1; 201 } 202 err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); 203 if (err < 0) { 204 printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", 205 strerror(errno)); 206 return -1; 207 } 208 209 return 0; 210} 211 212static int load_maps(struct bpf_map_data *maps, int nr_maps, 213 fixup_map_cb fixup_map) 214{ 215 int i, numa_node; 216 217 for (i = 0; i < nr_maps; i++) { 218 if (fixup_map) { 219 fixup_map(&maps[i], i); 220 /* Allow userspace to assign map FD prior to creation */ 221 if (maps[i].fd != -1) { 222 map_fd[i] = maps[i].fd; 223 continue; 224 } 225 } 226 227 numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? 228 maps[i].def.numa_node : -1; 229 230 if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || 231 maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { 232 int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; 233 234 map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, 235 maps[i].name, 236 maps[i].def.key_size, 237 inner_map_fd, 238 maps[i].def.max_entries, 239 maps[i].def.map_flags, 240 numa_node); 241 } else { 242 map_fd[i] = bpf_create_map_node(maps[i].def.type, 243 maps[i].name, 244 maps[i].def.key_size, 245 maps[i].def.value_size, 246 maps[i].def.max_entries, 247 maps[i].def.map_flags, 248 numa_node); 249 } 250 if (map_fd[i] < 0) { 251 printf("failed to create a map: %d %s\n", 252 errno, strerror(errno)); 253 return 1; 254 } 255 maps[i].fd = map_fd[i]; 256 257 if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) 258 prog_array_fd = map_fd[i]; 259 } 260 return 0; 261} 262 263static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, 264 GElf_Shdr *shdr, Elf_Data **data) 265{ 266 Elf_Scn *scn; 267 268 scn = elf_getscn(elf, i); 269 if (!scn) 270 return 1; 271 272 if (gelf_getshdr(scn, shdr) != shdr) 273 return 2; 274 275 *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); 276 if (!*shname || !shdr->sh_size) 277 return 3; 278 279 *data = elf_getdata(scn, 0); 280 if (!*data || elf_getdata(scn, *data) != NULL) 281 return 4; 282 283 return 0; 284} 285 286static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, 287 GElf_Shdr *shdr, struct bpf_insn *insn, 288 struct bpf_map_data *maps, int nr_maps) 289{ 290 int i, nrels; 291 292 nrels = shdr->sh_size / shdr->sh_entsize; 293 294 for (i = 0; i < nrels; i++) { 295 GElf_Sym sym; 296 GElf_Rel rel; 297 unsigned int insn_idx; 298 bool match = false; 299 int j, map_idx; 300 301 gelf_getrel(data, i, &rel); 302 303 insn_idx = rel.r_offset / sizeof(struct bpf_insn); 304 305 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); 306 307 if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { 308 printf("invalid relo for insn[%d].code 0x%x\n", 309 insn_idx, insn[insn_idx].code); 310 return 1; 311 } 312 insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 313 314 /* Match FD relocation against recorded map_data[] offset */ 315 for (map_idx = 0; map_idx < nr_maps; map_idx++) { 316 if (maps[map_idx].elf_offset == sym.st_value) { 317 match = true; 318 break; 319 } 320 } 321 if (match) { 322 insn[insn_idx].imm = maps[map_idx].fd; 323 } else { 324 printf("invalid relo for insn[%d] no map_data match\n", 325 insn_idx); 326 return 1; 327 } 328 } 329 330 return 0; 331} 332 333static int cmp_symbols(const void *l, const void *r) 334{ 335 const GElf_Sym *lsym = (const GElf_Sym *)l; 336 const GElf_Sym *rsym = (const GElf_Sym *)r; 337 338 if (lsym->st_value < rsym->st_value) 339 return -1; 340 else if (lsym->st_value > rsym->st_value) 341 return 1; 342 else 343 return 0; 344} 345 346static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, 347 Elf *elf, Elf_Data *symbols, int strtabidx) 348{ 349 int map_sz_elf, map_sz_copy; 350 bool validate_zero = false; 351 Elf_Data *data_maps; 352 int i, nr_maps; 353 GElf_Sym *sym; 354 Elf_Scn *scn; 355 int copy_sz; 356 357 if (maps_shndx < 0) 358 return -EINVAL; 359 if (!symbols) 360 return -EINVAL; 361 362 /* Get data for maps section via elf index */ 363 scn = elf_getscn(elf, maps_shndx); 364 if (scn) 365 data_maps = elf_getdata(scn, NULL); 366 if (!scn || !data_maps) { 367 printf("Failed to get Elf_Data from maps section %d\n", 368 maps_shndx); 369 return -EINVAL; 370 } 371 372 /* For each map get corrosponding symbol table entry */ 373 sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); 374 for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { 375 assert(nr_maps < MAX_MAPS+1); 376 if (!gelf_getsym(symbols, i, &sym[nr_maps])) 377 continue; 378 if (sym[nr_maps].st_shndx != maps_shndx) 379 continue; 380 /* Only increment iif maps section */ 381 nr_maps++; 382 } 383 384 /* Align to map_fd[] order, via sort on offset in sym.st_value */ 385 qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); 386 387 /* Keeping compatible with ELF maps section changes 388 * ------------------------------------------------ 389 * The program size of struct bpf_map_def is known by loader 390 * code, but struct stored in ELF file can be different. 391 * 392 * Unfortunately sym[i].st_size is zero. To calculate the 393 * struct size stored in the ELF file, assume all struct have 394 * the same size, and simply divide with number of map 395 * symbols. 396 */ 397 map_sz_elf = data_maps->d_size / nr_maps; 398 map_sz_copy = sizeof(struct bpf_map_def); 399 if (map_sz_elf < map_sz_copy) { 400 /* 401 * Backward compat, loading older ELF file with 402 * smaller struct, keeping remaining bytes zero. 403 */ 404 map_sz_copy = map_sz_elf; 405 } else if (map_sz_elf > map_sz_copy) { 406 /* 407 * Forward compat, loading newer ELF file with larger 408 * struct with unknown features. Assume zero means 409 * feature not used. Thus, validate rest of struct 410 * data is zero. 411 */ 412 validate_zero = true; 413 } 414 415 /* Memcpy relevant part of ELF maps data to loader maps */ 416 for (i = 0; i < nr_maps; i++) { 417 unsigned char *addr, *end; 418 struct bpf_map_def *def; 419 const char *map_name; 420 size_t offset; 421 422 map_name = elf_strptr(elf, strtabidx, sym[i].st_name); 423 maps[i].name = strdup(map_name); 424 if (!maps[i].name) { 425 printf("strdup(%s): %s(%d)\n", map_name, 426 strerror(errno), errno); 427 free(sym); 428 return -errno; 429 } 430 431 /* Symbol value is offset into ELF maps section data area */ 432 offset = sym[i].st_value; 433 def = (struct bpf_map_def *)(data_maps->d_buf + offset); 434 maps[i].elf_offset = offset; 435 memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); 436 memcpy(&maps[i].def, def, map_sz_copy); 437 438 /* Verify no newer features were requested */ 439 if (validate_zero) { 440 addr = (unsigned char*) def + map_sz_copy; 441 end = (unsigned char*) def + map_sz_elf; 442 for (; addr < end; addr++) { 443 if (*addr != 0) { 444 free(sym); 445 return -EFBIG; 446 } 447 } 448 } 449 } 450 451 free(sym); 452 return nr_maps; 453} 454 455static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) 456{ 457 int fd, i, ret, maps_shndx = -1, strtabidx = -1; 458 Elf *elf; 459 GElf_Ehdr ehdr; 460 GElf_Shdr shdr, shdr_prog; 461 Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; 462 char *shname, *shname_prog; 463 int nr_maps = 0; 464 465 /* reset global variables */ 466 kern_version = 0; 467 memset(license, 0, sizeof(license)); 468 memset(processed_sec, 0, sizeof(processed_sec)); 469 470 if (elf_version(EV_CURRENT) == EV_NONE) 471 return 1; 472 473 fd = open(path, O_RDONLY, 0); 474 if (fd < 0) 475 return 1; 476 477 elf = elf_begin(fd, ELF_C_READ, NULL); 478 479 if (!elf) 480 return 1; 481 482 if (gelf_getehdr(elf, &ehdr) != &ehdr) 483 return 1; 484 485 /* clear all kprobes */ 486 i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); 487 488 /* scan over all elf sections to get license and map info */ 489 for (i = 1; i < ehdr.e_shnum; i++) { 490 491 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 492 continue; 493 494 if (0) /* helpful for llvm debugging */ 495 printf("section %d:%s data %p size %zd link %d flags %d\n", 496 i, shname, data->d_buf, data->d_size, 497 shdr.sh_link, (int) shdr.sh_flags); 498 499 if (strcmp(shname, "license") == 0) { 500 processed_sec[i] = true; 501 memcpy(license, data->d_buf, data->d_size); 502 } else if (strcmp(shname, "version") == 0) { 503 processed_sec[i] = true; 504 if (data->d_size != sizeof(int)) { 505 printf("invalid size of version section %zd\n", 506 data->d_size); 507 return 1; 508 } 509 memcpy(&kern_version, data->d_buf, sizeof(int)); 510 } else if (strcmp(shname, "maps") == 0) { 511 int j; 512 513 maps_shndx = i; 514 data_maps = data; 515 for (j = 0; j < MAX_MAPS; j++) 516 map_data[j].fd = -1; 517 } else if (shdr.sh_type == SHT_SYMTAB) { 518 strtabidx = shdr.sh_link; 519 symbols = data; 520 } 521 } 522 523 ret = 1; 524 525 if (!symbols) { 526 printf("missing SHT_SYMTAB section\n"); 527 goto done; 528 } 529 530 if (data_maps) { 531 nr_maps = load_elf_maps_section(map_data, maps_shndx, 532 elf, symbols, strtabidx); 533 if (nr_maps < 0) { 534 printf("Error: Failed loading ELF maps (errno:%d):%s\n", 535 nr_maps, strerror(-nr_maps)); 536 ret = 1; 537 goto done; 538 } 539 if (load_maps(map_data, nr_maps, fixup_map)) 540 goto done; 541 map_data_count = nr_maps; 542 543 processed_sec[maps_shndx] = true; 544 } 545 546 /* process all relo sections, and rewrite bpf insns for maps */ 547 for (i = 1; i < ehdr.e_shnum; i++) { 548 if (processed_sec[i]) 549 continue; 550 551 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 552 continue; 553 554 if (shdr.sh_type == SHT_REL) { 555 struct bpf_insn *insns; 556 557 /* locate prog sec that need map fixup (relocations) */ 558 if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, 559 &shdr_prog, &data_prog)) 560 continue; 561 562 if (shdr_prog.sh_type != SHT_PROGBITS || 563 !(shdr_prog.sh_flags & SHF_EXECINSTR)) 564 continue; 565 566 insns = (struct bpf_insn *) data_prog->d_buf; 567 processed_sec[i] = true; /* relo section */ 568 569 if (parse_relo_and_apply(data, symbols, &shdr, insns, 570 map_data, nr_maps)) 571 continue; 572 } 573 } 574 575 /* load programs */ 576 for (i = 1; i < ehdr.e_shnum; i++) { 577 578 if (processed_sec[i]) 579 continue; 580 581 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) 582 continue; 583 584 if (memcmp(shname, "kprobe/", 7) == 0 || 585 memcmp(shname, "kretprobe/", 10) == 0 || 586 memcmp(shname, "tracepoint/", 11) == 0 || 587 memcmp(shname, "xdp", 3) == 0 || 588 memcmp(shname, "perf_event", 10) == 0 || 589 memcmp(shname, "socket", 6) == 0 || 590 memcmp(shname, "cgroup/", 7) == 0 || 591 memcmp(shname, "sockops", 7) == 0 || 592 memcmp(shname, "sk_skb", 6) == 0) { 593 ret = load_and_attach(shname, data->d_buf, 594 data->d_size); 595 if (ret != 0) 596 goto done; 597 } 598 } 599 600 ret = 0; 601done: 602 close(fd); 603 return ret; 604} 605 606int load_bpf_file(char *path) 607{ 608 return do_load_bpf_file(path, NULL); 609} 610 611int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) 612{ 613 return do_load_bpf_file(path, fixup_map); 614} 615 616void read_trace_pipe(void) 617{ 618 int trace_fd; 619 620 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 621 if (trace_fd < 0) 622 return; 623 624 while (1) { 625 static char buf[4096]; 626 ssize_t sz; 627 628 sz = read(trace_fd, buf, sizeof(buf)); 629 if (sz > 0) { 630 buf[sz] = 0; 631 puts(buf); 632 } 633 } 634} 635 636#define MAX_SYMS 300000 637static struct ksym syms[MAX_SYMS]; 638static int sym_cnt; 639 640static int ksym_cmp(const void *p1, const void *p2) 641{ 642 return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; 643} 644 645int load_kallsyms(void) 646{ 647 FILE *f = fopen("/proc/kallsyms", "r"); 648 char func[256], buf[256]; 649 char symbol; 650 void *addr; 651 int i = 0; 652 653 if (!f) 654 return -ENOENT; 655 656 while (!feof(f)) { 657 if (!fgets(buf, sizeof(buf), f)) 658 break; 659 if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) 660 break; 661 if (!addr) 662 continue; 663 syms[i].addr = (long) addr; 664 syms[i].name = strdup(func); 665 i++; 666 } 667 sym_cnt = i; 668 qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp); 669 return 0; 670} 671 672struct ksym *ksym_search(long key) 673{ 674 int start = 0, end = sym_cnt; 675 int result; 676 677 while (start < end) { 678 size_t mid = start + (end - start) / 2; 679 680 result = key - syms[mid].addr; 681 if (result < 0) 682 end = mid; 683 else if (result > 0) 684 start = mid + 1; 685 else 686 return &syms[mid]; 687 } 688 689 if (start >= 1 && syms[start - 1].addr < key && 690 key < syms[start].addr) 691 /* valid ksym */ 692 return &syms[start - 1]; 693 694 /* out of range. return _stext */ 695 return &syms[0]; 696} 697