Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Add support for BTF pointers to x86 JIT

Pointer to BTF object is a pointer to kernel object or NULL.
Such pointers can only be used by BPF_LDX instructions.
The verifier changed their opcode from LDX|MEM|size
to LDX|PROBE_MEM|size to make JITing easier.
The number of entries in extable is the number of BPF_LDX insns
that access kernel memory via "pointer to BTF type".
Only these load instructions can fault.
Since x86 extable is relative it has to be allocated in the same
memory region as JITed code.
Allocate it prior to last pass of JITing and let the last pass populate it.
Pointer to extable in bpf_prog_aux is necessary to make page fault
handling fast.
Page fault handling is done in two steps:
1. bpf_prog_kallsyms_find() finds BPF program that page faulted.
It's done by walking rb tree.
2. then extable for given bpf program is binary searched.
This process is similar to how page faulting is done for kernel modules.
The exception handler skips over faulting x86 instruction and
initializes destination register with zero. This mimics exact
behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed).

JITs for other architectures can add support in similar way.
Until then they will reject unknown opcode and fallback to interpreter.

Since extable should be aligned and placed near JITed code
make bpf_jit_binary_alloc() return 4 byte aligned image offset,
so that extable aligning formula in bpf_int_jit_compile() doesn't need
to rely on internal implementation of bpf_jit_binary_alloc().
On x86 gcc defaults to 16-byte alignment for regular kernel functions
due to better performance. JITed code may be aligned to 16 in the future,
but it will use 4 in the meantime.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org

authored by

Alexei Starovoitov and committed by
Daniel Borkmann
3dec541b 2a02759e

+128 -5
+93 -4
arch/x86/net/bpf_jit_comp.c
··· 9 9 #include <linux/filter.h> 10 10 #include <linux/if_vlan.h> 11 11 #include <linux/bpf.h> 12 - 12 + #include <asm/extable.h> 13 13 #include <asm/set_memory.h> 14 14 #include <asm/nospec-branch.h> 15 15 ··· 121 121 [BPF_REG_FP] = 5, /* RBP readonly */ 122 122 [BPF_REG_AX] = 2, /* R10 temp register */ 123 123 [AUX_REG] = 3, /* R11 temp register */ 124 + }; 125 + 126 + static const int reg2pt_regs[] = { 127 + [BPF_REG_0] = offsetof(struct pt_regs, ax), 128 + [BPF_REG_1] = offsetof(struct pt_regs, di), 129 + [BPF_REG_2] = offsetof(struct pt_regs, si), 130 + [BPF_REG_3] = offsetof(struct pt_regs, dx), 131 + [BPF_REG_4] = offsetof(struct pt_regs, cx), 132 + [BPF_REG_5] = offsetof(struct pt_regs, r8), 133 + [BPF_REG_6] = offsetof(struct pt_regs, bx), 134 + [BPF_REG_7] = offsetof(struct pt_regs, r13), 135 + [BPF_REG_8] = offsetof(struct pt_regs, r14), 136 + [BPF_REG_9] = offsetof(struct pt_regs, r15), 124 137 }; 125 138 126 139 /* ··· 390 377 *pprog = prog; 391 378 } 392 379 380 + 381 + static bool ex_handler_bpf(const struct exception_table_entry *x, 382 + struct pt_regs *regs, int trapnr, 383 + unsigned long error_code, unsigned long fault_addr) 384 + { 385 + u32 reg = x->fixup >> 8; 386 + 387 + /* jump over faulting load and clear dest register */ 388 + *(unsigned long *)((void *)regs + reg) = 0; 389 + regs->ip += x->fixup & 0xff; 390 + return true; 391 + } 392 + 393 393 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, 394 394 int oldproglen, struct jit_context *ctx) 395 395 { ··· 410 384 int insn_cnt = bpf_prog->len; 411 385 bool seen_exit = false; 412 386 u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; 413 - int i, cnt = 0; 387 + int i, cnt = 0, excnt = 0; 414 388 int proglen = 0; 415 389 u8 *prog = temp; 416 390 ··· 804 778 805 779 /* LDX: dst_reg = *(u8*)(src_reg + off) */ 806 780 case BPF_LDX | BPF_MEM | BPF_B: 781 + case BPF_LDX | BPF_PROBE_MEM | BPF_B: 807 782 /* Emit 'movzx rax, byte ptr [rax + off]' */ 808 783 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); 809 784 goto ldx; 810 785 case BPF_LDX | BPF_MEM | BPF_H: 786 + case BPF_LDX | BPF_PROBE_MEM | BPF_H: 811 787 /* Emit 'movzx rax, word ptr [rax + off]' */ 812 788 EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); 813 789 goto ldx; 814 790 case BPF_LDX | BPF_MEM | BPF_W: 791 + case BPF_LDX | BPF_PROBE_MEM | BPF_W: 815 792 /* Emit 'mov eax, dword ptr [rax+0x14]' */ 816 793 if (is_ereg(dst_reg) || is_ereg(src_reg)) 817 794 EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); ··· 822 793 EMIT1(0x8B); 823 794 goto ldx; 824 795 case BPF_LDX | BPF_MEM | BPF_DW: 796 + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: 825 797 /* Emit 'mov rax, qword ptr [rax+0x14]' */ 826 798 EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); 827 799 ldx: /* ··· 835 805 else 836 806 EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), 837 807 insn->off); 808 + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { 809 + struct exception_table_entry *ex; 810 + u8 *_insn = image + proglen; 811 + s64 delta; 812 + 813 + if (!bpf_prog->aux->extable) 814 + break; 815 + 816 + if (excnt >= bpf_prog->aux->num_exentries) { 817 + pr_err("ex gen bug\n"); 818 + return -EFAULT; 819 + } 820 + ex = &bpf_prog->aux->extable[excnt++]; 821 + 822 + delta = _insn - (u8 *)&ex->insn; 823 + if (!is_simm32(delta)) { 824 + pr_err("extable->insn doesn't fit into 32-bit\n"); 825 + return -EFAULT; 826 + } 827 + ex->insn = delta; 828 + 829 + delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; 830 + if (!is_simm32(delta)) { 831 + pr_err("extable->handler doesn't fit into 32-bit\n"); 832 + return -EFAULT; 833 + } 834 + ex->handler = delta; 835 + 836 + if (dst_reg > BPF_REG_9) { 837 + pr_err("verifier error\n"); 838 + return -EFAULT; 839 + } 840 + /* 841 + * Compute size of x86 insn and its target dest x86 register. 842 + * ex_handler_bpf() will use lower 8 bits to adjust 843 + * pt_regs->ip to jump over this x86 instruction 844 + * and upper bits to figure out which pt_regs to zero out. 845 + * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" 846 + * of 4 bytes will be ignored and rbx will be zero inited. 847 + */ 848 + ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); 849 + } 838 850 break; 839 851 840 852 /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ ··· 1130 1058 addrs[i] = proglen; 1131 1059 prog = temp; 1132 1060 } 1061 + 1062 + if (image && excnt != bpf_prog->aux->num_exentries) { 1063 + pr_err("extable is not populated\n"); 1064 + return -EFAULT; 1065 + } 1133 1066 return proglen; 1134 1067 } 1135 1068 ··· 1235 1158 break; 1236 1159 } 1237 1160 if (proglen == oldproglen) { 1238 - header = bpf_jit_binary_alloc(proglen, &image, 1239 - 1, jit_fill_hole); 1161 + /* 1162 + * The number of entries in extable is the number of BPF_LDX 1163 + * insns that access kernel memory via "pointer to BTF type". 1164 + * The verifier changed their opcode from LDX|MEM|size 1165 + * to LDX|PROBE_MEM|size to make JITing easier. 1166 + */ 1167 + u32 align = __alignof__(struct exception_table_entry); 1168 + u32 extable_size = prog->aux->num_exentries * 1169 + sizeof(struct exception_table_entry); 1170 + 1171 + /* allocate module memory for x86 insns and extable */ 1172 + header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, 1173 + &image, align, jit_fill_hole); 1240 1174 if (!header) { 1241 1175 prog = orig_prog; 1242 1176 goto out_addrs; 1243 1177 } 1178 + prog->aux->extable = (void *) image + roundup(proglen, align); 1244 1179 } 1245 1180 oldproglen = proglen; 1246 1181 cond_resched();
+3
include/linux/bpf.h
··· 24 24 struct seq_file; 25 25 struct btf; 26 26 struct btf_type; 27 + struct exception_table_entry; 27 28 28 29 extern struct idr btf_idr; 29 30 extern spinlock_t btf_idr_lock; ··· 424 423 * main prog always has linfo_idx == 0 425 424 */ 426 425 u32 linfo_idx; 426 + u32 num_exentries; 427 + struct exception_table_entry *extable; 427 428 struct bpf_prog_stats __percpu *stats; 428 429 union { 429 430 struct work_struct work;
+10
include/linux/extable.h
··· 33 33 } 34 34 #endif /*CONFIG_MODULES*/ 35 35 36 + #ifdef CONFIG_BPF_JIT 37 + const struct exception_table_entry *search_bpf_extables(unsigned long addr); 38 + #else 39 + static inline const struct exception_table_entry * 40 + search_bpf_extables(unsigned long addr) 41 + { 42 + return NULL; 43 + } 44 + #endif 45 + 36 46 #endif /* _LINUX_EXTABLE_H */
+19 -1
kernel/bpf/core.c
··· 30 30 #include <linux/kallsyms.h> 31 31 #include <linux/rcupdate.h> 32 32 #include <linux/perf_event.h> 33 - 33 + #include <linux/extable.h> 34 34 #include <asm/unaligned.h> 35 35 36 36 /* Registers */ ··· 710 710 rcu_read_unlock(); 711 711 712 712 return ret; 713 + } 714 + 715 + const struct exception_table_entry *search_bpf_extables(unsigned long addr) 716 + { 717 + const struct exception_table_entry *e = NULL; 718 + struct bpf_prog *prog; 719 + 720 + rcu_read_lock(); 721 + prog = bpf_prog_kallsyms_find(addr); 722 + if (!prog) 723 + goto out; 724 + if (!prog->aux->num_exentries) 725 + goto out; 726 + 727 + e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); 728 + out: 729 + rcu_read_unlock(); 730 + return e; 713 731 } 714 732 715 733 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+1
kernel/bpf/verifier.c
··· 8729 8729 return -EINVAL; 8730 8730 } 8731 8731 insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); 8732 + env->prog->aux->num_exentries++; 8732 8733 continue; 8733 8734 default: 8734 8735 continue;
+2
kernel/extable.c
··· 56 56 e = search_kernel_exception_table(addr); 57 57 if (!e) 58 58 e = search_module_extables(addr); 59 + if (!e) 60 + e = search_bpf_extables(addr); 59 61 return e; 60 62 } 61 63