Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

+3

arch/Kconfig

··· 196 196 config HAVE_KPROBES_ON_FTRACE 197 197 bool 198 198 199 + config HAVE_KPROBE_OVERRIDE 200 + bool 201 + 199 202 config HAVE_NMI 200 203 bool 201 204

+1 -1

arch/arm/net/bpf_jit_32.c

··· 1824 1824 /* If BPF JIT was not enabled then we must fall back to 1825 1825 * the interpreter. 1826 1826 */ 1827 - if (!bpf_jit_enable) 1827 + if (!prog->jit_requested) 1828 1828 return orig_prog; 1829 1829 1830 1830 /* If constant blinding was enabled and we failed during blinding

+65 -5

arch/arm64/net/bpf_jit_comp.c

··· 99 99 } 100 100 } 101 101 102 + static inline void emit_addr_mov_i64(const int reg, const u64 val, 103 + struct jit_ctx *ctx) 104 + { 105 + u64 tmp = val; 106 + int shift = 0; 107 + 108 + emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx); 109 + for (;shift < 48;) { 110 + tmp >>= 16; 111 + shift += 16; 112 + emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx); 113 + } 114 + } 115 + 102 116 static inline void emit_a64_mov_i(const int is64, const int reg, 103 117 const s32 val, struct jit_ctx *ctx) 104 118 { ··· 617 603 const u8 r0 = bpf2a64[BPF_REG_0]; 618 604 const u64 func = (u64)__bpf_call_base + imm; 619 605 620 - emit_a64_mov_i64(tmp, func, ctx); 606 + if (ctx->prog->is_func) 607 + emit_addr_mov_i64(tmp, func, ctx); 608 + else 609 + emit_a64_mov_i64(tmp, func, ctx); 621 610 emit(A64_BLR(tmp), ctx); 622 611 emit(A64_MOV(1, r0, A64_R(0)), ctx); 623 612 break; ··· 852 835 flush_icache_range((unsigned long)start, (unsigned long)end); 853 836 } 854 837 838 + struct arm64_jit_data { 839 + struct bpf_binary_header *header; 840 + u8 *image; 841 + struct jit_ctx ctx; 842 + }; 843 + 855 844 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) 856 845 { 857 846 struct bpf_prog *tmp, *orig_prog = prog; 858 847 struct bpf_binary_header *header; 848 + struct arm64_jit_data *jit_data; 859 849 bool tmp_blinded = false; 850 + bool extra_pass = false; 860 851 struct jit_ctx ctx; 861 852 int image_size; 862 853 u8 *image_ptr; 863 854 864 - if (!bpf_jit_enable) 855 + if (!prog->jit_requested) 865 856 return orig_prog; 866 857 867 858 tmp = bpf_jit_blind_constants(prog); ··· 883 858 prog = tmp; 884 859 } 885 860 861 + jit_data = prog->aux->jit_data; 862 + if (!jit_data) { 863 + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); 864 + if (!jit_data) { 865 + prog = orig_prog; 866 + goto out; 867 + } 868 + prog->aux->jit_data = jit_data; 869 + } 870 + if (jit_data->ctx.offset) { 871 + ctx = jit_data->ctx; 872 + image_ptr = jit_data->image; 873 + header = jit_data->header; 874 + extra_pass = true; 875 + goto skip_init_ctx; 876 + } 886 877 memset(&ctx, 0, sizeof(ctx)); 887 878 ctx.prog = prog; 888 879 889 880 ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL); 890 881 if (ctx.offset == NULL) { 891 882 prog = orig_prog; 892 - goto out; 883 + goto out_off; 893 884 } 894 885 895 886 /* 1. Initial fake pass to compute ctx->idx. */ ··· 936 895 /* 2. Now, the actual pass. */ 937 896 938 897 ctx.image = (__le32 *)image_ptr; 898 + skip_init_ctx: 939 899 ctx.idx = 0; 940 900 941 901 build_prologue(&ctx); ··· 962 920 963 921 bpf_flush_icache(header, ctx.image + ctx.idx); 964 922 965 - bpf_jit_binary_lock_ro(header); 923 + if (!prog->is_func || extra_pass) { 924 + if (extra_pass && ctx.idx != jit_data->ctx.idx) { 925 + pr_err_once("multi-func JIT bug %d != %d\n", 926 + ctx.idx, jit_data->ctx.idx); 927 + bpf_jit_binary_free(header); 928 + prog->bpf_func = NULL; 929 + prog->jited = 0; 930 + goto out_off; 931 + } 932 + bpf_jit_binary_lock_ro(header); 933 + } else { 934 + jit_data->ctx = ctx; 935 + jit_data->image = image_ptr; 936 + jit_data->header = header; 937 + } 966 938 prog->bpf_func = (void *)ctx.image; 967 939 prog->jited = 1; 968 940 prog->jited_len = image_size; 969 941 942 + if (!prog->is_func || extra_pass) { 970 943 out_off: 971 - kfree(ctx.offset); 944 + kfree(ctx.offset); 945 + kfree(jit_data); 946 + prog->aux->jit_data = NULL; 947 + } 972 948 out: 973 949 if (tmp_blinded) 974 950 bpf_jit_prog_release_other(prog, prog == orig_prog ?

+1 -1

arch/mips/net/ebpf_jit.c

··· 1869 1869 unsigned int image_size; 1870 1870 u8 *image_ptr; 1871 1871 1872 - if (!bpf_jit_enable || !cpu_has_mips64r2) 1872 + if (!prog->jit_requested || !cpu_has_mips64r2) 1873 1873 return prog; 1874 1874 1875 1875 tmp = bpf_jit_blind_constants(prog);

+1 -1

arch/powerpc/net/bpf_jit_comp64.c

··· 993 993 struct bpf_prog *tmp_fp; 994 994 bool bpf_blinded = false; 995 995 996 - if (!bpf_jit_enable) 996 + if (!fp->jit_requested) 997 997 return org_fp; 998 998 999 999 tmp_fp = bpf_jit_blind_constants(org_fp);

+1 -1

arch/s390/net/bpf_jit_comp.c

··· 1300 1300 struct bpf_jit jit; 1301 1301 int pass; 1302 1302 1303 - if (!bpf_jit_enable) 1303 + if (!fp->jit_requested) 1304 1304 return orig_fp; 1305 1305 1306 1306 tmp = bpf_jit_blind_constants(fp);

+1 -1

arch/sparc/net/bpf_jit_comp_64.c

··· 1517 1517 u8 *image_ptr; 1518 1518 int pass; 1519 1519 1520 - if (!bpf_jit_enable) 1520 + if (!prog->jit_requested) 1521 1521 return orig_prog; 1522 1522 1523 1523 tmp = bpf_jit_blind_constants(prog);

+1

arch/x86/Kconfig

··· 154 154 select HAVE_KERNEL_XZ 155 155 select HAVE_KPROBES 156 156 select HAVE_KPROBES_ON_FTRACE 157 + select HAVE_KPROBE_OVERRIDE 157 158 select HAVE_KRETPROBES 158 159 select HAVE_KVM 159 160 select HAVE_LIVEPATCH if X86_64

+4

arch/x86/include/asm/kprobes.h

··· 67 67 void arch_remove_kprobe(struct kprobe *p); 68 68 asmlinkage void kretprobe_trampoline(void); 69 69 70 + #ifdef CONFIG_KPROBES_ON_FTRACE 71 + extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); 72 + #endif 73 + 70 74 /* Architecture specific copy of original instruction*/ 71 75 struct arch_specific_insn { 72 76 /* copy of the original instruction */

+5

arch/x86/include/asm/ptrace.h

··· 109 109 return regs->ax; 110 110 } 111 111 112 + static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) 113 + { 114 + regs->ax = rc; 115 + } 116 + 112 117 /* 113 118 * user_mode(regs) determines whether a register set came from user 114 119 * mode. On x86_32, this is true if V8086 mode was enabled OR if the

+14

arch/x86/kernel/kprobes/ftrace.c

··· 97 97 p->ainsn.boostable = false; 98 98 return 0; 99 99 } 100 + 101 + asmlinkage void override_func(void); 102 + asm( 103 + ".type override_func, @function\n" 104 + "override_func:\n" 105 + " ret\n" 106 + ".size override_func, .-override_func\n" 107 + ); 108 + 109 + void arch_ftrace_kprobe_override_function(struct pt_regs *regs) 110 + { 111 + regs->ip = (unsigned long)&override_func; 112 + } 113 + NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);

+45 -4

arch/x86/net/bpf_jit_comp.c

··· 1109 1109 return proglen; 1110 1110 } 1111 1111 1112 + struct x64_jit_data { 1113 + struct bpf_binary_header *header; 1114 + int *addrs; 1115 + u8 *image; 1116 + int proglen; 1117 + struct jit_context ctx; 1118 + }; 1119 + 1112 1120 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) 1113 1121 { 1114 1122 struct bpf_binary_header *header = NULL; 1115 1123 struct bpf_prog *tmp, *orig_prog = prog; 1124 + struct x64_jit_data *jit_data; 1116 1125 int proglen, oldproglen = 0; 1117 1126 struct jit_context ctx = {}; 1118 1127 bool tmp_blinded = false; 1128 + bool extra_pass = false; 1119 1129 u8 *image = NULL; 1120 1130 int *addrs; 1121 1131 int pass; 1122 1132 int i; 1123 1133 1124 - if (!bpf_jit_enable) 1134 + if (!prog->jit_requested) 1125 1135 return orig_prog; 1126 1136 1127 1137 tmp = bpf_jit_blind_constants(prog); ··· 1145 1135 prog = tmp; 1146 1136 } 1147 1137 1138 + jit_data = prog->aux->jit_data; 1139 + if (!jit_data) { 1140 + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); 1141 + if (!jit_data) { 1142 + prog = orig_prog; 1143 + goto out; 1144 + } 1145 + prog->aux->jit_data = jit_data; 1146 + } 1147 + addrs = jit_data->addrs; 1148 + if (addrs) { 1149 + ctx = jit_data->ctx; 1150 + oldproglen = jit_data->proglen; 1151 + image = jit_data->image; 1152 + header = jit_data->header; 1153 + extra_pass = true; 1154 + goto skip_init_addrs; 1155 + } 1148 1156 addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); 1149 1157 if (!addrs) { 1150 1158 prog = orig_prog; 1151 - goto out; 1159 + goto out_addrs; 1152 1160 } 1153 1161 1154 1162 /* Before first pass, make a rough estimation of addrs[] ··· 1177 1149 addrs[i] = proglen; 1178 1150 } 1179 1151 ctx.cleanup_addr = proglen; 1152 + skip_init_addrs: 1180 1153 1181 1154 /* JITed image shrinks with every pass and the loop iterates 1182 1155 * until the image stops shrinking. Very large bpf programs ··· 1218 1189 1219 1190 if (image) { 1220 1191 bpf_flush_icache(header, image + proglen); 1221 - bpf_jit_binary_lock_ro(header); 1192 + if (!prog->is_func || extra_pass) { 1193 + bpf_jit_binary_lock_ro(header); 1194 + } else { 1195 + jit_data->addrs = addrs; 1196 + jit_data->ctx = ctx; 1197 + jit_data->proglen = proglen; 1198 + jit_data->image = image; 1199 + jit_data->header = header; 1200 + } 1222 1201 prog->bpf_func = (void *)image; 1223 1202 prog->jited = 1; 1224 1203 prog->jited_len = proglen; ··· 1234 1197 prog = orig_prog; 1235 1198 } 1236 1199 1200 + if (!prog->is_func || extra_pass) { 1237 1201 out_addrs: 1238 - kfree(addrs); 1202 + kfree(addrs); 1203 + kfree(jit_data); 1204 + prog->aux->jit_data = NULL; 1205 + } 1239 1206 out: 1240 1207 if (tmp_blinded) 1241 1208 bpf_jit_prog_release_other(prog, prog == orig_prog ?

+54

drivers/net/ethernet/netronome/nfp/bpf/fw.h

··· 1 + /* 2 + * Copyright (C) 2017 Netronome Systems, Inc. 3 + * 4 + * This software is dual licensed under the GNU General License Version 2, 5 + * June 1991 as shown in the file COPYING in the top-level directory of this 6 + * source tree or the BSD 2-Clause License provided below. You have the 7 + * option to license this software under the complete terms of either license. 8 + * 9 + * The BSD 2-Clause License: 10 + * 11 + * Redistribution and use in source and binary forms, with or 12 + * without modification, are permitted provided that the following 13 + * conditions are met: 14 + * 15 + * 1. Redistributions of source code must retain the above 16 + * copyright notice, this list of conditions and the following 17 + * disclaimer. 18 + * 19 + * 2. Redistributions in binary form must reproduce the above 20 + * copyright notice, this list of conditions and the following 21 + * disclaimer in the documentation and/or other materials 22 + * provided with the distribution. 23 + * 24 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 + * SOFTWARE. 32 + */ 33 + 34 + #ifndef NFP_BPF_FW_H 35 + #define NFP_BPF_FW_H 1 36 + 37 + #include <linux/bitops.h> 38 + #include <linux/types.h> 39 + 40 + enum bpf_cap_tlv_type { 41 + NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, 42 + }; 43 + 44 + struct nfp_bpf_cap_tlv_adjust_head { 45 + __le32 flags; 46 + __le32 off_min; 47 + __le32 off_max; 48 + __le32 guaranteed_sub; 49 + __le32 guaranteed_add; 50 + }; 51 + 52 + #define NFP_BPF_ADJUST_HEAD_NO_META BIT(0) 53 + 54 + #endif

+107

drivers/net/ethernet/netronome/nfp/bpf/jit.c

··· 33 33 34 34 #define pr_fmt(fmt) "NFP net bpf: " fmt 35 35 36 + #include <linux/bug.h> 36 37 #include <linux/kernel.h> 37 38 #include <linux/bpf.h> 38 39 #include <linux/filter.h> ··· 86 85 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog) 87 86 { 88 87 return nfp_prog->start_off + nfp_prog->prog_len; 88 + } 89 + 90 + static bool 91 + nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off) 92 + { 93 + /* If there is a recorded error we may have dropped instructions; 94 + * that doesn't have to be due to translator bug, and the translation 95 + * will fail anyway, so just return OK. 96 + */ 97 + if (nfp_prog->error) 98 + return true; 99 + return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off); 89 100 } 90 101 91 102 static unsigned int ··· 1209 1196 SHF_SC_R_ROT, 16); 1210 1197 } 1211 1198 1199 + static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1200 + { 1201 + swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog); 1202 + struct nfp_bpf_cap_adjust_head *adjust_head; 1203 + u32 ret_einval, end; 1204 + 1205 + adjust_head = &nfp_prog->bpf->adjust_head; 1206 + 1207 + /* Optimized version - 5 vs 14 cycles */ 1208 + if (nfp_prog->adjust_head_location != UINT_MAX) { 1209 + if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n)) 1210 + return -EINVAL; 1211 + 1212 + emit_alu(nfp_prog, pptr_reg(nfp_prog), 1213 + reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog)); 1214 + emit_alu(nfp_prog, plen_reg(nfp_prog), 1215 + plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2)); 1216 + emit_alu(nfp_prog, pv_len(nfp_prog), 1217 + pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2)); 1218 + 1219 + wrp_immed(nfp_prog, reg_both(0), 0); 1220 + wrp_immed(nfp_prog, reg_both(1), 0); 1221 + 1222 + /* TODO: when adjust head is guaranteed to succeed we can 1223 + * also eliminate the following if (r0 == 0) branch. 1224 + */ 1225 + 1226 + return 0; 1227 + } 1228 + 1229 + ret_einval = nfp_prog_current_offset(nfp_prog) + 14; 1230 + end = ret_einval + 2; 1231 + 1232 + /* We need to use a temp because offset is just a part of the pkt ptr */ 1233 + emit_alu(nfp_prog, tmp, 1234 + reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog)); 1235 + 1236 + /* Validate result will fit within FW datapath constraints */ 1237 + emit_alu(nfp_prog, reg_none(), 1238 + tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min)); 1239 + emit_br(nfp_prog, BR_BLO, ret_einval, 0); 1240 + emit_alu(nfp_prog, reg_none(), 1241 + reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp); 1242 + emit_br(nfp_prog, BR_BLO, ret_einval, 0); 1243 + 1244 + /* Validate the length is at least ETH_HLEN */ 1245 + emit_alu(nfp_prog, tmp_len, 1246 + plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2)); 1247 + emit_alu(nfp_prog, reg_none(), 1248 + tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN)); 1249 + emit_br(nfp_prog, BR_BMI, ret_einval, 0); 1250 + 1251 + /* Load the ret code */ 1252 + wrp_immed(nfp_prog, reg_both(0), 0); 1253 + wrp_immed(nfp_prog, reg_both(1), 0); 1254 + 1255 + /* Modify the packet metadata */ 1256 + emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0); 1257 + 1258 + /* Skip over the -EINVAL ret code (defer 2) */ 1259 + emit_br_def(nfp_prog, end, 2); 1260 + 1261 + emit_alu(nfp_prog, plen_reg(nfp_prog), 1262 + plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2)); 1263 + emit_alu(nfp_prog, pv_len(nfp_prog), 1264 + pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2)); 1265 + 1266 + /* return -EINVAL target */ 1267 + if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval)) 1268 + return -EINVAL; 1269 + 1270 + wrp_immed(nfp_prog, reg_both(0), -22); 1271 + wrp_immed(nfp_prog, reg_both(1), ~0); 1272 + 1273 + if (!nfp_prog_confirm_current_offset(nfp_prog, end)) 1274 + return -EINVAL; 1275 + 1276 + return 0; 1277 + } 1278 + 1212 1279 /* --- Callbacks --- */ 1213 1280 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1214 1281 { ··· 2023 1930 return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE); 2024 1931 } 2025 1932 1933 + static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 1934 + { 1935 + switch (meta->insn.imm) { 1936 + case BPF_FUNC_xdp_adjust_head: 1937 + return adjust_head(nfp_prog, meta); 1938 + default: 1939 + WARN_ONCE(1, "verifier allowed unsupported function\n"); 1940 + return -EOPNOTSUPP; 1941 + } 1942 + } 1943 + 2026 1944 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) 2027 1945 { 2028 1946 wrp_br_special(nfp_prog, BR_UNC, OP_BR_GO_OUT); ··· 2106 2002 [BPF_JMP | BPF_JLE | BPF_X] = jle_reg, 2107 2003 [BPF_JMP | BPF_JSET | BPF_X] = jset_reg, 2108 2004 [BPF_JMP | BPF_JNE | BPF_X] = jne_reg, 2005 + [BPF_JMP | BPF_CALL] = call, 2109 2006 [BPF_JMP | BPF_EXIT] = goto_out, 2110 2007 }; 2111 2008 ··· 2130 2025 2131 2026 list_for_each_entry(meta, &nfp_prog->insns, l) { 2132 2027 if (meta->skip) 2028 + continue; 2029 + if (meta->insn.code == (BPF_JMP | BPF_CALL)) 2133 2030 continue; 2134 2031 if (BPF_CLASS(meta->insn.code) != BPF_JMP) 2135 2032 continue;

+115

drivers/net/ethernet/netronome/nfp/bpf/main.c

··· 34 34 #include <net/pkt_cls.h> 35 35 36 36 #include "../nfpcore/nfp_cpp.h" 37 + #include "../nfpcore/nfp_nffw.h" 37 38 #include "../nfp_app.h" 38 39 #include "../nfp_main.h" 39 40 #include "../nfp_net.h" 40 41 #include "../nfp_port.h" 42 + #include "fw.h" 41 43 #include "main.h" 42 44 43 45 static bool nfp_net_ebpf_capable(struct nfp_net *nn) ··· 157 155 return nn->dp.ctrl & NFP_NET_CFG_CTRL_BPF; 158 156 } 159 157 158 + static int 159 + nfp_bpf_parse_cap_adjust_head(struct nfp_app_bpf *bpf, void __iomem *value, 160 + u32 length) 161 + { 162 + struct nfp_bpf_cap_tlv_adjust_head __iomem *cap = value; 163 + struct nfp_cpp *cpp = bpf->app->pf->cpp; 164 + 165 + if (length < sizeof(*cap)) { 166 + nfp_err(cpp, "truncated adjust_head TLV: %d\n", length); 167 + return -EINVAL; 168 + } 169 + 170 + bpf->adjust_head.flags = readl(&cap->flags); 171 + bpf->adjust_head.off_min = readl(&cap->off_min); 172 + bpf->adjust_head.off_max = readl(&cap->off_max); 173 + bpf->adjust_head.guaranteed_sub = readl(&cap->guaranteed_sub); 174 + bpf->adjust_head.guaranteed_add = readl(&cap->guaranteed_add); 175 + 176 + if (bpf->adjust_head.off_min > bpf->adjust_head.off_max) { 177 + nfp_err(cpp, "invalid adjust_head TLV: min > max\n"); 178 + return -EINVAL; 179 + } 180 + if (!FIELD_FIT(UR_REG_IMM_MAX, bpf->adjust_head.off_min) || 181 + !FIELD_FIT(UR_REG_IMM_MAX, bpf->adjust_head.off_max)) { 182 + nfp_warn(cpp, "disabling adjust_head - driver expects min/max to fit in as immediates\n"); 183 + memset(&bpf->adjust_head, 0, sizeof(bpf->adjust_head)); 184 + return 0; 185 + } 186 + 187 + return 0; 188 + } 189 + 190 + static int nfp_bpf_parse_capabilities(struct nfp_app *app) 191 + { 192 + struct nfp_cpp *cpp = app->pf->cpp; 193 + struct nfp_cpp_area *area; 194 + u8 __iomem *mem, *start; 195 + 196 + mem = nfp_rtsym_map(app->pf->rtbl, "_abi_bpf_capabilities", "bpf.cap", 197 + 8, &area); 198 + if (IS_ERR(mem)) 199 + return PTR_ERR(mem) == -ENOENT ? 0 : PTR_ERR(mem); 200 + 201 + start = mem; 202 + while (mem - start + 8 < nfp_cpp_area_size(area)) { 203 + u8 __iomem *value; 204 + u32 type, length; 205 + 206 + type = readl(mem); 207 + length = readl(mem + 4); 208 + value = mem + 8; 209 + 210 + mem += 8 + length; 211 + if (mem - start > nfp_cpp_area_size(area)) 212 + goto err_release_free; 213 + 214 + switch (type) { 215 + case NFP_BPF_CAP_TYPE_ADJUST_HEAD: 216 + if (nfp_bpf_parse_cap_adjust_head(app->priv, value, 217 + length)) 218 + goto err_release_free; 219 + break; 220 + default: 221 + nfp_dbg(cpp, "unknown BPF capability: %d\n", type); 222 + break; 223 + } 224 + } 225 + if (mem - start != nfp_cpp_area_size(area)) { 226 + nfp_err(cpp, "BPF capabilities left after parsing, parsed:%zd total length:%zu\n", 227 + mem - start, nfp_cpp_area_size(area)); 228 + goto err_release_free; 229 + } 230 + 231 + nfp_cpp_area_release_free(area); 232 + 233 + return 0; 234 + 235 + err_release_free: 236 + nfp_err(cpp, "invalid BPF capabilities at offset:%zd\n", mem - start); 237 + nfp_cpp_area_release_free(area); 238 + return -EINVAL; 239 + } 240 + 241 + static int nfp_bpf_init(struct nfp_app *app) 242 + { 243 + struct nfp_app_bpf *bpf; 244 + int err; 245 + 246 + bpf = kzalloc(sizeof(*bpf), GFP_KERNEL); 247 + if (!bpf) 248 + return -ENOMEM; 249 + bpf->app = app; 250 + app->priv = bpf; 251 + 252 + err = nfp_bpf_parse_capabilities(app); 253 + if (err) 254 + goto err_free_bpf; 255 + 256 + return 0; 257 + 258 + err_free_bpf: 259 + kfree(bpf); 260 + return err; 261 + } 262 + 263 + static void nfp_bpf_clean(struct nfp_app *app) 264 + { 265 + kfree(app->priv); 266 + } 267 + 160 268 const struct nfp_app_type app_bpf = { 161 269 .id = NFP_APP_BPF_NIC, 162 270 .name = "ebpf", 271 + 272 + .init = nfp_bpf_init, 273 + .clean = nfp_bpf_clean, 163 274 164 275 .extra_cap = nfp_bpf_extra_cap, 165 276

+30

drivers/net/ethernet/netronome/nfp/bpf/main.h

··· 78 78 #define NFP_BPF_ABI_FLAGS reg_imm(0) 79 79 #define NFP_BPF_ABI_FLAG_MARK 1 80 80 81 + /** 82 + * struct nfp_app_bpf - bpf app priv structure 83 + * @app: backpointer to the app 84 + * 85 + * @adjust_head: adjust head capability 86 + * @flags: extra flags for adjust head 87 + * @off_min: minimal packet offset within buffer required 88 + * @off_max: maximum packet offset within buffer required 89 + * @guaranteed_sub: amount of negative adjustment guaranteed possible 90 + * @guaranteed_add: amount of positive adjustment guaranteed possible 91 + */ 92 + struct nfp_app_bpf { 93 + struct nfp_app *app; 94 + 95 + struct nfp_bpf_cap_adjust_head { 96 + u32 flags; 97 + int off_min; 98 + int off_max; 99 + int guaranteed_sub; 100 + int guaranteed_add; 101 + } adjust_head; 102 + }; 103 + 81 104 struct nfp_prog; 82 105 struct nfp_insn_meta; 83 106 typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); ··· 120 97 * @ptr: pointer type for memory operations 121 98 * @ldst_gather_len: memcpy length gathered from load/store sequence 122 99 * @paired_st: the paired store insn at the head of the sequence 100 + * @arg2: arg2 for call instructions 123 101 * @ptr_not_const: pointer is not always constant 124 102 * @jmp_dst: destination info for jump instructions 125 103 * @off: index of first generated machine instruction (in nfp_prog.prog) ··· 140 116 bool ptr_not_const; 141 117 }; 142 118 struct nfp_insn_meta *jmp_dst; 119 + struct bpf_reg_state arg2; 143 120 }; 144 121 unsigned int off; 145 122 unsigned short n; ··· 185 160 186 161 /** 187 162 * struct nfp_prog - nfp BPF program 163 + * @bpf: backpointer to the bpf app priv structure 188 164 * @prog: machine code 189 165 * @prog_len: number of valid instructions in @prog array 190 166 * @__prog_alloc_len: alloc size of @prog array ··· 199 173 * @n_translated: number of successfully translated instructions (for errors) 200 174 * @error: error code if something went wrong 201 175 * @stack_depth: max stack depth from the verifier 176 + * @adjust_head_location: if program has single adjust head call - the insn no. 202 177 * @insns: list of BPF instruction wrappers (struct nfp_insn_meta) 203 178 */ 204 179 struct nfp_prog { 180 + struct nfp_app_bpf *bpf; 181 + 205 182 u64 *prog; 206 183 unsigned int prog_len; 207 184 unsigned int __prog_alloc_len; ··· 223 194 int error; 224 195 225 196 unsigned int stack_depth; 197 + unsigned int adjust_head_location; 226 198 227 199 struct list_head insns; 228 200 };

+2

drivers/net/ethernet/netronome/nfp/bpf/offload.c

··· 48 48 #include <net/tc_act/tc_mirred.h> 49 49 50 50 #include "main.h" 51 + #include "../nfp_app.h" 51 52 #include "../nfp_net_ctrl.h" 52 53 #include "../nfp_net.h" 53 54 ··· 116 115 117 116 INIT_LIST_HEAD(&nfp_prog->insns); 118 117 nfp_prog->type = prog->type; 118 + nfp_prog->bpf = app->priv; 119 119 120 120 ret = nfp_prog_prepare(nfp_prog, prog->insnsi, prog->len); 121 121 if (ret)

+70

drivers/net/ethernet/netronome/nfp/bpf/verifier.c

··· 38 38 #include <linux/kernel.h> 39 39 #include <linux/pkt_cls.h> 40 40 41 + #include "fw.h" 41 42 #include "main.h" 42 43 43 44 struct nfp_insn_meta * ··· 67 66 meta = nfp_meta_prev(meta); 68 67 69 68 return meta; 69 + } 70 + 71 + static void 72 + nfp_record_adjust_head(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, 73 + struct nfp_insn_meta *meta, 74 + const struct bpf_reg_state *reg2) 75 + { 76 + unsigned int location = UINT_MAX; 77 + int imm; 78 + 79 + /* Datapath usually can give us guarantees on how much adjust head 80 + * can be done without the need for any checks. Optimize the simple 81 + * case where there is only one adjust head by a constant. 82 + */ 83 + if (reg2->type != SCALAR_VALUE || !tnum_is_const(reg2->var_off)) 84 + goto exit_set_location; 85 + imm = reg2->var_off.value; 86 + /* Translator will skip all checks, we need to guarantee min pkt len */ 87 + if (imm > ETH_ZLEN - ETH_HLEN) 88 + goto exit_set_location; 89 + if (imm > (int)bpf->adjust_head.guaranteed_add || 90 + imm < -bpf->adjust_head.guaranteed_sub) 91 + goto exit_set_location; 92 + 93 + if (nfp_prog->adjust_head_location) { 94 + /* Only one call per program allowed */ 95 + if (nfp_prog->adjust_head_location != meta->n) 96 + goto exit_set_location; 97 + 98 + if (meta->arg2.var_off.value != imm) 99 + goto exit_set_location; 100 + } 101 + 102 + location = meta->n; 103 + exit_set_location: 104 + nfp_prog->adjust_head_location = location; 105 + } 106 + 107 + static int 108 + nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, 109 + struct nfp_insn_meta *meta) 110 + { 111 + const struct bpf_reg_state *reg2 = cur_regs(env) + BPF_REG_2; 112 + struct nfp_app_bpf *bpf = nfp_prog->bpf; 113 + u32 func_id = meta->insn.imm; 114 + 115 + switch (func_id) { 116 + case BPF_FUNC_xdp_adjust_head: 117 + if (!bpf->adjust_head.off_max) { 118 + pr_warn("adjust_head not supported by FW\n"); 119 + return -EOPNOTSUPP; 120 + } 121 + if (!(bpf->adjust_head.flags & NFP_BPF_ADJUST_HEAD_NO_META)) { 122 + pr_warn("adjust_head: FW requires shifting metadata, not supported by the driver\n"); 123 + return -EOPNOTSUPP; 124 + } 125 + 126 + nfp_record_adjust_head(bpf, nfp_prog, meta, reg2); 127 + break; 128 + default: 129 + pr_warn("unsupported function id: %d\n", func_id); 130 + return -EOPNOTSUPP; 131 + } 132 + 133 + meta->arg2 = *reg2; 134 + 135 + return 0; 70 136 } 71 137 72 138 static int ··· 245 177 return -EINVAL; 246 178 } 247 179 180 + if (meta->insn.code == (BPF_JMP | BPF_CALL)) 181 + return nfp_bpf_check_call(nfp_prog, env, meta); 248 182 if (meta->insn.code == (BPF_JMP | BPF_EXIT)) 249 183 return nfp_bpf_check_exit(nfp_prog, env); 250 184

+2

drivers/net/ethernet/netronome/nfp/nfp_asm.h

··· 77 77 enum br_mask { 78 78 BR_BEQ = 0x00, 79 79 BR_BNE = 0x01, 80 + BR_BMI = 0x02, 80 81 BR_BHS = 0x04, 81 82 BR_BLO = 0x05, 82 83 BR_BGE = 0x08, ··· 176 175 ALU_OP_NONE = 0x00, 177 176 ALU_OP_ADD = 0x01, 178 177 ALU_OP_NOT = 0x04, 178 + ALU_OP_ADD_2B = 0x05, 179 179 ALU_OP_AND = 0x08, 180 180 ALU_OP_SUB_C = 0x0d, 181 181 ALU_OP_ADD_C = 0x11,

+1 -1

drivers/net/ethernet/netronome/nfp/nfp_net_common.c

··· 3392 3392 if (nn->dp.bpf_offload_xdp) 3393 3393 xdp->prog_attached = XDP_ATTACHED_HW; 3394 3394 xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; 3395 - xdp->flags = nn->xdp_prog ? nn->xdp_flags : 0; 3395 + xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0; 3396 3396 return 0; 3397 3397 case BPF_OFFLOAD_VERIFIER_PREP: 3398 3398 return nfp_app_bpf_verifier_prep(nn->app, nn, xdp);

+1

drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h

··· 242 242 void *buffer, size_t length); 243 243 int nfp_cpp_area_write(struct nfp_cpp_area *area, unsigned long offset, 244 244 const void *buffer, size_t length); 245 + size_t nfp_cpp_area_size(struct nfp_cpp_area *area); 245 246 const char *nfp_cpp_area_name(struct nfp_cpp_area *cpp_area); 246 247 void *nfp_cpp_area_priv(struct nfp_cpp_area *cpp_area); 247 248 struct nfp_cpp *nfp_cpp_area_cpp(struct nfp_cpp_area *cpp_area);

+11

drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c

··· 568 568 } 569 569 570 570 /** 571 + * nfp_cpp_area_size() - return size of a CPP area 572 + * @cpp_area: CPP area handle 573 + * 574 + * Return: Size of the area 575 + */ 576 + size_t nfp_cpp_area_size(struct nfp_cpp_area *cpp_area) 577 + { 578 + return cpp_area->size; 579 + } 580 + 581 + /** 571 582 * nfp_cpp_area_name() - return name of a CPP area 572 583 * @cpp_area: CPP area handle 573 584 *

+2

fs/btrfs/disk-io.c

··· 30 30 #include <linux/ratelimit.h> 31 31 #include <linux/uuid.h> 32 32 #include <linux/semaphore.h> 33 + #include <linux/bpf.h> 33 34 #include <asm/unaligned.h> 34 35 #include "ctree.h" 35 36 #include "disk-io.h" ··· 3124 3123 goto fail_block_groups; 3125 3124 goto retry_root_backup; 3126 3125 } 3126 + BPF_ALLOW_ERROR_INJECTION(open_ctree); 3127 3127 3128 3128 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 3129 3129 {

+2

fs/btrfs/free-space-cache.c

··· 22 22 #include <linux/slab.h> 23 23 #include <linux/math64.h> 24 24 #include <linux/ratelimit.h> 25 + #include <linux/bpf.h> 25 26 #include "ctree.h" 26 27 #include "free-space-cache.h" 27 28 #include "transaction.h" ··· 333 332 334 333 return 0; 335 334 } 335 + BPF_ALLOW_ERROR_INJECTION(io_ctl_init); 336 336 337 337 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 338 338 {

+10

include/asm-generic/vmlinux.lds.h

··· 136 136 #define KPROBE_BLACKLIST() 137 137 #endif 138 138 139 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 140 + #define ERROR_INJECT_LIST() . = ALIGN(8); \ 141 + VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ 142 + KEEP(*(_kprobe_error_inject_list)) \ 143 + VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; 144 + #else 145 + #define ERROR_INJECT_LIST() 146 + #endif 147 + 139 148 #ifdef CONFIG_EVENT_TRACING 140 149 #define FTRACE_EVENTS() . = ALIGN(8); \ 141 150 VMLINUX_SYMBOL(__start_ftrace_events) = .; \ ··· 573 564 FTRACE_EVENTS() \ 574 565 TRACE_SYSCALLS() \ 575 566 KPROBE_BLACKLIST() \ 567 + ERROR_INJECT_LIST() \ 576 568 MEM_DISCARD(init.rodata) \ 577 569 CLK_OF_TABLES() \ 578 570 RESERVEDMEM_OF_TABLES() \

+18

include/linux/bpf.h

··· 200 200 u32 max_ctx_offset; 201 201 u32 stack_depth; 202 202 u32 id; 203 + u32 func_cnt; 204 + struct bpf_prog **func; 205 + void *jit_data; /* JIT specific data. arch dependent */ 203 206 struct latch_tree_node ksym_tnode; 204 207 struct list_head ksym_lnode; 205 208 const struct bpf_prog_ops *ops; ··· 288 285 289 286 void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, 290 287 struct bpf_prog *old_prog); 288 + int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 289 + __u32 __user *prog_ids, u32 request_cnt, 290 + __u32 __user *prog_cnt); 291 291 int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, 292 292 struct bpf_prog *exclude_prog, 293 293 struct bpf_prog *include_prog, ··· 405 399 406 400 /* verify correctness of eBPF program */ 407 401 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); 402 + void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); 408 403 409 404 /* Map specifics */ 410 405 struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); ··· 582 575 /* Shared helpers among cBPF and eBPF. */ 583 576 void bpf_user_rnd_init_once(void); 584 577 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 578 + 579 + #if defined(__KERNEL__) && !defined(__ASSEMBLY__) 580 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 581 + #define BPF_ALLOW_ERROR_INJECTION(fname) \ 582 + static unsigned long __used \ 583 + __attribute__((__section__("_kprobe_error_inject_list"))) \ 584 + _eil_addr_##fname = (unsigned long)fname; 585 + #else 586 + #define BPF_ALLOW_ERROR_INJECTION(fname) 587 + #endif 588 + #endif 585 589 586 590 #endif /* _LINUX_BPF_H */

+41 -4

include/linux/bpf_verifier.h

··· 76 76 s64 smax_value; /* maximum possible (s64)value */ 77 77 u64 umin_value; /* minimum possible (u64)value */ 78 78 u64 umax_value; /* maximum possible (u64)value */ 79 + /* Inside the callee two registers can be both PTR_TO_STACK like 80 + * R1=fp-8 and R2=fp-8, but one of them points to this function stack 81 + * while another to the caller's stack. To differentiate them 'frameno' 82 + * is used which is an index in bpf_verifier_state->frame[] array 83 + * pointing to bpf_func_state. 84 + * This field must be second to last, for states_equal() reasons. 85 + */ 86 + u32 frameno; 79 87 /* This field must be last, for states_equal() reasons. */ 80 88 enum bpf_reg_liveness live; 81 89 }; ··· 91 83 enum bpf_stack_slot_type { 92 84 STACK_INVALID, /* nothing was stored in this stack slot */ 93 85 STACK_SPILL, /* register spilled into stack */ 94 - STACK_MISC /* BPF program wrote some data into this slot */ 86 + STACK_MISC, /* BPF program wrote some data into this slot */ 87 + STACK_ZERO, /* BPF program wrote constant zero */ 95 88 }; 96 89 97 90 #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ ··· 105 96 /* state of the program: 106 97 * type of all registers and stack info 107 98 */ 108 - struct bpf_verifier_state { 99 + struct bpf_func_state { 109 100 struct bpf_reg_state regs[MAX_BPF_REG]; 110 101 struct bpf_verifier_state *parent; 102 + /* index of call instruction that called into this func */ 103 + int callsite; 104 + /* stack frame number of this function state from pov of 105 + * enclosing bpf_verifier_state. 106 + * 0 = main function, 1 = first callee. 107 + */ 108 + u32 frameno; 109 + /* subprog number == index within subprog_stack_depth 110 + * zero == main subprog 111 + */ 112 + u32 subprogno; 113 + 114 + /* should be second to last. See copy_func_state() */ 111 115 int allocated_stack; 112 116 struct bpf_stack_state *stack; 117 + }; 118 + 119 + #define MAX_CALL_FRAMES 8 120 + struct bpf_verifier_state { 121 + /* call stack tracking */ 122 + struct bpf_func_state *frame[MAX_CALL_FRAMES]; 123 + struct bpf_verifier_state *parent; 124 + u32 curframe; 113 125 }; 114 126 115 127 /* linked list of verifier states used to prune search */ ··· 143 113 union { 144 114 enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ 145 115 struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ 116 + s32 call_imm; /* saved imm field of call insn */ 146 117 }; 147 118 int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ 148 119 bool seen; /* this insn was processed by the verifier */ ··· 172 141 int insn_idx, int prev_insn_idx); 173 142 }; 174 143 144 + #define BPF_MAX_SUBPROGS 256 145 + 175 146 /* single container for all structs 176 147 * one verifier_env per bpf_check() call 177 148 */ ··· 192 159 bool allow_ptr_leaks; 193 160 bool seen_direct_write; 194 161 struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ 195 - 196 162 struct bpf_verifer_log log; 163 + u32 subprog_starts[BPF_MAX_SUBPROGS]; 164 + u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; 165 + u32 subprog_cnt; 197 166 }; 198 167 199 168 static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) 200 169 { 201 - return env->cur_state->regs; 170 + struct bpf_verifier_state *cur = env->cur_state; 171 + 172 + return cur->frame[cur->curframe]->regs; 202 173 } 203 174 204 175 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)

+13 -3

include/linux/filter.h

··· 58 58 /* unused opcode to mark special call to bpf_tail_call() helper */ 59 59 #define BPF_TAIL_CALL 0xf0 60 60 61 + /* unused opcode to mark call to interpreter with arguments */ 62 + #define BPF_CALL_ARGS 0xe0 63 + 61 64 /* As per nm, we expose JITed images as text (code) section for 62 65 * kallsyms. That way, tools like perf can find it to match 63 66 * addresses. ··· 458 455 struct bpf_prog { 459 456 u16 pages; /* Number of allocated pages */ 460 457 u16 jited:1, /* Is our filter JIT'ed? */ 458 + jit_requested:1,/* archs need to JIT the prog */ 461 459 locked:1, /* Program image locked? */ 462 460 gpl_compatible:1, /* Is filter GPL compatible? */ 463 461 cb_access:1, /* Is control block accessed? */ 464 - dst_needed:1; /* Do we need dst entry? */ 462 + dst_needed:1, /* Do we need dst entry? */ 463 + blinded:1, /* Was blinded */ 464 + is_func:1, /* program is a bpf function */ 465 + kprobe_override:1; /* Do we override a kprobe? */ 465 466 enum bpf_prog_type type; /* Type of BPF program */ 466 467 u32 len; /* Number of filter blocks */ 467 468 u32 jited_len; /* Size of jited insns in bytes */ ··· 716 709 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); 717 710 718 711 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 712 + #define __bpf_call_base_args \ 713 + ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ 714 + __bpf_call_base) 719 715 720 716 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); 721 717 void bpf_jit_compile(struct bpf_prog *prog); ··· 807 797 return fp->jited && bpf_jit_is_ebpf(); 808 798 } 809 799 810 - static inline bool bpf_jit_blinding_enabled(void) 800 + static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) 811 801 { 812 802 /* These are the prerequisites, should someone ever have the 813 803 * idea to call blinding outside of them, we make sure to ··· 815 805 */ 816 806 if (!bpf_jit_is_ebpf()) 817 807 return false; 818 - if (!bpf_jit_enable) 808 + if (!prog->jit_requested) 819 809 return false; 820 810 if (!bpf_jit_harden) 821 811 return false;

+1

include/linux/kprobes.h

··· 271 271 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); 272 272 273 273 extern bool within_kprobe_blacklist(unsigned long addr); 274 + extern bool within_kprobe_error_injection_list(unsigned long addr); 274 275 275 276 struct kprobe_insn_cache { 276 277 struct mutex mutex;

+5

include/linux/module.h

··· 475 475 ctor_fn_t *ctors; 476 476 unsigned int num_ctors; 477 477 #endif 478 + 479 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 480 + unsigned int num_kprobe_ei_funcs; 481 + unsigned long *kprobe_ei_funcs; 482 + #endif 478 483 } ____cacheline_aligned __randomize_layout; 479 484 #ifndef MODULE_ARCH_INIT 480 485 #define MODULE_ARCH_INIT {}

+7

include/linux/trace_events.h

··· 467 467 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); 468 468 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); 469 469 void perf_event_detach_bpf_prog(struct perf_event *event); 470 + int perf_event_query_prog_array(struct perf_event *event, void __user *info); 470 471 #else 471 472 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) 472 473 { ··· 482 481 483 482 static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } 484 483 484 + static inline int 485 + perf_event_query_prog_array(struct perf_event *event, void __user *info) 486 + { 487 + return -EOPNOTSUPP; 488 + } 485 489 #endif 486 490 487 491 enum { ··· 534 528 struct perf_event; 535 529 536 530 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); 531 + DECLARE_PER_CPU(int, bpf_kprobe_override); 537 532 538 533 extern int perf_trace_init(struct perf_event *event); 539 534 extern void perf_trace_destroy(struct perf_event *event);

+12 -1

include/uapi/linux/bpf.h

··· 197 197 */ 198 198 #define BPF_F_STRICT_ALIGNMENT (1U << 0) 199 199 200 + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ 200 201 #define BPF_PSEUDO_MAP_FD 1 202 + 203 + /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 204 + * offset to another bpf function 205 + */ 206 + #define BPF_PSEUDO_CALL 1 201 207 202 208 /* flags for BPF_MAP_UPDATE_ELEM command */ 203 209 #define BPF_ANY 0 /* create new element or update existing */ ··· 683 677 * @buf: buf to fill 684 678 * @buf_size: size of the buf 685 679 * Return : 0 on success or negative error code 680 + * 681 + * int bpf_override_return(pt_regs, rc) 682 + * @pt_regs: pointer to struct pt_regs 683 + * @rc: the return value to set 686 684 */ 687 685 #define __BPF_FUNC_MAPPER(FN) \ 688 686 FN(unspec), \ ··· 746 736 FN(xdp_adjust_meta), \ 747 737 FN(perf_event_read_value), \ 748 738 FN(perf_prog_read_value), \ 749 - FN(getsockopt), 739 + FN(getsockopt), \ 740 + FN(override_return), 750 741 751 742 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 752 743 * function eBPF program intends to call

+22

include/uapi/linux/perf_event.h

··· 418 418 __u16 __reserved_2; /* align to __u64 */ 419 419 }; 420 420 421 + /* 422 + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command 423 + * to query bpf programs attached to the same perf tracepoint 424 + * as the given perf event. 425 + */ 426 + struct perf_event_query_bpf { 427 + /* 428 + * The below ids array length 429 + */ 430 + __u32 ids_len; 431 + /* 432 + * Set by the kernel to indicate the number of 433 + * available programs 434 + */ 435 + __u32 prog_cnt; 436 + /* 437 + * User provided buffer to store program ids 438 + */ 439 + __u32 ids[0]; 440 + }; 441 + 421 442 #define perf_flags(attr) (*(&(attr)->read_format + 1)) 422 443 423 444 /* ··· 454 433 #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) 455 434 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) 456 435 #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) 436 + #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) 457 437 458 438 enum perf_event_ioc_flags { 459 439 PERF_IOC_FLAG_GROUP = 1U << 0,

+109 -19

kernel/bpf/core.c

··· 94 94 fp->pages = size / PAGE_SIZE; 95 95 fp->aux = aux; 96 96 fp->aux->prog = fp; 97 + fp->jit_requested = ebpf_jit_enabled(); 97 98 98 99 INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); 99 100 ··· 218 217 return 0; 219 218 } 220 219 221 - static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) 222 - { 223 - return BPF_CLASS(insn->code) == BPF_JMP && 224 - /* Call and Exit are both special jumps with no 225 - * target inside the BPF instruction image. 226 - */ 227 - BPF_OP(insn->code) != BPF_CALL && 228 - BPF_OP(insn->code) != BPF_EXIT; 229 - } 230 - 231 220 static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) 232 221 { 233 222 struct bpf_insn *insn = prog->insnsi; 234 223 u32 i, insn_cnt = prog->len; 224 + bool pseudo_call; 225 + u8 code; 226 + int off; 235 227 236 228 for (i = 0; i < insn_cnt; i++, insn++) { 237 - if (!bpf_is_jmp_and_has_target(insn)) 229 + code = insn->code; 230 + if (BPF_CLASS(code) != BPF_JMP) 238 231 continue; 232 + if (BPF_OP(code) == BPF_EXIT) 233 + continue; 234 + if (BPF_OP(code) == BPF_CALL) { 235 + if (insn->src_reg == BPF_PSEUDO_CALL) 236 + pseudo_call = true; 237 + else 238 + continue; 239 + } else { 240 + pseudo_call = false; 241 + } 242 + off = pseudo_call ? insn->imm : insn->off; 239 243 240 244 /* Adjust offset of jmps if we cross boundaries. */ 241 - if (i < pos && i + insn->off + 1 > pos) 242 - insn->off += delta; 243 - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) 244 - insn->off -= delta; 245 + if (i < pos && i + off + 1 > pos) 246 + off += delta; 247 + else if (i > pos + delta && i + off + 1 <= pos + delta) 248 + off -= delta; 249 + 250 + if (pseudo_call) 251 + insn->imm = off; 252 + else 253 + insn->off = off; 245 254 } 246 255 } 247 256 ··· 722 711 struct bpf_insn *insn; 723 712 int i, rewritten; 724 713 725 - if (!bpf_jit_blinding_enabled()) 714 + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) 726 715 return prog; 727 716 728 717 clone = bpf_prog_clone_create(prog, GFP_USER); ··· 764 753 i += insn_delta; 765 754 } 766 755 756 + clone->blinded = 1; 767 757 return clone; 768 758 } 769 759 #endif /* CONFIG_BPF_JIT */ ··· 786 774 * 787 775 * Decode and execute eBPF instructions. 788 776 */ 789 - static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, 790 - u64 *stack) 777 + static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) 791 778 { 792 779 u64 tmp; 793 780 static const void *jumptable[256] = { ··· 846 835 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, 847 836 /* Call instruction */ 848 837 [BPF_JMP | BPF_CALL] = &&JMP_CALL, 838 + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, 849 839 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, 850 840 /* Jumps */ 851 841 [BPF_JMP | BPF_JA] = &&JMP_JA, ··· 1035 1023 */ 1036 1024 BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, 1037 1025 BPF_R4, BPF_R5); 1026 + CONT; 1027 + 1028 + JMP_CALL_ARGS: 1029 + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, 1030 + BPF_R3, BPF_R4, 1031 + BPF_R5, 1032 + insn + insn->off + 1); 1038 1033 CONT; 1039 1034 1040 1035 JMP_TAIL_CALL: { ··· 1316 1297 return ___bpf_prog_run(regs, insn, stack); \ 1317 1298 } 1318 1299 1300 + #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size 1301 + #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ 1302 + static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ 1303 + const struct bpf_insn *insn) \ 1304 + { \ 1305 + u64 stack[stack_size / sizeof(u64)]; \ 1306 + u64 regs[MAX_BPF_REG]; \ 1307 + \ 1308 + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ 1309 + BPF_R1 = r1; \ 1310 + BPF_R2 = r2; \ 1311 + BPF_R3 = r3; \ 1312 + BPF_R4 = r4; \ 1313 + BPF_R5 = r5; \ 1314 + return ___bpf_prog_run(regs, insn, stack); \ 1315 + } 1316 + 1319 1317 #define EVAL1(FN, X) FN(X) 1320 1318 #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) 1321 1319 #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) ··· 1344 1308 EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); 1345 1309 EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); 1346 1310 1311 + EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); 1312 + EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); 1313 + EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); 1314 + 1347 1315 #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), 1348 1316 1349 1317 static unsigned int (*interpreters[])(const void *ctx, ··· 1356 1316 EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) 1357 1317 EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) 1358 1318 }; 1319 + #undef PROG_NAME_LIST 1320 + #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), 1321 + static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, 1322 + const struct bpf_insn *insn) = { 1323 + EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) 1324 + EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) 1325 + EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) 1326 + }; 1327 + #undef PROG_NAME_LIST 1328 + 1329 + void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) 1330 + { 1331 + stack_depth = max_t(u32, stack_depth, 1); 1332 + insn->off = (s16) insn->imm; 1333 + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - 1334 + __bpf_call_base_args; 1335 + insn->code = BPF_JMP | BPF_CALL_ARGS; 1336 + } 1359 1337 1360 1338 bool bpf_prog_array_compatible(struct bpf_array *array, 1361 1339 const struct bpf_prog *fp) 1362 1340 { 1341 + if (fp->kprobe_override) 1342 + return false; 1343 + 1363 1344 if (!array->owner_prog_type) { 1364 1345 /* There's no owner yet where we could check for 1365 1346 * compatibility. ··· 1523 1462 rcu_read_lock(); 1524 1463 prog = rcu_dereference(progs)->progs; 1525 1464 for (; *prog; prog++) { 1465 + if (*prog == &dummy_bpf_prog.prog) 1466 + continue; 1526 1467 id = (*prog)->aux->id; 1527 1468 if (copy_to_user(prog_ids + i, &id, sizeof(id))) { 1528 1469 rcu_read_unlock(); ··· 1608 1545 return 0; 1609 1546 } 1610 1547 1548 + int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 1549 + __u32 __user *prog_ids, u32 request_cnt, 1550 + __u32 __user *prog_cnt) 1551 + { 1552 + u32 cnt = 0; 1553 + 1554 + if (array) 1555 + cnt = bpf_prog_array_length(array); 1556 + 1557 + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) 1558 + return -EFAULT; 1559 + 1560 + /* return early if user requested only program count or nothing to copy */ 1561 + if (!request_cnt || !cnt) 1562 + return 0; 1563 + 1564 + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); 1565 + } 1566 + 1611 1567 static void bpf_prog_free_deferred(struct work_struct *work) 1612 1568 { 1613 1569 struct bpf_prog_aux *aux; 1570 + int i; 1614 1571 1615 1572 aux = container_of(work, struct bpf_prog_aux, work); 1616 1573 if (bpf_prog_is_dev_bound(aux)) 1617 1574 bpf_prog_offload_destroy(aux->prog); 1618 - bpf_jit_free(aux->prog); 1575 + for (i = 0; i < aux->func_cnt; i++) 1576 + bpf_jit_free(aux->func[i]); 1577 + if (aux->func_cnt) { 1578 + kfree(aux->func); 1579 + bpf_prog_unlock_free(aux->prog); 1580 + } else { 1581 + bpf_jit_free(aux->prog); 1582 + } 1619 1583 } 1620 1584 1621 1585 /* Free internal BPF program */

+6 -2

kernel/bpf/disasm.c

··· 189 189 u8 opcode = BPF_OP(insn->code); 190 190 191 191 if (opcode == BPF_CALL) { 192 - verbose(env, "(%02x) call %s#%d\n", insn->code, 193 - func_id_name(insn->imm), insn->imm); 192 + if (insn->src_reg == BPF_PSEUDO_CALL) 193 + verbose(env, "(%02x) call pc%+d\n", insn->code, 194 + insn->imm); 195 + else 196 + verbose(env, "(%02x) call %s#%d\n", insn->code, 197 + func_id_name(insn->imm), insn->imm); 194 198 } else if (insn->code == (BPF_JMP | BPF_JA)) { 195 199 verbose(env, "(%02x) goto pc%+d\n", 196 200 insn->code, insn->off);

+2 -1

kernel/bpf/syscall.c

··· 1194 1194 goto free_used_maps; 1195 1195 1196 1196 /* eBPF program is ready to be JITed */ 1197 - prog = bpf_prog_select_runtime(prog, &err); 1197 + if (!prog->bpf_func) 1198 + prog = bpf_prog_select_runtime(prog, &err); 1198 1199 if (err < 0) 1199 1200 goto free_used_maps; 1200 1201

+951 -177

kernel/bpf/verifier.c

··· 20 20 #include <linux/file.h> 21 21 #include <linux/vmalloc.h> 22 22 #include <linux/stringify.h> 23 + #include <linux/bsearch.h> 24 + #include <linux/sort.h> 23 25 24 26 #include "disasm.h" 25 27 ··· 229 227 verbose(env, "w"); 230 228 } 231 229 232 - static void print_verifier_state(struct bpf_verifier_env *env, 233 - struct bpf_verifier_state *state) 230 + static struct bpf_func_state *func(struct bpf_verifier_env *env, 231 + const struct bpf_reg_state *reg) 234 232 { 235 - struct bpf_reg_state *reg; 233 + struct bpf_verifier_state *cur = env->cur_state; 234 + 235 + return cur->frame[reg->frameno]; 236 + } 237 + 238 + static void print_verifier_state(struct bpf_verifier_env *env, 239 + const struct bpf_func_state *state) 240 + { 241 + const struct bpf_reg_state *reg; 236 242 enum bpf_reg_type t; 237 243 int i; 238 244 245 + if (state->frameno) 246 + verbose(env, " frame%d:", state->frameno); 239 247 for (i = 0; i < MAX_BPF_REG; i++) { 240 248 reg = &state->regs[i]; 241 249 t = reg->type; ··· 258 246 tnum_is_const(reg->var_off)) { 259 247 /* reg->off should be 0 for SCALAR_VALUE */ 260 248 verbose(env, "%lld", reg->var_off.value + reg->off); 249 + if (t == PTR_TO_STACK) 250 + verbose(env, ",call_%d", func(env, reg)->callsite); 261 251 } else { 262 252 verbose(env, "(id=%d", reg->id); 263 253 if (t != SCALAR_VALUE) ··· 311 297 verbose(env, "=%s", 312 298 reg_type_str[state->stack[i].spilled_ptr.type]); 313 299 } 300 + if (state->stack[i].slot_type[0] == STACK_ZERO) 301 + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); 314 302 } 315 303 verbose(env, "\n"); 316 304 } 317 305 318 - static int copy_stack_state(struct bpf_verifier_state *dst, 319 - const struct bpf_verifier_state *src) 306 + static int copy_stack_state(struct bpf_func_state *dst, 307 + const struct bpf_func_state *src) 320 308 { 321 309 if (!src->stack) 322 310 return 0; ··· 334 318 335 319 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to 336 320 * make it consume minimal amount of memory. check_stack_write() access from 337 - * the program calls into realloc_verifier_state() to grow the stack size. 321 + * the program calls into realloc_func_state() to grow the stack size. 338 322 * Note there is a non-zero 'parent' pointer inside bpf_verifier_state 339 323 * which this function copies over. It points to previous bpf_verifier_state 340 324 * which is never reallocated 341 325 */ 342 - static int realloc_verifier_state(struct bpf_verifier_state *state, int size, 343 - bool copy_old) 326 + static int realloc_func_state(struct bpf_func_state *state, int size, 327 + bool copy_old) 344 328 { 345 329 u32 old_size = state->allocated_stack; 346 330 struct bpf_stack_state *new_stack; ··· 373 357 return 0; 374 358 } 375 359 360 + static void free_func_state(struct bpf_func_state *state) 361 + { 362 + kfree(state->stack); 363 + kfree(state); 364 + } 365 + 376 366 static void free_verifier_state(struct bpf_verifier_state *state, 377 367 bool free_self) 378 368 { 379 - kfree(state->stack); 369 + int i; 370 + 371 + for (i = 0; i <= state->curframe; i++) { 372 + free_func_state(state->frame[i]); 373 + state->frame[i] = NULL; 374 + } 380 375 if (free_self) 381 376 kfree(state); 382 377 } ··· 395 368 /* copy verifier state from src to dst growing dst stack space 396 369 * when necessary to accommodate larger src stack 397 370 */ 398 - static int copy_verifier_state(struct bpf_verifier_state *dst, 399 - const struct bpf_verifier_state *src) 371 + static int copy_func_state(struct bpf_func_state *dst, 372 + const struct bpf_func_state *src) 400 373 { 401 374 int err; 402 375 403 - err = realloc_verifier_state(dst, src->allocated_stack, false); 376 + err = realloc_func_state(dst, src->allocated_stack, false); 404 377 if (err) 405 378 return err; 406 - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); 379 + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); 407 380 return copy_stack_state(dst, src); 381 + } 382 + 383 + static int copy_verifier_state(struct bpf_verifier_state *dst_state, 384 + const struct bpf_verifier_state *src) 385 + { 386 + struct bpf_func_state *dst; 387 + int i, err; 388 + 389 + /* if dst has more stack frames then src frame, free them */ 390 + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { 391 + free_func_state(dst_state->frame[i]); 392 + dst_state->frame[i] = NULL; 393 + } 394 + dst_state->curframe = src->curframe; 395 + dst_state->parent = src->parent; 396 + for (i = 0; i <= src->curframe; i++) { 397 + dst = dst_state->frame[i]; 398 + if (!dst) { 399 + dst = kzalloc(sizeof(*dst), GFP_KERNEL); 400 + if (!dst) 401 + return -ENOMEM; 402 + dst_state->frame[i] = dst; 403 + } 404 + err = copy_func_state(dst, src->frame[i]); 405 + if (err) 406 + return err; 407 + } 408 + return 0; 408 409 } 409 410 410 411 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, ··· 496 441 static const int caller_saved[CALLER_SAVED_REGS] = { 497 442 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 498 443 }; 444 + #define CALLEE_SAVED_REGS 5 445 + static const int callee_saved[CALLEE_SAVED_REGS] = { 446 + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 447 + }; 499 448 500 449 static void __mark_reg_not_init(struct bpf_reg_state *reg); 501 450 ··· 522 463 static void __mark_reg_known_zero(struct bpf_reg_state *reg) 523 464 { 524 465 __mark_reg_known(reg, 0); 466 + } 467 + 468 + static void __mark_reg_const_zero(struct bpf_reg_state *reg) 469 + { 470 + __mark_reg_known(reg, 0); 471 + reg->off = 0; 472 + reg->type = SCALAR_VALUE; 525 473 } 526 474 527 475 static void mark_reg_known_zero(struct bpf_verifier_env *env, ··· 642 576 reg->id = 0; 643 577 reg->off = 0; 644 578 reg->var_off = tnum_unknown; 579 + reg->frameno = 0; 645 580 __mark_reg_unbounded(reg); 646 581 } 647 582 ··· 679 612 } 680 613 681 614 static void init_reg_state(struct bpf_verifier_env *env, 682 - struct bpf_reg_state *regs) 615 + struct bpf_func_state *state) 683 616 { 617 + struct bpf_reg_state *regs = state->regs; 684 618 int i; 685 619 686 620 for (i = 0; i < MAX_BPF_REG; i++) { ··· 692 624 /* frame pointer */ 693 625 regs[BPF_REG_FP].type = PTR_TO_STACK; 694 626 mark_reg_known_zero(env, regs, BPF_REG_FP); 627 + regs[BPF_REG_FP].frameno = state->frameno; 695 628 696 629 /* 1st arg to a function */ 697 630 regs[BPF_REG_1].type = PTR_TO_CTX; 698 631 mark_reg_known_zero(env, regs, BPF_REG_1); 632 + } 633 + 634 + #define BPF_MAIN_FUNC (-1) 635 + static void init_func_state(struct bpf_verifier_env *env, 636 + struct bpf_func_state *state, 637 + int callsite, int frameno, int subprogno) 638 + { 639 + state->callsite = callsite; 640 + state->frameno = frameno; 641 + state->subprogno = subprogno; 642 + init_reg_state(env, state); 699 643 } 700 644 701 645 enum reg_arg_type { ··· 716 636 DST_OP_NO_MARK /* same as above, check only, don't mark */ 717 637 }; 718 638 719 - static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) 639 + static int cmp_subprogs(const void *a, const void *b) 720 640 { 721 - struct bpf_verifier_state *parent = state->parent; 641 + return *(int *)a - *(int *)b; 642 + } 643 + 644 + static int find_subprog(struct bpf_verifier_env *env, int off) 645 + { 646 + u32 *p; 647 + 648 + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, 649 + sizeof(env->subprog_starts[0]), cmp_subprogs); 650 + if (!p) 651 + return -ENOENT; 652 + return p - env->subprog_starts; 653 + 654 + } 655 + 656 + static int add_subprog(struct bpf_verifier_env *env, int off) 657 + { 658 + int insn_cnt = env->prog->len; 659 + int ret; 660 + 661 + if (off >= insn_cnt || off < 0) { 662 + verbose(env, "call to invalid destination\n"); 663 + return -EINVAL; 664 + } 665 + ret = find_subprog(env, off); 666 + if (ret >= 0) 667 + return 0; 668 + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { 669 + verbose(env, "too many subprograms\n"); 670 + return -E2BIG; 671 + } 672 + env->subprog_starts[env->subprog_cnt++] = off; 673 + sort(env->subprog_starts, env->subprog_cnt, 674 + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); 675 + return 0; 676 + } 677 + 678 + static int check_subprogs(struct bpf_verifier_env *env) 679 + { 680 + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; 681 + struct bpf_insn *insn = env->prog->insnsi; 682 + int insn_cnt = env->prog->len; 683 + 684 + /* determine subprog starts. The end is one before the next starts */ 685 + for (i = 0; i < insn_cnt; i++) { 686 + if (insn[i].code != (BPF_JMP | BPF_CALL)) 687 + continue; 688 + if (insn[i].src_reg != BPF_PSEUDO_CALL) 689 + continue; 690 + if (!env->allow_ptr_leaks) { 691 + verbose(env, "function calls to other bpf functions are allowed for root only\n"); 692 + return -EPERM; 693 + } 694 + if (bpf_prog_is_dev_bound(env->prog->aux)) { 695 + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); 696 + return -EINVAL; 697 + } 698 + ret = add_subprog(env, i + insn[i].imm + 1); 699 + if (ret < 0) 700 + return ret; 701 + } 702 + 703 + if (env->log.level > 1) 704 + for (i = 0; i < env->subprog_cnt; i++) 705 + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); 706 + 707 + /* now check that all jumps are within the same subprog */ 708 + subprog_start = 0; 709 + if (env->subprog_cnt == cur_subprog) 710 + subprog_end = insn_cnt; 711 + else 712 + subprog_end = env->subprog_starts[cur_subprog++]; 713 + for (i = 0; i < insn_cnt; i++) { 714 + u8 code = insn[i].code; 715 + 716 + if (BPF_CLASS(code) != BPF_JMP) 717 + goto next; 718 + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) 719 + goto next; 720 + off = i + insn[i].off + 1; 721 + if (off < subprog_start || off >= subprog_end) { 722 + verbose(env, "jump out of range from insn %d to %d\n", i, off); 723 + return -EINVAL; 724 + } 725 + next: 726 + if (i == subprog_end - 1) { 727 + /* to avoid fall-through from one subprog into another 728 + * the last insn of the subprog should be either exit 729 + * or unconditional jump back 730 + */ 731 + if (code != (BPF_JMP | BPF_EXIT) && 732 + code != (BPF_JMP | BPF_JA)) { 733 + verbose(env, "last insn is not an exit or jmp\n"); 734 + return -EINVAL; 735 + } 736 + subprog_start = subprog_end; 737 + if (env->subprog_cnt == cur_subprog) 738 + subprog_end = insn_cnt; 739 + else 740 + subprog_end = env->subprog_starts[cur_subprog++]; 741 + } 742 + } 743 + return 0; 744 + } 745 + 746 + struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, 747 + const struct bpf_verifier_state *state, 748 + struct bpf_verifier_state *parent, 749 + u32 regno) 750 + { 751 + struct bpf_verifier_state *tmp = NULL; 752 + 753 + /* 'parent' could be a state of caller and 754 + * 'state' could be a state of callee. In such case 755 + * parent->curframe < state->curframe 756 + * and it's ok for r1 - r5 registers 757 + * 758 + * 'parent' could be a callee's state after it bpf_exit-ed. 759 + * In such case parent->curframe > state->curframe 760 + * and it's ok for r0 only 761 + */ 762 + if (parent->curframe == state->curframe || 763 + (parent->curframe < state->curframe && 764 + regno >= BPF_REG_1 && regno <= BPF_REG_5) || 765 + (parent->curframe > state->curframe && 766 + regno == BPF_REG_0)) 767 + return parent; 768 + 769 + if (parent->curframe > state->curframe && 770 + regno >= BPF_REG_6) { 771 + /* for callee saved regs we have to skip the whole chain 772 + * of states that belong to callee and mark as LIVE_READ 773 + * the registers before the call 774 + */ 775 + tmp = parent; 776 + while (tmp && tmp->curframe != state->curframe) { 777 + tmp = tmp->parent; 778 + } 779 + if (!tmp) 780 + goto bug; 781 + parent = tmp; 782 + } else { 783 + goto bug; 784 + } 785 + return parent; 786 + bug: 787 + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); 788 + verbose(env, "regno %d parent frame %d current frame %d\n", 789 + regno, parent->curframe, state->curframe); 790 + return 0; 791 + } 792 + 793 + static int mark_reg_read(struct bpf_verifier_env *env, 794 + const struct bpf_verifier_state *state, 795 + struct bpf_verifier_state *parent, 796 + u32 regno) 797 + { 798 + bool writes = parent == state->parent; /* Observe write marks */ 722 799 723 800 if (regno == BPF_REG_FP) 724 801 /* We don't need to worry about FP liveness because it's read-only */ 725 - return; 802 + return 0; 726 803 727 804 while (parent) { 728 805 /* if read wasn't screened by an earlier write ... */ 729 - if (state->regs[regno].live & REG_LIVE_WRITTEN) 806 + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) 730 807 break; 808 + parent = skip_callee(env, state, parent, regno); 809 + if (!parent) 810 + return -EFAULT; 731 811 /* ... then we depend on parent's value */ 732 - parent->regs[regno].live |= REG_LIVE_READ; 812 + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; 733 813 state = parent; 734 814 parent = state->parent; 815 + writes = true; 735 816 } 817 + return 0; 736 818 } 737 819 738 820 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 739 821 enum reg_arg_type t) 740 822 { 741 - struct bpf_reg_state *regs = env->cur_state->regs; 823 + struct bpf_verifier_state *vstate = env->cur_state; 824 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 825 + struct bpf_reg_state *regs = state->regs; 742 826 743 827 if (regno >= MAX_BPF_REG) { 744 828 verbose(env, "R%d is invalid\n", regno); ··· 915 671 verbose(env, "R%d !read_ok\n", regno); 916 672 return -EACCES; 917 673 } 918 - mark_reg_read(env->cur_state, regno); 674 + return mark_reg_read(env, vstate, vstate->parent, regno); 919 675 } else { 920 676 /* check whether register used as dest operand can be written to */ 921 677 if (regno == BPF_REG_FP) { ··· 946 702 } 947 703 } 948 704 705 + /* Does this register contain a constant zero? */ 706 + static bool register_is_null(struct bpf_reg_state *reg) 707 + { 708 + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 709 + } 710 + 949 711 /* check_stack_read/write functions track spill/fill of registers, 950 712 * stack boundary and alignment are checked in check_mem_access() 951 713 */ 952 714 static int check_stack_write(struct bpf_verifier_env *env, 953 - struct bpf_verifier_state *state, int off, 954 - int size, int value_regno) 715 + struct bpf_func_state *state, /* func where register points to */ 716 + int off, int size, int value_regno) 955 717 { 718 + struct bpf_func_state *cur; /* state of the current function */ 956 719 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; 720 + enum bpf_reg_type type; 957 721 958 - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), 959 - true); 722 + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), 723 + true); 960 724 if (err) 961 725 return err; 962 726 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, ··· 977 725 return -EACCES; 978 726 } 979 727 728 + cur = env->cur_state->frame[env->cur_state->curframe]; 980 729 if (value_regno >= 0 && 981 - is_spillable_regtype(state->regs[value_regno].type)) { 730 + is_spillable_regtype((type = cur->regs[value_regno].type))) { 982 731 983 732 /* register containing pointer is being spilled into stack */ 984 733 if (size != BPF_REG_SIZE) { ··· 987 734 return -EACCES; 988 735 } 989 736 737 + if (state != cur && type == PTR_TO_STACK) { 738 + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); 739 + return -EINVAL; 740 + } 741 + 990 742 /* save register state */ 991 - state->stack[spi].spilled_ptr = state->regs[value_regno]; 743 + state->stack[spi].spilled_ptr = cur->regs[value_regno]; 992 744 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 993 745 994 746 for (i = 0; i < BPF_REG_SIZE; i++) 995 747 state->stack[spi].slot_type[i] = STACK_SPILL; 996 748 } else { 749 + u8 type = STACK_MISC; 750 + 997 751 /* regular write of data into stack */ 998 752 state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; 999 753 754 + /* only mark the slot as written if all 8 bytes were written 755 + * otherwise read propagation may incorrectly stop too soon 756 + * when stack slots are partially written. 757 + * This heuristic means that read propagation will be 758 + * conservative, since it will add reg_live_read marks 759 + * to stack slots all the way to first state when programs 760 + * writes+reads less than 8 bytes 761 + */ 762 + if (size == BPF_REG_SIZE) 763 + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 764 + 765 + /* when we zero initialize stack slots mark them as such */ 766 + if (value_regno >= 0 && 767 + register_is_null(&cur->regs[value_regno])) 768 + type = STACK_ZERO; 769 + 1000 770 for (i = 0; i < size; i++) 1001 771 state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = 1002 - STACK_MISC; 772 + type; 1003 773 } 1004 774 return 0; 1005 775 } 1006 776 1007 - static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) 777 + /* registers of every function are unique and mark_reg_read() propagates 778 + * the liveness in the following cases: 779 + * - from callee into caller for R1 - R5 that were used as arguments 780 + * - from caller into callee for R0 that used as result of the call 781 + * - from caller to the same caller skipping states of the callee for R6 - R9, 782 + * since R6 - R9 are callee saved by implicit function prologue and 783 + * caller's R6 != callee's R6, so when we propagate liveness up to 784 + * parent states we need to skip callee states for R6 - R9. 785 + * 786 + * stack slot marking is different, since stacks of caller and callee are 787 + * accessible in both (since caller can pass a pointer to caller's stack to 788 + * callee which can pass it to another function), hence mark_stack_slot_read() 789 + * has to propagate the stack liveness to all parent states at given frame number. 790 + * Consider code: 791 + * f1() { 792 + * ptr = fp - 8; 793 + * *ptr = ctx; 794 + * call f2 { 795 + * .. = *ptr; 796 + * } 797 + * .. = *ptr; 798 + * } 799 + * First *ptr is reading from f1's stack and mark_stack_slot_read() has 800 + * to mark liveness at the f1's frame and not f2's frame. 801 + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has 802 + * to propagate liveness to f2 states at f1's frame level and further into 803 + * f1 states at f1's frame level until write into that stack slot 804 + */ 805 + static void mark_stack_slot_read(struct bpf_verifier_env *env, 806 + const struct bpf_verifier_state *state, 807 + struct bpf_verifier_state *parent, 808 + int slot, int frameno) 1008 809 { 1009 - struct bpf_verifier_state *parent = state->parent; 810 + bool writes = parent == state->parent; /* Observe write marks */ 1010 811 1011 812 while (parent) { 813 + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) 814 + /* since LIVE_WRITTEN mark is only done for full 8-byte 815 + * write the read marks are conservative and parent 816 + * state may not even have the stack allocated. In such case 817 + * end the propagation, since the loop reached beginning 818 + * of the function 819 + */ 820 + break; 1012 821 /* if read wasn't screened by an earlier write ... */ 1013 - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) 822 + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) 1014 823 break; 1015 824 /* ... then we depend on parent's value */ 1016 - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; 825 + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; 1017 826 state = parent; 1018 827 parent = state->parent; 828 + writes = true; 1019 829 } 1020 830 } 1021 831 1022 832 static int check_stack_read(struct bpf_verifier_env *env, 1023 - struct bpf_verifier_state *state, int off, int size, 1024 - int value_regno) 833 + struct bpf_func_state *reg_state /* func where register points to */, 834 + int off, int size, int value_regno) 1025 835 { 836 + struct bpf_verifier_state *vstate = env->cur_state; 837 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1026 838 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; 1027 839 u8 *stype; 1028 840 1029 - if (state->allocated_stack <= slot) { 841 + if (reg_state->allocated_stack <= slot) { 1030 842 verbose(env, "invalid read from stack off %d+0 size %d\n", 1031 843 off, size); 1032 844 return -EACCES; 1033 845 } 1034 - stype = state->stack[spi].slot_type; 846 + stype = reg_state->stack[spi].slot_type; 1035 847 1036 848 if (stype[0] == STACK_SPILL) { 1037 849 if (size != BPF_REG_SIZE) { ··· 1112 794 1113 795 if (value_regno >= 0) { 1114 796 /* restore register state from stack */ 1115 - state->regs[value_regno] = state->stack[spi].spilled_ptr; 797 + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; 1116 798 /* mark reg as written since spilled pointer state likely 1117 799 * has its liveness marks cleared by is_state_visited() 1118 800 * which resets stack/reg liveness for state transitions 1119 801 */ 1120 802 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 1121 - mark_stack_slot_read(state, spi); 1122 803 } 804 + mark_stack_slot_read(env, vstate, vstate->parent, spi, 805 + reg_state->frameno); 1123 806 return 0; 1124 807 } else { 808 + int zeros = 0; 809 + 1125 810 for (i = 0; i < size; i++) { 1126 - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { 1127 - verbose(env, "invalid read from stack off %d+%d size %d\n", 1128 - off, i, size); 1129 - return -EACCES; 811 + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) 812 + continue; 813 + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { 814 + zeros++; 815 + continue; 1130 816 } 817 + verbose(env, "invalid read from stack off %d+%d size %d\n", 818 + off, i, size); 819 + return -EACCES; 1131 820 } 1132 - if (value_regno >= 0) 1133 - /* have read misc data from the stack */ 1134 - mark_reg_unknown(env, state->regs, value_regno); 821 + mark_stack_slot_read(env, vstate, vstate->parent, spi, 822 + reg_state->frameno); 823 + if (value_regno >= 0) { 824 + if (zeros == size) { 825 + /* any size read into register is zero extended, 826 + * so the whole register == const_zero 827 + */ 828 + __mark_reg_const_zero(&state->regs[value_regno]); 829 + } else { 830 + /* have read misc data from the stack */ 831 + mark_reg_unknown(env, state->regs, value_regno); 832 + } 833 + state->regs[value_regno].live |= REG_LIVE_WRITTEN; 834 + } 1135 835 return 0; 1136 836 } 1137 837 } ··· 1174 838 static int check_map_access(struct bpf_verifier_env *env, u32 regno, 1175 839 int off, int size, bool zero_size_allowed) 1176 840 { 1177 - struct bpf_verifier_state *state = env->cur_state; 841 + struct bpf_verifier_state *vstate = env->cur_state; 842 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1178 843 struct bpf_reg_state *reg = &state->regs[regno]; 1179 844 int err; 1180 845 ··· 1425 1088 strict); 1426 1089 } 1427 1090 1091 + static int update_stack_depth(struct bpf_verifier_env *env, 1092 + const struct bpf_func_state *func, 1093 + int off) 1094 + { 1095 + u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; 1096 + struct bpf_verifier_state *cur = env->cur_state; 1097 + int i; 1098 + 1099 + if (stack >= -off) 1100 + return 0; 1101 + 1102 + /* update known max for given subprogram */ 1103 + env->subprog_stack_depth[func->subprogno] = -off; 1104 + 1105 + /* compute the total for current call chain */ 1106 + for (i = 0; i <= cur->curframe; i++) { 1107 + u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; 1108 + 1109 + /* round up to 32-bytes, since this is granularity 1110 + * of interpreter stack sizes 1111 + */ 1112 + depth = round_up(depth, 32); 1113 + total += depth; 1114 + } 1115 + 1116 + if (total > MAX_BPF_STACK) { 1117 + verbose(env, "combined stack size of %d calls is %d. Too large\n", 1118 + cur->curframe, total); 1119 + return -EACCES; 1120 + } 1121 + return 0; 1122 + } 1123 + 1124 + static int get_callee_stack_depth(struct bpf_verifier_env *env, 1125 + const struct bpf_insn *insn, int idx) 1126 + { 1127 + int start = idx + insn->imm + 1, subprog; 1128 + 1129 + subprog = find_subprog(env, start); 1130 + if (subprog < 0) { 1131 + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", 1132 + start); 1133 + return -EFAULT; 1134 + } 1135 + subprog++; 1136 + return env->subprog_stack_depth[subprog]; 1137 + } 1138 + 1428 1139 /* check whether memory at (regno + off) is accessible for t = (read | write) 1429 1140 * if t==write, value_regno is a register which value is stored into memory 1430 1141 * if t==read, value_regno is a register which will receive the value from memory ··· 1483 1098 int bpf_size, enum bpf_access_type t, 1484 1099 int value_regno) 1485 1100 { 1486 - struct bpf_verifier_state *state = env->cur_state; 1487 1101 struct bpf_reg_state *regs = cur_regs(env); 1488 1102 struct bpf_reg_state *reg = regs + regno; 1103 + struct bpf_func_state *state; 1489 1104 int size, err = 0; 1490 1105 1491 1106 size = bpf_size_to_bytes(bpf_size); ··· 1574 1189 return -EACCES; 1575 1190 } 1576 1191 1577 - if (env->prog->aux->stack_depth < -off) 1578 - env->prog->aux->stack_depth = -off; 1192 + state = func(env, reg); 1193 + err = update_stack_depth(env, state, off); 1194 + if (err) 1195 + return err; 1579 1196 1580 1197 if (t == BPF_WRITE) 1581 1198 err = check_stack_write(env, state, off, size, ··· 1651 1264 BPF_SIZE(insn->code), BPF_WRITE, -1); 1652 1265 } 1653 1266 1654 - /* Does this register contain a constant zero? */ 1655 - static bool register_is_null(struct bpf_reg_state *reg) 1656 - { 1657 - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 1658 - } 1659 - 1660 1267 /* when register 'regno' is passed into function that will read 'access_size' 1661 1268 * bytes from that pointer, make sure that it's within stack boundary 1662 1269 * and all elements of stack are initialized. ··· 1662 1281 struct bpf_call_arg_meta *meta) 1663 1282 { 1664 1283 struct bpf_reg_state *reg = cur_regs(env) + regno; 1665 - struct bpf_verifier_state *state = env->cur_state; 1284 + struct bpf_func_state *state = func(env, reg); 1666 1285 int off, i, slot, spi; 1667 1286 1668 1287 if (reg->type != PTR_TO_STACK) { ··· 1693 1312 return -EACCES; 1694 1313 } 1695 1314 1696 - if (env->prog->aux->stack_depth < -off) 1697 - env->prog->aux->stack_depth = -off; 1698 - 1699 1315 if (meta && meta->raw_mode) { 1700 1316 meta->access_size = access_size; 1701 1317 meta->regno = regno; ··· 1700 1322 } 1701 1323 1702 1324 for (i = 0; i < access_size; i++) { 1325 + u8 *stype; 1326 + 1703 1327 slot = -(off + i) - 1; 1704 1328 spi = slot / BPF_REG_SIZE; 1705 - if (state->allocated_stack <= slot || 1706 - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != 1707 - STACK_MISC) { 1708 - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 1709 - off, i, access_size); 1710 - return -EACCES; 1329 + if (state->allocated_stack <= slot) 1330 + goto err; 1331 + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; 1332 + if (*stype == STACK_MISC) 1333 + goto mark; 1334 + if (*stype == STACK_ZERO) { 1335 + /* helper can write anything into the stack */ 1336 + *stype = STACK_MISC; 1337 + goto mark; 1711 1338 } 1339 + err: 1340 + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 1341 + off, i, access_size); 1342 + return -EACCES; 1343 + mark: 1344 + /* reading any byte out of 8-byte 'spill_slot' will cause 1345 + * the whole slot to be marked as 'read' 1346 + */ 1347 + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, 1348 + spi, state->frameno); 1712 1349 } 1713 - return 0; 1350 + return update_stack_depth(env, state, off); 1714 1351 } 1715 1352 1716 1353 static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, ··· 1978 1585 case BPF_FUNC_tail_call: 1979 1586 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) 1980 1587 goto error; 1588 + if (env->subprog_cnt) { 1589 + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); 1590 + return -EINVAL; 1591 + } 1981 1592 break; 1982 1593 case BPF_FUNC_perf_event_read: 1983 1594 case BPF_FUNC_perf_event_output: ··· 2043 1646 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] 2044 1647 * are now invalid, so turn them into unknown SCALAR_VALUE. 2045 1648 */ 2046 - static void clear_all_pkt_pointers(struct bpf_verifier_env *env) 1649 + static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, 1650 + struct bpf_func_state *state) 2047 1651 { 2048 - struct bpf_verifier_state *state = env->cur_state; 2049 1652 struct bpf_reg_state *regs = state->regs, *reg; 2050 1653 int i; 2051 1654 ··· 2062 1665 } 2063 1666 } 2064 1667 2065 - static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) 1668 + static void clear_all_pkt_pointers(struct bpf_verifier_env *env) 1669 + { 1670 + struct bpf_verifier_state *vstate = env->cur_state; 1671 + int i; 1672 + 1673 + for (i = 0; i <= vstate->curframe; i++) 1674 + __clear_all_pkt_pointers(env, vstate->frame[i]); 1675 + } 1676 + 1677 + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 1678 + int *insn_idx) 1679 + { 1680 + struct bpf_verifier_state *state = env->cur_state; 1681 + struct bpf_func_state *caller, *callee; 1682 + int i, subprog, target_insn; 1683 + 1684 + if (state->curframe >= MAX_CALL_FRAMES) { 1685 + verbose(env, "the call stack of %d frames is too deep\n", 1686 + state->curframe); 1687 + return -E2BIG; 1688 + } 1689 + 1690 + target_insn = *insn_idx + insn->imm; 1691 + subprog = find_subprog(env, target_insn + 1); 1692 + if (subprog < 0) { 1693 + verbose(env, "verifier bug. No program starts at insn %d\n", 1694 + target_insn + 1); 1695 + return -EFAULT; 1696 + } 1697 + 1698 + caller = state->frame[state->curframe]; 1699 + if (state->frame[state->curframe + 1]) { 1700 + verbose(env, "verifier bug. Frame %d already allocated\n", 1701 + state->curframe + 1); 1702 + return -EFAULT; 1703 + } 1704 + 1705 + callee = kzalloc(sizeof(*callee), GFP_KERNEL); 1706 + if (!callee) 1707 + return -ENOMEM; 1708 + state->frame[state->curframe + 1] = callee; 1709 + 1710 + /* callee cannot access r0, r6 - r9 for reading and has to write 1711 + * into its own stack before reading from it. 1712 + * callee can read/write into caller's stack 1713 + */ 1714 + init_func_state(env, callee, 1715 + /* remember the callsite, it will be used by bpf_exit */ 1716 + *insn_idx /* callsite */, 1717 + state->curframe + 1 /* frameno within this callchain */, 1718 + subprog + 1 /* subprog number within this prog */); 1719 + 1720 + /* copy r1 - r5 args that callee can access */ 1721 + for (i = BPF_REG_1; i <= BPF_REG_5; i++) 1722 + callee->regs[i] = caller->regs[i]; 1723 + 1724 + /* after the call regsiters r0 - r5 were scratched */ 1725 + for (i = 0; i < CALLER_SAVED_REGS; i++) { 1726 + mark_reg_not_init(env, caller->regs, caller_saved[i]); 1727 + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); 1728 + } 1729 + 1730 + /* only increment it after check_reg_arg() finished */ 1731 + state->curframe++; 1732 + 1733 + /* and go analyze first insn of the callee */ 1734 + *insn_idx = target_insn; 1735 + 1736 + if (env->log.level) { 1737 + verbose(env, "caller:\n"); 1738 + print_verifier_state(env, caller); 1739 + verbose(env, "callee:\n"); 1740 + print_verifier_state(env, callee); 1741 + } 1742 + return 0; 1743 + } 1744 + 1745 + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) 1746 + { 1747 + struct bpf_verifier_state *state = env->cur_state; 1748 + struct bpf_func_state *caller, *callee; 1749 + struct bpf_reg_state *r0; 1750 + 1751 + callee = state->frame[state->curframe]; 1752 + r0 = &callee->regs[BPF_REG_0]; 1753 + if (r0->type == PTR_TO_STACK) { 1754 + /* technically it's ok to return caller's stack pointer 1755 + * (or caller's caller's pointer) back to the caller, 1756 + * since these pointers are valid. Only current stack 1757 + * pointer will be invalid as soon as function exits, 1758 + * but let's be conservative 1759 + */ 1760 + verbose(env, "cannot return stack pointer to the caller\n"); 1761 + return -EINVAL; 1762 + } 1763 + 1764 + state->curframe--; 1765 + caller = state->frame[state->curframe]; 1766 + /* return to the caller whatever r0 had in the callee */ 1767 + caller->regs[BPF_REG_0] = *r0; 1768 + 1769 + *insn_idx = callee->callsite + 1; 1770 + if (env->log.level) { 1771 + verbose(env, "returning from callee:\n"); 1772 + print_verifier_state(env, callee); 1773 + verbose(env, "to caller at %d:\n", *insn_idx); 1774 + print_verifier_state(env, caller); 1775 + } 1776 + /* clear everything in the callee */ 1777 + free_func_state(callee); 1778 + state->frame[state->curframe + 1] = NULL; 1779 + return 0; 1780 + } 1781 + 1782 + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) 2066 1783 { 2067 1784 const struct bpf_func_proto *fn = NULL; 2068 1785 struct bpf_reg_state *regs; ··· 2336 1825 const struct bpf_reg_state *ptr_reg, 2337 1826 const struct bpf_reg_state *off_reg) 2338 1827 { 2339 - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; 1828 + struct bpf_verifier_state *vstate = env->cur_state; 1829 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1830 + struct bpf_reg_state *regs = state->regs, *dst_reg; 2340 1831 bool known = tnum_is_const(off_reg->var_off); 2341 1832 s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, 2342 1833 smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; ··· 2350 1837 dst_reg = &regs[dst]; 2351 1838 2352 1839 if (WARN_ON_ONCE(known && (smin_val != smax_val))) { 2353 - print_verifier_state(env, env->cur_state); 1840 + print_verifier_state(env, state); 2354 1841 verbose(env, 2355 1842 "verifier internal error: known but bad sbounds\n"); 2356 1843 return -EINVAL; 2357 1844 } 2358 1845 if (WARN_ON_ONCE(known && (umin_val != umax_val))) { 2359 - print_verifier_state(env, env->cur_state); 1846 + print_verifier_state(env, state); 2360 1847 verbose(env, 2361 1848 "verifier internal error: known but bad ubounds\n"); 2362 1849 return -EINVAL; ··· 2758 2245 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, 2759 2246 struct bpf_insn *insn) 2760 2247 { 2761 - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; 2248 + struct bpf_verifier_state *vstate = env->cur_state; 2249 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 2250 + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; 2762 2251 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; 2763 2252 u8 opcode = BPF_OP(insn->code); 2764 2253 int rc; ··· 2834 2319 2835 2320 /* Got here implies adding two SCALAR_VALUEs */ 2836 2321 if (WARN_ON_ONCE(ptr_reg)) { 2837 - print_verifier_state(env, env->cur_state); 2322 + print_verifier_state(env, state); 2838 2323 verbose(env, "verifier internal error: unexpected ptr_reg\n"); 2839 2324 return -EINVAL; 2840 2325 } 2841 2326 if (WARN_ON(!src_reg)) { 2842 - print_verifier_state(env, env->cur_state); 2327 + print_verifier_state(env, state); 2843 2328 verbose(env, "verifier internal error: no src_reg\n"); 2844 2329 return -EINVAL; 2845 2330 } ··· 2993 2478 return 0; 2994 2479 } 2995 2480 2996 - static void find_good_pkt_pointers(struct bpf_verifier_state *state, 2481 + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, 2997 2482 struct bpf_reg_state *dst_reg, 2998 2483 enum bpf_reg_type type, 2999 2484 bool range_right_open) 3000 2485 { 2486 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 3001 2487 struct bpf_reg_state *regs = state->regs, *reg; 3002 2488 u16 new_range; 3003 - int i; 2489 + int i, j; 3004 2490 3005 2491 if (dst_reg->off < 0 || 3006 2492 (dst_reg->off == 0 && range_right_open)) ··· 3071 2555 /* keep the maximum range already checked */ 3072 2556 regs[i].range = max(regs[i].range, new_range); 3073 2557 3074 - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 3075 - if (state->stack[i].slot_type[0] != STACK_SPILL) 3076 - continue; 3077 - reg = &state->stack[i].spilled_ptr; 3078 - if (reg->type == type && reg->id == dst_reg->id) 3079 - reg->range = max(reg->range, new_range); 2558 + for (j = 0; j <= vstate->curframe; j++) { 2559 + state = vstate->frame[j]; 2560 + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 2561 + if (state->stack[i].slot_type[0] != STACK_SPILL) 2562 + continue; 2563 + reg = &state->stack[i].spilled_ptr; 2564 + if (reg->type == type && reg->id == dst_reg->id) 2565 + reg->range = max(reg->range, new_range); 2566 + } 3080 2567 } 3081 2568 } 3082 2569 ··· 3317 2798 /* The logic is similar to find_good_pkt_pointers(), both could eventually 3318 2799 * be folded together at some point. 3319 2800 */ 3320 - static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, 2801 + static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, 3321 2802 bool is_null) 3322 2803 { 2804 + struct bpf_func_state *state = vstate->frame[vstate->curframe]; 3323 2805 struct bpf_reg_state *regs = state->regs; 3324 2806 u32 id = regs[regno].id; 3325 - int i; 2807 + int i, j; 3326 2808 3327 2809 for (i = 0; i < MAX_BPF_REG; i++) 3328 2810 mark_map_reg(regs, i, id, is_null); 3329 2811 3330 - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 3331 - if (state->stack[i].slot_type[0] != STACK_SPILL) 3332 - continue; 3333 - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); 2812 + for (j = 0; j <= vstate->curframe; j++) { 2813 + state = vstate->frame[j]; 2814 + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 2815 + if (state->stack[i].slot_type[0] != STACK_SPILL) 2816 + continue; 2817 + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); 2818 + } 3334 2819 } 3335 2820 } 3336 2821 ··· 3434 2911 static int check_cond_jmp_op(struct bpf_verifier_env *env, 3435 2912 struct bpf_insn *insn, int *insn_idx) 3436 2913 { 3437 - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; 3438 - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; 2914 + struct bpf_verifier_state *this_branch = env->cur_state; 2915 + struct bpf_verifier_state *other_branch; 2916 + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; 2917 + struct bpf_reg_state *dst_reg, *other_branch_regs; 3439 2918 u8 opcode = BPF_OP(insn->code); 3440 2919 int err; 3441 2920 ··· 3500 2975 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); 3501 2976 if (!other_branch) 3502 2977 return -EFAULT; 2978 + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; 3503 2979 3504 2980 /* detect if we are comparing against a constant value so we can adjust 3505 2981 * our min/max values for our dst register. ··· 3513 2987 if (dst_reg->type == SCALAR_VALUE && 3514 2988 regs[insn->src_reg].type == SCALAR_VALUE) { 3515 2989 if (tnum_is_const(regs[insn->src_reg].var_off)) 3516 - reg_set_min_max(&other_branch->regs[insn->dst_reg], 2990 + reg_set_min_max(&other_branch_regs[insn->dst_reg], 3517 2991 dst_reg, regs[insn->src_reg].var_off.value, 3518 2992 opcode); 3519 2993 else if (tnum_is_const(dst_reg->var_off)) 3520 - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], 2994 + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], 3521 2995 &regs[insn->src_reg], 3522 2996 dst_reg->var_off.value, opcode); 3523 2997 else if (opcode == BPF_JEQ || opcode == BPF_JNE) 3524 2998 /* Comparing for equality, we can combine knowledge */ 3525 - reg_combine_min_max(&other_branch->regs[insn->src_reg], 3526 - &other_branch->regs[insn->dst_reg], 2999 + reg_combine_min_max(&other_branch_regs[insn->src_reg], 3000 + &other_branch_regs[insn->dst_reg], 3527 3001 &regs[insn->src_reg], 3528 3002 &regs[insn->dst_reg], opcode); 3529 3003 } 3530 3004 } else if (dst_reg->type == SCALAR_VALUE) { 3531 - reg_set_min_max(&other_branch->regs[insn->dst_reg], 3005 + reg_set_min_max(&other_branch_regs[insn->dst_reg], 3532 3006 dst_reg, insn->imm, opcode); 3533 3007 } 3534 3008 ··· 3549 3023 return -EACCES; 3550 3024 } 3551 3025 if (env->log.level) 3552 - print_verifier_state(env, this_branch); 3026 + print_verifier_state(env, this_branch->frame[this_branch->curframe]); 3553 3027 return 0; 3554 3028 } 3555 3029 ··· 3631 3105 3632 3106 if (!may_access_skb(env->prog->type)) { 3633 3107 verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); 3108 + return -EINVAL; 3109 + } 3110 + 3111 + if (env->subprog_cnt) { 3112 + /* when program has LD_ABS insn JITs and interpreter assume 3113 + * that r1 == ctx == skb which is not the case for callees 3114 + * that can have arbitrary arguments. It's problematic 3115 + * for main prog as well since JITs would need to analyze 3116 + * all functions in order to make proper register save/restore 3117 + * decisions in the main prog. Hence disallow LD_ABS with calls 3118 + */ 3119 + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); 3634 3120 return -EINVAL; 3635 3121 } 3636 3122 ··· 3822 3284 int ret = 0; 3823 3285 int i, t; 3824 3286 3287 + ret = check_subprogs(env); 3288 + if (ret < 0) 3289 + return ret; 3290 + 3825 3291 insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); 3826 3292 if (!insn_state) 3827 3293 return -ENOMEM; ··· 3858 3316 goto err_free; 3859 3317 if (t + 1 < insn_cnt) 3860 3318 env->explored_states[t + 1] = STATE_LIST_MARK; 3319 + if (insns[t].src_reg == BPF_PSEUDO_CALL) { 3320 + env->explored_states[t] = STATE_LIST_MARK; 3321 + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); 3322 + if (ret == 1) 3323 + goto peek_stack; 3324 + else if (ret < 0) 3325 + goto err_free; 3326 + } 3861 3327 } else if (opcode == BPF_JA) { 3862 3328 if (BPF_SRC(insns[t].code) != BPF_K) { 3863 3329 ret = -EINVAL; ··· 3984 3434 static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, 3985 3435 struct idpair *idmap) 3986 3436 { 3437 + bool equal; 3438 + 3987 3439 if (!(rold->live & REG_LIVE_READ)) 3988 3440 /* explored state didn't use this */ 3989 3441 return true; 3990 3442 3991 - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) 3443 + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; 3444 + 3445 + if (rold->type == PTR_TO_STACK) 3446 + /* two stack pointers are equal only if they're pointing to 3447 + * the same stack frame, since fp-8 in foo != fp-8 in bar 3448 + */ 3449 + return equal && rold->frameno == rcur->frameno; 3450 + 3451 + if (equal) 3992 3452 return true; 3993 3453 3994 3454 if (rold->type == NOT_INIT) ··· 4071 3511 tnum_in(rold->var_off, rcur->var_off); 4072 3512 case PTR_TO_CTX: 4073 3513 case CONST_PTR_TO_MAP: 4074 - case PTR_TO_STACK: 4075 3514 case PTR_TO_PACKET_END: 4076 3515 /* Only valid matches are exact, which memcmp() above 4077 3516 * would have accepted ··· 4085 3526 return false; 4086 3527 } 4087 3528 4088 - static bool stacksafe(struct bpf_verifier_state *old, 4089 - struct bpf_verifier_state *cur, 3529 + static bool stacksafe(struct bpf_func_state *old, 3530 + struct bpf_func_state *cur, 4090 3531 struct idpair *idmap) 4091 3532 { 4092 3533 int i, spi; ··· 4104 3545 for (i = 0; i < old->allocated_stack; i++) { 4105 3546 spi = i / BPF_REG_SIZE; 4106 3547 3548 + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) 3549 + /* explored state didn't use this */ 3550 + return true; 3551 + 4107 3552 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) 3553 + continue; 3554 + /* if old state was safe with misc data in the stack 3555 + * it will be safe with zero-initialized stack. 3556 + * The opposite is not true 3557 + */ 3558 + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && 3559 + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) 4108 3560 continue; 4109 3561 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != 4110 3562 cur->stack[spi].slot_type[i % BPF_REG_SIZE]) ··· 4173 3603 * whereas register type in current state is meaningful, it means that 4174 3604 * the current state will reach 'bpf_exit' instruction safely 4175 3605 */ 4176 - static bool states_equal(struct bpf_verifier_env *env, 4177 - struct bpf_verifier_state *old, 4178 - struct bpf_verifier_state *cur) 3606 + static bool func_states_equal(struct bpf_func_state *old, 3607 + struct bpf_func_state *cur) 4179 3608 { 4180 3609 struct idpair *idmap; 4181 3610 bool ret = false; ··· 4198 3629 return ret; 4199 3630 } 4200 3631 4201 - /* A write screens off any subsequent reads; but write marks come from the 4202 - * straight-line code between a state and its parent. When we arrive at a 4203 - * jump target (in the first iteration of the propagate_liveness() loop), 4204 - * we didn't arrive by the straight-line code, so read marks in state must 4205 - * propagate to parent regardless of state's write marks. 4206 - */ 4207 - static bool do_propagate_liveness(const struct bpf_verifier_state *state, 4208 - struct bpf_verifier_state *parent) 3632 + static bool states_equal(struct bpf_verifier_env *env, 3633 + struct bpf_verifier_state *old, 3634 + struct bpf_verifier_state *cur) 4209 3635 { 4210 - bool writes = parent == state->parent; /* Observe write marks */ 4211 - bool touched = false; /* any changes made? */ 4212 3636 int i; 4213 3637 4214 - if (!parent) 4215 - return touched; 3638 + if (old->curframe != cur->curframe) 3639 + return false; 3640 + 3641 + /* for states to be equal callsites have to be the same 3642 + * and all frame states need to be equivalent 3643 + */ 3644 + for (i = 0; i <= old->curframe; i++) { 3645 + if (old->frame[i]->callsite != cur->frame[i]->callsite) 3646 + return false; 3647 + if (!func_states_equal(old->frame[i], cur->frame[i])) 3648 + return false; 3649 + } 3650 + return true; 3651 + } 3652 + 3653 + /* A write screens off any subsequent reads; but write marks come from the 3654 + * straight-line code between a state and its parent. When we arrive at an 3655 + * equivalent state (jump target or such) we didn't arrive by the straight-line 3656 + * code, so read marks in the state must propagate to the parent regardless 3657 + * of the state's write marks. That's what 'parent == state->parent' comparison 3658 + * in mark_reg_read() and mark_stack_slot_read() is for. 3659 + */ 3660 + static int propagate_liveness(struct bpf_verifier_env *env, 3661 + const struct bpf_verifier_state *vstate, 3662 + struct bpf_verifier_state *vparent) 3663 + { 3664 + int i, frame, err = 0; 3665 + struct bpf_func_state *state, *parent; 3666 + 3667 + if (vparent->curframe != vstate->curframe) { 3668 + WARN(1, "propagate_live: parent frame %d current frame %d\n", 3669 + vparent->curframe, vstate->curframe); 3670 + return -EFAULT; 3671 + } 4216 3672 /* Propagate read liveness of registers... */ 4217 3673 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 4218 3674 /* We don't need to worry about FP liveness because it's read-only */ 4219 3675 for (i = 0; i < BPF_REG_FP; i++) { 4220 - if (parent->regs[i].live & REG_LIVE_READ) 3676 + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) 4221 3677 continue; 4222 - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) 4223 - continue; 4224 - if (state->regs[i].live & REG_LIVE_READ) { 4225 - parent->regs[i].live |= REG_LIVE_READ; 4226 - touched = true; 3678 + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { 3679 + err = mark_reg_read(env, vstate, vparent, i); 3680 + if (err) 3681 + return err; 4227 3682 } 4228 3683 } 4229 - /* ... and stack slots */ 4230 - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && 4231 - i < parent->allocated_stack / BPF_REG_SIZE; i++) { 4232 - if (parent->stack[i].slot_type[0] != STACK_SPILL) 4233 - continue; 4234 - if (state->stack[i].slot_type[0] != STACK_SPILL) 4235 - continue; 4236 - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) 4237 - continue; 4238 - if (writes && 4239 - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) 4240 - continue; 4241 - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { 4242 - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; 4243 - touched = true; 4244 - } 4245 - } 4246 - return touched; 4247 - } 4248 3684 4249 - /* "parent" is "a state from which we reach the current state", but initially 4250 - * it is not the state->parent (i.e. "the state whose straight-line code leads 4251 - * to the current state"), instead it is the state that happened to arrive at 4252 - * a (prunable) equivalent of the current state. See comment above 4253 - * do_propagate_liveness() for consequences of this. 4254 - * This function is just a more efficient way of calling mark_reg_read() or 4255 - * mark_stack_slot_read() on each reg in "parent" that is read in "state", 4256 - * though it requires that parent != state->parent in the call arguments. 4257 - */ 4258 - static void propagate_liveness(const struct bpf_verifier_state *state, 4259 - struct bpf_verifier_state *parent) 4260 - { 4261 - while (do_propagate_liveness(state, parent)) { 4262 - /* Something changed, so we need to feed those changes onward */ 4263 - state = parent; 4264 - parent = state->parent; 3685 + /* ... and stack slots */ 3686 + for (frame = 0; frame <= vstate->curframe; frame++) { 3687 + state = vstate->frame[frame]; 3688 + parent = vparent->frame[frame]; 3689 + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && 3690 + i < parent->allocated_stack / BPF_REG_SIZE; i++) { 3691 + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) 3692 + continue; 3693 + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) 3694 + mark_stack_slot_read(env, vstate, vparent, i, frame); 3695 + } 4265 3696 } 3697 + return err; 4266 3698 } 4267 3699 4268 3700 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) ··· 4271 3701 struct bpf_verifier_state_list *new_sl; 4272 3702 struct bpf_verifier_state_list *sl; 4273 3703 struct bpf_verifier_state *cur = env->cur_state; 4274 - int i, err; 3704 + int i, j, err; 4275 3705 4276 3706 sl = env->explored_states[insn_idx]; 4277 3707 if (!sl) ··· 4292 3722 * they'll be immediately forgotten as we're pruning 4293 3723 * this state and will pop a new one. 4294 3724 */ 4295 - propagate_liveness(&sl->state, cur); 3725 + err = propagate_liveness(env, &sl->state, cur); 3726 + if (err) 3727 + return err; 4296 3728 return 1; 4297 3729 } 4298 3730 sl = sl->next; ··· 4302 3730 4303 3731 /* there were no equivalent states, remember current one. 4304 3732 * technically the current state is not proven to be safe yet, 4305 - * but it will either reach bpf_exit (which means it's safe) or 4306 - * it will be rejected. Since there are no loops, we won't be 4307 - * seeing this 'insn_idx' instruction again on the way to bpf_exit 3733 + * but it will either reach outer most bpf_exit (which means it's safe) 3734 + * or it will be rejected. Since there are no loops, we won't be 3735 + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) 3736 + * again on the way to bpf_exit 4308 3737 */ 4309 3738 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); 4310 3739 if (!new_sl) ··· 4329 3756 * explored_states can get read marks.) 4330 3757 */ 4331 3758 for (i = 0; i < BPF_REG_FP; i++) 4332 - cur->regs[i].live = REG_LIVE_NONE; 4333 - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) 4334 - if (cur->stack[i].slot_type[0] == STACK_SPILL) 4335 - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; 3759 + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; 3760 + 3761 + /* all stack frames are accessible from callee, clear them all */ 3762 + for (j = 0; j <= cur->curframe; j++) { 3763 + struct bpf_func_state *frame = cur->frame[j]; 3764 + 3765 + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) 3766 + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; 3767 + } 4336 3768 return 0; 4337 3769 } 4338 3770 ··· 4355 3777 struct bpf_verifier_state *state; 4356 3778 struct bpf_insn *insns = env->prog->insnsi; 4357 3779 struct bpf_reg_state *regs; 4358 - int insn_cnt = env->prog->len; 3780 + int insn_cnt = env->prog->len, i; 4359 3781 int insn_idx, prev_insn_idx = 0; 4360 3782 int insn_processed = 0; 4361 3783 bool do_print_state = false; ··· 4363 3785 state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); 4364 3786 if (!state) 4365 3787 return -ENOMEM; 4366 - env->cur_state = state; 4367 - init_reg_state(env, state->regs); 3788 + state->curframe = 0; 4368 3789 state->parent = NULL; 3790 + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); 3791 + if (!state->frame[0]) { 3792 + kfree(state); 3793 + return -ENOMEM; 3794 + } 3795 + env->cur_state = state; 3796 + init_func_state(env, state->frame[0], 3797 + BPF_MAIN_FUNC /* callsite */, 3798 + 0 /* frameno */, 3799 + 0 /* subprogno, zero == main subprog */); 4369 3800 insn_idx = 0; 4370 3801 for (;;) { 4371 3802 struct bpf_insn *insn; ··· 4421 3834 else 4422 3835 verbose(env, "\nfrom %d to %d:", 4423 3836 prev_insn_idx, insn_idx); 4424 - print_verifier_state(env, state); 3837 + print_verifier_state(env, state->frame[state->curframe]); 4425 3838 do_print_state = false; 4426 3839 } 4427 3840 ··· 4554 3967 if (opcode == BPF_CALL) { 4555 3968 if (BPF_SRC(insn->code) != BPF_K || 4556 3969 insn->off != 0 || 4557 - insn->src_reg != BPF_REG_0 || 3970 + (insn->src_reg != BPF_REG_0 && 3971 + insn->src_reg != BPF_PSEUDO_CALL) || 4558 3972 insn->dst_reg != BPF_REG_0) { 4559 3973 verbose(env, "BPF_CALL uses reserved fields\n"); 4560 3974 return -EINVAL; 4561 3975 } 4562 3976 4563 - err = check_call(env, insn->imm, insn_idx); 3977 + if (insn->src_reg == BPF_PSEUDO_CALL) 3978 + err = check_func_call(env, insn, &insn_idx); 3979 + else 3980 + err = check_helper_call(env, insn->imm, insn_idx); 4564 3981 if (err) 4565 3982 return err; 4566 3983 ··· 4587 3996 insn->dst_reg != BPF_REG_0) { 4588 3997 verbose(env, "BPF_EXIT uses reserved fields\n"); 4589 3998 return -EINVAL; 3999 + } 4000 + 4001 + if (state->curframe) { 4002 + /* exit from nested function */ 4003 + prev_insn_idx = insn_idx; 4004 + err = prepare_func_exit(env, &insn_idx); 4005 + if (err) 4006 + return err; 4007 + do_print_state = true; 4008 + continue; 4590 4009 } 4591 4010 4592 4011 /* eBPF calling convetion is such that R0 is used ··· 4659 4058 insn_idx++; 4660 4059 } 4661 4060 4662 - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, 4663 - env->prog->aux->stack_depth); 4061 + verbose(env, "processed %d insns, stack depth ", insn_processed); 4062 + for (i = 0; i < env->subprog_cnt + 1; i++) { 4063 + u32 depth = env->subprog_stack_depth[i]; 4064 + 4065 + verbose(env, "%d", depth); 4066 + if (i + 1 < env->subprog_cnt + 1) 4067 + verbose(env, "+"); 4068 + } 4069 + verbose(env, "\n"); 4070 + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; 4664 4071 return 0; 4665 4072 } 4666 4073 ··· 4854 4245 return 0; 4855 4246 } 4856 4247 4248 + static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) 4249 + { 4250 + int i; 4251 + 4252 + if (len == 1) 4253 + return; 4254 + for (i = 0; i < env->subprog_cnt; i++) { 4255 + if (env->subprog_starts[i] < off) 4256 + continue; 4257 + env->subprog_starts[i] += len - 1; 4258 + } 4259 + } 4260 + 4857 4261 static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, 4858 4262 const struct bpf_insn *patch, u32 len) 4859 4263 { ··· 4877 4255 return NULL; 4878 4256 if (adjust_insn_aux_data(env, new_prog->len, off, len)) 4879 4257 return NULL; 4258 + adjust_subprog_starts(env, off, len); 4880 4259 return new_prog; 4881 4260 } 4882 4261 ··· 5012 4389 return 0; 5013 4390 } 5014 4391 4392 + static int jit_subprogs(struct bpf_verifier_env *env) 4393 + { 4394 + struct bpf_prog *prog = env->prog, **func, *tmp; 4395 + int i, j, subprog_start, subprog_end = 0, len, subprog; 4396 + struct bpf_insn *insn = prog->insnsi; 4397 + void *old_bpf_func; 4398 + int err = -ENOMEM; 4399 + 4400 + if (env->subprog_cnt == 0) 4401 + return 0; 4402 + 4403 + for (i = 0; i < prog->len; i++, insn++) { 4404 + if (insn->code != (BPF_JMP | BPF_CALL) || 4405 + insn->src_reg != BPF_PSEUDO_CALL) 4406 + continue; 4407 + subprog = find_subprog(env, i + insn->imm + 1); 4408 + if (subprog < 0) { 4409 + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", 4410 + i + insn->imm + 1); 4411 + return -EFAULT; 4412 + } 4413 + /* temporarily remember subprog id inside insn instead of 4414 + * aux_data, since next loop will split up all insns into funcs 4415 + */ 4416 + insn->off = subprog + 1; 4417 + /* remember original imm in case JIT fails and fallback 4418 + * to interpreter will be needed 4419 + */ 4420 + env->insn_aux_data[i].call_imm = insn->imm; 4421 + /* point imm to __bpf_call_base+1 from JITs point of view */ 4422 + insn->imm = 1; 4423 + } 4424 + 4425 + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); 4426 + if (!func) 4427 + return -ENOMEM; 4428 + 4429 + for (i = 0; i <= env->subprog_cnt; i++) { 4430 + subprog_start = subprog_end; 4431 + if (env->subprog_cnt == i) 4432 + subprog_end = prog->len; 4433 + else 4434 + subprog_end = env->subprog_starts[i]; 4435 + 4436 + len = subprog_end - subprog_start; 4437 + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); 4438 + if (!func[i]) 4439 + goto out_free; 4440 + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], 4441 + len * sizeof(struct bpf_insn)); 4442 + func[i]->len = len; 4443 + func[i]->is_func = 1; 4444 + /* Use bpf_prog_F_tag to indicate functions in stack traces. 4445 + * Long term would need debug info to populate names 4446 + */ 4447 + func[i]->aux->name[0] = 'F'; 4448 + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; 4449 + func[i]->jit_requested = 1; 4450 + func[i] = bpf_int_jit_compile(func[i]); 4451 + if (!func[i]->jited) { 4452 + err = -ENOTSUPP; 4453 + goto out_free; 4454 + } 4455 + cond_resched(); 4456 + } 4457 + /* at this point all bpf functions were successfully JITed 4458 + * now populate all bpf_calls with correct addresses and 4459 + * run last pass of JIT 4460 + */ 4461 + for (i = 0; i <= env->subprog_cnt; i++) { 4462 + insn = func[i]->insnsi; 4463 + for (j = 0; j < func[i]->len; j++, insn++) { 4464 + if (insn->code != (BPF_JMP | BPF_CALL) || 4465 + insn->src_reg != BPF_PSEUDO_CALL) 4466 + continue; 4467 + subprog = insn->off; 4468 + insn->off = 0; 4469 + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) 4470 + func[subprog]->bpf_func - 4471 + __bpf_call_base; 4472 + } 4473 + } 4474 + for (i = 0; i <= env->subprog_cnt; i++) { 4475 + old_bpf_func = func[i]->bpf_func; 4476 + tmp = bpf_int_jit_compile(func[i]); 4477 + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { 4478 + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); 4479 + err = -EFAULT; 4480 + goto out_free; 4481 + } 4482 + cond_resched(); 4483 + } 4484 + 4485 + /* finally lock prog and jit images for all functions and 4486 + * populate kallsysm 4487 + */ 4488 + for (i = 0; i <= env->subprog_cnt; i++) { 4489 + bpf_prog_lock_ro(func[i]); 4490 + bpf_prog_kallsyms_add(func[i]); 4491 + } 4492 + prog->jited = 1; 4493 + prog->bpf_func = func[0]->bpf_func; 4494 + prog->aux->func = func; 4495 + prog->aux->func_cnt = env->subprog_cnt + 1; 4496 + return 0; 4497 + out_free: 4498 + for (i = 0; i <= env->subprog_cnt; i++) 4499 + if (func[i]) 4500 + bpf_jit_free(func[i]); 4501 + kfree(func); 4502 + /* cleanup main prog to be interpreted */ 4503 + prog->jit_requested = 0; 4504 + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { 4505 + if (insn->code != (BPF_JMP | BPF_CALL) || 4506 + insn->src_reg != BPF_PSEUDO_CALL) 4507 + continue; 4508 + insn->off = 0; 4509 + insn->imm = env->insn_aux_data[i].call_imm; 4510 + } 4511 + return err; 4512 + } 4513 + 4514 + static int fixup_call_args(struct bpf_verifier_env *env) 4515 + { 4516 + struct bpf_prog *prog = env->prog; 4517 + struct bpf_insn *insn = prog->insnsi; 4518 + int i, depth; 4519 + 4520 + if (env->prog->jit_requested) 4521 + if (jit_subprogs(env) == 0) 4522 + return 0; 4523 + 4524 + for (i = 0; i < prog->len; i++, insn++) { 4525 + if (insn->code != (BPF_JMP | BPF_CALL) || 4526 + insn->src_reg != BPF_PSEUDO_CALL) 4527 + continue; 4528 + depth = get_callee_stack_depth(env, insn, i); 4529 + if (depth < 0) 4530 + return depth; 4531 + bpf_patch_call_args(insn, depth); 4532 + } 4533 + return 0; 4534 + } 4535 + 5015 4536 /* fixup insn->imm field of bpf_call instructions 5016 4537 * and inline eligible helpers as explicit sequence of BPF instructions 5017 4538 * ··· 5175 4408 for (i = 0; i < insn_cnt; i++, insn++) { 5176 4409 if (insn->code != (BPF_JMP | BPF_CALL)) 5177 4410 continue; 4411 + if (insn->src_reg == BPF_PSEUDO_CALL) 4412 + continue; 5178 4413 5179 4414 if (insn->imm == BPF_FUNC_get_route_realm) 5180 4415 prog->dst_needed = 1; 5181 4416 if (insn->imm == BPF_FUNC_get_prandom_u32) 5182 4417 bpf_user_rnd_init_once(); 4418 + if (insn->imm == BPF_FUNC_override_return) 4419 + prog->kprobe_override = 1; 5183 4420 if (insn->imm == BPF_FUNC_tail_call) { 5184 4421 /* If we tail call into other programs, we 5185 4422 * cannot make any assumptions since they can ··· 5206 4435 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup 5207 4436 * handlers are currently limited to 64 bit only. 5208 4437 */ 5209 - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && 4438 + if (prog->jit_requested && BITS_PER_LONG == 64 && 5210 4439 insn->imm == BPF_FUNC_map_lookup_elem) { 5211 4440 map_ptr = env->insn_aux_data[i + delta].map_ptr; 5212 4441 if (map_ptr == BPF_MAP_PTR_POISON || ··· 5358 4587 if (!env->explored_states) 5359 4588 goto skip_full_check; 5360 4589 4590 + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); 4591 + 5361 4592 ret = check_cfg(env); 5362 4593 if (ret < 0) 5363 4594 goto skip_full_check; 5364 - 5365 - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); 5366 4595 5367 4596 ret = do_check(env); 5368 4597 if (env->cur_state) { ··· 5383 4612 5384 4613 if (ret == 0) 5385 4614 ret = fixup_bpf_calls(env); 4615 + 4616 + if (ret == 0) 4617 + ret = fixup_call_args(env); 5386 4618 5387 4619 if (log->level && bpf_verifier_log_full(log)) 5388 4620 ret = -ENOSPC;

+10

kernel/events/core.c

··· 4723 4723 rcu_read_unlock(); 4724 4724 return 0; 4725 4725 } 4726 + 4727 + case PERF_EVENT_IOC_QUERY_BPF: 4728 + return perf_event_query_prog_array(event, (void __user *)arg); 4726 4729 default: 4727 4730 return -ENOTTY; 4728 4731 } ··· 8079 8076 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || 8080 8077 (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { 8081 8078 /* valid fd, but invalid bpf program type */ 8079 + bpf_prog_put(prog); 8080 + return -EINVAL; 8081 + } 8082 + 8083 + /* Kprobe override only works for kprobes, not uprobes. */ 8084 + if (prog->kprobe_override && 8085 + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { 8082 8086 bpf_prog_put(prog); 8083 8087 return -EINVAL; 8084 8088 }

+163

kernel/kprobes.c

··· 83 83 return &(kretprobe_table_locks[hash].lock); 84 84 } 85 85 86 + /* List of symbols that can be overriden for error injection. */ 87 + static LIST_HEAD(kprobe_error_injection_list); 88 + static DEFINE_MUTEX(kprobe_ei_mutex); 89 + struct kprobe_ei_entry { 90 + struct list_head list; 91 + unsigned long start_addr; 92 + unsigned long end_addr; 93 + void *priv; 94 + }; 95 + 86 96 /* Blacklist -- list of struct kprobe_blacklist_entry */ 87 97 static LIST_HEAD(kprobe_blacklist); 88 98 ··· 1404 1394 return false; 1405 1395 } 1406 1396 1397 + bool within_kprobe_error_injection_list(unsigned long addr) 1398 + { 1399 + struct kprobe_ei_entry *ent; 1400 + 1401 + list_for_each_entry(ent, &kprobe_error_injection_list, list) { 1402 + if (addr >= ent->start_addr && addr < ent->end_addr) 1403 + return true; 1404 + } 1405 + return false; 1406 + } 1407 + 1407 1408 /* 1408 1409 * If we have a symbol_name argument, look it up and add the offset field 1409 1410 * to it. This way, we can specify a relative address to a symbol. ··· 2189 2168 return 0; 2190 2169 } 2191 2170 2171 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 2172 + /* Markers of the _kprobe_error_inject_list section */ 2173 + extern unsigned long __start_kprobe_error_inject_list[]; 2174 + extern unsigned long __stop_kprobe_error_inject_list[]; 2175 + 2176 + /* 2177 + * Lookup and populate the kprobe_error_injection_list. 2178 + * 2179 + * For safety reasons we only allow certain functions to be overriden with 2180 + * bpf_error_injection, so we need to populate the list of the symbols that have 2181 + * been marked as safe for overriding. 2182 + */ 2183 + static void populate_kprobe_error_injection_list(unsigned long *start, 2184 + unsigned long *end, 2185 + void *priv) 2186 + { 2187 + unsigned long *iter; 2188 + struct kprobe_ei_entry *ent; 2189 + unsigned long entry, offset = 0, size = 0; 2190 + 2191 + mutex_lock(&kprobe_ei_mutex); 2192 + for (iter = start; iter < end; iter++) { 2193 + entry = arch_deref_entry_point((void *)*iter); 2194 + 2195 + if (!kernel_text_address(entry) || 2196 + !kallsyms_lookup_size_offset(entry, &size, &offset)) { 2197 + pr_err("Failed to find error inject entry at %p\n", 2198 + (void *)entry); 2199 + continue; 2200 + } 2201 + 2202 + ent = kmalloc(sizeof(*ent), GFP_KERNEL); 2203 + if (!ent) 2204 + break; 2205 + ent->start_addr = entry; 2206 + ent->end_addr = entry + size; 2207 + ent->priv = priv; 2208 + INIT_LIST_HEAD(&ent->list); 2209 + list_add_tail(&ent->list, &kprobe_error_injection_list); 2210 + } 2211 + mutex_unlock(&kprobe_ei_mutex); 2212 + } 2213 + 2214 + static void __init populate_kernel_kprobe_ei_list(void) 2215 + { 2216 + populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, 2217 + __stop_kprobe_error_inject_list, 2218 + NULL); 2219 + } 2220 + 2221 + static void module_load_kprobe_ei_list(struct module *mod) 2222 + { 2223 + if (!mod->num_kprobe_ei_funcs) 2224 + return; 2225 + populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, 2226 + mod->kprobe_ei_funcs + 2227 + mod->num_kprobe_ei_funcs, mod); 2228 + } 2229 + 2230 + static void module_unload_kprobe_ei_list(struct module *mod) 2231 + { 2232 + struct kprobe_ei_entry *ent, *n; 2233 + if (!mod->num_kprobe_ei_funcs) 2234 + return; 2235 + 2236 + mutex_lock(&kprobe_ei_mutex); 2237 + list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { 2238 + if (ent->priv == mod) { 2239 + list_del_init(&ent->list); 2240 + kfree(ent); 2241 + } 2242 + } 2243 + mutex_unlock(&kprobe_ei_mutex); 2244 + } 2245 + #else 2246 + static inline void __init populate_kernel_kprobe_ei_list(void) {} 2247 + static inline void module_load_kprobe_ei_list(struct module *m) {} 2248 + static inline void module_unload_kprobe_ei_list(struct module *m) {} 2249 + #endif 2250 + 2192 2251 /* Module notifier call back, checking kprobes on the module */ 2193 2252 static int kprobes_module_callback(struct notifier_block *nb, 2194 2253 unsigned long val, void *data) ··· 2278 2177 struct kprobe *p; 2279 2178 unsigned int i; 2280 2179 int checkcore = (val == MODULE_STATE_GOING); 2180 + 2181 + if (val == MODULE_STATE_COMING) 2182 + module_load_kprobe_ei_list(mod); 2183 + else if (val == MODULE_STATE_GOING) 2184 + module_unload_kprobe_ei_list(mod); 2281 2185 2282 2186 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) 2283 2187 return NOTIFY_DONE; ··· 2345 2239 pr_err("kprobes: failed to populate blacklist: %d\n", err); 2346 2240 pr_err("Please take care of using kprobes.\n"); 2347 2241 } 2242 + 2243 + populate_kernel_kprobe_ei_list(); 2348 2244 2349 2245 if (kretprobe_blacklist_size) { 2350 2246 /* lookup the function address from its name */ ··· 2515 2407 .release = seq_release, 2516 2408 }; 2517 2409 2410 + /* 2411 + * kprobes/error_injection_list -- shows which functions can be overriden for 2412 + * error injection. 2413 + * */ 2414 + static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) 2415 + { 2416 + mutex_lock(&kprobe_ei_mutex); 2417 + return seq_list_start(&kprobe_error_injection_list, *pos); 2418 + } 2419 + 2420 + static void kprobe_ei_seq_stop(struct seq_file *m, void *v) 2421 + { 2422 + mutex_unlock(&kprobe_ei_mutex); 2423 + } 2424 + 2425 + static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) 2426 + { 2427 + return seq_list_next(v, &kprobe_error_injection_list, pos); 2428 + } 2429 + 2430 + static int kprobe_ei_seq_show(struct seq_file *m, void *v) 2431 + { 2432 + char buffer[KSYM_SYMBOL_LEN]; 2433 + struct kprobe_ei_entry *ent = 2434 + list_entry(v, struct kprobe_ei_entry, list); 2435 + 2436 + sprint_symbol(buffer, ent->start_addr); 2437 + seq_printf(m, "%s\n", buffer); 2438 + return 0; 2439 + } 2440 + 2441 + static const struct seq_operations kprobe_ei_seq_ops = { 2442 + .start = kprobe_ei_seq_start, 2443 + .next = kprobe_ei_seq_next, 2444 + .stop = kprobe_ei_seq_stop, 2445 + .show = kprobe_ei_seq_show, 2446 + }; 2447 + 2448 + static int kprobe_ei_open(struct inode *inode, struct file *filp) 2449 + { 2450 + return seq_open(filp, &kprobe_ei_seq_ops); 2451 + } 2452 + 2453 + static const struct file_operations debugfs_kprobe_ei_ops = { 2454 + .open = kprobe_ei_open, 2455 + .read = seq_read, 2456 + .llseek = seq_lseek, 2457 + .release = seq_release, 2458 + }; 2459 + 2518 2460 static void arm_all_kprobes(void) 2519 2461 { 2520 2462 struct hlist_head *head; ··· 2703 2545 2704 2546 file = debugfs_create_file("blacklist", 0444, dir, NULL, 2705 2547 &debugfs_kprobe_blacklist_ops); 2548 + if (!file) 2549 + goto error; 2550 + 2551 + file = debugfs_create_file("error_injection_list", 0444, dir, NULL, 2552 + &debugfs_kprobe_ei_ops); 2706 2553 if (!file) 2707 2554 goto error; 2708 2555

+5 -1

kernel/module.c

··· 3118 3118 sizeof(*mod->ftrace_callsites), 3119 3119 &mod->num_ftrace_callsites); 3120 3120 #endif 3121 - 3121 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 3122 + mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", 3123 + sizeof(*mod->kprobe_ei_funcs), 3124 + &mod->num_kprobe_ei_funcs); 3125 + #endif 3122 3126 mod->extable = section_objs(info, "__ex_table", 3123 3127 sizeof(*mod->extable), &mod->num_exentries); 3124 3128

+11

kernel/trace/Kconfig

··· 530 530 531 531 If in doubt, say N. 532 532 533 + config BPF_KPROBE_OVERRIDE 534 + bool "Enable BPF programs to override a kprobed function" 535 + depends on BPF_EVENTS 536 + depends on KPROBES_ON_FTRACE 537 + depends on HAVE_KPROBE_OVERRIDE 538 + depends on DYNAMIC_FTRACE_WITH_REGS 539 + default n 540 + help 541 + Allows BPF to override the execution of a probed function and 542 + set a different return value. This is used for error injection. 543 + 533 544 config FTRACE_MCOUNT_RECORD 534 545 def_bool y 535 546 depends on DYNAMIC_FTRACE

+58

kernel/trace/bpf_trace.c

··· 13 13 #include <linux/filter.h> 14 14 #include <linux/uaccess.h> 15 15 #include <linux/ctype.h> 16 + #include <linux/kprobes.h> 17 + #include <asm/kprobes.h> 18 + 19 + #include "trace_probe.h" 16 20 #include "trace.h" 17 21 18 22 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ··· 79 75 return ret; 80 76 } 81 77 EXPORT_SYMBOL_GPL(trace_call_bpf); 78 + 79 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 80 + BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) 81 + { 82 + __this_cpu_write(bpf_kprobe_override, 1); 83 + regs_set_return_value(regs, rc); 84 + arch_ftrace_kprobe_override_function(regs); 85 + return 0; 86 + } 87 + 88 + static const struct bpf_func_proto bpf_override_return_proto = { 89 + .func = bpf_override_return, 90 + .gpl_only = true, 91 + .ret_type = RET_INTEGER, 92 + .arg1_type = ARG_PTR_TO_CTX, 93 + .arg2_type = ARG_ANYTHING, 94 + }; 95 + #endif 82 96 83 97 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) 84 98 { ··· 578 556 return &bpf_get_stackid_proto; 579 557 case BPF_FUNC_perf_event_read_value: 580 558 return &bpf_perf_event_read_value_proto; 559 + #ifdef CONFIG_BPF_KPROBE_OVERRIDE 560 + case BPF_FUNC_override_return: 561 + return &bpf_override_return_proto; 562 + #endif 581 563 default: 582 564 return tracing_func_proto(func_id); 583 565 } ··· 799 773 struct bpf_prog_array *new_array; 800 774 int ret = -EEXIST; 801 775 776 + /* 777 + * Kprobe override only works for ftrace based kprobes, and only if they 778 + * are on the opt-in list. 779 + */ 780 + if (prog->kprobe_override && 781 + (!trace_kprobe_ftrace(event->tp_event) || 782 + !trace_kprobe_error_injectable(event->tp_event))) 783 + return -EINVAL; 784 + 802 785 mutex_lock(&bpf_event_mutex); 803 786 804 787 if (event->prog) ··· 859 824 860 825 unlock: 861 826 mutex_unlock(&bpf_event_mutex); 827 + } 828 + 829 + int perf_event_query_prog_array(struct perf_event *event, void __user *info) 830 + { 831 + struct perf_event_query_bpf __user *uquery = info; 832 + struct perf_event_query_bpf query = {}; 833 + int ret; 834 + 835 + if (!capable(CAP_SYS_ADMIN)) 836 + return -EPERM; 837 + if (event->attr.type != PERF_TYPE_TRACEPOINT) 838 + return -EINVAL; 839 + if (copy_from_user(&query, uquery, sizeof(query))) 840 + return -EFAULT; 841 + 842 + mutex_lock(&bpf_event_mutex); 843 + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, 844 + uquery->ids, 845 + query.ids_len, 846 + &uquery->prog_cnt); 847 + mutex_unlock(&bpf_event_mutex); 848 + 849 + return ret; 862 850 }

+56 -8

kernel/trace/trace_kprobe.c

··· 42 42 (offsetof(struct trace_kprobe, tp.args) + \ 43 43 (sizeof(struct probe_arg) * (n))) 44 44 45 + DEFINE_PER_CPU(int, bpf_kprobe_override); 45 46 46 47 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) 47 48 { ··· 86 85 nhit += *per_cpu_ptr(tk->nhit, cpu); 87 86 88 87 return nhit; 88 + } 89 + 90 + int trace_kprobe_ftrace(struct trace_event_call *call) 91 + { 92 + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; 93 + return kprobe_ftrace(&tk->rp.kp); 94 + } 95 + 96 + int trace_kprobe_error_injectable(struct trace_event_call *call) 97 + { 98 + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; 99 + unsigned long addr; 100 + 101 + if (tk->symbol) { 102 + addr = (unsigned long) 103 + kallsyms_lookup_name(trace_kprobe_symbol(tk)); 104 + addr += tk->rp.kp.offset; 105 + } else { 106 + addr = (unsigned long)tk->rp.kp.addr; 107 + } 108 + return within_kprobe_error_injection_list(addr); 89 109 } 90 110 91 111 static int register_kprobe_event(struct trace_kprobe *tk); ··· 1192 1170 #ifdef CONFIG_PERF_EVENTS 1193 1171 1194 1172 /* Kprobe profile handler */ 1195 - static void 1173 + static int 1196 1174 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1197 1175 { 1198 1176 struct trace_event_call *call = &tk->tp.call; ··· 1201 1179 int size, __size, dsize; 1202 1180 int rctx; 1203 1181 1204 - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) 1205 - return; 1182 + if (bpf_prog_array_valid(call)) { 1183 + int ret; 1184 + 1185 + ret = trace_call_bpf(call, regs); 1186 + 1187 + /* 1188 + * We need to check and see if we modified the pc of the 1189 + * pt_regs, and if so clear the kprobe and return 1 so that we 1190 + * don't do the instruction skipping. Also reset our state so 1191 + * we are clean the next pass through. 1192 + */ 1193 + if (__this_cpu_read(bpf_kprobe_override)) { 1194 + __this_cpu_write(bpf_kprobe_override, 0); 1195 + reset_current_kprobe(); 1196 + return 1; 1197 + } 1198 + if (!ret) 1199 + return 0; 1200 + } 1206 1201 1207 1202 head = this_cpu_ptr(call->perf_events); 1208 1203 if (hlist_empty(head)) 1209 - return; 1204 + return 0; 1210 1205 1211 1206 dsize = __get_data_size(&tk->tp, regs); 1212 1207 __size = sizeof(*entry) + tk->tp.size + dsize; ··· 1232 1193 1233 1194 entry = perf_trace_buf_alloc(size, NULL, &rctx); 1234 1195 if (!entry) 1235 - return; 1196 + return 0; 1236 1197 1237 1198 entry->ip = (unsigned long)tk->rp.kp.addr; 1238 1199 memset(&entry[1], 0, dsize); 1239 1200 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1240 1201 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, 1241 1202 head, NULL); 1203 + return 0; 1242 1204 } 1243 1205 NOKPROBE_SYMBOL(kprobe_perf_func); 1244 1206 ··· 1315 1275 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1316 1276 { 1317 1277 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); 1278 + int ret = 0; 1318 1279 1319 1280 raw_cpu_inc(*tk->nhit); 1320 1281 1321 1282 if (tk->tp.flags & TP_FLAG_TRACE) 1322 1283 kprobe_trace_func(tk, regs); 1323 1284 #ifdef CONFIG_PERF_EVENTS 1324 - if (tk->tp.flags & TP_FLAG_PROFILE) 1325 - kprobe_perf_func(tk, regs); 1285 + if (tk->tp.flags & TP_FLAG_PROFILE) { 1286 + ret = kprobe_perf_func(tk, regs); 1287 + /* 1288 + * The ftrace kprobe handler leaves it up to us to re-enable 1289 + * preemption here before returning if we've modified the ip. 1290 + */ 1291 + if (ret) 1292 + preempt_enable_no_resched(); 1293 + } 1326 1294 #endif 1327 - return 0; /* We don't tweek kernel, so just return 0 */ 1295 + return ret; 1328 1296 } 1329 1297 NOKPROBE_SYMBOL(kprobe_dispatcher); 1330 1298

+12

kernel/trace/trace_probe.h

··· 252 252 unsigned long update_symbol_cache(struct symbol_cache *sc); 253 253 void free_symbol_cache(struct symbol_cache *sc); 254 254 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); 255 + int trace_kprobe_ftrace(struct trace_event_call *call); 256 + int trace_kprobe_error_injectable(struct trace_event_call *call); 255 257 #else 256 258 /* uprobes do not support symbol fetch methods */ 257 259 #define fetch_symbol_u8 NULL ··· 278 276 alloc_symbol_cache(const char *sym, long offset) 279 277 { 280 278 return NULL; 279 + } 280 + 281 + static inline int trace_kprobe_ftrace(struct trace_event_call *call) 282 + { 283 + return 0; 284 + } 285 + 286 + static inline int trace_kprobe_error_injectable(struct trace_event_call *call) 287 + { 288 + return 0; 281 289 } 282 290 #endif /* CONFIG_KPROBE_EVENTS */ 283 291

+4

samples/bpf/Makefile

··· 12 12 hostprogs-y += tracex4 13 13 hostprogs-y += tracex5 14 14 hostprogs-y += tracex6 15 + hostprogs-y += tracex7 15 16 hostprogs-y += test_probe_write_user 16 17 hostprogs-y += trace_output 17 18 hostprogs-y += lathist ··· 59 58 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o 60 59 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o 61 60 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o 61 + tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o 62 62 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o 63 63 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o 64 64 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o ··· 103 101 always += tracex4_kern.o 104 102 always += tracex5_kern.o 105 103 always += tracex6_kern.o 104 + always += tracex7_kern.o 106 105 always += sock_flags_kern.o 107 106 always += test_probe_write_user_kern.o 108 107 always += trace_output_kern.o ··· 158 155 HOSTLOADLIBES_tracex4 += -lelf -lrt 159 156 HOSTLOADLIBES_tracex5 += -lelf 160 157 HOSTLOADLIBES_tracex6 += -lelf 158 + HOSTLOADLIBES_tracex7 += -lelf 161 159 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf 162 160 HOSTLOADLIBES_load_sock_ops += -lelf 163 161 HOSTLOADLIBES_test_probe_write_user += -lelf

+15

samples/bpf/test_override_return.sh

··· 1 + #!/bin/bash 2 + 3 + rm -f testfile.img 4 + dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 5 + DEVICE=$(losetup --show -f testfile.img) 6 + mkfs.btrfs -f $DEVICE 7 + mkdir tmpmnt 8 + ./tracex7 $DEVICE 9 + if [ $? -eq 0 ] 10 + then 11 + echo "SUCCESS!" 12 + else 13 + echo "FAILED!" 14 + fi 15 + losetup -d $DEVICE

+16

samples/bpf/tracex7_kern.c

··· 1 + #include <uapi/linux/ptrace.h> 2 + #include <uapi/linux/bpf.h> 3 + #include <linux/version.h> 4 + #include "bpf_helpers.h" 5 + 6 + SEC("kprobe/open_ctree") 7 + int bpf_prog1(struct pt_regs *ctx) 8 + { 9 + unsigned long rc = -12; 10 + 11 + bpf_override_return(ctx, rc); 12 + return 0; 13 + } 14 + 15 + char _license[] SEC("license") = "GPL"; 16 + u32 _version SEC("version") = LINUX_VERSION_CODE;

+28

samples/bpf/tracex7_user.c

··· 1 + #define _GNU_SOURCE 2 + 3 + #include <stdio.h> 4 + #include <linux/bpf.h> 5 + #include <unistd.h> 6 + #include "libbpf.h" 7 + #include "bpf_load.h" 8 + 9 + int main(int argc, char **argv) 10 + { 11 + FILE *f; 12 + char filename[256]; 13 + char command[256]; 14 + int ret; 15 + 16 + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); 17 + 18 + if (load_bpf_file(filename)) { 19 + printf("%s", bpf_log_buf); 20 + return 1; 21 + } 22 + 23 + snprintf(command, 256, "mount %s tmpmnt/", argv[1]); 24 + f = popen(command, "r"); 25 + ret = pclose(f); 26 + 27 + return ret ? 0 : 1; 28 + }

+20 -10

tools/bpf/bpftool/Documentation/Makefile

··· 3 3 4 4 INSTALL ?= install 5 5 RM ?= rm -f 6 + RMDIR ?= rmdir --ignore-fail-on-non-empty 6 7 7 - # Make the path relative to DESTDIR, not prefix 8 - ifndef DESTDIR 9 - prefix ?= /usr/local 8 + ifeq ($(V),1) 9 + Q = 10 + else 11 + Q = @ 10 12 endif 11 - mandir ?= $(prefix)/share/man 13 + 14 + prefix ?= /usr/local 15 + mandir ?= $(prefix)/man 12 16 man8dir = $(mandir)/man8 13 17 14 18 MAN8_RST = $(wildcard *.rst) ··· 24 20 man8: $(DOC_MAN8) 25 21 26 22 $(OUTPUT)%.8: %.rst 27 - rst2man $< > $@ 23 + $(QUIET_GEN)rst2man $< > $@ 28 24 29 25 clean: 30 - $(call QUIET_CLEAN, Documentation) $(RM) $(DOC_MAN8) 26 + $(call QUIET_CLEAN, Documentation) 27 + $(Q)$(RM) $(DOC_MAN8) 31 28 32 29 install: man 33 - $(call QUIET_INSTALL, Documentation-man) \ 34 - $(INSTALL) -d -m 755 $(DESTDIR)$(man8dir); \ 35 - $(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir); 30 + $(call QUIET_INSTALL, Documentation-man) 31 + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) 32 + $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) 36 33 37 - .PHONY: man man8 clean install 34 + uninstall: 35 + $(call QUIET_UNINST, Documentation-man) 36 + $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) 37 + $(Q)$(RMDIR) $(DESTDIR)$(man8dir) 38 + 39 + .PHONY: man man8 clean install uninstall 38 40 .DEFAULT_GOAL := man

+118

tools/bpf/bpftool/Documentation/bpftool-cgroup.rst

··· 1 + ================ 2 + bpftool-cgroup 3 + ================ 4 + ------------------------------------------------------------------------------- 5 + tool for inspection and simple manipulation of eBPF progs 6 + ------------------------------------------------------------------------------- 7 + 8 + :Manual section: 8 9 + 10 + SYNOPSIS 11 + ======== 12 + 13 + **bpftool** [*OPTIONS*] **cgroup** *COMMAND* 14 + 15 + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } 16 + 17 + *COMMANDS* := 18 + { **list** | **attach** | **detach** | **help** } 19 + 20 + MAP COMMANDS 21 + ============= 22 + 23 + | **bpftool** **cgroup list** *CGROUP* 24 + | **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] 25 + | **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* 26 + | **bpftool** **cgroup help** 27 + | 28 + | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } 29 + | *ATTACH_TYPE* := { *ingress* | *egress* | *sock_create* | *sock_ops* | *device* } 30 + | *ATTACH_FLAGS* := { *multi* | *override* } 31 + 32 + DESCRIPTION 33 + =========== 34 + **bpftool cgroup list** *CGROUP* 35 + List all programs attached to the cgroup *CGROUP*. 36 + 37 + Output will start with program ID followed by attach type, 38 + attach flags and program name. 39 + 40 + **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] 41 + Attach program *PROG* to the cgroup *CGROUP* with attach type 42 + *ATTACH_TYPE* and optional *ATTACH_FLAGS*. 43 + 44 + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs 45 + some bpf program, the program in this cgroup yields to sub-cgroup 46 + program; **multi** if a sub-cgroup installs some bpf program, 47 + that cgroup program gets run in addition to the program in this 48 + cgroup. 49 + 50 + Only one program is allowed to be attached to a cgroup with 51 + no attach flags or the **override** flag. Attaching another 52 + program will release old program and attach the new one. 53 + 54 + Multiple programs are allowed to be attached to a cgroup with 55 + **multi**. They are executed in FIFO order (those that were 56 + attached first, run first). 57 + 58 + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 59 + and later. 60 + 61 + *ATTACH_TYPE* can be on of: 62 + **ingress** ingress path of the inet socket (since 4.10); 63 + **egress** egress path of the inet socket (since 4.10); 64 + **sock_create** opening of an inet socket (since 4.10); 65 + **sock_ops** various socket operations (since 4.12); 66 + **device** device access (since 4.15). 67 + 68 + **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* 69 + Detach *PROG* from the cgroup *CGROUP* and attach type 70 + *ATTACH_TYPE*. 71 + 72 + **bpftool prog help** 73 + Print short help message. 74 + 75 + OPTIONS 76 + ======= 77 + -h, --help 78 + Print short generic help message (similar to **bpftool help**). 79 + 80 + -v, --version 81 + Print version number (similar to **bpftool version**). 82 + 83 + -j, --json 84 + Generate JSON output. For commands that cannot produce JSON, this 85 + option has no effect. 86 + 87 + -p, --pretty 88 + Generate human-readable JSON output. Implies **-j**. 89 + 90 + -f, --bpffs 91 + Show file names of pinned programs. 92 + 93 + EXAMPLES 94 + ======== 95 + | 96 + | **# mount -t bpf none /sys/fs/bpf/** 97 + | **# mkdir /sys/fs/cgroup/test.slice** 98 + | **# bpftool prog load ./device_cgroup.o /sys/fs/bpf/prog** 99 + | **# bpftool cgroup attach /sys/fs/cgroup/test.slice/ device id 1 allow_multi** 100 + 101 + **# bpftool cgroup list /sys/fs/cgroup/test.slice/** 102 + 103 + :: 104 + 105 + ID AttachType AttachFlags Name 106 + 1 device allow_multi bpf_prog1 107 + 108 + | 109 + | **# bpftool cgroup detach /sys/fs/cgroup/test.slice/ device id 1** 110 + | **# bpftool cgroup list /sys/fs/cgroup/test.slice/** 111 + 112 + :: 113 + 114 + ID AttachType AttachFlags Name 115 + 116 + SEE ALSO 117 + ======== 118 + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)

+1 -1

tools/bpf/bpftool/Documentation/bpftool-map.rst

··· 128 128 129 129 SEE ALSO 130 130 ======== 131 - **bpftool**\ (8), **bpftool-prog**\ (8) 131 + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)

+10 -2

tools/bpf/bpftool/Documentation/bpftool-prog.rst

··· 15 15 *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } } 16 16 17 17 *COMMANDS* := 18 - { **show** | **dump xlated** | **dump jited** | **pin** | **help** } 18 + { **show** | **dump xlated** | **dump jited** | **pin** | **load** | **help** } 19 19 20 20 MAP COMMANDS 21 21 ============= ··· 24 24 | **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes**}] 25 25 | **bpftool** **prog dump jited** *PROG* [{**file** *FILE* | **opcodes**}] 26 26 | **bpftool** **prog pin** *PROG* *FILE* 27 + | **bpftool** **prog load** *OBJ* *FILE* 27 28 | **bpftool** **prog help** 28 29 | 29 30 | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } ··· 55 54 56 55 **bpftool prog pin** *PROG* *FILE* 57 56 Pin program *PROG* as *FILE*. 57 + 58 + Note: *FILE* must be located in *bpffs* mount. 59 + 60 + **bpftool prog load** *OBJ* *FILE* 61 + Load bpf program from binary *OBJ* and pin as *FILE*. 58 62 59 63 Note: *FILE* must be located in *bpffs* mount. 60 64 ··· 132 126 | 133 127 | **# mount -t bpf none /sys/fs/bpf/** 134 128 | **# bpftool prog pin id 10 /sys/fs/bpf/prog** 129 + | **# bpftool prog load ./my_prog.o /sys/fs/bpf/prog2** 135 130 | **# ls -l /sys/fs/bpf/** 136 131 | -rw------- 1 root root 0 Jul 22 01:43 prog 132 + | -rw------- 1 root root 0 Jul 22 01:44 prog2 137 133 138 134 **# bpftool prog dum jited pinned /sys/fs/bpf/prog opcodes** 139 135 ··· 155 147 156 148 SEE ALSO 157 149 ======== 158 - **bpftool**\ (8), **bpftool-map**\ (8) 150 + **bpftool**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8)

+5 -3

tools/bpf/bpftool/Documentation/bpftool.rst

··· 16 16 17 17 **bpftool** **version** 18 18 19 - *OBJECT* := { **map** | **program** } 19 + *OBJECT* := { **map** | **program** | **cgroup** } 20 20 21 21 *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } 22 22 | { **-j** | **--json** } [{ **-p** | **--pretty** }] } ··· 26 26 | **pin** | **help** } 27 27 28 28 *PROG-COMMANDS* := { **show** | **dump jited** | **dump xlated** | **pin** 29 - | **help** } 29 + | **load** | **help** } 30 + 31 + *CGROUP-COMMANDS* := { **list** | **attach** | **detach** | **help** } 30 32 31 33 DESCRIPTION 32 34 =========== ··· 55 53 56 54 SEE ALSO 57 55 ======== 58 - **bpftool-map**\ (8), **bpftool-prog**\ (8) 56 + **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)

+31 -30

tools/bpf/bpftool/Makefile

··· 1 1 include ../../scripts/Makefile.include 2 - 3 2 include ../../scripts/utilities.mak 4 3 5 4 ifeq ($(srctree),) 6 5 srctree := $(patsubst %/,%,$(dir $(CURDIR))) 7 6 srctree := $(patsubst %/,%,$(dir $(srctree))) 8 7 srctree := $(patsubst %/,%,$(dir $(srctree))) 9 - #$(info Determined 'srctree' to be $(srctree)) 10 - endif 11 - 12 - ifneq ($(objtree),) 13 - #$(info Determined 'objtree' to be $(objtree)) 14 - endif 15 - 16 - ifneq ($(OUTPUT),) 17 - #$(info Determined 'OUTPUT' to be $(OUTPUT)) 18 - # Adding $(OUTPUT) as a directory to look for source files, 19 - # because use generated output files as sources dependency 20 - # for flex/bison parsers. 21 - VPATH += $(OUTPUT) 22 - export VPATH 23 8 endif 24 9 25 10 ifeq ($(V),1) ··· 13 28 Q = @ 14 29 endif 15 30 16 - BPF_DIR = $(srctree)/tools/lib/bpf/ 31 + BPF_DIR = $(srctree)/tools/lib/bpf/ 17 32 18 33 ifneq ($(OUTPUT),) 19 - BPF_PATH=$(OUTPUT) 34 + BPF_PATH = $(OUTPUT) 20 35 else 21 - BPF_PATH=$(BPF_DIR) 36 + BPF_PATH = $(BPF_DIR) 22 37 endif 23 38 24 39 LIBBPF = $(BPF_PATH)libbpf.a ··· 30 45 $(call QUIET_CLEAN, libbpf) 31 46 $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null 32 47 33 - prefix = /usr/local 48 + prefix ?= /usr/local 34 49 bash_compdir ?= /usr/share/bash-completion/completions 35 50 36 51 CC = gcc ··· 40 55 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ 41 56 LIBS = -lelf -lbfd -lopcodes $(LIBBPF) 42 57 58 + INSTALL ?= install 59 + RM ?= rm -f 60 + 43 61 include $(wildcard *.d) 44 62 45 63 all: $(OUTPUT)bpftool 46 64 47 - SRCS=$(wildcard *.c) 48 - OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o 65 + SRCS = $(wildcard *.c) 66 + OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o 49 67 50 68 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c 51 69 $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< ··· 61 73 62 74 clean: $(LIBBPF)-clean 63 75 $(call QUIET_CLEAN, bpftool) 64 - $(Q)rm -rf $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d 76 + $(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d 65 77 66 - install: 67 - install -m 0755 -d $(prefix)/sbin 68 - install $(OUTPUT)bpftool $(prefix)/sbin/bpftool 69 - install -m 0755 -d $(bash_compdir) 70 - install -m 0644 bash-completion/bpftool $(bash_compdir) 78 + install: $(OUTPUT)bpftool 79 + $(call QUIET_INSTALL, bpftool) 80 + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/sbin 81 + $(Q)$(INSTALL) $(OUTPUT)bpftool $(DESTDIR)$(prefix)/sbin/bpftool 82 + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir) 83 + $(Q)$(INSTALL) -m 0644 bash-completion/bpftool $(DESTDIR)$(bash_compdir) 84 + 85 + uninstall: 86 + $(call QUIET_UNINST, bpftool) 87 + $(Q)$(RM) $(DESTDIR)$(prefix)/sbin/bpftool 88 + $(Q)$(RM) $(DESTDIR)$(bash_compdir)/bpftool 71 89 72 90 doc: 73 - $(Q)$(MAKE) -C Documentation/ 91 + $(call descend,Documentation) 92 + 93 + doc-clean: 94 + $(call descend,Documentation,clean) 74 95 75 96 doc-install: 76 - $(Q)$(MAKE) -C Documentation/ install 97 + $(call descend,Documentation,install) 98 + 99 + doc-uninstall: 100 + $(call descend,Documentation,uninstall) 77 101 78 102 FORCE: 79 103 80 - .PHONY: all clean FORCE install doc doc-install 104 + .PHONY: all FORCE clean install uninstall 105 + .PHONY: doc doc-clean doc-install doc-uninstall 81 106 .DEFAULT_GOAL := all

+307

tools/bpf/bpftool/cgroup.c

··· 1 + // SPDX-License-Identifier: GPL-2.0+ 2 + // Copyright (C) 2017 Facebook 3 + // Author: Roman Gushchin <guro@fb.com> 4 + 5 + #include <fcntl.h> 6 + #include <stdlib.h> 7 + #include <string.h> 8 + #include <sys/stat.h> 9 + #include <sys/types.h> 10 + #include <unistd.h> 11 + 12 + #include <bpf.h> 13 + 14 + #include "main.h" 15 + 16 + #define HELP_SPEC_ATTACH_FLAGS \ 17 + "ATTACH_FLAGS := { multi | override }" 18 + 19 + #define HELP_SPEC_ATTACH_TYPES \ 20 + "ATTACH_TYPE := { ingress | egress | sock_create | sock_ops | device }" 21 + 22 + static const char * const attach_type_strings[] = { 23 + [BPF_CGROUP_INET_INGRESS] = "ingress", 24 + [BPF_CGROUP_INET_EGRESS] = "egress", 25 + [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", 26 + [BPF_CGROUP_SOCK_OPS] = "sock_ops", 27 + [BPF_CGROUP_DEVICE] = "device", 28 + [__MAX_BPF_ATTACH_TYPE] = NULL, 29 + }; 30 + 31 + static enum bpf_attach_type parse_attach_type(const char *str) 32 + { 33 + enum bpf_attach_type type; 34 + 35 + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { 36 + if (attach_type_strings[type] && 37 + is_prefix(str, attach_type_strings[type])) 38 + return type; 39 + } 40 + 41 + return __MAX_BPF_ATTACH_TYPE; 42 + } 43 + 44 + static int list_bpf_prog(int id, const char *attach_type_str, 45 + const char *attach_flags_str) 46 + { 47 + struct bpf_prog_info info = {}; 48 + __u32 info_len = sizeof(info); 49 + int prog_fd; 50 + 51 + prog_fd = bpf_prog_get_fd_by_id(id); 52 + if (prog_fd < 0) 53 + return -1; 54 + 55 + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { 56 + close(prog_fd); 57 + return -1; 58 + } 59 + 60 + if (json_output) { 61 + jsonw_start_object(json_wtr); 62 + jsonw_uint_field(json_wtr, "id", info.id); 63 + jsonw_string_field(json_wtr, "attach_type", 64 + attach_type_str); 65 + jsonw_string_field(json_wtr, "attach_flags", 66 + attach_flags_str); 67 + jsonw_string_field(json_wtr, "name", info.name); 68 + jsonw_end_object(json_wtr); 69 + } else { 70 + printf("%-8u %-15s %-15s %-15s\n", info.id, 71 + attach_type_str, 72 + attach_flags_str, 73 + info.name); 74 + } 75 + 76 + close(prog_fd); 77 + return 0; 78 + } 79 + 80 + static int list_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type) 81 + { 82 + __u32 prog_ids[1024] = {0}; 83 + char *attach_flags_str; 84 + __u32 prog_cnt, iter; 85 + __u32 attach_flags; 86 + char buf[32]; 87 + int ret; 88 + 89 + prog_cnt = ARRAY_SIZE(prog_ids); 90 + ret = bpf_prog_query(cgroup_fd, type, 0, &attach_flags, prog_ids, 91 + &prog_cnt); 92 + if (ret) 93 + return ret; 94 + 95 + if (prog_cnt == 0) 96 + return 0; 97 + 98 + switch (attach_flags) { 99 + case BPF_F_ALLOW_MULTI: 100 + attach_flags_str = "multi"; 101 + break; 102 + case BPF_F_ALLOW_OVERRIDE: 103 + attach_flags_str = "override"; 104 + break; 105 + case 0: 106 + attach_flags_str = ""; 107 + break; 108 + default: 109 + snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); 110 + attach_flags_str = buf; 111 + } 112 + 113 + for (iter = 0; iter < prog_cnt; iter++) 114 + list_bpf_prog(prog_ids[iter], attach_type_strings[type], 115 + attach_flags_str); 116 + 117 + return 0; 118 + } 119 + 120 + static int do_list(int argc, char **argv) 121 + { 122 + enum bpf_attach_type type; 123 + int cgroup_fd; 124 + int ret = -1; 125 + 126 + if (argc < 1) { 127 + p_err("too few parameters for cgroup list\n"); 128 + goto exit; 129 + } else if (argc > 1) { 130 + p_err("too many parameters for cgroup list\n"); 131 + goto exit; 132 + } 133 + 134 + cgroup_fd = open(argv[0], O_RDONLY); 135 + if (cgroup_fd < 0) { 136 + p_err("can't open cgroup %s\n", argv[1]); 137 + goto exit; 138 + } 139 + 140 + if (json_output) 141 + jsonw_start_array(json_wtr); 142 + else 143 + printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType", 144 + "AttachFlags", "Name"); 145 + 146 + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { 147 + /* 148 + * Not all attach types may be supported, so it's expected, 149 + * that some requests will fail. 150 + * If we were able to get the list for at least one 151 + * attach type, let's return 0. 152 + */ 153 + if (list_attached_bpf_progs(cgroup_fd, type) == 0) 154 + ret = 0; 155 + } 156 + 157 + if (json_output) 158 + jsonw_end_array(json_wtr); 159 + 160 + close(cgroup_fd); 161 + exit: 162 + return ret; 163 + } 164 + 165 + static int do_attach(int argc, char **argv) 166 + { 167 + enum bpf_attach_type attach_type; 168 + int cgroup_fd, prog_fd; 169 + int attach_flags = 0; 170 + int ret = -1; 171 + int i; 172 + 173 + if (argc < 4) { 174 + p_err("too few parameters for cgroup attach\n"); 175 + goto exit; 176 + } 177 + 178 + cgroup_fd = open(argv[0], O_RDONLY); 179 + if (cgroup_fd < 0) { 180 + p_err("can't open cgroup %s\n", argv[1]); 181 + goto exit; 182 + } 183 + 184 + attach_type = parse_attach_type(argv[1]); 185 + if (attach_type == __MAX_BPF_ATTACH_TYPE) { 186 + p_err("invalid attach type\n"); 187 + goto exit_cgroup; 188 + } 189 + 190 + argc -= 2; 191 + argv = &argv[2]; 192 + prog_fd = prog_parse_fd(&argc, &argv); 193 + if (prog_fd < 0) 194 + goto exit_cgroup; 195 + 196 + for (i = 0; i < argc; i++) { 197 + if (is_prefix(argv[i], "multi")) { 198 + attach_flags |= BPF_F_ALLOW_MULTI; 199 + } else if (is_prefix(argv[i], "override")) { 200 + attach_flags |= BPF_F_ALLOW_OVERRIDE; 201 + } else { 202 + p_err("unknown option: %s\n", argv[i]); 203 + goto exit_cgroup; 204 + } 205 + } 206 + 207 + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, attach_flags)) { 208 + p_err("failed to attach program"); 209 + goto exit_prog; 210 + } 211 + 212 + if (json_output) 213 + jsonw_null(json_wtr); 214 + 215 + ret = 0; 216 + 217 + exit_prog: 218 + close(prog_fd); 219 + exit_cgroup: 220 + close(cgroup_fd); 221 + exit: 222 + return ret; 223 + } 224 + 225 + static int do_detach(int argc, char **argv) 226 + { 227 + enum bpf_attach_type attach_type; 228 + int prog_fd, cgroup_fd; 229 + int ret = -1; 230 + 231 + if (argc < 4) { 232 + p_err("too few parameters for cgroup detach\n"); 233 + goto exit; 234 + } 235 + 236 + cgroup_fd = open(argv[0], O_RDONLY); 237 + if (cgroup_fd < 0) { 238 + p_err("can't open cgroup %s\n", argv[1]); 239 + goto exit; 240 + } 241 + 242 + attach_type = parse_attach_type(argv[1]); 243 + if (attach_type == __MAX_BPF_ATTACH_TYPE) { 244 + p_err("invalid attach type"); 245 + goto exit_cgroup; 246 + } 247 + 248 + argc -= 2; 249 + argv = &argv[2]; 250 + prog_fd = prog_parse_fd(&argc, &argv); 251 + if (prog_fd < 0) 252 + goto exit_cgroup; 253 + 254 + if (bpf_prog_detach2(prog_fd, cgroup_fd, attach_type)) { 255 + p_err("failed to detach program"); 256 + goto exit_prog; 257 + } 258 + 259 + if (json_output) 260 + jsonw_null(json_wtr); 261 + 262 + ret = 0; 263 + 264 + exit_prog: 265 + close(prog_fd); 266 + exit_cgroup: 267 + close(cgroup_fd); 268 + exit: 269 + return ret; 270 + } 271 + 272 + static int do_help(int argc, char **argv) 273 + { 274 + if (json_output) { 275 + jsonw_null(json_wtr); 276 + return 0; 277 + } 278 + 279 + fprintf(stderr, 280 + "Usage: %s %s list CGROUP\n" 281 + " %s %s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" 282 + " %s %s detach CGROUP ATTACH_TYPE PROG\n" 283 + " %s %s help\n" 284 + "\n" 285 + " " HELP_SPEC_ATTACH_TYPES "\n" 286 + " " HELP_SPEC_ATTACH_FLAGS "\n" 287 + " " HELP_SPEC_PROGRAM "\n" 288 + " " HELP_SPEC_OPTIONS "\n" 289 + "", 290 + bin_name, argv[-2], bin_name, argv[-2], 291 + bin_name, argv[-2], bin_name, argv[-2]); 292 + 293 + return 0; 294 + } 295 + 296 + static const struct cmd cmds[] = { 297 + { "list", do_list }, 298 + { "attach", do_attach }, 299 + { "detach", do_detach }, 300 + { "help", do_help }, 301 + { 0 } 302 + }; 303 + 304 + int do_cgroup(int argc, char **argv) 305 + { 306 + return cmd_select(cmds, argc, argv, do_help); 307 + }

+40 -31

tools/bpf/bpftool/common.c

··· 163 163 return fd; 164 164 } 165 165 166 - int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)) 166 + int do_pin_fd(int fd, const char *name) 167 167 { 168 168 char err_str[ERR_MAX_LEN]; 169 - unsigned int id; 170 - char *endptr; 171 169 char *file; 172 170 char *dir; 171 + int err = 0; 172 + 173 + err = bpf_obj_pin(fd, name); 174 + if (!err) 175 + goto out; 176 + 177 + file = malloc(strlen(name) + 1); 178 + strcpy(file, name); 179 + dir = dirname(file); 180 + 181 + if (errno != EPERM || is_bpffs(dir)) { 182 + p_err("can't pin the object (%s): %s", name, strerror(errno)); 183 + goto out_free; 184 + } 185 + 186 + /* Attempt to mount bpffs, then retry pinning. */ 187 + err = mnt_bpffs(dir, err_str, ERR_MAX_LEN); 188 + if (!err) { 189 + err = bpf_obj_pin(fd, name); 190 + if (err) 191 + p_err("can't pin the object (%s): %s", name, 192 + strerror(errno)); 193 + } else { 194 + err_str[ERR_MAX_LEN - 1] = '\0'; 195 + p_err("can't mount BPF file system to pin the object (%s): %s", 196 + name, err_str); 197 + } 198 + 199 + out_free: 200 + free(file); 201 + out: 202 + return err; 203 + } 204 + 205 + int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)) 206 + { 207 + unsigned int id; 208 + char *endptr; 173 209 int err; 174 210 int fd; 175 211 ··· 231 195 return -1; 232 196 } 233 197 234 - err = bpf_obj_pin(fd, *argv); 235 - if (!err) 236 - goto out_close; 198 + err = do_pin_fd(fd, *argv); 237 199 238 - file = malloc(strlen(*argv) + 1); 239 - strcpy(file, *argv); 240 - dir = dirname(file); 241 - 242 - if (errno != EPERM || is_bpffs(dir)) { 243 - p_err("can't pin the object (%s): %s", *argv, strerror(errno)); 244 - goto out_free; 245 - } 246 - 247 - /* Attempt to mount bpffs, then retry pinning. */ 248 - err = mnt_bpffs(dir, err_str, ERR_MAX_LEN); 249 - if (!err) { 250 - err = bpf_obj_pin(fd, *argv); 251 - if (err) 252 - p_err("can't pin the object (%s): %s", *argv, 253 - strerror(errno)); 254 - } else { 255 - err_str[ERR_MAX_LEN - 1] = '\0'; 256 - p_err("can't mount BPF file system to pin the object (%s): %s", 257 - *argv, err_str); 258 - } 259 - 260 - out_free: 261 - free(file); 262 - out_close: 263 200 close(fd); 264 201 return err; 265 202 }

+2 -1

tools/bpf/bpftool/main.c

··· 85 85 " %s batch file FILE\n" 86 86 " %s version\n" 87 87 "\n" 88 - " OBJECT := { prog | map }\n" 88 + " OBJECT := { prog | map | cgroup }\n" 89 89 " " HELP_SPEC_OPTIONS "\n" 90 90 "", 91 91 bin_name, bin_name, bin_name); ··· 173 173 { "batch", do_batch }, 174 174 { "prog", do_prog }, 175 175 { "map", do_map }, 176 + { "cgroup", do_cgroup }, 176 177 { "version", do_version }, 177 178 { 0 } 178 179 };

+2

tools/bpf/bpftool/main.h

··· 111 111 int open_obj_pinned(char *path); 112 112 int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type); 113 113 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)); 114 + int do_pin_fd(int fd, const char *name); 114 115 115 116 int do_prog(int argc, char **arg); 116 117 int do_map(int argc, char **arg); 118 + int do_cgroup(int argc, char **arg); 117 119 118 120 int prog_parse_fd(int *argc, char ***argv); 119 121

+28 -1

tools/bpf/bpftool/prog.c

··· 45 45 #include <sys/stat.h> 46 46 47 47 #include <bpf.h> 48 + #include <libbpf.h> 48 49 49 50 #include "main.h" 50 51 #include "disasm.h" ··· 636 635 return err; 637 636 } 638 637 638 + static int do_load(int argc, char **argv) 639 + { 640 + struct bpf_object *obj; 641 + int prog_fd; 642 + 643 + if (argc != 2) 644 + usage(); 645 + 646 + if (bpf_prog_load(argv[0], BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd)) { 647 + p_err("failed to load program\n"); 648 + return -1; 649 + } 650 + 651 + if (do_pin_fd(prog_fd, argv[1])) { 652 + p_err("failed to pin program\n"); 653 + return -1; 654 + } 655 + 656 + if (json_output) 657 + jsonw_null(json_wtr); 658 + 659 + return 0; 660 + } 661 + 639 662 static int do_help(int argc, char **argv) 640 663 { 641 664 if (json_output) { ··· 672 647 " %s %s dump xlated PROG [{ file FILE | opcodes }]\n" 673 648 " %s %s dump jited PROG [{ file FILE | opcodes }]\n" 674 649 " %s %s pin PROG FILE\n" 650 + " %s %s load OBJ FILE\n" 675 651 " %s %s help\n" 676 652 "\n" 677 653 " " HELP_SPEC_PROGRAM "\n" 678 654 " " HELP_SPEC_OPTIONS "\n" 679 655 "", 680 656 bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], 681 - bin_name, argv[-2], bin_name, argv[-2]); 657 + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); 682 658 683 659 return 0; 684 660 } ··· 689 663 { "help", do_help }, 690 664 { "dump", do_dump }, 691 665 { "pin", do_pin }, 666 + { "load", do_load }, 692 667 { 0 } 693 668 }; 694 669

+12 -1

tools/include/uapi/linux/bpf.h

··· 197 197 */ 198 198 #define BPF_F_STRICT_ALIGNMENT (1U << 0) 199 199 200 + /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ 200 201 #define BPF_PSEUDO_MAP_FD 1 202 + 203 + /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative 204 + * offset to another bpf function 205 + */ 206 + #define BPF_PSEUDO_CALL 1 201 207 202 208 /* flags for BPF_MAP_UPDATE_ELEM command */ 203 209 #define BPF_ANY 0 /* create new element or update existing */ ··· 683 677 * @buf: buf to fill 684 678 * @buf_size: size of the buf 685 679 * Return : 0 on success or negative error code 680 + * 681 + * int bpf_override_return(pt_regs, rc) 682 + * @pt_regs: pointer to struct pt_regs 683 + * @rc: the return value to set 686 684 */ 687 685 #define __BPF_FUNC_MAPPER(FN) \ 688 686 FN(unspec), \ ··· 746 736 FN(xdp_adjust_meta), \ 747 737 FN(perf_event_read_value), \ 748 738 FN(perf_prog_read_value), \ 749 - FN(getsockopt), 739 + FN(getsockopt), \ 740 + FN(override_return), 750 741 751 742 /* integer value in 'imm' field of BPF_CALL instruction selects which helper 752 743 * function eBPF program intends to call

+22

tools/include/uapi/linux/perf_event.h

··· 418 418 __u16 __reserved_2; /* align to __u64 */ 419 419 }; 420 420 421 + /* 422 + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command 423 + * to query bpf programs attached to the same perf tracepoint 424 + * as the given perf event. 425 + */ 426 + struct perf_event_query_bpf { 427 + /* 428 + * The below ids array length 429 + */ 430 + __u32 ids_len; 431 + /* 432 + * Set by the kernel to indicate the number of 433 + * available programs 434 + */ 435 + __u32 prog_cnt; 436 + /* 437 + * User provided buffer to store program ids 438 + */ 439 + __u32 ids[0]; 440 + }; 441 + 421 442 #define perf_flags(attr) (*(&(attr)->read_format + 1)) 422 443 423 444 /* ··· 454 433 #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) 455 434 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) 456 435 #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) 436 + #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) 457 437 458 438 enum perf_event_ioc_flags { 459 439 PERF_IOC_FLAG_GROUP = 1U << 0,

+2 -2

tools/lib/bpf/Makefile

··· 213 213 force: 214 214 215 215 elfdep: 216 - @if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit -1 ; fi 216 + @if [ "$(feature-libelf)" != "1" ]; then echo "No libelf found"; exit 1 ; fi 217 217 218 218 bpfdep: 219 - @if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit -1 ; fi 219 + @if [ "$(feature-bpf)" != "1" ]; then echo "BPF API too old"; exit 1 ; fi 220 220 221 221 # Declare the contents of the .PHONY variable as phony. We keep that 222 222 # information in a variable so we can use it in if_changed and friends.

+1 -1

tools/lib/bpf/bpf.h

··· 40 40 __u32 map_flags); 41 41 42 42 /* Recommend log buffer size */ 43 - #define BPF_LOG_BUF_SIZE 65536 43 + #define BPF_LOG_BUF_SIZE (256 * 1024) 44 44 int bpf_load_program_name(enum bpf_prog_type type, const char *name, 45 45 const struct bpf_insn *insns, 46 46 size_t insns_cnt, const char *license,

+168 -31

tools/lib/bpf/libbpf.c

··· 174 174 char *name; 175 175 char *section_name; 176 176 struct bpf_insn *insns; 177 - size_t insns_cnt; 177 + size_t insns_cnt, main_prog_cnt; 178 178 enum bpf_prog_type type; 179 179 180 - struct { 180 + struct reloc_desc { 181 + enum { 182 + RELO_LD64, 183 + RELO_CALL, 184 + } type; 181 185 int insn_idx; 182 - int map_idx; 186 + union { 187 + int map_idx; 188 + int text_off; 189 + }; 183 190 } *reloc_desc; 184 191 int nr_reloc; 185 192 ··· 241 234 } *reloc; 242 235 int nr_reloc; 243 236 int maps_shndx; 237 + int text_shndx; 244 238 } efile; 245 239 /* 246 240 * All loaded bpf_object is linked in a list, which is ··· 383 375 size_t pi, si; 384 376 385 377 for (pi = 0; pi < obj->nr_programs; pi++) { 386 - char *name = NULL; 378 + const char *name = NULL; 387 379 388 380 prog = &obj->programs[pi]; 381 + if (prog->idx == obj->efile.text_shndx) { 382 + name = ".text"; 383 + goto skip_search; 384 + } 389 385 390 386 for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name; 391 387 si++) { ··· 398 386 if (!gelf_getsym(symbols, si, &sym)) 399 387 continue; 400 388 if (sym.st_shndx != prog->idx) 389 + continue; 390 + if (GELF_ST_BIND(sym.st_info) != STB_GLOBAL) 401 391 continue; 402 392 403 393 name = elf_strptr(obj->efile.elf, ··· 417 403 prog->section_name); 418 404 return -EINVAL; 419 405 } 420 - 406 + skip_search: 421 407 prog->name = strdup(name); 422 408 if (!prog->name) { 423 409 pr_warning("failed to allocate memory for prog sym %s\n", ··· 807 793 } else if ((sh.sh_type == SHT_PROGBITS) && 808 794 (sh.sh_flags & SHF_EXECINSTR) && 809 795 (data->d_size > 0)) { 796 + if (strcmp(name, ".text") == 0) 797 + obj->efile.text_shndx = idx; 810 798 err = bpf_object__add_program(obj, data->d_buf, 811 799 data->d_size, name, idx); 812 800 if (err) { ··· 870 854 } 871 855 872 856 static int 873 - bpf_program__collect_reloc(struct bpf_program *prog, 874 - size_t nr_maps, GElf_Shdr *shdr, 875 - Elf_Data *data, Elf_Data *symbols, 876 - int maps_shndx, struct bpf_map *maps) 857 + bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, 858 + Elf_Data *data, struct bpf_object *obj) 877 859 { 860 + Elf_Data *symbols = obj->efile.symbols; 861 + int text_shndx = obj->efile.text_shndx; 862 + int maps_shndx = obj->efile.maps_shndx; 863 + struct bpf_map *maps = obj->maps; 864 + size_t nr_maps = obj->nr_maps; 878 865 int i, nrels; 879 866 880 867 pr_debug("collecting relocating info for: '%s'\n", ··· 910 891 GELF_R_SYM(rel.r_info)); 911 892 return -LIBBPF_ERRNO__FORMAT; 912 893 } 894 + pr_debug("relo for %ld value %ld name %d\n", 895 + rel.r_info >> 32, sym.st_value, sym.st_name); 913 896 914 - if (sym.st_shndx != maps_shndx) { 897 + if (sym.st_shndx != maps_shndx && sym.st_shndx != text_shndx) { 915 898 pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n", 916 899 prog->section_name, sym.st_shndx); 917 900 return -LIBBPF_ERRNO__RELOC; ··· 921 900 922 901 insn_idx = rel.r_offset / sizeof(struct bpf_insn); 923 902 pr_debug("relocation: insn_idx=%u\n", insn_idx); 903 + 904 + if (insns[insn_idx].code == (BPF_JMP | BPF_CALL)) { 905 + if (insns[insn_idx].src_reg != BPF_PSEUDO_CALL) { 906 + pr_warning("incorrect bpf_call opcode\n"); 907 + return -LIBBPF_ERRNO__RELOC; 908 + } 909 + prog->reloc_desc[i].type = RELO_CALL; 910 + prog->reloc_desc[i].insn_idx = insn_idx; 911 + prog->reloc_desc[i].text_off = sym.st_value; 912 + continue; 913 + } 924 914 925 915 if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { 926 916 pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n", ··· 954 922 return -LIBBPF_ERRNO__RELOC; 955 923 } 956 924 925 + prog->reloc_desc[i].type = RELO_LD64; 957 926 prog->reloc_desc[i].insn_idx = insn_idx; 958 927 prog->reloc_desc[i].map_idx = map_idx; 959 928 } ··· 994 961 } 995 962 996 963 static int 964 + bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, 965 + struct reloc_desc *relo) 966 + { 967 + struct bpf_insn *insn, *new_insn; 968 + struct bpf_program *text; 969 + size_t new_cnt; 970 + 971 + if (relo->type != RELO_CALL) 972 + return -LIBBPF_ERRNO__RELOC; 973 + 974 + if (prog->idx == obj->efile.text_shndx) { 975 + pr_warning("relo in .text insn %d into off %d\n", 976 + relo->insn_idx, relo->text_off); 977 + return -LIBBPF_ERRNO__RELOC; 978 + } 979 + 980 + if (prog->main_prog_cnt == 0) { 981 + text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx); 982 + if (!text) { 983 + pr_warning("no .text section found yet relo into text exist\n"); 984 + return -LIBBPF_ERRNO__RELOC; 985 + } 986 + new_cnt = prog->insns_cnt + text->insns_cnt; 987 + new_insn = realloc(prog->insns, new_cnt * sizeof(*insn)); 988 + if (!new_insn) { 989 + pr_warning("oom in prog realloc\n"); 990 + return -ENOMEM; 991 + } 992 + memcpy(new_insn + prog->insns_cnt, text->insns, 993 + text->insns_cnt * sizeof(*insn)); 994 + prog->insns = new_insn; 995 + prog->main_prog_cnt = prog->insns_cnt; 996 + prog->insns_cnt = new_cnt; 997 + } 998 + insn = &prog->insns[relo->insn_idx]; 999 + insn->imm += prog->main_prog_cnt - relo->insn_idx; 1000 + pr_debug("added %zd insn from %s to prog %s\n", 1001 + text->insns_cnt, text->section_name, prog->section_name); 1002 + return 0; 1003 + } 1004 + 1005 + static int 997 1006 bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) 998 1007 { 999 - int i; 1008 + int i, err; 1000 1009 1001 1010 if (!prog || !prog->reloc_desc) 1002 1011 return 0; 1003 1012 1004 1013 for (i = 0; i < prog->nr_reloc; i++) { 1005 - int insn_idx, map_idx; 1006 - struct bpf_insn *insns = prog->insns; 1014 + if (prog->reloc_desc[i].type == RELO_LD64) { 1015 + struct bpf_insn *insns = prog->insns; 1016 + int insn_idx, map_idx; 1007 1017 1008 - insn_idx = prog->reloc_desc[i].insn_idx; 1009 - map_idx = prog->reloc_desc[i].map_idx; 1018 + insn_idx = prog->reloc_desc[i].insn_idx; 1019 + map_idx = prog->reloc_desc[i].map_idx; 1010 1020 1011 - if (insn_idx >= (int)prog->insns_cnt) { 1012 - pr_warning("relocation out of range: '%s'\n", 1013 - prog->section_name); 1014 - return -LIBBPF_ERRNO__RELOC; 1021 + if (insn_idx >= (int)prog->insns_cnt) { 1022 + pr_warning("relocation out of range: '%s'\n", 1023 + prog->section_name); 1024 + return -LIBBPF_ERRNO__RELOC; 1025 + } 1026 + insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 1027 + insns[insn_idx].imm = obj->maps[map_idx].fd; 1028 + } else { 1029 + err = bpf_program__reloc_text(prog, obj, 1030 + &prog->reloc_desc[i]); 1031 + if (err) 1032 + return err; 1015 1033 } 1016 - insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; 1017 - insns[insn_idx].imm = obj->maps[map_idx].fd; 1018 1034 } 1019 1035 1020 1036 zfree(&prog->reloc_desc); ··· 1106 1024 Elf_Data *data = obj->efile.reloc[i].data; 1107 1025 int idx = shdr->sh_info; 1108 1026 struct bpf_program *prog; 1109 - size_t nr_maps = obj->nr_maps; 1110 1027 1111 1028 if (shdr->sh_type != SHT_REL) { 1112 1029 pr_warning("internal error at %d\n", __LINE__); ··· 1119 1038 return -LIBBPF_ERRNO__RELOC; 1120 1039 } 1121 1040 1122 - err = bpf_program__collect_reloc(prog, nr_maps, 1041 + err = bpf_program__collect_reloc(prog, 1123 1042 shdr, data, 1124 - obj->efile.symbols, 1125 - obj->efile.maps_shndx, 1126 - obj->maps); 1043 + obj); 1127 1044 if (err) 1128 1045 return err; 1129 1046 } ··· 1274 1195 int err; 1275 1196 1276 1197 for (i = 0; i < obj->nr_programs; i++) { 1198 + if (obj->programs[i].idx == obj->efile.text_shndx) 1199 + continue; 1277 1200 err = bpf_program__load(&obj->programs[i], 1278 1201 obj->license, 1279 1202 obj->kern_version); ··· 1802 1721 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); 1803 1722 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); 1804 1723 1724 + #define BPF_PROG_SEC(string, type) { string, sizeof(string), type } 1725 + static const struct { 1726 + const char *sec; 1727 + size_t len; 1728 + enum bpf_prog_type prog_type; 1729 + } section_names[] = { 1730 + BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), 1731 + BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), 1732 + BPF_PROG_SEC("kretprobe/", BPF_PROG_TYPE_KPROBE), 1733 + BPF_PROG_SEC("tracepoint/", BPF_PROG_TYPE_TRACEPOINT), 1734 + BPF_PROG_SEC("xdp", BPF_PROG_TYPE_XDP), 1735 + BPF_PROG_SEC("perf_event", BPF_PROG_TYPE_PERF_EVENT), 1736 + BPF_PROG_SEC("cgroup/skb", BPF_PROG_TYPE_CGROUP_SKB), 1737 + BPF_PROG_SEC("cgroup/sock", BPF_PROG_TYPE_CGROUP_SOCK), 1738 + BPF_PROG_SEC("cgroup/dev", BPF_PROG_TYPE_CGROUP_DEVICE), 1739 + BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS), 1740 + BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), 1741 + }; 1742 + #undef BPF_PROG_SEC 1743 + 1744 + static enum bpf_prog_type bpf_program__guess_type(struct bpf_program *prog) 1745 + { 1746 + int i; 1747 + 1748 + if (!prog->section_name) 1749 + goto err; 1750 + 1751 + for (i = 0; i < ARRAY_SIZE(section_names); i++) 1752 + if (strncmp(prog->section_name, section_names[i].sec, 1753 + section_names[i].len) == 0) 1754 + return section_names[i].prog_type; 1755 + 1756 + err: 1757 + pr_warning("failed to guess program type based on section name %s\n", 1758 + prog->section_name); 1759 + 1760 + return BPF_PROG_TYPE_UNSPEC; 1761 + } 1762 + 1805 1763 int bpf_map__fd(struct bpf_map *map) 1806 1764 { 1807 1765 return map ? map->fd : -EINVAL; ··· 1938 1818 int bpf_prog_load(const char *file, enum bpf_prog_type type, 1939 1819 struct bpf_object **pobj, int *prog_fd) 1940 1820 { 1941 - struct bpf_program *prog; 1821 + struct bpf_program *prog, *first_prog = NULL; 1942 1822 struct bpf_object *obj; 1943 1823 int err; 1944 1824 ··· 1946 1826 if (IS_ERR(obj)) 1947 1827 return -ENOENT; 1948 1828 1949 - prog = bpf_program__next(NULL, obj); 1950 - if (!prog) { 1829 + bpf_object__for_each_program(prog, obj) { 1830 + /* 1831 + * If type is not specified, try to guess it based on 1832 + * section name. 1833 + */ 1834 + if (type == BPF_PROG_TYPE_UNSPEC) { 1835 + type = bpf_program__guess_type(prog); 1836 + if (type == BPF_PROG_TYPE_UNSPEC) { 1837 + bpf_object__close(obj); 1838 + return -EINVAL; 1839 + } 1840 + } 1841 + 1842 + bpf_program__set_type(prog, type); 1843 + if (prog->idx != obj->efile.text_shndx && !first_prog) 1844 + first_prog = prog; 1845 + } 1846 + 1847 + if (!first_prog) { 1848 + pr_warning("object file doesn't contain bpf program\n"); 1951 1849 bpf_object__close(obj); 1952 1850 return -ENOENT; 1953 1851 } 1954 1852 1955 - bpf_program__set_type(prog, type); 1956 1853 err = bpf_object__load(obj); 1957 1854 if (err) { 1958 1855 bpf_object__close(obj); ··· 1977 1840 } 1978 1841 1979 1842 *pobj = obj; 1980 - *prog_fd = bpf_program__fd(prog); 1843 + *prog_fd = bpf_program__fd(first_prog); 1981 1844 return 0; 1982 1845 }

+1

tools/scripts/Makefile.include

··· 99 99 100 100 QUIET_CLEAN = @printf ' CLEAN %s\n' $1; 101 101 QUIET_INSTALL = @printf ' INSTALL %s\n' $1; 102 + QUIET_UNINST = @printf ' UNINST %s\n' $1; 102 103 endif 103 104 endif

+9 -3

tools/testing/selftests/bpf/Makefile

··· 18 18 19 19 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ 20 20 test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ 21 - sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o 21 + sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ 22 + test_l4lb_noinline.o test_xdp_noinline.o 22 23 23 24 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \ 24 25 test_offload.py ··· 51 50 CPU ?= generic 52 51 endif 53 52 53 + CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \ 54 + -Wno-compare-distinct-pointer-types 55 + 56 + $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline 57 + $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline 58 + 54 59 %.o: %.c 55 - $(CLANG) -I. -I./include/uapi -I../../../include/uapi \ 56 - -Wno-compare-distinct-pointer-types \ 60 + $(CLANG) $(CLANG_FLAGS) \ 57 61 -O2 -target bpf -emit-llvm -c $< -o - | \ 58 62 $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@

+2 -1

tools/testing/selftests/bpf/bpf_helpers.h

··· 82 82 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, 83 83 unsigned int buf_size) = 84 84 (void *) BPF_FUNC_perf_prog_read_value; 85 - 85 + static int (*bpf_override_return)(void *ctx, unsigned long rc) = 86 + (void *) BPF_FUNC_override_return; 86 87 87 88 /* llvm builtin functions that eBPF C program may use to 88 89 * emit BPF_LD_ABS and BPF_LD_IND instructions

+1

tools/testing/selftests/bpf/config

··· 3 3 CONFIG_NET_CLS_BPF=m 4 4 CONFIG_BPF_EVENTS=y 5 5 CONFIG_TEST_BPF=m 6 + CONFIG_CGROUP_BPF=y

+473

tools/testing/selftests/bpf/test_l4lb_noinline.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2017 Facebook 3 + #include <stddef.h> 4 + #include <stdbool.h> 5 + #include <string.h> 6 + #include <linux/pkt_cls.h> 7 + #include <linux/bpf.h> 8 + #include <linux/in.h> 9 + #include <linux/if_ether.h> 10 + #include <linux/ip.h> 11 + #include <linux/ipv6.h> 12 + #include <linux/icmp.h> 13 + #include <linux/icmpv6.h> 14 + #include <linux/tcp.h> 15 + #include <linux/udp.h> 16 + #include "bpf_helpers.h" 17 + #include "test_iptunnel_common.h" 18 + #include "bpf_endian.h" 19 + 20 + int _version SEC("version") = 1; 21 + 22 + static __u32 rol32(__u32 word, unsigned int shift) 23 + { 24 + return (word << shift) | (word >> ((-shift) & 31)); 25 + } 26 + 27 + /* copy paste of jhash from kernel sources to make sure llvm 28 + * can compile it into valid sequence of bpf instructions 29 + */ 30 + #define __jhash_mix(a, b, c) \ 31 + { \ 32 + a -= c; a ^= rol32(c, 4); c += b; \ 33 + b -= a; b ^= rol32(a, 6); a += c; \ 34 + c -= b; c ^= rol32(b, 8); b += a; \ 35 + a -= c; a ^= rol32(c, 16); c += b; \ 36 + b -= a; b ^= rol32(a, 19); a += c; \ 37 + c -= b; c ^= rol32(b, 4); b += a; \ 38 + } 39 + 40 + #define __jhash_final(a, b, c) \ 41 + { \ 42 + c ^= b; c -= rol32(b, 14); \ 43 + a ^= c; a -= rol32(c, 11); \ 44 + b ^= a; b -= rol32(a, 25); \ 45 + c ^= b; c -= rol32(b, 16); \ 46 + a ^= c; a -= rol32(c, 4); \ 47 + b ^= a; b -= rol32(a, 14); \ 48 + c ^= b; c -= rol32(b, 24); \ 49 + } 50 + 51 + #define JHASH_INITVAL 0xdeadbeef 52 + 53 + typedef unsigned int u32; 54 + 55 + static u32 jhash(const void *key, u32 length, u32 initval) 56 + { 57 + u32 a, b, c; 58 + const unsigned char *k = key; 59 + 60 + a = b = c = JHASH_INITVAL + length + initval; 61 + 62 + while (length > 12) { 63 + a += *(u32 *)(k); 64 + b += *(u32 *)(k + 4); 65 + c += *(u32 *)(k + 8); 66 + __jhash_mix(a, b, c); 67 + length -= 12; 68 + k += 12; 69 + } 70 + switch (length) { 71 + case 12: c += (u32)k[11]<<24; 72 + case 11: c += (u32)k[10]<<16; 73 + case 10: c += (u32)k[9]<<8; 74 + case 9: c += k[8]; 75 + case 8: b += (u32)k[7]<<24; 76 + case 7: b += (u32)k[6]<<16; 77 + case 6: b += (u32)k[5]<<8; 78 + case 5: b += k[4]; 79 + case 4: a += (u32)k[3]<<24; 80 + case 3: a += (u32)k[2]<<16; 81 + case 2: a += (u32)k[1]<<8; 82 + case 1: a += k[0]; 83 + __jhash_final(a, b, c); 84 + case 0: /* Nothing left to add */ 85 + break; 86 + } 87 + 88 + return c; 89 + } 90 + 91 + static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) 92 + { 93 + a += initval; 94 + b += initval; 95 + c += initval; 96 + __jhash_final(a, b, c); 97 + return c; 98 + } 99 + 100 + static u32 jhash_2words(u32 a, u32 b, u32 initval) 101 + { 102 + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); 103 + } 104 + 105 + #define PCKT_FRAGMENTED 65343 106 + #define IPV4_HDR_LEN_NO_OPT 20 107 + #define IPV4_PLUS_ICMP_HDR 28 108 + #define IPV6_PLUS_ICMP_HDR 48 109 + #define RING_SIZE 2 110 + #define MAX_VIPS 12 111 + #define MAX_REALS 5 112 + #define CTL_MAP_SIZE 16 113 + #define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE) 114 + #define F_IPV6 (1 << 0) 115 + #define F_HASH_NO_SRC_PORT (1 << 0) 116 + #define F_ICMP (1 << 0) 117 + #define F_SYN_SET (1 << 1) 118 + 119 + struct packet_description { 120 + union { 121 + __be32 src; 122 + __be32 srcv6[4]; 123 + }; 124 + union { 125 + __be32 dst; 126 + __be32 dstv6[4]; 127 + }; 128 + union { 129 + __u32 ports; 130 + __u16 port16[2]; 131 + }; 132 + __u8 proto; 133 + __u8 flags; 134 + }; 135 + 136 + struct ctl_value { 137 + union { 138 + __u64 value; 139 + __u32 ifindex; 140 + __u8 mac[6]; 141 + }; 142 + }; 143 + 144 + struct vip_meta { 145 + __u32 flags; 146 + __u32 vip_num; 147 + }; 148 + 149 + struct real_definition { 150 + union { 151 + __be32 dst; 152 + __be32 dstv6[4]; 153 + }; 154 + __u8 flags; 155 + }; 156 + 157 + struct vip_stats { 158 + __u64 bytes; 159 + __u64 pkts; 160 + }; 161 + 162 + struct eth_hdr { 163 + unsigned char eth_dest[ETH_ALEN]; 164 + unsigned char eth_source[ETH_ALEN]; 165 + unsigned short eth_proto; 166 + }; 167 + 168 + struct bpf_map_def SEC("maps") vip_map = { 169 + .type = BPF_MAP_TYPE_HASH, 170 + .key_size = sizeof(struct vip), 171 + .value_size = sizeof(struct vip_meta), 172 + .max_entries = MAX_VIPS, 173 + }; 174 + 175 + struct bpf_map_def SEC("maps") ch_rings = { 176 + .type = BPF_MAP_TYPE_ARRAY, 177 + .key_size = sizeof(__u32), 178 + .value_size = sizeof(__u32), 179 + .max_entries = CH_RINGS_SIZE, 180 + }; 181 + 182 + struct bpf_map_def SEC("maps") reals = { 183 + .type = BPF_MAP_TYPE_ARRAY, 184 + .key_size = sizeof(__u32), 185 + .value_size = sizeof(struct real_definition), 186 + .max_entries = MAX_REALS, 187 + }; 188 + 189 + struct bpf_map_def SEC("maps") stats = { 190 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 191 + .key_size = sizeof(__u32), 192 + .value_size = sizeof(struct vip_stats), 193 + .max_entries = MAX_VIPS, 194 + }; 195 + 196 + struct bpf_map_def SEC("maps") ctl_array = { 197 + .type = BPF_MAP_TYPE_ARRAY, 198 + .key_size = sizeof(__u32), 199 + .value_size = sizeof(struct ctl_value), 200 + .max_entries = CTL_MAP_SIZE, 201 + }; 202 + 203 + static __u32 get_packet_hash(struct packet_description *pckt, 204 + bool ipv6) 205 + { 206 + if (ipv6) 207 + return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), 208 + pckt->ports, CH_RINGS_SIZE); 209 + else 210 + return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); 211 + } 212 + 213 + static bool get_packet_dst(struct real_definition **real, 214 + struct packet_description *pckt, 215 + struct vip_meta *vip_info, 216 + bool is_ipv6) 217 + { 218 + __u32 hash = get_packet_hash(pckt, is_ipv6); 219 + __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE; 220 + __u32 *real_pos; 221 + 222 + if (hash != 0x358459b7 /* jhash of ipv4 packet */ && 223 + hash != 0x2f4bc6bb /* jhash of ipv6 packet */) 224 + return 0; 225 + 226 + real_pos = bpf_map_lookup_elem(&ch_rings, &key); 227 + if (!real_pos) 228 + return false; 229 + key = *real_pos; 230 + *real = bpf_map_lookup_elem(&reals, &key); 231 + if (!(*real)) 232 + return false; 233 + return true; 234 + } 235 + 236 + static int parse_icmpv6(void *data, void *data_end, __u64 off, 237 + struct packet_description *pckt) 238 + { 239 + struct icmp6hdr *icmp_hdr; 240 + struct ipv6hdr *ip6h; 241 + 242 + icmp_hdr = data + off; 243 + if (icmp_hdr + 1 > data_end) 244 + return TC_ACT_SHOT; 245 + if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG) 246 + return TC_ACT_OK; 247 + off += sizeof(struct icmp6hdr); 248 + ip6h = data + off; 249 + if (ip6h + 1 > data_end) 250 + return TC_ACT_SHOT; 251 + pckt->proto = ip6h->nexthdr; 252 + pckt->flags |= F_ICMP; 253 + memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16); 254 + memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16); 255 + return TC_ACT_UNSPEC; 256 + } 257 + 258 + static int parse_icmp(void *data, void *data_end, __u64 off, 259 + struct packet_description *pckt) 260 + { 261 + struct icmphdr *icmp_hdr; 262 + struct iphdr *iph; 263 + 264 + icmp_hdr = data + off; 265 + if (icmp_hdr + 1 > data_end) 266 + return TC_ACT_SHOT; 267 + if (icmp_hdr->type != ICMP_DEST_UNREACH || 268 + icmp_hdr->code != ICMP_FRAG_NEEDED) 269 + return TC_ACT_OK; 270 + off += sizeof(struct icmphdr); 271 + iph = data + off; 272 + if (iph + 1 > data_end) 273 + return TC_ACT_SHOT; 274 + if (iph->ihl != 5) 275 + return TC_ACT_SHOT; 276 + pckt->proto = iph->protocol; 277 + pckt->flags |= F_ICMP; 278 + pckt->src = iph->daddr; 279 + pckt->dst = iph->saddr; 280 + return TC_ACT_UNSPEC; 281 + } 282 + 283 + static bool parse_udp(void *data, __u64 off, void *data_end, 284 + struct packet_description *pckt) 285 + { 286 + struct udphdr *udp; 287 + udp = data + off; 288 + 289 + if (udp + 1 > data_end) 290 + return false; 291 + 292 + if (!(pckt->flags & F_ICMP)) { 293 + pckt->port16[0] = udp->source; 294 + pckt->port16[1] = udp->dest; 295 + } else { 296 + pckt->port16[0] = udp->dest; 297 + pckt->port16[1] = udp->source; 298 + } 299 + return true; 300 + } 301 + 302 + static bool parse_tcp(void *data, __u64 off, void *data_end, 303 + struct packet_description *pckt) 304 + { 305 + struct tcphdr *tcp; 306 + 307 + tcp = data + off; 308 + if (tcp + 1 > data_end) 309 + return false; 310 + 311 + if (tcp->syn) 312 + pckt->flags |= F_SYN_SET; 313 + 314 + if (!(pckt->flags & F_ICMP)) { 315 + pckt->port16[0] = tcp->source; 316 + pckt->port16[1] = tcp->dest; 317 + } else { 318 + pckt->port16[0] = tcp->dest; 319 + pckt->port16[1] = tcp->source; 320 + } 321 + return true; 322 + } 323 + 324 + static int process_packet(void *data, __u64 off, void *data_end, 325 + bool is_ipv6, struct __sk_buff *skb) 326 + { 327 + void *pkt_start = (void *)(long)skb->data; 328 + struct packet_description pckt = {}; 329 + struct eth_hdr *eth = pkt_start; 330 + struct bpf_tunnel_key tkey = {}; 331 + struct vip_stats *data_stats; 332 + struct real_definition *dst; 333 + struct vip_meta *vip_info; 334 + struct ctl_value *cval; 335 + __u32 v4_intf_pos = 1; 336 + __u32 v6_intf_pos = 2; 337 + struct ipv6hdr *ip6h; 338 + struct vip vip = {}; 339 + struct iphdr *iph; 340 + int tun_flag = 0; 341 + __u16 pkt_bytes; 342 + __u64 iph_len; 343 + __u32 ifindex; 344 + __u8 protocol; 345 + __u32 vip_num; 346 + int action; 347 + 348 + tkey.tunnel_ttl = 64; 349 + if (is_ipv6) { 350 + ip6h = data + off; 351 + if (ip6h + 1 > data_end) 352 + return TC_ACT_SHOT; 353 + 354 + iph_len = sizeof(struct ipv6hdr); 355 + protocol = ip6h->nexthdr; 356 + pckt.proto = protocol; 357 + pkt_bytes = bpf_ntohs(ip6h->payload_len); 358 + off += iph_len; 359 + if (protocol == IPPROTO_FRAGMENT) { 360 + return TC_ACT_SHOT; 361 + } else if (protocol == IPPROTO_ICMPV6) { 362 + action = parse_icmpv6(data, data_end, off, &pckt); 363 + if (action >= 0) 364 + return action; 365 + off += IPV6_PLUS_ICMP_HDR; 366 + } else { 367 + memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16); 368 + memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16); 369 + } 370 + } else { 371 + iph = data + off; 372 + if (iph + 1 > data_end) 373 + return TC_ACT_SHOT; 374 + if (iph->ihl != 5) 375 + return TC_ACT_SHOT; 376 + 377 + protocol = iph->protocol; 378 + pckt.proto = protocol; 379 + pkt_bytes = bpf_ntohs(iph->tot_len); 380 + off += IPV4_HDR_LEN_NO_OPT; 381 + 382 + if (iph->frag_off & PCKT_FRAGMENTED) 383 + return TC_ACT_SHOT; 384 + if (protocol == IPPROTO_ICMP) { 385 + action = parse_icmp(data, data_end, off, &pckt); 386 + if (action >= 0) 387 + return action; 388 + off += IPV4_PLUS_ICMP_HDR; 389 + } else { 390 + pckt.src = iph->saddr; 391 + pckt.dst = iph->daddr; 392 + } 393 + } 394 + protocol = pckt.proto; 395 + 396 + if (protocol == IPPROTO_TCP) { 397 + if (!parse_tcp(data, off, data_end, &pckt)) 398 + return TC_ACT_SHOT; 399 + } else if (protocol == IPPROTO_UDP) { 400 + if (!parse_udp(data, off, data_end, &pckt)) 401 + return TC_ACT_SHOT; 402 + } else { 403 + return TC_ACT_SHOT; 404 + } 405 + 406 + if (is_ipv6) 407 + memcpy(vip.daddr.v6, pckt.dstv6, 16); 408 + else 409 + vip.daddr.v4 = pckt.dst; 410 + 411 + vip.dport = pckt.port16[1]; 412 + vip.protocol = pckt.proto; 413 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 414 + if (!vip_info) { 415 + vip.dport = 0; 416 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 417 + if (!vip_info) 418 + return TC_ACT_SHOT; 419 + pckt.port16[1] = 0; 420 + } 421 + 422 + if (vip_info->flags & F_HASH_NO_SRC_PORT) 423 + pckt.port16[0] = 0; 424 + 425 + if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 426 + return TC_ACT_SHOT; 427 + 428 + if (dst->flags & F_IPV6) { 429 + cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos); 430 + if (!cval) 431 + return TC_ACT_SHOT; 432 + ifindex = cval->ifindex; 433 + memcpy(tkey.remote_ipv6, dst->dstv6, 16); 434 + tun_flag = BPF_F_TUNINFO_IPV6; 435 + } else { 436 + cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos); 437 + if (!cval) 438 + return TC_ACT_SHOT; 439 + ifindex = cval->ifindex; 440 + tkey.remote_ipv4 = dst->dst; 441 + } 442 + vip_num = vip_info->vip_num; 443 + data_stats = bpf_map_lookup_elem(&stats, &vip_num); 444 + if (!data_stats) 445 + return TC_ACT_SHOT; 446 + data_stats->pkts++; 447 + data_stats->bytes += pkt_bytes; 448 + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag); 449 + *(u32 *)eth->eth_dest = tkey.remote_ipv4; 450 + return bpf_redirect(ifindex, 0); 451 + } 452 + 453 + SEC("l4lb-demo") 454 + int balancer_ingress(struct __sk_buff *ctx) 455 + { 456 + void *data_end = (void *)(long)ctx->data_end; 457 + void *data = (void *)(long)ctx->data; 458 + struct eth_hdr *eth = data; 459 + __u32 eth_proto; 460 + __u32 nh_off; 461 + 462 + nh_off = sizeof(struct eth_hdr); 463 + if (data + nh_off > data_end) 464 + return TC_ACT_SHOT; 465 + eth_proto = eth->eth_proto; 466 + if (eth_proto == bpf_htons(ETH_P_IP)) 467 + return process_packet(data, nh_off, data_end, false, ctx); 468 + else if (eth_proto == bpf_htons(ETH_P_IPV6)) 469 + return process_packet(data, nh_off, data_end, true, ctx); 470 + else 471 + return TC_ACT_SHOT; 472 + } 473 + char _license[] SEC("license") = "GPL";

+225 -3

tools/testing/selftests/bpf/test_progs.c

··· 21 21 #include <linux/ipv6.h> 22 22 #include <linux/tcp.h> 23 23 #include <linux/filter.h> 24 + #include <linux/perf_event.h> 24 25 #include <linux/unistd.h> 25 26 27 + #include <sys/ioctl.h> 26 28 #include <sys/wait.h> 27 29 #include <sys/resource.h> 28 30 #include <sys/types.h> ··· 169 167 #define NUM_ITER 100000 170 168 #define VIP_NUM 5 171 169 172 - static void test_l4lb(void) 170 + static void test_l4lb(const char *file) 173 171 { 174 172 unsigned int nr_cpus = bpf_num_possible_cpus(); 175 - const char *file = "./test_l4lb.o"; 176 173 struct vip key = {.protocol = 6}; 177 174 struct vip_meta { 178 175 __u32 flags; ··· 243 242 if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) { 244 243 error_cnt++; 245 244 printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts); 245 + } 246 + out: 247 + bpf_object__close(obj); 248 + } 249 + 250 + static void test_l4lb_all(void) 251 + { 252 + const char *file1 = "./test_l4lb.o"; 253 + const char *file2 = "./test_l4lb_noinline.o"; 254 + 255 + test_l4lb(file1); 256 + test_l4lb(file2); 257 + } 258 + 259 + static void test_xdp_noinline(void) 260 + { 261 + const char *file = "./test_xdp_noinline.o"; 262 + unsigned int nr_cpus = bpf_num_possible_cpus(); 263 + struct vip key = {.protocol = 6}; 264 + struct vip_meta { 265 + __u32 flags; 266 + __u32 vip_num; 267 + } value = {.vip_num = VIP_NUM}; 268 + __u32 stats_key = VIP_NUM; 269 + struct vip_stats { 270 + __u64 bytes; 271 + __u64 pkts; 272 + } stats[nr_cpus]; 273 + struct real_definition { 274 + union { 275 + __be32 dst; 276 + __be32 dstv6[4]; 277 + }; 278 + __u8 flags; 279 + } real_def = {.dst = MAGIC_VAL}; 280 + __u32 ch_key = 11, real_num = 3; 281 + __u32 duration, retval, size; 282 + int err, i, prog_fd, map_fd; 283 + __u64 bytes = 0, pkts = 0; 284 + struct bpf_object *obj; 285 + char buf[128]; 286 + u32 *magic = (u32 *)buf; 287 + 288 + err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); 289 + if (err) { 290 + error_cnt++; 291 + return; 292 + } 293 + 294 + map_fd = bpf_find_map(__func__, obj, "vip_map"); 295 + if (map_fd < 0) 296 + goto out; 297 + bpf_map_update_elem(map_fd, &key, &value, 0); 298 + 299 + map_fd = bpf_find_map(__func__, obj, "ch_rings"); 300 + if (map_fd < 0) 301 + goto out; 302 + bpf_map_update_elem(map_fd, &ch_key, &real_num, 0); 303 + 304 + map_fd = bpf_find_map(__func__, obj, "reals"); 305 + if (map_fd < 0) 306 + goto out; 307 + bpf_map_update_elem(map_fd, &real_num, &real_def, 0); 308 + 309 + err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), 310 + buf, &size, &retval, &duration); 311 + CHECK(err || errno || retval != 1 || size != 54 || 312 + *magic != MAGIC_VAL, "ipv4", 313 + "err %d errno %d retval %d size %d magic %x\n", 314 + err, errno, retval, size, *magic); 315 + 316 + err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), 317 + buf, &size, &retval, &duration); 318 + CHECK(err || errno || retval != 1 || size != 74 || 319 + *magic != MAGIC_VAL, "ipv6", 320 + "err %d errno %d retval %d size %d magic %x\n", 321 + err, errno, retval, size, *magic); 322 + 323 + map_fd = bpf_find_map(__func__, obj, "stats"); 324 + if (map_fd < 0) 325 + goto out; 326 + bpf_map_lookup_elem(map_fd, &stats_key, stats); 327 + for (i = 0; i < nr_cpus; i++) { 328 + bytes += stats[i].bytes; 329 + pkts += stats[i].pkts; 330 + } 331 + if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) { 332 + error_cnt++; 333 + printf("test_xdp_noinline:FAIL:stats %lld %lld\n", bytes, pkts); 246 334 } 247 335 out: 248 336 bpf_object__close(obj); ··· 707 617 } 708 618 } 709 619 620 + static void test_tp_attach_query(void) 621 + { 622 + const int num_progs = 3; 623 + int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs]; 624 + __u32 duration = 0, info_len, saved_prog_ids[num_progs]; 625 + const char *file = "./test_tracepoint.o"; 626 + struct perf_event_query_bpf *query; 627 + struct perf_event_attr attr = {}; 628 + struct bpf_object *obj[num_progs]; 629 + struct bpf_prog_info prog_info; 630 + char buf[256]; 631 + 632 + snprintf(buf, sizeof(buf), 633 + "/sys/kernel/debug/tracing/events/sched/sched_switch/id"); 634 + efd = open(buf, O_RDONLY, 0); 635 + if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) 636 + return; 637 + bytes = read(efd, buf, sizeof(buf)); 638 + close(efd); 639 + if (CHECK(bytes <= 0 || bytes >= sizeof(buf), 640 + "read", "bytes %d errno %d\n", bytes, errno)) 641 + return; 642 + 643 + attr.config = strtol(buf, NULL, 0); 644 + attr.type = PERF_TYPE_TRACEPOINT; 645 + attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN; 646 + attr.sample_period = 1; 647 + attr.wakeup_events = 1; 648 + 649 + query = malloc(sizeof(*query) + sizeof(__u32) * num_progs); 650 + for (i = 0; i < num_progs; i++) { 651 + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i], 652 + &prog_fd[i]); 653 + if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) 654 + goto cleanup1; 655 + 656 + bzero(&prog_info, sizeof(prog_info)); 657 + prog_info.jited_prog_len = 0; 658 + prog_info.xlated_prog_len = 0; 659 + prog_info.nr_map_ids = 0; 660 + info_len = sizeof(prog_info); 661 + err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len); 662 + if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n", 663 + err, errno)) 664 + goto cleanup1; 665 + saved_prog_ids[i] = prog_info.id; 666 + 667 + pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 668 + 0 /* cpu 0 */, -1 /* group id */, 669 + 0 /* flags */); 670 + if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n", 671 + pmu_fd[i], errno)) 672 + goto cleanup2; 673 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); 674 + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", 675 + err, errno)) 676 + goto cleanup3; 677 + 678 + if (i == 0) { 679 + /* check NULL prog array query */ 680 + query->ids_len = num_progs; 681 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); 682 + if (CHECK(err || query->prog_cnt != 0, 683 + "perf_event_ioc_query_bpf", 684 + "err %d errno %d query->prog_cnt %u\n", 685 + err, errno, query->prog_cnt)) 686 + goto cleanup3; 687 + } 688 + 689 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[i]); 690 + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", 691 + err, errno)) 692 + goto cleanup3; 693 + 694 + if (i == 1) { 695 + /* try to get # of programs only */ 696 + query->ids_len = 0; 697 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); 698 + if (CHECK(err || query->prog_cnt != 2, 699 + "perf_event_ioc_query_bpf", 700 + "err %d errno %d query->prog_cnt %u\n", 701 + err, errno, query->prog_cnt)) 702 + goto cleanup3; 703 + 704 + /* try a few negative tests */ 705 + /* invalid query pointer */ 706 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, 707 + (struct perf_event_query_bpf *)0x1); 708 + if (CHECK(!err || errno != EFAULT, 709 + "perf_event_ioc_query_bpf", 710 + "err %d errno %d\n", err, errno)) 711 + goto cleanup3; 712 + 713 + /* no enough space */ 714 + query->ids_len = 1; 715 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); 716 + if (CHECK(!err || errno != ENOSPC || query->prog_cnt != 2, 717 + "perf_event_ioc_query_bpf", 718 + "err %d errno %d query->prog_cnt %u\n", 719 + err, errno, query->prog_cnt)) 720 + goto cleanup3; 721 + } 722 + 723 + query->ids_len = num_progs; 724 + err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query); 725 + if (CHECK(err || query->prog_cnt != (i + 1), 726 + "perf_event_ioc_query_bpf", 727 + "err %d errno %d query->prog_cnt %u\n", 728 + err, errno, query->prog_cnt)) 729 + goto cleanup3; 730 + for (j = 0; j < i + 1; j++) 731 + if (CHECK(saved_prog_ids[j] != query->ids[j], 732 + "perf_event_ioc_query_bpf", 733 + "#%d saved_prog_id %x query prog_id %x\n", 734 + j, saved_prog_ids[j], query->ids[j])) 735 + goto cleanup3; 736 + } 737 + 738 + i = num_progs - 1; 739 + for (; i >= 0; i--) { 740 + cleanup3: 741 + ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE); 742 + cleanup2: 743 + close(pmu_fd[i]); 744 + cleanup1: 745 + bpf_object__close(obj[i]); 746 + } 747 + free(query); 748 + } 749 + 710 750 int main(void) 711 751 { 712 752 struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; ··· 845 625 846 626 test_pkt_access(); 847 627 test_xdp(); 848 - test_l4lb(); 628 + test_l4lb_all(); 629 + test_xdp_noinline(); 849 630 test_tcp_estats(); 850 631 test_bpf_obj_id(); 851 632 test_pkt_md_access(); 852 633 test_obj_name(); 634 + test_tp_attach_query(); 853 635 854 636 printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); 855 637 return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;

+26

tools/testing/selftests/bpf/test_tracepoint.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2017 Facebook 3 + 4 + #include <linux/bpf.h> 5 + #include "bpf_helpers.h" 6 + 7 + /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ 8 + struct sched_switch_args { 9 + unsigned long long pad; 10 + char prev_comm[16]; 11 + int prev_pid; 12 + int prev_prio; 13 + long long prev_state; 14 + char next_comm[16]; 15 + int next_pid; 16 + int next_prio; 17 + }; 18 + 19 + SEC("tracepoint/sched/sched_switch") 20 + int oncpu(struct sched_switch_args *ctx) 21 + { 22 + return 0; 23 + } 24 + 25 + char _license[] SEC("license") = "GPL"; 26 + __u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */

+1621 -3

tools/testing/selftests/bpf/test_verifier.c

··· 2 2 * Testsuite for eBPF verifier 3 3 * 4 4 * Copyright (c) 2014 PLUMgrid, http://plumgrid.com 5 + * Copyright (c) 2017 Facebook 5 6 * 6 7 * This program is free software; you can redistribute it and/or 7 8 * modify it under the terms of version 2 of the GNU General Public ··· 278 277 .insns = { 279 278 BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2), 280 279 }, 281 - .errstr = "jump out of range", 280 + .errstr = "not an exit", 282 281 .result = REJECT, 283 282 }, 284 283 { ··· 5649 5648 "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", 5650 5649 .insns = { 5651 5650 BPF_MOV64_IMM(BPF_REG_1, 0), 5652 - BPF_MOV64_IMM(BPF_REG_2, 0), 5651 + BPF_MOV64_IMM(BPF_REG_2, 1), 5653 5652 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 5654 5653 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 5655 5654 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64), ··· 5884 5883 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24), 5885 5884 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16), 5886 5885 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), 5887 - BPF_MOV64_IMM(BPF_REG_2, 0), 5886 + BPF_MOV64_IMM(BPF_REG_2, 1), 5888 5887 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128), 5889 5888 BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128), 5890 5889 BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63), ··· 8097 8096 .errstr = "R0 has unknown scalar value", 8098 8097 .result = REJECT, 8099 8098 .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, 8099 + }, 8100 + { 8101 + "calls: basic sanity", 8102 + .insns = { 8103 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8104 + BPF_MOV64_IMM(BPF_REG_0, 1), 8105 + BPF_EXIT_INSN(), 8106 + BPF_MOV64_IMM(BPF_REG_0, 2), 8107 + BPF_EXIT_INSN(), 8108 + }, 8109 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8110 + .result = ACCEPT, 8111 + }, 8112 + { 8113 + "calls: not on unpriviledged", 8114 + .insns = { 8115 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8116 + BPF_MOV64_IMM(BPF_REG_0, 1), 8117 + BPF_EXIT_INSN(), 8118 + BPF_MOV64_IMM(BPF_REG_0, 2), 8119 + BPF_EXIT_INSN(), 8120 + }, 8121 + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", 8122 + .result_unpriv = REJECT, 8123 + .result = ACCEPT, 8124 + }, 8125 + { 8126 + "calls: overlapping caller/callee", 8127 + .insns = { 8128 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0), 8129 + BPF_MOV64_IMM(BPF_REG_0, 1), 8130 + BPF_EXIT_INSN(), 8131 + }, 8132 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8133 + .errstr = "last insn is not an exit or jmp", 8134 + .result = REJECT, 8135 + }, 8136 + { 8137 + "calls: wrong recursive calls", 8138 + .insns = { 8139 + BPF_JMP_IMM(BPF_JA, 0, 0, 4), 8140 + BPF_JMP_IMM(BPF_JA, 0, 0, 4), 8141 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), 8142 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), 8143 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2), 8144 + BPF_MOV64_IMM(BPF_REG_0, 1), 8145 + BPF_EXIT_INSN(), 8146 + }, 8147 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8148 + .errstr = "jump out of range", 8149 + .result = REJECT, 8150 + }, 8151 + { 8152 + "calls: wrong src reg", 8153 + .insns = { 8154 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), 8155 + BPF_MOV64_IMM(BPF_REG_0, 1), 8156 + BPF_EXIT_INSN(), 8157 + }, 8158 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8159 + .errstr = "BPF_CALL uses reserved fields", 8160 + .result = REJECT, 8161 + }, 8162 + { 8163 + "calls: wrong off value", 8164 + .insns = { 8165 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, -1, 2), 8166 + BPF_MOV64_IMM(BPF_REG_0, 1), 8167 + BPF_EXIT_INSN(), 8168 + BPF_MOV64_IMM(BPF_REG_0, 2), 8169 + BPF_EXIT_INSN(), 8170 + }, 8171 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8172 + .errstr = "BPF_CALL uses reserved fields", 8173 + .result = REJECT, 8174 + }, 8175 + { 8176 + "calls: jump back loop", 8177 + .insns = { 8178 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1), 8179 + BPF_MOV64_IMM(BPF_REG_0, 1), 8180 + BPF_EXIT_INSN(), 8181 + }, 8182 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8183 + .errstr = "back-edge from insn 0 to 0", 8184 + .result = REJECT, 8185 + }, 8186 + { 8187 + "calls: conditional call", 8188 + .insns = { 8189 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8190 + offsetof(struct __sk_buff, mark)), 8191 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), 8192 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8193 + BPF_MOV64_IMM(BPF_REG_0, 1), 8194 + BPF_EXIT_INSN(), 8195 + BPF_MOV64_IMM(BPF_REG_0, 2), 8196 + BPF_EXIT_INSN(), 8197 + }, 8198 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8199 + .errstr = "jump out of range", 8200 + .result = REJECT, 8201 + }, 8202 + { 8203 + "calls: conditional call 2", 8204 + .insns = { 8205 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8206 + offsetof(struct __sk_buff, mark)), 8207 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), 8208 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8209 + BPF_MOV64_IMM(BPF_REG_0, 1), 8210 + BPF_EXIT_INSN(), 8211 + BPF_MOV64_IMM(BPF_REG_0, 2), 8212 + BPF_EXIT_INSN(), 8213 + BPF_MOV64_IMM(BPF_REG_0, 3), 8214 + BPF_EXIT_INSN(), 8215 + }, 8216 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8217 + .result = ACCEPT, 8218 + }, 8219 + { 8220 + "calls: conditional call 3", 8221 + .insns = { 8222 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8223 + offsetof(struct __sk_buff, mark)), 8224 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), 8225 + BPF_JMP_IMM(BPF_JA, 0, 0, 4), 8226 + BPF_MOV64_IMM(BPF_REG_0, 1), 8227 + BPF_EXIT_INSN(), 8228 + BPF_MOV64_IMM(BPF_REG_0, 1), 8229 + BPF_JMP_IMM(BPF_JA, 0, 0, -6), 8230 + BPF_MOV64_IMM(BPF_REG_0, 3), 8231 + BPF_JMP_IMM(BPF_JA, 0, 0, -6), 8232 + }, 8233 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8234 + .errstr = "back-edge from insn", 8235 + .result = REJECT, 8236 + }, 8237 + { 8238 + "calls: conditional call 4", 8239 + .insns = { 8240 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8241 + offsetof(struct __sk_buff, mark)), 8242 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), 8243 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8244 + BPF_MOV64_IMM(BPF_REG_0, 1), 8245 + BPF_EXIT_INSN(), 8246 + BPF_MOV64_IMM(BPF_REG_0, 1), 8247 + BPF_JMP_IMM(BPF_JA, 0, 0, -5), 8248 + BPF_MOV64_IMM(BPF_REG_0, 3), 8249 + BPF_EXIT_INSN(), 8250 + }, 8251 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8252 + .result = ACCEPT, 8253 + }, 8254 + { 8255 + "calls: conditional call 5", 8256 + .insns = { 8257 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8258 + offsetof(struct __sk_buff, mark)), 8259 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), 8260 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8261 + BPF_MOV64_IMM(BPF_REG_0, 1), 8262 + BPF_EXIT_INSN(), 8263 + BPF_MOV64_IMM(BPF_REG_0, 1), 8264 + BPF_JMP_IMM(BPF_JA, 0, 0, -6), 8265 + BPF_MOV64_IMM(BPF_REG_0, 3), 8266 + BPF_EXIT_INSN(), 8267 + }, 8268 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8269 + .errstr = "back-edge from insn", 8270 + .result = REJECT, 8271 + }, 8272 + { 8273 + "calls: conditional call 6", 8274 + .insns = { 8275 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8276 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -2), 8277 + BPF_EXIT_INSN(), 8278 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8279 + offsetof(struct __sk_buff, mark)), 8280 + BPF_EXIT_INSN(), 8281 + }, 8282 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8283 + .errstr = "back-edge from insn", 8284 + .result = REJECT, 8285 + }, 8286 + { 8287 + "calls: using r0 returned by callee", 8288 + .insns = { 8289 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8290 + BPF_EXIT_INSN(), 8291 + BPF_MOV64_IMM(BPF_REG_0, 2), 8292 + BPF_EXIT_INSN(), 8293 + }, 8294 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8295 + .result = ACCEPT, 8296 + }, 8297 + { 8298 + "calls: using uninit r0 from callee", 8299 + .insns = { 8300 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8301 + BPF_EXIT_INSN(), 8302 + BPF_EXIT_INSN(), 8303 + }, 8304 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8305 + .errstr = "!read_ok", 8306 + .result = REJECT, 8307 + }, 8308 + { 8309 + "calls: callee is using r1", 8310 + .insns = { 8311 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8312 + BPF_EXIT_INSN(), 8313 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8314 + offsetof(struct __sk_buff, len)), 8315 + BPF_EXIT_INSN(), 8316 + }, 8317 + .prog_type = BPF_PROG_TYPE_SCHED_ACT, 8318 + .result = ACCEPT, 8319 + }, 8320 + { 8321 + "calls: callee using args1", 8322 + .insns = { 8323 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8324 + BPF_EXIT_INSN(), 8325 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 8326 + BPF_EXIT_INSN(), 8327 + }, 8328 + .errstr_unpriv = "allowed for root only", 8329 + .result_unpriv = REJECT, 8330 + .result = ACCEPT, 8331 + }, 8332 + { 8333 + "calls: callee using wrong args2", 8334 + .insns = { 8335 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8336 + BPF_EXIT_INSN(), 8337 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 8338 + BPF_EXIT_INSN(), 8339 + }, 8340 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8341 + .errstr = "R2 !read_ok", 8342 + .result = REJECT, 8343 + }, 8344 + { 8345 + "calls: callee using two args", 8346 + .insns = { 8347 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8348 + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, 8349 + offsetof(struct __sk_buff, len)), 8350 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6, 8351 + offsetof(struct __sk_buff, len)), 8352 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8353 + BPF_EXIT_INSN(), 8354 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), 8355 + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2), 8356 + BPF_EXIT_INSN(), 8357 + }, 8358 + .errstr_unpriv = "allowed for root only", 8359 + .result_unpriv = REJECT, 8360 + .result = ACCEPT, 8361 + }, 8362 + { 8363 + "calls: callee changing pkt pointers", 8364 + .insns = { 8365 + BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, 8366 + offsetof(struct xdp_md, data)), 8367 + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1, 8368 + offsetof(struct xdp_md, data_end)), 8369 + BPF_MOV64_REG(BPF_REG_8, BPF_REG_6), 8370 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 8), 8371 + BPF_JMP_REG(BPF_JGT, BPF_REG_8, BPF_REG_7, 2), 8372 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8373 + /* clear_all_pkt_pointers() has to walk all frames 8374 + * to make sure that pkt pointers in the caller 8375 + * are cleared when callee is calling a helper that 8376 + * adjusts packet size 8377 + */ 8378 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), 8379 + BPF_MOV32_IMM(BPF_REG_0, 0), 8380 + BPF_EXIT_INSN(), 8381 + BPF_MOV64_IMM(BPF_REG_2, 0), 8382 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 8383 + BPF_FUNC_xdp_adjust_head), 8384 + BPF_EXIT_INSN(), 8385 + }, 8386 + .result = REJECT, 8387 + .errstr = "R6 invalid mem access 'inv'", 8388 + .prog_type = BPF_PROG_TYPE_XDP, 8389 + }, 8390 + { 8391 + "calls: two calls with args", 8392 + .insns = { 8393 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8394 + BPF_EXIT_INSN(), 8395 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8396 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), 8397 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 8398 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8399 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8400 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 8401 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), 8402 + BPF_EXIT_INSN(), 8403 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8404 + offsetof(struct __sk_buff, len)), 8405 + BPF_EXIT_INSN(), 8406 + }, 8407 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8408 + .result = ACCEPT, 8409 + }, 8410 + { 8411 + "calls: calls with stack arith", 8412 + .insns = { 8413 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8414 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), 8415 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8416 + BPF_EXIT_INSN(), 8417 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), 8418 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8419 + BPF_EXIT_INSN(), 8420 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64), 8421 + BPF_MOV64_IMM(BPF_REG_0, 42), 8422 + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), 8423 + BPF_EXIT_INSN(), 8424 + }, 8425 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8426 + .result = ACCEPT, 8427 + }, 8428 + { 8429 + "calls: calls with misaligned stack access", 8430 + .insns = { 8431 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8432 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63), 8433 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8434 + BPF_EXIT_INSN(), 8435 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -61), 8436 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8437 + BPF_EXIT_INSN(), 8438 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63), 8439 + BPF_MOV64_IMM(BPF_REG_0, 42), 8440 + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), 8441 + BPF_EXIT_INSN(), 8442 + }, 8443 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8444 + .flags = F_LOAD_WITH_STRICT_ALIGNMENT, 8445 + .errstr = "misaligned stack access", 8446 + .result = REJECT, 8447 + }, 8448 + { 8449 + "calls: calls control flow, jump test", 8450 + .insns = { 8451 + BPF_MOV64_IMM(BPF_REG_0, 42), 8452 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 8453 + BPF_MOV64_IMM(BPF_REG_0, 43), 8454 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 8455 + BPF_JMP_IMM(BPF_JA, 0, 0, -3), 8456 + BPF_EXIT_INSN(), 8457 + }, 8458 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8459 + .result = ACCEPT, 8460 + }, 8461 + { 8462 + "calls: calls control flow, jump test 2", 8463 + .insns = { 8464 + BPF_MOV64_IMM(BPF_REG_0, 42), 8465 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 8466 + BPF_MOV64_IMM(BPF_REG_0, 43), 8467 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 8468 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3), 8469 + BPF_EXIT_INSN(), 8470 + }, 8471 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8472 + .errstr = "jump out of range from insn 1 to 4", 8473 + .result = REJECT, 8474 + }, 8475 + { 8476 + "calls: two calls with bad jump", 8477 + .insns = { 8478 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8479 + BPF_EXIT_INSN(), 8480 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8481 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), 8482 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 8483 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8484 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8485 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 8486 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), 8487 + BPF_EXIT_INSN(), 8488 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8489 + offsetof(struct __sk_buff, len)), 8490 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3), 8491 + BPF_EXIT_INSN(), 8492 + }, 8493 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8494 + .errstr = "jump out of range from insn 11 to 9", 8495 + .result = REJECT, 8496 + }, 8497 + { 8498 + "calls: recursive call. test1", 8499 + .insns = { 8500 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8501 + BPF_EXIT_INSN(), 8502 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1), 8503 + BPF_EXIT_INSN(), 8504 + }, 8505 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8506 + .errstr = "back-edge", 8507 + .result = REJECT, 8508 + }, 8509 + { 8510 + "calls: recursive call. test2", 8511 + .insns = { 8512 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8513 + BPF_EXIT_INSN(), 8514 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3), 8515 + BPF_EXIT_INSN(), 8516 + }, 8517 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8518 + .errstr = "back-edge", 8519 + .result = REJECT, 8520 + }, 8521 + { 8522 + "calls: unreachable code", 8523 + .insns = { 8524 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8525 + BPF_EXIT_INSN(), 8526 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8527 + BPF_EXIT_INSN(), 8528 + BPF_MOV64_IMM(BPF_REG_0, 0), 8529 + BPF_EXIT_INSN(), 8530 + BPF_MOV64_IMM(BPF_REG_0, 0), 8531 + BPF_EXIT_INSN(), 8532 + }, 8533 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8534 + .errstr = "unreachable insn 6", 8535 + .result = REJECT, 8536 + }, 8537 + { 8538 + "calls: invalid call", 8539 + .insns = { 8540 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8541 + BPF_EXIT_INSN(), 8542 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -4), 8543 + BPF_EXIT_INSN(), 8544 + }, 8545 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8546 + .errstr = "invalid destination", 8547 + .result = REJECT, 8548 + }, 8549 + { 8550 + "calls: invalid call 2", 8551 + .insns = { 8552 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8553 + BPF_EXIT_INSN(), 8554 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0x7fffffff), 8555 + BPF_EXIT_INSN(), 8556 + }, 8557 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8558 + .errstr = "invalid destination", 8559 + .result = REJECT, 8560 + }, 8561 + { 8562 + "calls: jumping across function bodies. test1", 8563 + .insns = { 8564 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8565 + BPF_MOV64_IMM(BPF_REG_0, 0), 8566 + BPF_EXIT_INSN(), 8567 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), 8568 + BPF_EXIT_INSN(), 8569 + }, 8570 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8571 + .errstr = "jump out of range", 8572 + .result = REJECT, 8573 + }, 8574 + { 8575 + "calls: jumping across function bodies. test2", 8576 + .insns = { 8577 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), 8578 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8579 + BPF_MOV64_IMM(BPF_REG_0, 0), 8580 + BPF_EXIT_INSN(), 8581 + BPF_EXIT_INSN(), 8582 + }, 8583 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8584 + .errstr = "jump out of range", 8585 + .result = REJECT, 8586 + }, 8587 + { 8588 + "calls: call without exit", 8589 + .insns = { 8590 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8591 + BPF_EXIT_INSN(), 8592 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8593 + BPF_EXIT_INSN(), 8594 + BPF_MOV64_IMM(BPF_REG_0, 0), 8595 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -2), 8596 + }, 8597 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8598 + .errstr = "not an exit", 8599 + .result = REJECT, 8600 + }, 8601 + { 8602 + "calls: call into middle of ld_imm64", 8603 + .insns = { 8604 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8605 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8606 + BPF_MOV64_IMM(BPF_REG_0, 0), 8607 + BPF_EXIT_INSN(), 8608 + BPF_LD_IMM64(BPF_REG_0, 0), 8609 + BPF_EXIT_INSN(), 8610 + }, 8611 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8612 + .errstr = "last insn", 8613 + .result = REJECT, 8614 + }, 8615 + { 8616 + "calls: call into middle of other call", 8617 + .insns = { 8618 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8619 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8620 + BPF_MOV64_IMM(BPF_REG_0, 0), 8621 + BPF_EXIT_INSN(), 8622 + BPF_MOV64_IMM(BPF_REG_0, 0), 8623 + BPF_MOV64_IMM(BPF_REG_0, 0), 8624 + BPF_EXIT_INSN(), 8625 + }, 8626 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8627 + .errstr = "last insn", 8628 + .result = REJECT, 8629 + }, 8630 + { 8631 + "calls: ld_abs with changing ctx data in callee", 8632 + .insns = { 8633 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8634 + BPF_LD_ABS(BPF_B, 0), 8635 + BPF_LD_ABS(BPF_H, 0), 8636 + BPF_LD_ABS(BPF_W, 0), 8637 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), 8638 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), 8639 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), 8640 + BPF_LD_ABS(BPF_B, 0), 8641 + BPF_LD_ABS(BPF_H, 0), 8642 + BPF_LD_ABS(BPF_W, 0), 8643 + BPF_EXIT_INSN(), 8644 + BPF_MOV64_IMM(BPF_REG_2, 1), 8645 + BPF_MOV64_IMM(BPF_REG_3, 2), 8646 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 8647 + BPF_FUNC_skb_vlan_push), 8648 + BPF_EXIT_INSN(), 8649 + }, 8650 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 8651 + .errstr = "BPF_LD_[ABS|IND] instructions cannot be mixed", 8652 + .result = REJECT, 8653 + }, 8654 + { 8655 + "calls: two calls with bad fallthrough", 8656 + .insns = { 8657 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8658 + BPF_EXIT_INSN(), 8659 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8660 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), 8661 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 8662 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8663 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8664 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 8665 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), 8666 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_0), 8667 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 8668 + offsetof(struct __sk_buff, len)), 8669 + BPF_EXIT_INSN(), 8670 + }, 8671 + .prog_type = BPF_PROG_TYPE_TRACEPOINT, 8672 + .errstr = "not an exit", 8673 + .result = REJECT, 8674 + }, 8675 + { 8676 + "calls: two calls with stack read", 8677 + .insns = { 8678 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8679 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8680 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8681 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8682 + BPF_EXIT_INSN(), 8683 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8684 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6), 8685 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), 8686 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8687 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8688 + BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0), 8689 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_7), 8690 + BPF_EXIT_INSN(), 8691 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), 8692 + BPF_EXIT_INSN(), 8693 + }, 8694 + .prog_type = BPF_PROG_TYPE_XDP, 8695 + .result = ACCEPT, 8696 + }, 8697 + { 8698 + "calls: two calls with stack write", 8699 + .insns = { 8700 + /* main prog */ 8701 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8702 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8703 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8704 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8705 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8706 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8707 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), 8708 + BPF_EXIT_INSN(), 8709 + 8710 + /* subprog 1 */ 8711 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8712 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 8713 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 7), 8714 + BPF_MOV64_REG(BPF_REG_8, BPF_REG_0), 8715 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8716 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8717 + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), 8718 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_8), 8719 + /* write into stack frame of main prog */ 8720 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 8721 + BPF_EXIT_INSN(), 8722 + 8723 + /* subprog 2 */ 8724 + /* read from stack frame of main prog */ 8725 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), 8726 + BPF_EXIT_INSN(), 8727 + }, 8728 + .prog_type = BPF_PROG_TYPE_XDP, 8729 + .result = ACCEPT, 8730 + }, 8731 + { 8732 + "calls: spill into caller stack frame", 8733 + .insns = { 8734 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8735 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8736 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8737 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8738 + BPF_EXIT_INSN(), 8739 + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), 8740 + BPF_MOV64_IMM(BPF_REG_0, 0), 8741 + BPF_EXIT_INSN(), 8742 + }, 8743 + .prog_type = BPF_PROG_TYPE_XDP, 8744 + .errstr = "cannot spill", 8745 + .result = REJECT, 8746 + }, 8747 + { 8748 + "calls: write into caller stack frame", 8749 + .insns = { 8750 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8751 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8752 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8753 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8754 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), 8755 + BPF_EXIT_INSN(), 8756 + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), 8757 + BPF_MOV64_IMM(BPF_REG_0, 0), 8758 + BPF_EXIT_INSN(), 8759 + }, 8760 + .prog_type = BPF_PROG_TYPE_XDP, 8761 + .result = ACCEPT, 8762 + }, 8763 + { 8764 + "calls: write into callee stack frame", 8765 + .insns = { 8766 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8767 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42), 8768 + BPF_EXIT_INSN(), 8769 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_10), 8770 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, -8), 8771 + BPF_EXIT_INSN(), 8772 + }, 8773 + .prog_type = BPF_PROG_TYPE_XDP, 8774 + .errstr = "cannot return stack pointer", 8775 + .result = REJECT, 8776 + }, 8777 + { 8778 + "calls: two calls with stack write and void return", 8779 + .insns = { 8780 + /* main prog */ 8781 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8782 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8783 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8784 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8785 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8786 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8787 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), 8788 + BPF_EXIT_INSN(), 8789 + 8790 + /* subprog 1 */ 8791 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8792 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 8793 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8794 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), 8795 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8796 + BPF_EXIT_INSN(), 8797 + 8798 + /* subprog 2 */ 8799 + /* write into stack frame of main prog */ 8800 + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0), 8801 + BPF_EXIT_INSN(), /* void return */ 8802 + }, 8803 + .prog_type = BPF_PROG_TYPE_XDP, 8804 + .result = ACCEPT, 8805 + }, 8806 + { 8807 + "calls: ambiguous return value", 8808 + .insns = { 8809 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8810 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), 8811 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 8812 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 8813 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8814 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), 8815 + BPF_EXIT_INSN(), 8816 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), 8817 + BPF_MOV64_IMM(BPF_REG_0, 0), 8818 + BPF_EXIT_INSN(), 8819 + }, 8820 + .errstr_unpriv = "allowed for root only", 8821 + .result_unpriv = REJECT, 8822 + .errstr = "R0 !read_ok", 8823 + .result = REJECT, 8824 + }, 8825 + { 8826 + "calls: two calls that return map_value", 8827 + .insns = { 8828 + /* main prog */ 8829 + /* pass fp-16, fp-8 into a function */ 8830 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8831 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8832 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8833 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8834 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8), 8835 + 8836 + /* fetch map_value_ptr from the stack of this function */ 8837 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), 8838 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), 8839 + /* write into map value */ 8840 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8841 + /* fetch secound map_value_ptr from the stack */ 8842 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16), 8843 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), 8844 + /* write into map value */ 8845 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8846 + BPF_MOV64_IMM(BPF_REG_0, 0), 8847 + BPF_EXIT_INSN(), 8848 + 8849 + /* subprog 1 */ 8850 + /* call 3rd function twice */ 8851 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8852 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 8853 + /* first time with fp-8 */ 8854 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 8855 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), 8856 + /* second time with fp-16 */ 8857 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 8858 + BPF_EXIT_INSN(), 8859 + 8860 + /* subprog 2 */ 8861 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8862 + /* lookup from map */ 8863 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8864 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8865 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 8866 + BPF_LD_MAP_FD(BPF_REG_1, 0), 8867 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 8868 + BPF_FUNC_map_lookup_elem), 8869 + /* write map_value_ptr into stack frame of main prog */ 8870 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 8871 + BPF_MOV64_IMM(BPF_REG_0, 0), 8872 + BPF_EXIT_INSN(), /* return 0 */ 8873 + }, 8874 + .prog_type = BPF_PROG_TYPE_XDP, 8875 + .fixup_map1 = { 23 }, 8876 + .result = ACCEPT, 8877 + }, 8878 + { 8879 + "calls: two calls that return map_value with bool condition", 8880 + .insns = { 8881 + /* main prog */ 8882 + /* pass fp-16, fp-8 into a function */ 8883 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8884 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8885 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8886 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8887 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8888 + BPF_MOV64_IMM(BPF_REG_0, 0), 8889 + BPF_EXIT_INSN(), 8890 + 8891 + /* subprog 1 */ 8892 + /* call 3rd function twice */ 8893 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8894 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 8895 + /* first time with fp-8 */ 8896 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9), 8897 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), 8898 + /* fetch map_value_ptr from the stack of this function */ 8899 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), 8900 + /* write into map value */ 8901 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8902 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), 8903 + /* second time with fp-16 */ 8904 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8905 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), 8906 + /* fetch secound map_value_ptr from the stack */ 8907 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 8908 + /* write into map value */ 8909 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8910 + BPF_EXIT_INSN(), 8911 + 8912 + /* subprog 2 */ 8913 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8914 + /* lookup from map */ 8915 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8916 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8917 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 8918 + BPF_LD_MAP_FD(BPF_REG_1, 0), 8919 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 8920 + BPF_FUNC_map_lookup_elem), 8921 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 8922 + BPF_MOV64_IMM(BPF_REG_0, 0), 8923 + BPF_EXIT_INSN(), /* return 0 */ 8924 + /* write map_value_ptr into stack frame of main prog */ 8925 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 8926 + BPF_MOV64_IMM(BPF_REG_0, 1), 8927 + BPF_EXIT_INSN(), /* return 1 */ 8928 + }, 8929 + .prog_type = BPF_PROG_TYPE_XDP, 8930 + .fixup_map1 = { 23 }, 8931 + .result = ACCEPT, 8932 + }, 8933 + { 8934 + "calls: two calls that return map_value with incorrect bool check", 8935 + .insns = { 8936 + /* main prog */ 8937 + /* pass fp-16, fp-8 into a function */ 8938 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8939 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8940 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8941 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8942 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8943 + BPF_MOV64_IMM(BPF_REG_0, 0), 8944 + BPF_EXIT_INSN(), 8945 + 8946 + /* subprog 1 */ 8947 + /* call 3rd function twice */ 8948 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8949 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 8950 + /* first time with fp-8 */ 8951 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9), 8952 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), 8953 + /* fetch map_value_ptr from the stack of this function */ 8954 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0), 8955 + /* write into map value */ 8956 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8957 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), 8958 + /* second time with fp-16 */ 8959 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 8960 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 8961 + /* fetch secound map_value_ptr from the stack */ 8962 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), 8963 + /* write into map value */ 8964 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 8965 + BPF_EXIT_INSN(), 8966 + 8967 + /* subprog 2 */ 8968 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 8969 + /* lookup from map */ 8970 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 8971 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8972 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 8973 + BPF_LD_MAP_FD(BPF_REG_1, 0), 8974 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 8975 + BPF_FUNC_map_lookup_elem), 8976 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 8977 + BPF_MOV64_IMM(BPF_REG_0, 0), 8978 + BPF_EXIT_INSN(), /* return 0 */ 8979 + /* write map_value_ptr into stack frame of main prog */ 8980 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 8981 + BPF_MOV64_IMM(BPF_REG_0, 1), 8982 + BPF_EXIT_INSN(), /* return 1 */ 8983 + }, 8984 + .prog_type = BPF_PROG_TYPE_XDP, 8985 + .fixup_map1 = { 23 }, 8986 + .result = REJECT, 8987 + .errstr = "invalid read from stack off -16+0 size 8", 8988 + }, 8989 + { 8990 + "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1", 8991 + .insns = { 8992 + /* main prog */ 8993 + /* pass fp-16, fp-8 into a function */ 8994 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 8995 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 8996 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 8997 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 8998 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 8999 + BPF_MOV64_IMM(BPF_REG_0, 0), 9000 + BPF_EXIT_INSN(), 9001 + 9002 + /* subprog 1 */ 9003 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9004 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 9005 + /* 1st lookup from map */ 9006 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9007 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9008 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9009 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9010 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9011 + BPF_FUNC_map_lookup_elem), 9012 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9013 + BPF_MOV64_IMM(BPF_REG_8, 0), 9014 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9015 + /* write map_value_ptr into stack frame of main prog at fp-8 */ 9016 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9017 + BPF_MOV64_IMM(BPF_REG_8, 1), 9018 + 9019 + /* 2nd lookup from map */ 9020 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */ 9021 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9022 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9023 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */ 9024 + BPF_FUNC_map_lookup_elem), 9025 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9026 + BPF_MOV64_IMM(BPF_REG_9, 0), 9027 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9028 + /* write map_value_ptr into stack frame of main prog at fp-16 */ 9029 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 9030 + BPF_MOV64_IMM(BPF_REG_9, 1), 9031 + 9032 + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ 9033 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */ 9034 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), 9035 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 9036 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), 9037 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */ 9038 + BPF_EXIT_INSN(), 9039 + 9040 + /* subprog 2 */ 9041 + /* if arg2 == 1 do *arg1 = 0 */ 9042 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), 9043 + /* fetch map_value_ptr from the stack of this function */ 9044 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), 9045 + /* write into map value */ 9046 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9047 + 9048 + /* if arg4 == 1 do *arg3 = 0 */ 9049 + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), 9050 + /* fetch map_value_ptr from the stack of this function */ 9051 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 9052 + /* write into map value */ 9053 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0), 9054 + BPF_EXIT_INSN(), 9055 + }, 9056 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9057 + .fixup_map1 = { 12, 22 }, 9058 + .result = REJECT, 9059 + .errstr = "invalid access to map value, value_size=8 off=2 size=8", 9060 + }, 9061 + { 9062 + "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test2", 9063 + .insns = { 9064 + /* main prog */ 9065 + /* pass fp-16, fp-8 into a function */ 9066 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 9067 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 9068 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9069 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 9070 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 9071 + BPF_MOV64_IMM(BPF_REG_0, 0), 9072 + BPF_EXIT_INSN(), 9073 + 9074 + /* subprog 1 */ 9075 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9076 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 9077 + /* 1st lookup from map */ 9078 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9079 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9080 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9081 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9082 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9083 + BPF_FUNC_map_lookup_elem), 9084 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9085 + BPF_MOV64_IMM(BPF_REG_8, 0), 9086 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9087 + /* write map_value_ptr into stack frame of main prog at fp-8 */ 9088 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9089 + BPF_MOV64_IMM(BPF_REG_8, 1), 9090 + 9091 + /* 2nd lookup from map */ 9092 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */ 9093 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9094 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9095 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */ 9096 + BPF_FUNC_map_lookup_elem), 9097 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9098 + BPF_MOV64_IMM(BPF_REG_9, 0), 9099 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9100 + /* write map_value_ptr into stack frame of main prog at fp-16 */ 9101 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 9102 + BPF_MOV64_IMM(BPF_REG_9, 1), 9103 + 9104 + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ 9105 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */ 9106 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), 9107 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 9108 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), 9109 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */ 9110 + BPF_EXIT_INSN(), 9111 + 9112 + /* subprog 2 */ 9113 + /* if arg2 == 1 do *arg1 = 0 */ 9114 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), 9115 + /* fetch map_value_ptr from the stack of this function */ 9116 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), 9117 + /* write into map value */ 9118 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9119 + 9120 + /* if arg4 == 1 do *arg3 = 0 */ 9121 + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), 9122 + /* fetch map_value_ptr from the stack of this function */ 9123 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 9124 + /* write into map value */ 9125 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9126 + BPF_EXIT_INSN(), 9127 + }, 9128 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9129 + .fixup_map1 = { 12, 22 }, 9130 + .result = ACCEPT, 9131 + }, 9132 + { 9133 + "calls: two jumps that receive map_value via arg=ptr_stack_of_jumper. test3", 9134 + .insns = { 9135 + /* main prog */ 9136 + /* pass fp-16, fp-8 into a function */ 9137 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 9138 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 9139 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9140 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 9141 + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), 9142 + BPF_MOV64_IMM(BPF_REG_0, 0), 9143 + BPF_EXIT_INSN(), 9144 + 9145 + /* subprog 1 */ 9146 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9147 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 9148 + /* 1st lookup from map */ 9149 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -24, 0), 9150 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9151 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24), 9152 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9153 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9154 + BPF_FUNC_map_lookup_elem), 9155 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9156 + BPF_MOV64_IMM(BPF_REG_8, 0), 9157 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9158 + /* write map_value_ptr into stack frame of main prog at fp-8 */ 9159 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9160 + BPF_MOV64_IMM(BPF_REG_8, 1), 9161 + 9162 + /* 2nd lookup from map */ 9163 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9164 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24), 9165 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9166 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9167 + BPF_FUNC_map_lookup_elem), 9168 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9169 + BPF_MOV64_IMM(BPF_REG_9, 0), // 26 9170 + BPF_JMP_IMM(BPF_JA, 0, 0, 2), 9171 + /* write map_value_ptr into stack frame of main prog at fp-16 */ 9172 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 9173 + BPF_MOV64_IMM(BPF_REG_9, 1), 9174 + 9175 + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ 9176 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), // 30 9177 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), 9178 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 9179 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), 9180 + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), // 34 9181 + BPF_JMP_IMM(BPF_JA, 0, 0, -30), 9182 + 9183 + /* subprog 2 */ 9184 + /* if arg2 == 1 do *arg1 = 0 */ 9185 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), 9186 + /* fetch map_value_ptr from the stack of this function */ 9187 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), 9188 + /* write into map value */ 9189 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9190 + 9191 + /* if arg4 == 1 do *arg3 = 0 */ 9192 + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), 9193 + /* fetch map_value_ptr from the stack of this function */ 9194 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 9195 + /* write into map value */ 9196 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0), 9197 + BPF_JMP_IMM(BPF_JA, 0, 0, -8), 9198 + }, 9199 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9200 + .fixup_map1 = { 12, 22 }, 9201 + .result = REJECT, 9202 + .errstr = "invalid access to map value, value_size=8 off=2 size=8", 9203 + }, 9204 + { 9205 + "calls: two calls that receive map_value_ptr_or_null via arg. test1", 9206 + .insns = { 9207 + /* main prog */ 9208 + /* pass fp-16, fp-8 into a function */ 9209 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 9210 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 9211 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9212 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 9213 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 9214 + BPF_MOV64_IMM(BPF_REG_0, 0), 9215 + BPF_EXIT_INSN(), 9216 + 9217 + /* subprog 1 */ 9218 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9219 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 9220 + /* 1st lookup from map */ 9221 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9222 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9223 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9224 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9225 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9226 + BPF_FUNC_map_lookup_elem), 9227 + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ 9228 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9229 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9230 + BPF_MOV64_IMM(BPF_REG_8, 0), 9231 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 9232 + BPF_MOV64_IMM(BPF_REG_8, 1), 9233 + 9234 + /* 2nd lookup from map */ 9235 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9236 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9237 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9238 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9239 + BPF_FUNC_map_lookup_elem), 9240 + /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */ 9241 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 9242 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9243 + BPF_MOV64_IMM(BPF_REG_9, 0), 9244 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 9245 + BPF_MOV64_IMM(BPF_REG_9, 1), 9246 + 9247 + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ 9248 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 9249 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), 9250 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 9251 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), 9252 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 9253 + BPF_EXIT_INSN(), 9254 + 9255 + /* subprog 2 */ 9256 + /* if arg2 == 1 do *arg1 = 0 */ 9257 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), 9258 + /* fetch map_value_ptr from the stack of this function */ 9259 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), 9260 + /* write into map value */ 9261 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9262 + 9263 + /* if arg4 == 1 do *arg3 = 0 */ 9264 + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2), 9265 + /* fetch map_value_ptr from the stack of this function */ 9266 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 9267 + /* write into map value */ 9268 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9269 + BPF_EXIT_INSN(), 9270 + }, 9271 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9272 + .fixup_map1 = { 12, 22 }, 9273 + .result = ACCEPT, 9274 + }, 9275 + { 9276 + "calls: two calls that receive map_value_ptr_or_null via arg. test2", 9277 + .insns = { 9278 + /* main prog */ 9279 + /* pass fp-16, fp-8 into a function */ 9280 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), 9281 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), 9282 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9283 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), 9284 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), 9285 + BPF_MOV64_IMM(BPF_REG_0, 0), 9286 + BPF_EXIT_INSN(), 9287 + 9288 + /* subprog 1 */ 9289 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), 9290 + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), 9291 + /* 1st lookup from map */ 9292 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9293 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9294 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9295 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9296 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9297 + BPF_FUNC_map_lookup_elem), 9298 + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ 9299 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9300 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9301 + BPF_MOV64_IMM(BPF_REG_8, 0), 9302 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 9303 + BPF_MOV64_IMM(BPF_REG_8, 1), 9304 + 9305 + /* 2nd lookup from map */ 9306 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9307 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9308 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9309 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9310 + BPF_FUNC_map_lookup_elem), 9311 + /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */ 9312 + BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0), 9313 + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), 9314 + BPF_MOV64_IMM(BPF_REG_9, 0), 9315 + BPF_JMP_IMM(BPF_JA, 0, 0, 1), 9316 + BPF_MOV64_IMM(BPF_REG_9, 1), 9317 + 9318 + /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */ 9319 + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), 9320 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_8), 9321 + BPF_MOV64_REG(BPF_REG_3, BPF_REG_7), 9322 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_9), 9323 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 9324 + BPF_EXIT_INSN(), 9325 + 9326 + /* subprog 2 */ 9327 + /* if arg2 == 1 do *arg1 = 0 */ 9328 + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2), 9329 + /* fetch map_value_ptr from the stack of this function */ 9330 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), 9331 + /* write into map value */ 9332 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9333 + 9334 + /* if arg4 == 0 do *arg3 = 0 */ 9335 + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 0, 2), 9336 + /* fetch map_value_ptr from the stack of this function */ 9337 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), 9338 + /* write into map value */ 9339 + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), 9340 + BPF_EXIT_INSN(), 9341 + }, 9342 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9343 + .fixup_map1 = { 12, 22 }, 9344 + .result = REJECT, 9345 + .errstr = "R0 invalid mem access 'inv'", 9346 + }, 9347 + { 9348 + "calls: pkt_ptr spill into caller stack", 9349 + .insns = { 9350 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9351 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9352 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), 9353 + BPF_EXIT_INSN(), 9354 + 9355 + /* subprog 1 */ 9356 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9357 + offsetof(struct __sk_buff, data)), 9358 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9359 + offsetof(struct __sk_buff, data_end)), 9360 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9361 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9362 + /* spill unchecked pkt_ptr into stack of caller */ 9363 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9364 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), 9365 + /* now the pkt range is verified, read pkt_ptr from stack */ 9366 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), 9367 + /* write 4 bytes into packet */ 9368 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9369 + BPF_EXIT_INSN(), 9370 + }, 9371 + .result = ACCEPT, 9372 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9373 + }, 9374 + { 9375 + "calls: pkt_ptr spill into caller stack 2", 9376 + .insns = { 9377 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9378 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9379 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9380 + /* Marking is still kept, but not in all cases safe. */ 9381 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9382 + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), 9383 + BPF_EXIT_INSN(), 9384 + 9385 + /* subprog 1 */ 9386 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9387 + offsetof(struct __sk_buff, data)), 9388 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9389 + offsetof(struct __sk_buff, data_end)), 9390 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9391 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9392 + /* spill unchecked pkt_ptr into stack of caller */ 9393 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9394 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), 9395 + /* now the pkt range is verified, read pkt_ptr from stack */ 9396 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), 9397 + /* write 4 bytes into packet */ 9398 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9399 + BPF_EXIT_INSN(), 9400 + }, 9401 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9402 + .errstr = "invalid access to packet", 9403 + .result = REJECT, 9404 + }, 9405 + { 9406 + "calls: pkt_ptr spill into caller stack 3", 9407 + .insns = { 9408 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9409 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9410 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 9411 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), 9412 + /* Marking is still kept and safe here. */ 9413 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9414 + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), 9415 + BPF_EXIT_INSN(), 9416 + 9417 + /* subprog 1 */ 9418 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9419 + offsetof(struct __sk_buff, data)), 9420 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9421 + offsetof(struct __sk_buff, data_end)), 9422 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9423 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9424 + /* spill unchecked pkt_ptr into stack of caller */ 9425 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9426 + BPF_MOV64_IMM(BPF_REG_5, 0), 9427 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), 9428 + BPF_MOV64_IMM(BPF_REG_5, 1), 9429 + /* now the pkt range is verified, read pkt_ptr from stack */ 9430 + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), 9431 + /* write 4 bytes into packet */ 9432 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9433 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9434 + BPF_EXIT_INSN(), 9435 + }, 9436 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9437 + .result = ACCEPT, 9438 + }, 9439 + { 9440 + "calls: pkt_ptr spill into caller stack 4", 9441 + .insns = { 9442 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9443 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9444 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 9445 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), 9446 + /* Check marking propagated. */ 9447 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9448 + BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0), 9449 + BPF_EXIT_INSN(), 9450 + 9451 + /* subprog 1 */ 9452 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9453 + offsetof(struct __sk_buff, data)), 9454 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9455 + offsetof(struct __sk_buff, data_end)), 9456 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9457 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9458 + /* spill unchecked pkt_ptr into stack of caller */ 9459 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9460 + BPF_MOV64_IMM(BPF_REG_5, 0), 9461 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), 9462 + BPF_MOV64_IMM(BPF_REG_5, 1), 9463 + /* don't read back pkt_ptr from stack here */ 9464 + /* write 4 bytes into packet */ 9465 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9466 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9467 + BPF_EXIT_INSN(), 9468 + }, 9469 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9470 + .result = ACCEPT, 9471 + }, 9472 + { 9473 + "calls: pkt_ptr spill into caller stack 5", 9474 + .insns = { 9475 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9476 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9477 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, 0), 9478 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9479 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9480 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), 9481 + BPF_EXIT_INSN(), 9482 + 9483 + /* subprog 1 */ 9484 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9485 + offsetof(struct __sk_buff, data)), 9486 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9487 + offsetof(struct __sk_buff, data_end)), 9488 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9489 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9490 + BPF_MOV64_IMM(BPF_REG_5, 0), 9491 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), 9492 + /* spill checked pkt_ptr into stack of caller */ 9493 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9494 + BPF_MOV64_IMM(BPF_REG_5, 1), 9495 + /* don't read back pkt_ptr from stack here */ 9496 + /* write 4 bytes into packet */ 9497 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9498 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9499 + BPF_EXIT_INSN(), 9500 + }, 9501 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9502 + .errstr = "same insn cannot be used with different", 9503 + .result = REJECT, 9504 + }, 9505 + { 9506 + "calls: pkt_ptr spill into caller stack 6", 9507 + .insns = { 9508 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9509 + offsetof(struct __sk_buff, data_end)), 9510 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9511 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9512 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9513 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9514 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9515 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), 9516 + BPF_EXIT_INSN(), 9517 + 9518 + /* subprog 1 */ 9519 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9520 + offsetof(struct __sk_buff, data)), 9521 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9522 + offsetof(struct __sk_buff, data_end)), 9523 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9524 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9525 + BPF_MOV64_IMM(BPF_REG_5, 0), 9526 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), 9527 + /* spill checked pkt_ptr into stack of caller */ 9528 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9529 + BPF_MOV64_IMM(BPF_REG_5, 1), 9530 + /* don't read back pkt_ptr from stack here */ 9531 + /* write 4 bytes into packet */ 9532 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9533 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9534 + BPF_EXIT_INSN(), 9535 + }, 9536 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9537 + .errstr = "R4 invalid mem access", 9538 + .result = REJECT, 9539 + }, 9540 + { 9541 + "calls: pkt_ptr spill into caller stack 7", 9542 + .insns = { 9543 + BPF_MOV64_IMM(BPF_REG_2, 0), 9544 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9545 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9546 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9547 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9548 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9549 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), 9550 + BPF_EXIT_INSN(), 9551 + 9552 + /* subprog 1 */ 9553 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9554 + offsetof(struct __sk_buff, data)), 9555 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9556 + offsetof(struct __sk_buff, data_end)), 9557 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9558 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9559 + BPF_MOV64_IMM(BPF_REG_5, 0), 9560 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), 9561 + /* spill checked pkt_ptr into stack of caller */ 9562 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9563 + BPF_MOV64_IMM(BPF_REG_5, 1), 9564 + /* don't read back pkt_ptr from stack here */ 9565 + /* write 4 bytes into packet */ 9566 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9567 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9568 + BPF_EXIT_INSN(), 9569 + }, 9570 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9571 + .errstr = "R4 invalid mem access", 9572 + .result = REJECT, 9573 + }, 9574 + { 9575 + "calls: pkt_ptr spill into caller stack 8", 9576 + .insns = { 9577 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9578 + offsetof(struct __sk_buff, data)), 9579 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9580 + offsetof(struct __sk_buff, data_end)), 9581 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9582 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9583 + BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1), 9584 + BPF_EXIT_INSN(), 9585 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9586 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9587 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9588 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9589 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9590 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), 9591 + BPF_EXIT_INSN(), 9592 + 9593 + /* subprog 1 */ 9594 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9595 + offsetof(struct __sk_buff, data)), 9596 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9597 + offsetof(struct __sk_buff, data_end)), 9598 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9599 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9600 + BPF_MOV64_IMM(BPF_REG_5, 0), 9601 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3), 9602 + /* spill checked pkt_ptr into stack of caller */ 9603 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9604 + BPF_MOV64_IMM(BPF_REG_5, 1), 9605 + /* don't read back pkt_ptr from stack here */ 9606 + /* write 4 bytes into packet */ 9607 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9608 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9609 + BPF_EXIT_INSN(), 9610 + }, 9611 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9612 + .result = ACCEPT, 9613 + }, 9614 + { 9615 + "calls: pkt_ptr spill into caller stack 9", 9616 + .insns = { 9617 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9618 + offsetof(struct __sk_buff, data)), 9619 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9620 + offsetof(struct __sk_buff, data_end)), 9621 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9622 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9623 + BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1), 9624 + BPF_EXIT_INSN(), 9625 + BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), 9626 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), 9627 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9628 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3), 9629 + BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), 9630 + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0), 9631 + BPF_EXIT_INSN(), 9632 + 9633 + /* subprog 1 */ 9634 + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 9635 + offsetof(struct __sk_buff, data)), 9636 + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 9637 + offsetof(struct __sk_buff, data_end)), 9638 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), 9639 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), 9640 + BPF_MOV64_IMM(BPF_REG_5, 0), 9641 + /* spill unchecked pkt_ptr into stack of caller */ 9642 + BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), 9643 + BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2), 9644 + BPF_MOV64_IMM(BPF_REG_5, 1), 9645 + /* don't read back pkt_ptr from stack here */ 9646 + /* write 4 bytes into packet */ 9647 + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), 9648 + BPF_MOV64_REG(BPF_REG_0, BPF_REG_5), 9649 + BPF_EXIT_INSN(), 9650 + }, 9651 + .prog_type = BPF_PROG_TYPE_SCHED_CLS, 9652 + .errstr = "invalid access to packet", 9653 + .result = REJECT, 9654 + }, 9655 + { 9656 + "calls: caller stack init to zero or map_value_or_null", 9657 + .insns = { 9658 + BPF_MOV64_IMM(BPF_REG_0, 0), 9659 + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), 9660 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9661 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9662 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4), 9663 + /* fetch map_value_or_null or const_zero from stack */ 9664 + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), 9665 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), 9666 + /* store into map_value */ 9667 + BPF_ST_MEM(BPF_W, BPF_REG_0, 0, 0), 9668 + BPF_EXIT_INSN(), 9669 + 9670 + /* subprog 1 */ 9671 + /* if (ctx == 0) return; */ 9672 + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8), 9673 + /* else bpf_map_lookup() and *(fp - 8) = r0 */ 9674 + BPF_MOV64_REG(BPF_REG_6, BPF_REG_2), 9675 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9676 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9677 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9678 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9679 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9680 + BPF_FUNC_map_lookup_elem), 9681 + /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */ 9682 + BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0), 9683 + BPF_EXIT_INSN(), 9684 + }, 9685 + .fixup_map1 = { 13 }, 9686 + .result = ACCEPT, 9687 + .prog_type = BPF_PROG_TYPE_XDP, 9688 + }, 9689 + { 9690 + "calls: stack init to zero and pruning", 9691 + .insns = { 9692 + /* first make allocated_stack 16 byte */ 9693 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0), 9694 + /* now fork the execution such that the false branch 9695 + * of JGT insn will be verified second and it skisp zero 9696 + * init of fp-8 stack slot. If stack liveness marking 9697 + * is missing live_read marks from call map_lookup 9698 + * processing then pruning will incorrectly assume 9699 + * that fp-8 stack slot was unused in the fall-through 9700 + * branch and will accept the program incorrectly 9701 + */ 9702 + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 2), 9703 + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), 9704 + BPF_JMP_IMM(BPF_JA, 0, 0, 0), 9705 + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), 9706 + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), 9707 + BPF_LD_MAP_FD(BPF_REG_1, 0), 9708 + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9709 + BPF_FUNC_map_lookup_elem), 9710 + BPF_EXIT_INSN(), 9711 + }, 9712 + .fixup_map2 = { 6 }, 9713 + .errstr = "invalid indirect read from stack off -8+0 size 8", 9714 + .result = REJECT, 9715 + .prog_type = BPF_PROG_TYPE_XDP, 8100 9716 }, 8101 9717 }; 8102 9718

+833

tools/testing/selftests/bpf/test_xdp_noinline.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2017 Facebook 3 + #include <stddef.h> 4 + #include <stdbool.h> 5 + #include <string.h> 6 + #include <linux/pkt_cls.h> 7 + #include <linux/bpf.h> 8 + #include <linux/in.h> 9 + #include <linux/if_ether.h> 10 + #include <linux/ip.h> 11 + #include <linux/ipv6.h> 12 + #include <linux/icmp.h> 13 + #include <linux/icmpv6.h> 14 + #include <linux/tcp.h> 15 + #include <linux/udp.h> 16 + #include "bpf_helpers.h" 17 + 18 + #define bpf_printk(fmt, ...) \ 19 + ({ \ 20 + char ____fmt[] = fmt; \ 21 + bpf_trace_printk(____fmt, sizeof(____fmt), \ 22 + ##__VA_ARGS__); \ 23 + }) 24 + 25 + static __u32 rol32(__u32 word, unsigned int shift) 26 + { 27 + return (word << shift) | (word >> ((-shift) & 31)); 28 + } 29 + 30 + /* copy paste of jhash from kernel sources to make sure llvm 31 + * can compile it into valid sequence of bpf instructions 32 + */ 33 + #define __jhash_mix(a, b, c) \ 34 + { \ 35 + a -= c; a ^= rol32(c, 4); c += b; \ 36 + b -= a; b ^= rol32(a, 6); a += c; \ 37 + c -= b; c ^= rol32(b, 8); b += a; \ 38 + a -= c; a ^= rol32(c, 16); c += b; \ 39 + b -= a; b ^= rol32(a, 19); a += c; \ 40 + c -= b; c ^= rol32(b, 4); b += a; \ 41 + } 42 + 43 + #define __jhash_final(a, b, c) \ 44 + { \ 45 + c ^= b; c -= rol32(b, 14); \ 46 + a ^= c; a -= rol32(c, 11); \ 47 + b ^= a; b -= rol32(a, 25); \ 48 + c ^= b; c -= rol32(b, 16); \ 49 + a ^= c; a -= rol32(c, 4); \ 50 + b ^= a; b -= rol32(a, 14); \ 51 + c ^= b; c -= rol32(b, 24); \ 52 + } 53 + 54 + #define JHASH_INITVAL 0xdeadbeef 55 + 56 + typedef unsigned int u32; 57 + 58 + static __attribute__ ((noinline)) 59 + u32 jhash(const void *key, u32 length, u32 initval) 60 + { 61 + u32 a, b, c; 62 + const unsigned char *k = key; 63 + 64 + a = b = c = JHASH_INITVAL + length + initval; 65 + 66 + while (length > 12) { 67 + a += *(u32 *)(k); 68 + b += *(u32 *)(k + 4); 69 + c += *(u32 *)(k + 8); 70 + __jhash_mix(a, b, c); 71 + length -= 12; 72 + k += 12; 73 + } 74 + switch (length) { 75 + case 12: c += (u32)k[11]<<24; 76 + case 11: c += (u32)k[10]<<16; 77 + case 10: c += (u32)k[9]<<8; 78 + case 9: c += k[8]; 79 + case 8: b += (u32)k[7]<<24; 80 + case 7: b += (u32)k[6]<<16; 81 + case 6: b += (u32)k[5]<<8; 82 + case 5: b += k[4]; 83 + case 4: a += (u32)k[3]<<24; 84 + case 3: a += (u32)k[2]<<16; 85 + case 2: a += (u32)k[1]<<8; 86 + case 1: a += k[0]; 87 + __jhash_final(a, b, c); 88 + case 0: /* Nothing left to add */ 89 + break; 90 + } 91 + 92 + return c; 93 + } 94 + 95 + static __attribute__ ((noinline)) 96 + u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) 97 + { 98 + a += initval; 99 + b += initval; 100 + c += initval; 101 + __jhash_final(a, b, c); 102 + return c; 103 + } 104 + 105 + static __attribute__ ((noinline)) 106 + u32 jhash_2words(u32 a, u32 b, u32 initval) 107 + { 108 + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); 109 + } 110 + 111 + struct flow_key { 112 + union { 113 + __be32 src; 114 + __be32 srcv6[4]; 115 + }; 116 + union { 117 + __be32 dst; 118 + __be32 dstv6[4]; 119 + }; 120 + union { 121 + __u32 ports; 122 + __u16 port16[2]; 123 + }; 124 + __u8 proto; 125 + }; 126 + 127 + struct packet_description { 128 + struct flow_key flow; 129 + __u8 flags; 130 + }; 131 + 132 + struct ctl_value { 133 + union { 134 + __u64 value; 135 + __u32 ifindex; 136 + __u8 mac[6]; 137 + }; 138 + }; 139 + 140 + struct vip_definition { 141 + union { 142 + __be32 vip; 143 + __be32 vipv6[4]; 144 + }; 145 + __u16 port; 146 + __u16 family; 147 + __u8 proto; 148 + }; 149 + 150 + struct vip_meta { 151 + __u32 flags; 152 + __u32 vip_num; 153 + }; 154 + 155 + struct real_pos_lru { 156 + __u32 pos; 157 + __u64 atime; 158 + }; 159 + 160 + struct real_definition { 161 + union { 162 + __be32 dst; 163 + __be32 dstv6[4]; 164 + }; 165 + __u8 flags; 166 + }; 167 + 168 + struct lb_stats { 169 + __u64 v2; 170 + __u64 v1; 171 + }; 172 + 173 + struct bpf_map_def __attribute__ ((section("maps"), used)) vip_map = { 174 + .type = BPF_MAP_TYPE_HASH, 175 + .key_size = sizeof(struct vip_definition), 176 + .value_size = sizeof(struct vip_meta), 177 + .max_entries = 512, 178 + .map_flags = 0, 179 + }; 180 + 181 + struct bpf_map_def __attribute__ ((section("maps"), used)) lru_cache = { 182 + .type = BPF_MAP_TYPE_LRU_HASH, 183 + .key_size = sizeof(struct flow_key), 184 + .value_size = sizeof(struct real_pos_lru), 185 + .max_entries = 300, 186 + .map_flags = 1U << 1, 187 + }; 188 + 189 + struct bpf_map_def __attribute__ ((section("maps"), used)) ch_rings = { 190 + .type = BPF_MAP_TYPE_ARRAY, 191 + .key_size = sizeof(__u32), 192 + .value_size = sizeof(__u32), 193 + .max_entries = 12 * 655, 194 + .map_flags = 0, 195 + }; 196 + 197 + struct bpf_map_def __attribute__ ((section("maps"), used)) reals = { 198 + .type = BPF_MAP_TYPE_ARRAY, 199 + .key_size = sizeof(__u32), 200 + .value_size = sizeof(struct real_definition), 201 + .max_entries = 40, 202 + .map_flags = 0, 203 + }; 204 + 205 + struct bpf_map_def __attribute__ ((section("maps"), used)) stats = { 206 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 207 + .key_size = sizeof(__u32), 208 + .value_size = sizeof(struct lb_stats), 209 + .max_entries = 515, 210 + .map_flags = 0, 211 + }; 212 + 213 + struct bpf_map_def __attribute__ ((section("maps"), used)) ctl_array = { 214 + .type = BPF_MAP_TYPE_ARRAY, 215 + .key_size = sizeof(__u32), 216 + .value_size = sizeof(struct ctl_value), 217 + .max_entries = 16, 218 + .map_flags = 0, 219 + }; 220 + 221 + struct eth_hdr { 222 + unsigned char eth_dest[6]; 223 + unsigned char eth_source[6]; 224 + unsigned short eth_proto; 225 + }; 226 + 227 + static inline __u64 calc_offset(bool is_ipv6, bool is_icmp) 228 + { 229 + __u64 off = sizeof(struct eth_hdr); 230 + if (is_ipv6) { 231 + off += sizeof(struct ipv6hdr); 232 + if (is_icmp) 233 + off += sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr); 234 + } else { 235 + off += sizeof(struct iphdr); 236 + if (is_icmp) 237 + off += sizeof(struct icmphdr) + sizeof(struct iphdr); 238 + } 239 + return off; 240 + } 241 + 242 + static __attribute__ ((noinline)) 243 + bool parse_udp(void *data, void *data_end, 244 + bool is_ipv6, struct packet_description *pckt) 245 + { 246 + 247 + bool is_icmp = !((pckt->flags & (1 << 0)) == 0); 248 + __u64 off = calc_offset(is_ipv6, is_icmp); 249 + struct udphdr *udp; 250 + udp = data + off; 251 + 252 + if (udp + 1 > data_end) 253 + return 0; 254 + if (!is_icmp) { 255 + pckt->flow.port16[0] = udp->source; 256 + pckt->flow.port16[1] = udp->dest; 257 + } else { 258 + pckt->flow.port16[0] = udp->dest; 259 + pckt->flow.port16[1] = udp->source; 260 + } 261 + return 1; 262 + } 263 + 264 + static __attribute__ ((noinline)) 265 + bool parse_tcp(void *data, void *data_end, 266 + bool is_ipv6, struct packet_description *pckt) 267 + { 268 + 269 + bool is_icmp = !((pckt->flags & (1 << 0)) == 0); 270 + __u64 off = calc_offset(is_ipv6, is_icmp); 271 + struct tcphdr *tcp; 272 + 273 + tcp = data + off; 274 + if (tcp + 1 > data_end) 275 + return 0; 276 + if (tcp->syn) 277 + pckt->flags |= (1 << 1); 278 + if (!is_icmp) { 279 + pckt->flow.port16[0] = tcp->source; 280 + pckt->flow.port16[1] = tcp->dest; 281 + } else { 282 + pckt->flow.port16[0] = tcp->dest; 283 + pckt->flow.port16[1] = tcp->source; 284 + } 285 + return 1; 286 + } 287 + 288 + static __attribute__ ((noinline)) 289 + bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval, 290 + struct packet_description *pckt, 291 + struct real_definition *dst, __u32 pkt_bytes) 292 + { 293 + struct eth_hdr *new_eth; 294 + struct eth_hdr *old_eth; 295 + struct ipv6hdr *ip6h; 296 + __u32 ip_suffix; 297 + void *data_end; 298 + void *data; 299 + 300 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) 301 + return 0; 302 + data = (void *)(long)xdp->data; 303 + data_end = (void *)(long)xdp->data_end; 304 + new_eth = data; 305 + ip6h = data + sizeof(struct eth_hdr); 306 + old_eth = data + sizeof(struct ipv6hdr); 307 + if (new_eth + 1 > data_end || 308 + old_eth + 1 > data_end || ip6h + 1 > data_end) 309 + return 0; 310 + memcpy(new_eth->eth_dest, cval->mac, 6); 311 + memcpy(new_eth->eth_source, old_eth->eth_dest, 6); 312 + new_eth->eth_proto = 56710; 313 + ip6h->version = 6; 314 + ip6h->priority = 0; 315 + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); 316 + 317 + ip6h->nexthdr = IPPROTO_IPV6; 318 + ip_suffix = pckt->flow.srcv6[3] ^ pckt->flow.port16[0]; 319 + ip6h->payload_len = 320 + __builtin_bswap16(pkt_bytes + sizeof(struct ipv6hdr)); 321 + ip6h->hop_limit = 4; 322 + 323 + ip6h->saddr.in6_u.u6_addr32[0] = 1; 324 + ip6h->saddr.in6_u.u6_addr32[1] = 2; 325 + ip6h->saddr.in6_u.u6_addr32[2] = 3; 326 + ip6h->saddr.in6_u.u6_addr32[3] = ip_suffix; 327 + memcpy(ip6h->daddr.in6_u.u6_addr32, dst->dstv6, 16); 328 + return 1; 329 + } 330 + 331 + static __attribute__ ((noinline)) 332 + bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval, 333 + struct packet_description *pckt, 334 + struct real_definition *dst, __u32 pkt_bytes) 335 + { 336 + 337 + __u32 ip_suffix = __builtin_bswap16(pckt->flow.port16[0]); 338 + struct eth_hdr *new_eth; 339 + struct eth_hdr *old_eth; 340 + __u16 *next_iph_u16; 341 + struct iphdr *iph; 342 + __u32 csum = 0; 343 + void *data_end; 344 + void *data; 345 + 346 + ip_suffix <<= 15; 347 + ip_suffix ^= pckt->flow.src; 348 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) 349 + return 0; 350 + data = (void *)(long)xdp->data; 351 + data_end = (void *)(long)xdp->data_end; 352 + new_eth = data; 353 + iph = data + sizeof(struct eth_hdr); 354 + old_eth = data + sizeof(struct iphdr); 355 + if (new_eth + 1 > data_end || 356 + old_eth + 1 > data_end || iph + 1 > data_end) 357 + return 0; 358 + memcpy(new_eth->eth_dest, cval->mac, 6); 359 + memcpy(new_eth->eth_source, old_eth->eth_dest, 6); 360 + new_eth->eth_proto = 8; 361 + iph->version = 4; 362 + iph->ihl = 5; 363 + iph->frag_off = 0; 364 + iph->protocol = IPPROTO_IPIP; 365 + iph->check = 0; 366 + iph->tos = 1; 367 + iph->tot_len = __builtin_bswap16(pkt_bytes + sizeof(struct iphdr)); 368 + /* don't update iph->daddr, since it will overwrite old eth_proto 369 + * and multiple iterations of bpf_prog_run() will fail 370 + */ 371 + 372 + iph->saddr = ((0xFFFF0000 & ip_suffix) | 4268) ^ dst->dst; 373 + iph->ttl = 4; 374 + 375 + next_iph_u16 = (__u16 *) iph; 376 + #pragma clang loop unroll(full) 377 + for (int i = 0; i < sizeof(struct iphdr) >> 1; i++) 378 + csum += *next_iph_u16++; 379 + iph->check = ~((csum & 0xffff) + (csum >> 16)); 380 + if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr))) 381 + return 0; 382 + return 1; 383 + } 384 + 385 + static __attribute__ ((noinline)) 386 + bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4) 387 + { 388 + struct eth_hdr *new_eth; 389 + struct eth_hdr *old_eth; 390 + 391 + old_eth = *data; 392 + new_eth = *data + sizeof(struct ipv6hdr); 393 + memcpy(new_eth->eth_source, old_eth->eth_source, 6); 394 + memcpy(new_eth->eth_dest, old_eth->eth_dest, 6); 395 + if (inner_v4) 396 + new_eth->eth_proto = 8; 397 + else 398 + new_eth->eth_proto = 56710; 399 + if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct ipv6hdr))) 400 + return 0; 401 + *data = (void *)(long)xdp->data; 402 + *data_end = (void *)(long)xdp->data_end; 403 + return 1; 404 + } 405 + 406 + static __attribute__ ((noinline)) 407 + bool decap_v4(struct xdp_md *xdp, void **data, void **data_end) 408 + { 409 + struct eth_hdr *new_eth; 410 + struct eth_hdr *old_eth; 411 + 412 + old_eth = *data; 413 + new_eth = *data + sizeof(struct iphdr); 414 + memcpy(new_eth->eth_source, old_eth->eth_source, 6); 415 + memcpy(new_eth->eth_dest, old_eth->eth_dest, 6); 416 + new_eth->eth_proto = 8; 417 + if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr))) 418 + return 0; 419 + *data = (void *)(long)xdp->data; 420 + *data_end = (void *)(long)xdp->data_end; 421 + return 1; 422 + } 423 + 424 + static __attribute__ ((noinline)) 425 + int swap_mac_and_send(void *data, void *data_end) 426 + { 427 + unsigned char tmp_mac[6]; 428 + struct eth_hdr *eth; 429 + 430 + eth = data; 431 + memcpy(tmp_mac, eth->eth_source, 6); 432 + memcpy(eth->eth_source, eth->eth_dest, 6); 433 + memcpy(eth->eth_dest, tmp_mac, 6); 434 + return XDP_TX; 435 + } 436 + 437 + static __attribute__ ((noinline)) 438 + int send_icmp_reply(void *data, void *data_end) 439 + { 440 + struct icmphdr *icmp_hdr; 441 + __u16 *next_iph_u16; 442 + __u32 tmp_addr = 0; 443 + struct iphdr *iph; 444 + __u32 csum1 = 0; 445 + __u32 csum = 0; 446 + __u64 off = 0; 447 + 448 + if (data + sizeof(struct eth_hdr) 449 + + sizeof(struct iphdr) + sizeof(struct icmphdr) > data_end) 450 + return XDP_DROP; 451 + off += sizeof(struct eth_hdr); 452 + iph = data + off; 453 + off += sizeof(struct iphdr); 454 + icmp_hdr = data + off; 455 + icmp_hdr->type = 0; 456 + icmp_hdr->checksum += 0x0007; 457 + iph->ttl = 4; 458 + tmp_addr = iph->daddr; 459 + iph->daddr = iph->saddr; 460 + iph->saddr = tmp_addr; 461 + iph->check = 0; 462 + next_iph_u16 = (__u16 *) iph; 463 + #pragma clang loop unroll(full) 464 + for (int i = 0; i < sizeof(struct iphdr) >> 1; i++) 465 + csum += *next_iph_u16++; 466 + iph->check = ~((csum & 0xffff) + (csum >> 16)); 467 + return swap_mac_and_send(data, data_end); 468 + } 469 + 470 + static __attribute__ ((noinline)) 471 + int send_icmp6_reply(void *data, void *data_end) 472 + { 473 + struct icmp6hdr *icmp_hdr; 474 + struct ipv6hdr *ip6h; 475 + __be32 tmp_addr[4]; 476 + __u64 off = 0; 477 + 478 + if (data + sizeof(struct eth_hdr) 479 + + sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) > data_end) 480 + return XDP_DROP; 481 + off += sizeof(struct eth_hdr); 482 + ip6h = data + off; 483 + off += sizeof(struct ipv6hdr); 484 + icmp_hdr = data + off; 485 + icmp_hdr->icmp6_type = 129; 486 + icmp_hdr->icmp6_cksum -= 0x0001; 487 + ip6h->hop_limit = 4; 488 + memcpy(tmp_addr, ip6h->saddr.in6_u.u6_addr32, 16); 489 + memcpy(ip6h->saddr.in6_u.u6_addr32, ip6h->daddr.in6_u.u6_addr32, 16); 490 + memcpy(ip6h->daddr.in6_u.u6_addr32, tmp_addr, 16); 491 + return swap_mac_and_send(data, data_end); 492 + } 493 + 494 + static __attribute__ ((noinline)) 495 + int parse_icmpv6(void *data, void *data_end, __u64 off, 496 + struct packet_description *pckt) 497 + { 498 + struct icmp6hdr *icmp_hdr; 499 + struct ipv6hdr *ip6h; 500 + 501 + icmp_hdr = data + off; 502 + if (icmp_hdr + 1 > data_end) 503 + return XDP_DROP; 504 + if (icmp_hdr->icmp6_type == 128) 505 + return send_icmp6_reply(data, data_end); 506 + if (icmp_hdr->icmp6_type != 3) 507 + return XDP_PASS; 508 + off += sizeof(struct icmp6hdr); 509 + ip6h = data + off; 510 + if (ip6h + 1 > data_end) 511 + return XDP_DROP; 512 + pckt->flow.proto = ip6h->nexthdr; 513 + pckt->flags |= (1 << 0); 514 + memcpy(pckt->flow.srcv6, ip6h->daddr.in6_u.u6_addr32, 16); 515 + memcpy(pckt->flow.dstv6, ip6h->saddr.in6_u.u6_addr32, 16); 516 + return -1; 517 + } 518 + 519 + static __attribute__ ((noinline)) 520 + int parse_icmp(void *data, void *data_end, __u64 off, 521 + struct packet_description *pckt) 522 + { 523 + struct icmphdr *icmp_hdr; 524 + struct iphdr *iph; 525 + 526 + icmp_hdr = data + off; 527 + if (icmp_hdr + 1 > data_end) 528 + return XDP_DROP; 529 + if (icmp_hdr->type == 8) 530 + return send_icmp_reply(data, data_end); 531 + if ((icmp_hdr->type != 3) || (icmp_hdr->code != 4)) 532 + return XDP_PASS; 533 + off += sizeof(struct icmphdr); 534 + iph = data + off; 535 + if (iph + 1 > data_end) 536 + return XDP_DROP; 537 + if (iph->ihl != 5) 538 + return XDP_DROP; 539 + pckt->flow.proto = iph->protocol; 540 + pckt->flags |= (1 << 0); 541 + pckt->flow.src = iph->daddr; 542 + pckt->flow.dst = iph->saddr; 543 + return -1; 544 + } 545 + 546 + static __attribute__ ((noinline)) 547 + __u32 get_packet_hash(struct packet_description *pckt, 548 + bool hash_16bytes) 549 + { 550 + if (hash_16bytes) 551 + return jhash_2words(jhash(pckt->flow.srcv6, 16, 12), 552 + pckt->flow.ports, 24); 553 + else 554 + return jhash_2words(pckt->flow.src, pckt->flow.ports, 555 + 24); 556 + } 557 + 558 + __attribute__ ((noinline)) 559 + static bool get_packet_dst(struct real_definition **real, 560 + struct packet_description *pckt, 561 + struct vip_meta *vip_info, 562 + bool is_ipv6, void *lru_map) 563 + { 564 + struct real_pos_lru new_dst_lru = { }; 565 + bool hash_16bytes = is_ipv6; 566 + __u32 *real_pos, hash, key; 567 + __u64 cur_time; 568 + 569 + if (vip_info->flags & (1 << 2)) 570 + hash_16bytes = 1; 571 + if (vip_info->flags & (1 << 3)) { 572 + pckt->flow.port16[0] = pckt->flow.port16[1]; 573 + memset(pckt->flow.srcv6, 0, 16); 574 + } 575 + hash = get_packet_hash(pckt, hash_16bytes); 576 + if (hash != 0x358459b7 /* jhash of ipv4 packet */ && 577 + hash != 0x2f4bc6bb /* jhash of ipv6 packet */) 578 + return 0; 579 + key = 2 * vip_info->vip_num + hash % 2; 580 + real_pos = bpf_map_lookup_elem(&ch_rings, &key); 581 + if (!real_pos) 582 + return 0; 583 + key = *real_pos; 584 + *real = bpf_map_lookup_elem(&reals, &key); 585 + if (!(*real)) 586 + return 0; 587 + if (!(vip_info->flags & (1 << 1))) { 588 + __u32 conn_rate_key = 512 + 2; 589 + struct lb_stats *conn_rate_stats = 590 + bpf_map_lookup_elem(&stats, &conn_rate_key); 591 + 592 + if (!conn_rate_stats) 593 + return 1; 594 + cur_time = bpf_ktime_get_ns(); 595 + if ((cur_time - conn_rate_stats->v2) >> 32 > 0xffFFFF) { 596 + conn_rate_stats->v1 = 1; 597 + conn_rate_stats->v2 = cur_time; 598 + } else { 599 + conn_rate_stats->v1 += 1; 600 + if (conn_rate_stats->v1 >= 1) 601 + return 1; 602 + } 603 + if (pckt->flow.proto == IPPROTO_UDP) 604 + new_dst_lru.atime = cur_time; 605 + new_dst_lru.pos = key; 606 + bpf_map_update_elem(lru_map, &pckt->flow, &new_dst_lru, 0); 607 + } 608 + return 1; 609 + } 610 + 611 + __attribute__ ((noinline)) 612 + static void connection_table_lookup(struct real_definition **real, 613 + struct packet_description *pckt, 614 + void *lru_map) 615 + { 616 + 617 + struct real_pos_lru *dst_lru; 618 + __u64 cur_time; 619 + __u32 key; 620 + 621 + dst_lru = bpf_map_lookup_elem(lru_map, &pckt->flow); 622 + if (!dst_lru) 623 + return; 624 + if (pckt->flow.proto == IPPROTO_UDP) { 625 + cur_time = bpf_ktime_get_ns(); 626 + if (cur_time - dst_lru->atime > 300000) 627 + return; 628 + dst_lru->atime = cur_time; 629 + } 630 + key = dst_lru->pos; 631 + *real = bpf_map_lookup_elem(&reals, &key); 632 + } 633 + 634 + /* don't believe your eyes! 635 + * below function has 6 arguments whereas bpf and llvm allow maximum of 5 636 + * but since it's _static_ llvm can optimize one argument away 637 + */ 638 + __attribute__ ((noinline)) 639 + static int process_l3_headers_v6(struct packet_description *pckt, 640 + __u8 *protocol, __u64 off, 641 + __u16 *pkt_bytes, void *data, 642 + void *data_end) 643 + { 644 + struct ipv6hdr *ip6h; 645 + __u64 iph_len; 646 + int action; 647 + 648 + ip6h = data + off; 649 + if (ip6h + 1 > data_end) 650 + return XDP_DROP; 651 + iph_len = sizeof(struct ipv6hdr); 652 + *protocol = ip6h->nexthdr; 653 + pckt->flow.proto = *protocol; 654 + *pkt_bytes = __builtin_bswap16(ip6h->payload_len); 655 + off += iph_len; 656 + if (*protocol == 45) { 657 + return XDP_DROP; 658 + } else if (*protocol == 59) { 659 + action = parse_icmpv6(data, data_end, off, pckt); 660 + if (action >= 0) 661 + return action; 662 + } else { 663 + memcpy(pckt->flow.srcv6, ip6h->saddr.in6_u.u6_addr32, 16); 664 + memcpy(pckt->flow.dstv6, ip6h->daddr.in6_u.u6_addr32, 16); 665 + } 666 + return -1; 667 + } 668 + 669 + __attribute__ ((noinline)) 670 + static int process_l3_headers_v4(struct packet_description *pckt, 671 + __u8 *protocol, __u64 off, 672 + __u16 *pkt_bytes, void *data, 673 + void *data_end) 674 + { 675 + struct iphdr *iph; 676 + __u64 iph_len; 677 + int action; 678 + 679 + iph = data + off; 680 + if (iph + 1 > data_end) 681 + return XDP_DROP; 682 + if (iph->ihl != 5) 683 + return XDP_DROP; 684 + *protocol = iph->protocol; 685 + pckt->flow.proto = *protocol; 686 + *pkt_bytes = __builtin_bswap16(iph->tot_len); 687 + off += 20; 688 + if (iph->frag_off & 65343) 689 + return XDP_DROP; 690 + if (*protocol == IPPROTO_ICMP) { 691 + action = parse_icmp(data, data_end, off, pckt); 692 + if (action >= 0) 693 + return action; 694 + } else { 695 + pckt->flow.src = iph->saddr; 696 + pckt->flow.dst = iph->daddr; 697 + } 698 + return -1; 699 + } 700 + 701 + __attribute__ ((noinline)) 702 + static int process_packet(void *data, __u64 off, void *data_end, 703 + bool is_ipv6, struct xdp_md *xdp) 704 + { 705 + 706 + struct real_definition *dst = NULL; 707 + struct packet_description pckt = { }; 708 + struct vip_definition vip = { }; 709 + struct lb_stats *data_stats; 710 + struct eth_hdr *eth = data; 711 + void *lru_map = &lru_cache; 712 + struct vip_meta *vip_info; 713 + __u32 lru_stats_key = 513; 714 + __u32 mac_addr_pos = 0; 715 + __u32 stats_key = 512; 716 + struct ctl_value *cval; 717 + __u16 pkt_bytes; 718 + __u64 iph_len; 719 + __u8 protocol; 720 + __u32 vip_num; 721 + int action; 722 + 723 + if (is_ipv6) 724 + action = process_l3_headers_v6(&pckt, &protocol, off, 725 + &pkt_bytes, data, data_end); 726 + else 727 + action = process_l3_headers_v4(&pckt, &protocol, off, 728 + &pkt_bytes, data, data_end); 729 + if (action >= 0) 730 + return action; 731 + protocol = pckt.flow.proto; 732 + if (protocol == IPPROTO_TCP) { 733 + if (!parse_tcp(data, data_end, is_ipv6, &pckt)) 734 + return XDP_DROP; 735 + } else if (protocol == IPPROTO_UDP) { 736 + if (!parse_udp(data, data_end, is_ipv6, &pckt)) 737 + return XDP_DROP; 738 + } else { 739 + return XDP_TX; 740 + } 741 + 742 + if (is_ipv6) 743 + memcpy(vip.vipv6, pckt.flow.dstv6, 16); 744 + else 745 + vip.vip = pckt.flow.dst; 746 + vip.port = pckt.flow.port16[1]; 747 + vip.proto = pckt.flow.proto; 748 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 749 + if (!vip_info) { 750 + vip.port = 0; 751 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 752 + if (!vip_info) 753 + return XDP_PASS; 754 + if (!(vip_info->flags & (1 << 4))) 755 + pckt.flow.port16[1] = 0; 756 + } 757 + if (data_end - data > 1400) 758 + return XDP_DROP; 759 + data_stats = bpf_map_lookup_elem(&stats, &stats_key); 760 + if (!data_stats) 761 + return XDP_DROP; 762 + data_stats->v1 += 1; 763 + if (!dst) { 764 + if (vip_info->flags & (1 << 0)) 765 + pckt.flow.port16[0] = 0; 766 + if (!(pckt.flags & (1 << 1)) && !(vip_info->flags & (1 << 1))) 767 + connection_table_lookup(&dst, &pckt, lru_map); 768 + if (dst) 769 + goto out; 770 + if (pckt.flow.proto == IPPROTO_TCP) { 771 + struct lb_stats *lru_stats = 772 + bpf_map_lookup_elem(&stats, &lru_stats_key); 773 + 774 + if (!lru_stats) 775 + return XDP_DROP; 776 + if (pckt.flags & (1 << 1)) 777 + lru_stats->v1 += 1; 778 + else 779 + lru_stats->v2 += 1; 780 + } 781 + if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6, lru_map)) 782 + return XDP_DROP; 783 + data_stats->v2 += 1; 784 + } 785 + out: 786 + cval = bpf_map_lookup_elem(&ctl_array, &mac_addr_pos); 787 + if (!cval) 788 + return XDP_DROP; 789 + if (dst->flags & (1 << 0)) { 790 + if (!encap_v6(xdp, cval, &pckt, dst, pkt_bytes)) 791 + return XDP_DROP; 792 + } else { 793 + if (!encap_v4(xdp, cval, &pckt, dst, pkt_bytes)) 794 + return XDP_DROP; 795 + } 796 + vip_num = vip_info->vip_num; 797 + data_stats = bpf_map_lookup_elem(&stats, &vip_num); 798 + if (!data_stats) 799 + return XDP_DROP; 800 + data_stats->v1 += 1; 801 + data_stats->v2 += pkt_bytes; 802 + 803 + data = (void *)(long)xdp->data; 804 + data_end = (void *)(long)xdp->data_end; 805 + if (data + 4 > data_end) 806 + return XDP_DROP; 807 + *(u32 *)data = dst->dst; 808 + return XDP_DROP; 809 + } 810 + 811 + __attribute__ ((section("xdp-test"), used)) 812 + int balancer_ingress(struct xdp_md *ctx) 813 + { 814 + void *data = (void *)(long)ctx->data; 815 + void *data_end = (void *)(long)ctx->data_end; 816 + struct eth_hdr *eth = data; 817 + __u32 eth_proto; 818 + __u32 nh_off; 819 + 820 + nh_off = sizeof(struct eth_hdr); 821 + if (data + nh_off > data_end) 822 + return XDP_DROP; 823 + eth_proto = eth->eth_proto; 824 + if (eth_proto == 8) 825 + return process_packet(data, nh_off, data_end, 0, ctx); 826 + else if (eth_proto == 56710) 827 + return process_packet(data, nh_off, data_end, 1, ctx); 828 + else 829 + return XDP_DROP; 830 + } 831 + 832 + char _license[] __attribute__ ((section("license"), used)) = "GPL"; 833 + int _version __attribute__ ((section("version"), used)) = 1;