selftests/bpf: tests for using dynptrs to parse skb and xdp buffers

+2

tools/testing/selftests/bpf/DENYLIST.s390x

··· 4 4 bpf_cookie # failed to open_and_load program: -524 (trampoline) 5 5 bpf_loop # attaches to __x64_sys_nanosleep 6 6 cgrp_local_storage # prog_attach unexpected error: -524 (trampoline) 7 + dynptr/test_dynptr_skb_data 8 + dynptr/test_skb_readonly 7 9 fexit_sleep # fexit_skel_load fexit skeleton failed (trampoline) 8 10 get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) 9 11 kprobe_multi_bench_attach # bpf_program__attach_kprobe_multi_opts unexpected error: -95

+38

tools/testing/selftests/bpf/bpf_kfuncs.h

··· 1 + #ifndef __BPF_KFUNCS__ 2 + #define __BPF_KFUNCS__ 3 + 4 + /* Description 5 + * Initializes an skb-type dynptr 6 + * Returns 7 + * Error code 8 + */ 9 + extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags, 10 + struct bpf_dynptr *ptr__uninit) __ksym; 11 + 12 + /* Description 13 + * Initializes an xdp-type dynptr 14 + * Returns 15 + * Error code 16 + */ 17 + extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags, 18 + struct bpf_dynptr *ptr__uninit) __ksym; 19 + 20 + /* Description 21 + * Obtain a read-only pointer to the dynptr's data 22 + * Returns 23 + * Either a direct pointer to the dynptr data or a pointer to the user-provided 24 + * buffer if unable to obtain a direct pointer 25 + */ 26 + extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset, 27 + void *buffer, __u32 buffer__szk) __ksym; 28 + 29 + /* Description 30 + * Obtain a read-write pointer to the dynptr's data 31 + * Returns 32 + * Either a direct pointer to the dynptr data or a pointer to the user-provided 33 + * buffer if unable to obtain a direct pointer 34 + */ 35 + extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset, 36 + void *buffer, __u32 buffer__szk) __ksym; 37 + 38 + #endif

+25

tools/testing/selftests/bpf/prog_tests/cls_redirect.c

··· 13 13 14 14 #include "progs/test_cls_redirect.h" 15 15 #include "test_cls_redirect.skel.h" 16 + #include "test_cls_redirect_dynptr.skel.h" 16 17 #include "test_cls_redirect_subprogs.skel.h" 17 18 18 19 #define ENCAP_IP INADDR_LOOPBACK ··· 447 446 close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); 448 447 } 449 448 449 + static void test_cls_redirect_dynptr(void) 450 + { 451 + struct test_cls_redirect_dynptr *skel; 452 + int err; 453 + 454 + skel = test_cls_redirect_dynptr__open(); 455 + if (!ASSERT_OK_PTR(skel, "skel_open")) 456 + return; 457 + 458 + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); 459 + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); 460 + 461 + err = test_cls_redirect_dynptr__load(skel); 462 + if (!ASSERT_OK(err, "skel_load")) 463 + goto cleanup; 464 + 465 + test_cls_redirect_common(skel->progs.cls_redirect); 466 + 467 + cleanup: 468 + test_cls_redirect_dynptr__destroy(skel); 469 + } 470 + 450 471 static void test_cls_redirect_inlined(void) 451 472 { 452 473 struct test_cls_redirect *skel; ··· 519 496 test_cls_redirect_inlined(); 520 497 if (test__start_subtest("cls_redirect_subprogs")) 521 498 test_cls_redirect_subprogs(); 499 + if (test__start_subtest("cls_redirect_dynptr")) 500 + test_cls_redirect_dynptr(); 522 501 }

+58 -16

tools/testing/selftests/bpf/prog_tests/dynptr.c

··· 2 2 /* Copyright (c) 2022 Facebook */ 3 3 4 4 #include <test_progs.h> 5 + #include <network_helpers.h> 5 6 #include "dynptr_fail.skel.h" 6 7 #include "dynptr_success.skel.h" 7 8 8 - static const char * const success_tests[] = { 9 - "test_read_write", 10 - "test_data_slice", 11 - "test_ringbuf", 9 + enum test_setup_type { 10 + SETUP_SYSCALL_SLEEP, 11 + SETUP_SKB_PROG, 12 12 }; 13 13 14 - static void verify_success(const char *prog_name) 14 + static struct { 15 + const char *prog_name; 16 + enum test_setup_type type; 17 + } success_tests[] = { 18 + {"test_read_write", SETUP_SYSCALL_SLEEP}, 19 + {"test_dynptr_data", SETUP_SYSCALL_SLEEP}, 20 + {"test_ringbuf", SETUP_SYSCALL_SLEEP}, 21 + {"test_skb_readonly", SETUP_SKB_PROG}, 22 + {"test_dynptr_skb_data", SETUP_SKB_PROG}, 23 + }; 24 + 25 + static void verify_success(const char *prog_name, enum test_setup_type setup_type) 15 26 { 16 27 struct dynptr_success *skel; 17 28 struct bpf_program *prog; 18 29 struct bpf_link *link; 30 + int err; 19 31 20 32 skel = dynptr_success__open(); 21 33 if (!ASSERT_OK_PTR(skel, "dynptr_success__open")) ··· 35 23 36 24 skel->bss->pid = getpid(); 37 25 38 - dynptr_success__load(skel); 39 - if (!ASSERT_OK_PTR(skel, "dynptr_success__load")) 40 - goto cleanup; 41 - 42 26 prog = bpf_object__find_program_by_name(skel->obj, prog_name); 43 27 if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) 44 28 goto cleanup; 45 29 46 - link = bpf_program__attach(prog); 47 - if (!ASSERT_OK_PTR(link, "bpf_program__attach")) 30 + bpf_program__set_autoload(prog, true); 31 + 32 + err = dynptr_success__load(skel); 33 + if (!ASSERT_OK(err, "dynptr_success__load")) 48 34 goto cleanup; 49 35 50 - usleep(1); 36 + switch (setup_type) { 37 + case SETUP_SYSCALL_SLEEP: 38 + link = bpf_program__attach(prog); 39 + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) 40 + goto cleanup; 41 + 42 + usleep(1); 43 + 44 + bpf_link__destroy(link); 45 + break; 46 + case SETUP_SKB_PROG: 47 + { 48 + int prog_fd; 49 + char buf[64]; 50 + 51 + LIBBPF_OPTS(bpf_test_run_opts, topts, 52 + .data_in = &pkt_v4, 53 + .data_size_in = sizeof(pkt_v4), 54 + .data_out = buf, 55 + .data_size_out = sizeof(buf), 56 + .repeat = 1, 57 + ); 58 + 59 + prog_fd = bpf_program__fd(prog); 60 + if (!ASSERT_GE(prog_fd, 0, "prog_fd")) 61 + goto cleanup; 62 + 63 + err = bpf_prog_test_run_opts(prog_fd, &topts); 64 + 65 + if (!ASSERT_OK(err, "test_run")) 66 + goto cleanup; 67 + 68 + break; 69 + } 70 + } 51 71 52 72 ASSERT_EQ(skel->bss->err, 0, "err"); 53 - 54 - bpf_link__destroy(link); 55 73 56 74 cleanup: 57 75 dynptr_success__destroy(skel); ··· 92 50 int i; 93 51 94 52 for (i = 0; i < ARRAY_SIZE(success_tests); i++) { 95 - if (!test__start_subtest(success_tests[i])) 53 + if (!test__start_subtest(success_tests[i].prog_name)) 96 54 continue; 97 55 98 - verify_success(success_tests[i]); 56 + verify_success(success_tests[i].prog_name, success_tests[i].type); 99 57 } 100 58 101 59 RUN_TESTS(dynptr_fail);

+2

tools/testing/selftests/bpf/prog_tests/l4lb_all.c

··· 93 93 test_l4lb("test_l4lb.bpf.o"); 94 94 if (test__start_subtest("l4lb_noinline")) 95 95 test_l4lb("test_l4lb_noinline.bpf.o"); 96 + if (test__start_subtest("l4lb_noinline_dynptr")) 97 + test_l4lb("test_l4lb_noinline_dynptr.bpf.o"); 96 98 }

+93

tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <test_progs.h> 4 + #include <network_helpers.h> 5 + #include "test_parse_tcp_hdr_opt.skel.h" 6 + #include "test_parse_tcp_hdr_opt_dynptr.skel.h" 7 + #include "test_tcp_hdr_options.h" 8 + 9 + struct test_pkt { 10 + struct ipv6_packet pk6_v6; 11 + u8 options[16]; 12 + } __packed; 13 + 14 + struct test_pkt pkt = { 15 + .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6), 16 + .pk6_v6.iph.nexthdr = IPPROTO_TCP, 17 + .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES), 18 + .pk6_v6.tcp.urg_ptr = 123, 19 + .pk6_v6.tcp.doff = 9, /* 16 bytes of options */ 20 + 21 + .options = { 22 + TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP, 23 + 0, 6, 0xBB, 0xBB, 0xBB, 0xBB, TCPOPT_EOL 24 + }, 25 + }; 26 + 27 + static void test_parse_opt(void) 28 + { 29 + struct test_parse_tcp_hdr_opt *skel; 30 + struct bpf_program *prog; 31 + char buf[128]; 32 + int err; 33 + 34 + LIBBPF_OPTS(bpf_test_run_opts, topts, 35 + .data_in = &pkt, 36 + .data_size_in = sizeof(pkt), 37 + .data_out = buf, 38 + .data_size_out = sizeof(buf), 39 + .repeat = 3, 40 + ); 41 + 42 + skel = test_parse_tcp_hdr_opt__open_and_load(); 43 + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) 44 + return; 45 + 46 + pkt.options[6] = skel->rodata->tcp_hdr_opt_kind_tpr; 47 + prog = skel->progs.xdp_ingress_v6; 48 + 49 + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &topts); 50 + ASSERT_OK(err, "ipv6 test_run"); 51 + ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval"); 52 + ASSERT_EQ(skel->bss->server_id, 0xBBBBBBBB, "server id"); 53 + 54 + test_parse_tcp_hdr_opt__destroy(skel); 55 + } 56 + 57 + static void test_parse_opt_dynptr(void) 58 + { 59 + struct test_parse_tcp_hdr_opt_dynptr *skel; 60 + struct bpf_program *prog; 61 + char buf[128]; 62 + int err; 63 + 64 + LIBBPF_OPTS(bpf_test_run_opts, topts, 65 + .data_in = &pkt, 66 + .data_size_in = sizeof(pkt), 67 + .data_out = buf, 68 + .data_size_out = sizeof(buf), 69 + .repeat = 3, 70 + ); 71 + 72 + skel = test_parse_tcp_hdr_opt_dynptr__open_and_load(); 73 + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) 74 + return; 75 + 76 + pkt.options[6] = skel->rodata->tcp_hdr_opt_kind_tpr; 77 + prog = skel->progs.xdp_ingress_v6; 78 + 79 + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &topts); 80 + ASSERT_OK(err, "ipv6 test_run"); 81 + ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval"); 82 + ASSERT_EQ(skel->bss->server_id, 0xBBBBBBBB, "server id"); 83 + 84 + test_parse_tcp_hdr_opt_dynptr__destroy(skel); 85 + } 86 + 87 + void test_parse_tcp_hdr_opt(void) 88 + { 89 + if (test__start_subtest("parse_tcp_hdr_opt")) 90 + test_parse_opt(); 91 + if (test__start_subtest("parse_tcp_hdr_opt_dynptr")) 92 + test_parse_opt_dynptr(); 93 + }

+9 -2

tools/testing/selftests/bpf/prog_tests/xdp_attach.c

··· 4 4 #define IFINDEX_LO 1 5 5 #define XDP_FLAGS_REPLACE (1U << 4) 6 6 7 - void serial_test_xdp_attach(void) 7 + static void test_xdp_attach(const char *file) 8 8 { 9 9 __u32 duration = 0, id1, id2, id0 = 0, len; 10 10 struct bpf_object *obj1, *obj2, *obj3; 11 - const char *file = "./test_xdp.bpf.o"; 12 11 struct bpf_prog_info info = {}; 13 12 int err, fd1, fd2, fd3; 14 13 LIBBPF_OPTS(bpf_xdp_attach_opts, opts); ··· 83 84 bpf_object__close(obj2); 84 85 out_1: 85 86 bpf_object__close(obj1); 87 + } 88 + 89 + void serial_test_xdp_attach(void) 90 + { 91 + if (test__start_subtest("xdp_attach")) 92 + test_xdp_attach("./test_xdp.bpf.o"); 93 + if (test__start_subtest("xdp_attach_dynptr")) 94 + test_xdp_attach("./test_xdp_dynptr.bpf.o"); 86 95 }

+286 -1

tools/testing/selftests/bpf/progs/dynptr_fail.c

··· 5 5 #include <string.h> 6 6 #include <linux/bpf.h> 7 7 #include <bpf/bpf_helpers.h> 8 + #include <linux/if_ether.h> 8 9 #include "bpf_misc.h" 10 + #include "bpf_kfuncs.h" 9 11 10 12 char _license[] SEC("license") = "GPL"; 11 13 ··· 246 244 return 0; 247 245 } 248 246 247 + /* A data slice can't be accessed out of bounds */ 248 + SEC("?tc") 249 + __failure __msg("value is outside of the allowed memory range") 250 + int data_slice_out_of_bounds_skb(struct __sk_buff *skb) 251 + { 252 + struct bpf_dynptr ptr; 253 + struct ethhdr *hdr; 254 + char buffer[sizeof(*hdr)] = {}; 255 + 256 + bpf_dynptr_from_skb(skb, 0, &ptr); 257 + 258 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 259 + if (!hdr) 260 + return SK_DROP; 261 + 262 + /* this should fail */ 263 + *(__u8*)(hdr + 1) = 1; 264 + 265 + return SK_PASS; 266 + } 267 + 249 268 SEC("?raw_tp") 250 269 __failure __msg("value is outside of the allowed memory range") 251 270 int data_slice_out_of_bounds_map_value(void *ctx) ··· 422 399 423 400 /* this should fail */ 424 401 bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0); 425 - 426 402 return 0; 427 403 } 428 404 ··· 1066 1044 return 0; 1067 1045 } 1068 1046 1047 + /* bpf_dynptr_slice()s are read-only and cannot be written to */ 1048 + SEC("?tc") 1049 + __failure __msg("R0 cannot write into rdonly_mem") 1050 + int skb_invalid_slice_write(struct __sk_buff *skb) 1051 + { 1052 + struct bpf_dynptr ptr; 1053 + struct ethhdr *hdr; 1054 + char buffer[sizeof(*hdr)] = {}; 1055 + 1056 + bpf_dynptr_from_skb(skb, 0, &ptr); 1057 + 1058 + hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer)); 1059 + if (!hdr) 1060 + return SK_DROP; 1061 + 1062 + /* this should fail */ 1063 + hdr->h_proto = 1; 1064 + 1065 + return SK_PASS; 1066 + } 1067 + 1068 + /* The read-only data slice is invalidated whenever a helper changes packet data */ 1069 + SEC("?tc") 1070 + __failure __msg("invalid mem access 'scalar'") 1071 + int skb_invalid_data_slice1(struct __sk_buff *skb) 1072 + { 1073 + struct bpf_dynptr ptr; 1074 + struct ethhdr *hdr; 1075 + char buffer[sizeof(*hdr)] = {}; 1076 + 1077 + bpf_dynptr_from_skb(skb, 0, &ptr); 1078 + 1079 + hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer)); 1080 + if (!hdr) 1081 + return SK_DROP; 1082 + 1083 + val = hdr->h_proto; 1084 + 1085 + if (bpf_skb_pull_data(skb, skb->len)) 1086 + return SK_DROP; 1087 + 1088 + /* this should fail */ 1089 + val = hdr->h_proto; 1090 + 1091 + return SK_PASS; 1092 + } 1093 + 1094 + /* The read-write data slice is invalidated whenever a helper changes packet data */ 1095 + SEC("?tc") 1096 + __failure __msg("invalid mem access 'scalar'") 1097 + int skb_invalid_data_slice2(struct __sk_buff *skb) 1098 + { 1099 + struct bpf_dynptr ptr; 1100 + struct ethhdr *hdr; 1101 + char buffer[sizeof(*hdr)] = {}; 1102 + 1103 + bpf_dynptr_from_skb(skb, 0, &ptr); 1104 + 1105 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 1106 + if (!hdr) 1107 + return SK_DROP; 1108 + 1109 + hdr->h_proto = 123; 1110 + 1111 + if (bpf_skb_pull_data(skb, skb->len)) 1112 + return SK_DROP; 1113 + 1114 + /* this should fail */ 1115 + hdr->h_proto = 1; 1116 + 1117 + return SK_PASS; 1118 + } 1119 + 1120 + /* The read-only data slice is invalidated whenever bpf_dynptr_write() is called */ 1121 + SEC("?tc") 1122 + __failure __msg("invalid mem access 'scalar'") 1123 + int skb_invalid_data_slice3(struct __sk_buff *skb) 1124 + { 1125 + char write_data[64] = "hello there, world!!"; 1126 + struct bpf_dynptr ptr; 1127 + struct ethhdr *hdr; 1128 + char buffer[sizeof(*hdr)] = {}; 1129 + 1130 + bpf_dynptr_from_skb(skb, 0, &ptr); 1131 + 1132 + hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer)); 1133 + if (!hdr) 1134 + return SK_DROP; 1135 + 1136 + val = hdr->h_proto; 1137 + 1138 + bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); 1139 + 1140 + /* this should fail */ 1141 + val = hdr->h_proto; 1142 + 1143 + return SK_PASS; 1144 + } 1145 + 1146 + /* The read-write data slice is invalidated whenever bpf_dynptr_write() is called */ 1147 + SEC("?tc") 1148 + __failure __msg("invalid mem access 'scalar'") 1149 + int skb_invalid_data_slice4(struct __sk_buff *skb) 1150 + { 1151 + char write_data[64] = "hello there, world!!"; 1152 + struct bpf_dynptr ptr; 1153 + struct ethhdr *hdr; 1154 + char buffer[sizeof(*hdr)] = {}; 1155 + 1156 + bpf_dynptr_from_skb(skb, 0, &ptr); 1157 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 1158 + if (!hdr) 1159 + return SK_DROP; 1160 + 1161 + hdr->h_proto = 123; 1162 + 1163 + bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); 1164 + 1165 + /* this should fail */ 1166 + hdr->h_proto = 1; 1167 + 1168 + return SK_PASS; 1169 + } 1170 + 1171 + /* The read-only data slice is invalidated whenever a helper changes packet data */ 1172 + SEC("?xdp") 1173 + __failure __msg("invalid mem access 'scalar'") 1174 + int xdp_invalid_data_slice1(struct xdp_md *xdp) 1175 + { 1176 + struct bpf_dynptr ptr; 1177 + struct ethhdr *hdr; 1178 + char buffer[sizeof(*hdr)] = {}; 1179 + 1180 + bpf_dynptr_from_xdp(xdp, 0, &ptr); 1181 + hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer)); 1182 + if (!hdr) 1183 + return SK_DROP; 1184 + 1185 + val = hdr->h_proto; 1186 + 1187 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr))) 1188 + return XDP_DROP; 1189 + 1190 + /* this should fail */ 1191 + val = hdr->h_proto; 1192 + 1193 + return XDP_PASS; 1194 + } 1195 + 1196 + /* The read-write data slice is invalidated whenever a helper changes packet data */ 1197 + SEC("?xdp") 1198 + __failure __msg("invalid mem access 'scalar'") 1199 + int xdp_invalid_data_slice2(struct xdp_md *xdp) 1200 + { 1201 + struct bpf_dynptr ptr; 1202 + struct ethhdr *hdr; 1203 + char buffer[sizeof(*hdr)] = {}; 1204 + 1205 + bpf_dynptr_from_xdp(xdp, 0, &ptr); 1206 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 1207 + if (!hdr) 1208 + return SK_DROP; 1209 + 1210 + hdr->h_proto = 9; 1211 + 1212 + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr))) 1213 + return XDP_DROP; 1214 + 1215 + /* this should fail */ 1216 + hdr->h_proto = 1; 1217 + 1218 + return XDP_PASS; 1219 + } 1220 + 1221 + /* Only supported prog type can create skb-type dynptrs */ 1222 + SEC("?raw_tp") 1223 + __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed") 1224 + int skb_invalid_ctx(void *ctx) 1225 + { 1226 + struct bpf_dynptr ptr; 1227 + 1228 + /* this should fail */ 1229 + bpf_dynptr_from_skb(ctx, 0, &ptr); 1230 + 1231 + return 0; 1232 + } 1233 + 1069 1234 /* Reject writes to dynptr slot for uninit arg */ 1070 1235 SEC("?raw_tp") 1071 1236 __failure __msg("potential write to dynptr at off=-16") ··· 1268 1059 bpf_get_current_comm(data.buf, 80); 1269 1060 1270 1061 return 0; 1062 + } 1063 + 1064 + /* Only supported prog type can create xdp-type dynptrs */ 1065 + SEC("?raw_tp") 1066 + __failure __msg("calling kernel function bpf_dynptr_from_xdp is not allowed") 1067 + int xdp_invalid_ctx(void *ctx) 1068 + { 1069 + struct bpf_dynptr ptr; 1070 + 1071 + /* this should fail */ 1072 + bpf_dynptr_from_xdp(ctx, 0, &ptr); 1073 + 1074 + return 0; 1075 + } 1076 + 1077 + __u32 hdr_size = sizeof(struct ethhdr); 1078 + /* Can't pass in variable-sized len to bpf_dynptr_slice */ 1079 + SEC("?tc") 1080 + __failure __msg("unbounded memory access") 1081 + int dynptr_slice_var_len1(struct __sk_buff *skb) 1082 + { 1083 + struct bpf_dynptr ptr; 1084 + struct ethhdr *hdr; 1085 + char buffer[sizeof(*hdr)] = {}; 1086 + 1087 + bpf_dynptr_from_skb(skb, 0, &ptr); 1088 + 1089 + /* this should fail */ 1090 + hdr = bpf_dynptr_slice(&ptr, 0, buffer, hdr_size); 1091 + if (!hdr) 1092 + return SK_DROP; 1093 + 1094 + return SK_PASS; 1095 + } 1096 + 1097 + /* Can't pass in variable-sized len to bpf_dynptr_slice */ 1098 + SEC("?tc") 1099 + __failure __msg("must be a known constant") 1100 + int dynptr_slice_var_len2(struct __sk_buff *skb) 1101 + { 1102 + char buffer[sizeof(struct ethhdr)] = {}; 1103 + struct bpf_dynptr ptr; 1104 + struct ethhdr *hdr; 1105 + 1106 + bpf_dynptr_from_skb(skb, 0, &ptr); 1107 + 1108 + if (hdr_size <= sizeof(buffer)) { 1109 + /* this should fail */ 1110 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, hdr_size); 1111 + if (!hdr) 1112 + return SK_DROP; 1113 + hdr->h_proto = 12; 1114 + } 1115 + 1116 + return SK_PASS; 1271 1117 } 1272 1118 1273 1119 static int callback(__u32 index, void *data) ··· 1353 1089 1354 1090 /* this should fail */ 1355 1091 *slice = 1; 1092 + 1093 + return 0; 1094 + } 1095 + 1096 + /* Program types that don't allow writes to packet data should fail if 1097 + * bpf_dynptr_slice_rdwr is called 1098 + */ 1099 + SEC("cgroup_skb/ingress") 1100 + __failure __msg("the prog does not allow writes to packet data") 1101 + int invalid_slice_rdwr_rdonly(struct __sk_buff *skb) 1102 + { 1103 + char buffer[sizeof(struct ethhdr)] = {}; 1104 + struct bpf_dynptr ptr; 1105 + struct ethhdr *hdr; 1106 + 1107 + bpf_dynptr_from_skb(skb, 0, &ptr); 1108 + 1109 + /* this should fail since cgroup_skb doesn't allow 1110 + * changing packet data 1111 + */ 1112 + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 1356 1113 1357 1114 return 0; 1358 1115 }

+51 -4

tools/testing/selftests/bpf/progs/dynptr_success.c

··· 5 5 #include <linux/bpf.h> 6 6 #include <bpf/bpf_helpers.h> 7 7 #include "bpf_misc.h" 8 + #include "bpf_kfuncs.h" 8 9 #include "errno.h" 9 10 10 11 char _license[] SEC("license") = "GPL"; ··· 31 30 __type(value, __u32); 32 31 } array_map SEC(".maps"); 33 32 34 - SEC("tp/syscalls/sys_enter_nanosleep") 33 + SEC("?tp/syscalls/sys_enter_nanosleep") 35 34 int test_read_write(void *ctx) 36 35 { 37 36 char write_data[64] = "hello there, world!!"; ··· 62 61 return 0; 63 62 } 64 63 65 - SEC("tp/syscalls/sys_enter_nanosleep") 66 - int test_data_slice(void *ctx) 64 + SEC("?tp/syscalls/sys_enter_nanosleep") 65 + int test_dynptr_data(void *ctx) 67 66 { 68 67 __u32 key = 0, val = 235, *map_val; 69 68 struct bpf_dynptr ptr; ··· 132 131 return 0; 133 132 } 134 133 135 - SEC("tp/syscalls/sys_enter_nanosleep") 134 + SEC("?tp/syscalls/sys_enter_nanosleep") 136 135 int test_ringbuf(void *ctx) 137 136 { 138 137 struct bpf_dynptr ptr; ··· 163 162 done: 164 163 bpf_ringbuf_discard_dynptr(&ptr, 0); 165 164 return 0; 165 + } 166 + 167 + SEC("?cgroup_skb/egress") 168 + int test_skb_readonly(struct __sk_buff *skb) 169 + { 170 + __u8 write_data[2] = {1, 2}; 171 + struct bpf_dynptr ptr; 172 + __u64 *data; 173 + int ret; 174 + 175 + if (bpf_dynptr_from_skb(skb, 0, &ptr)) { 176 + err = 1; 177 + return 1; 178 + } 179 + 180 + /* since cgroup skbs are read only, writes should fail */ 181 + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0); 182 + if (ret != -EINVAL) { 183 + err = 2; 184 + return 1; 185 + } 186 + 187 + return 1; 188 + } 189 + 190 + SEC("?cgroup_skb/egress") 191 + int test_dynptr_skb_data(struct __sk_buff *skb) 192 + { 193 + __u8 write_data[2] = {1, 2}; 194 + struct bpf_dynptr ptr; 195 + __u64 *data; 196 + int ret; 197 + 198 + if (bpf_dynptr_from_skb(skb, 0, &ptr)) { 199 + err = 1; 200 + return 1; 201 + } 202 + 203 + /* This should return NULL. Must use bpf_dynptr_slice API */ 204 + data = bpf_dynptr_data(&ptr, 0, 1); 205 + if (data) { 206 + err = 2; 207 + return 1; 208 + } 209 + 210 + return 1; 166 211 }

+980

tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + // Copyright (c) 2019, 2020 Cloudflare 3 + 4 + #include <stdbool.h> 5 + #include <stddef.h> 6 + #include <stdint.h> 7 + #include <string.h> 8 + 9 + #include <linux/bpf.h> 10 + #include <linux/icmp.h> 11 + #include <linux/icmpv6.h> 12 + #include <linux/if_ether.h> 13 + #include <linux/in.h> 14 + #include <linux/ip.h> 15 + #include <linux/ipv6.h> 16 + #include <linux/pkt_cls.h> 17 + #include <linux/tcp.h> 18 + #include <linux/udp.h> 19 + 20 + #include <bpf/bpf_helpers.h> 21 + #include <bpf/bpf_endian.h> 22 + 23 + #include "test_cls_redirect.h" 24 + #include "bpf_kfuncs.h" 25 + 26 + #define offsetofend(TYPE, MEMBER) \ 27 + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 28 + 29 + #define IP_OFFSET_MASK (0x1FFF) 30 + #define IP_MF (0x2000) 31 + 32 + char _license[] SEC("license") = "Dual BSD/GPL"; 33 + 34 + /** 35 + * Destination port and IP used for UDP encapsulation. 36 + */ 37 + volatile const __be16 ENCAPSULATION_PORT; 38 + volatile const __be32 ENCAPSULATION_IP; 39 + 40 + typedef struct { 41 + uint64_t processed_packets_total; 42 + uint64_t l3_protocol_packets_total_ipv4; 43 + uint64_t l3_protocol_packets_total_ipv6; 44 + uint64_t l4_protocol_packets_total_tcp; 45 + uint64_t l4_protocol_packets_total_udp; 46 + uint64_t accepted_packets_total_syn; 47 + uint64_t accepted_packets_total_syn_cookies; 48 + uint64_t accepted_packets_total_last_hop; 49 + uint64_t accepted_packets_total_icmp_echo_request; 50 + uint64_t accepted_packets_total_established; 51 + uint64_t forwarded_packets_total_gue; 52 + uint64_t forwarded_packets_total_gre; 53 + 54 + uint64_t errors_total_unknown_l3_proto; 55 + uint64_t errors_total_unknown_l4_proto; 56 + uint64_t errors_total_malformed_ip; 57 + uint64_t errors_total_fragmented_ip; 58 + uint64_t errors_total_malformed_icmp; 59 + uint64_t errors_total_unwanted_icmp; 60 + uint64_t errors_total_malformed_icmp_pkt_too_big; 61 + uint64_t errors_total_malformed_tcp; 62 + uint64_t errors_total_malformed_udp; 63 + uint64_t errors_total_icmp_echo_replies; 64 + uint64_t errors_total_malformed_encapsulation; 65 + uint64_t errors_total_encap_adjust_failed; 66 + uint64_t errors_total_encap_buffer_too_small; 67 + uint64_t errors_total_redirect_loop; 68 + uint64_t errors_total_encap_mtu_violate; 69 + } metrics_t; 70 + 71 + typedef enum { 72 + INVALID = 0, 73 + UNKNOWN, 74 + ECHO_REQUEST, 75 + SYN, 76 + SYN_COOKIE, 77 + ESTABLISHED, 78 + } verdict_t; 79 + 80 + typedef struct { 81 + uint16_t src, dst; 82 + } flow_ports_t; 83 + 84 + _Static_assert( 85 + sizeof(flow_ports_t) != 86 + offsetofend(struct bpf_sock_tuple, ipv4.dport) - 87 + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 88 + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 89 + _Static_assert( 90 + sizeof(flow_ports_t) != 91 + offsetofend(struct bpf_sock_tuple, ipv6.dport) - 92 + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 93 + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 94 + 95 + struct iphdr_info { 96 + void *hdr; 97 + __u64 len; 98 + }; 99 + 100 + typedef int ret_t; 101 + 102 + /* This is a bit of a hack. We need a return value which allows us to 103 + * indicate that the regular flow of the program should continue, 104 + * while allowing functions to use XDP_PASS and XDP_DROP, etc. 105 + */ 106 + static const ret_t CONTINUE_PROCESSING = -1; 107 + 108 + /* Convenience macro to call functions which return ret_t. 109 + */ 110 + #define MAYBE_RETURN(x) \ 111 + do { \ 112 + ret_t __ret = x; \ 113 + if (__ret != CONTINUE_PROCESSING) \ 114 + return __ret; \ 115 + } while (0) 116 + 117 + static bool ipv4_is_fragment(const struct iphdr *ip) 118 + { 119 + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 120 + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 121 + } 122 + 123 + static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr) 124 + { 125 + if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0)) 126 + return -1; 127 + 128 + *offset += sizeof(*iphdr); 129 + 130 + if (iphdr->ihl < 5) 131 + return -1; 132 + 133 + /* skip ipv4 options */ 134 + *offset += (iphdr->ihl - 5) * 4; 135 + 136 + return 0; 137 + } 138 + 139 + /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 140 + static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports) 141 + { 142 + if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0)) 143 + return false; 144 + 145 + *offset += sizeof(*ports); 146 + 147 + /* Ports in the L4 headers are reversed, since we are parsing an ICMP 148 + * payload which is going towards the eyeball. 149 + */ 150 + uint16_t dst = ports->src; 151 + ports->src = ports->dst; 152 + ports->dst = dst; 153 + return true; 154 + } 155 + 156 + static uint16_t pkt_checksum_fold(uint32_t csum) 157 + { 158 + /* The highest reasonable value for an IPv4 header 159 + * checksum requires two folds, so we just do that always. 160 + */ 161 + csum = (csum & 0xffff) + (csum >> 16); 162 + csum = (csum & 0xffff) + (csum >> 16); 163 + return (uint16_t)~csum; 164 + } 165 + 166 + static void pkt_ipv4_checksum(struct iphdr *iph) 167 + { 168 + iph->check = 0; 169 + 170 + /* An IP header without options is 20 bytes. Two of those 171 + * are the checksum, which we always set to zero. Hence, 172 + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 173 + * which fits in 32 bit. 174 + */ 175 + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 176 + uint32_t acc = 0; 177 + uint16_t *ipw = (uint16_t *)iph; 178 + 179 + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) 180 + acc += ipw[i]; 181 + 182 + iph->check = pkt_checksum_fold(acc); 183 + } 184 + 185 + static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset, 186 + const struct ipv6hdr *ipv6, uint8_t *upper_proto, 187 + bool *is_fragment) 188 + { 189 + /* We understand five extension headers. 190 + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 191 + * headers should occur once, except Destination Options, which may 192 + * occur twice. Hence we give up after 6 headers. 193 + */ 194 + struct { 195 + uint8_t next; 196 + uint8_t len; 197 + } exthdr = { 198 + .next = ipv6->nexthdr, 199 + }; 200 + *is_fragment = false; 201 + 202 + for (int i = 0; i < 6; i++) { 203 + switch (exthdr.next) { 204 + case IPPROTO_FRAGMENT: 205 + *is_fragment = true; 206 + /* NB: We don't check that hdrlen == 0 as per spec. */ 207 + /* fallthrough; */ 208 + 209 + case IPPROTO_HOPOPTS: 210 + case IPPROTO_ROUTING: 211 + case IPPROTO_DSTOPTS: 212 + case IPPROTO_MH: 213 + if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0)) 214 + return false; 215 + 216 + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 217 + *offset += (exthdr.len + 1) * 8; 218 + 219 + /* Decode next header */ 220 + break; 221 + 222 + default: 223 + /* The next header is not one of the known extension 224 + * headers, treat it as the upper layer header. 225 + * 226 + * This handles IPPROTO_NONE. 227 + * 228 + * Encapsulating Security Payload (50) and Authentication 229 + * Header (51) also end up here (and will trigger an 230 + * unknown proto error later). They have a custom header 231 + * format and seem too esoteric to care about. 232 + */ 233 + *upper_proto = exthdr.next; 234 + return true; 235 + } 236 + } 237 + 238 + /* We never found an upper layer header. */ 239 + return false; 240 + } 241 + 242 + static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6, 243 + uint8_t *proto, bool *is_fragment) 244 + { 245 + if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0)) 246 + return -1; 247 + 248 + *offset += sizeof(*ipv6); 249 + 250 + if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment)) 251 + return -1; 252 + 253 + return 0; 254 + } 255 + 256 + /* Global metrics, per CPU 257 + */ 258 + struct { 259 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 260 + __uint(max_entries, 1); 261 + __type(key, unsigned int); 262 + __type(value, metrics_t); 263 + } metrics_map SEC(".maps"); 264 + 265 + static metrics_t *get_global_metrics(void) 266 + { 267 + uint64_t key = 0; 268 + return bpf_map_lookup_elem(&metrics_map, &key); 269 + } 270 + 271 + static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 272 + { 273 + const int payload_off = 274 + sizeof(*encap) + 275 + sizeof(struct in_addr) * encap->unigue.hop_count; 276 + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 277 + 278 + /* Changing the ethertype if the encapsulated packet is ipv6 */ 279 + if (encap->gue.proto_ctype == IPPROTO_IPV6) 280 + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 281 + 282 + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 283 + BPF_F_ADJ_ROOM_FIXED_GSO | 284 + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 285 + bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) 286 + return TC_ACT_SHOT; 287 + 288 + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 289 + } 290 + 291 + static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 292 + encap_headers_t *encap, struct in_addr *next_hop, 293 + metrics_t *metrics) 294 + { 295 + const int payload_off = 296 + sizeof(*encap) + 297 + sizeof(struct in_addr) * encap->unigue.hop_count; 298 + int32_t encap_overhead = 299 + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 300 + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 301 + __u8 encap_buffer[sizeof(encap_gre_t)] = {}; 302 + uint16_t proto = ETH_P_IP; 303 + uint32_t mtu_len = 0; 304 + encap_gre_t *encap_gre; 305 + 306 + metrics->forwarded_packets_total_gre++; 307 + 308 + /* Loop protection: the inner packet's TTL is decremented as a safeguard 309 + * against any forwarding loop. As the only interesting field is the TTL 310 + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 311 + * as they handle the split packets if needed (no need for the data to be 312 + * in the linear section). 313 + */ 314 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { 315 + proto = ETH_P_IPV6; 316 + uint8_t ttl; 317 + int rc; 318 + 319 + rc = bpf_skb_load_bytes( 320 + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 321 + &ttl, 1); 322 + if (rc != 0) { 323 + metrics->errors_total_malformed_encapsulation++; 324 + return TC_ACT_SHOT; 325 + } 326 + 327 + if (ttl == 0) { 328 + metrics->errors_total_redirect_loop++; 329 + return TC_ACT_SHOT; 330 + } 331 + 332 + ttl--; 333 + rc = bpf_skb_store_bytes( 334 + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 335 + &ttl, 1, 0); 336 + if (rc != 0) { 337 + metrics->errors_total_malformed_encapsulation++; 338 + return TC_ACT_SHOT; 339 + } 340 + } else { 341 + uint8_t ttl; 342 + int rc; 343 + 344 + rc = bpf_skb_load_bytes( 345 + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 346 + 1); 347 + if (rc != 0) { 348 + metrics->errors_total_malformed_encapsulation++; 349 + return TC_ACT_SHOT; 350 + } 351 + 352 + if (ttl == 0) { 353 + metrics->errors_total_redirect_loop++; 354 + return TC_ACT_SHOT; 355 + } 356 + 357 + /* IPv4 also has a checksum to patch. While the TTL is only one byte, 358 + * this function only works for 2 and 4 bytes arguments (the result is 359 + * the same). 360 + */ 361 + rc = bpf_l3_csum_replace( 362 + skb, payload_off + offsetof(struct iphdr, check), ttl, 363 + ttl - 1, 2); 364 + if (rc != 0) { 365 + metrics->errors_total_malformed_encapsulation++; 366 + return TC_ACT_SHOT; 367 + } 368 + 369 + ttl--; 370 + rc = bpf_skb_store_bytes( 371 + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 372 + 0); 373 + if (rc != 0) { 374 + metrics->errors_total_malformed_encapsulation++; 375 + return TC_ACT_SHOT; 376 + } 377 + } 378 + 379 + if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { 380 + metrics->errors_total_encap_mtu_violate++; 381 + return TC_ACT_SHOT; 382 + } 383 + 384 + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 385 + BPF_F_ADJ_ROOM_FIXED_GSO | 386 + BPF_F_ADJ_ROOM_NO_CSUM_RESET) || 387 + bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { 388 + metrics->errors_total_encap_adjust_failed++; 389 + return TC_ACT_SHOT; 390 + } 391 + 392 + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 393 + metrics->errors_total_encap_buffer_too_small++; 394 + return TC_ACT_SHOT; 395 + } 396 + 397 + encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer)); 398 + if (!encap_gre) { 399 + metrics->errors_total_encap_buffer_too_small++; 400 + return TC_ACT_SHOT; 401 + } 402 + 403 + encap_gre->ip.protocol = IPPROTO_GRE; 404 + encap_gre->ip.daddr = next_hop->s_addr; 405 + encap_gre->ip.saddr = ENCAPSULATION_IP; 406 + encap_gre->ip.tot_len = 407 + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 408 + encap_gre->gre.flags = 0; 409 + encap_gre->gre.protocol = bpf_htons(proto); 410 + pkt_ipv4_checksum((void *)&encap_gre->ip); 411 + 412 + if (encap_gre == encap_buffer) 413 + bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); 414 + 415 + return bpf_redirect(skb->ifindex, 0); 416 + } 417 + 418 + static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 419 + encap_headers_t *encap, struct in_addr *next_hop, 420 + metrics_t *metrics) 421 + { 422 + /* swap L2 addresses */ 423 + /* This assumes that packets are received from a router. 424 + * So just swapping the MAC addresses here will make the packet go back to 425 + * the router, which will send it to the appropriate machine. 426 + */ 427 + unsigned char temp[ETH_ALEN]; 428 + memcpy(temp, encap->eth.h_dest, sizeof(temp)); 429 + memcpy(encap->eth.h_dest, encap->eth.h_source, 430 + sizeof(encap->eth.h_dest)); 431 + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 432 + 433 + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 434 + encap->unigue.last_hop_gre) { 435 + return forward_with_gre(skb, dynptr, encap, next_hop, metrics); 436 + } 437 + 438 + metrics->forwarded_packets_total_gue++; 439 + uint32_t old_saddr = encap->ip.saddr; 440 + encap->ip.saddr = encap->ip.daddr; 441 + encap->ip.daddr = next_hop->s_addr; 442 + if (encap->unigue.next_hop < encap->unigue.hop_count) { 443 + encap->unigue.next_hop++; 444 + } 445 + 446 + /* Remove ip->saddr, add next_hop->s_addr */ 447 + const uint64_t off = offsetof(typeof(*encap), ip.check); 448 + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 449 + if (ret < 0) { 450 + return TC_ACT_SHOT; 451 + } 452 + 453 + return bpf_redirect(skb->ifindex, 0); 454 + } 455 + 456 + static ret_t skip_next_hops(__u64 *offset, int n) 457 + { 458 + __u32 res; 459 + switch (n) { 460 + case 1: 461 + *offset += sizeof(struct in_addr); 462 + case 0: 463 + return CONTINUE_PROCESSING; 464 + 465 + default: 466 + return TC_ACT_SHOT; 467 + } 468 + } 469 + 470 + /* Get the next hop from the GLB header. 471 + * 472 + * Sets next_hop->s_addr to 0 if there are no more hops left. 473 + * pkt is positioned just after the variable length GLB header 474 + * iff the call is successful. 475 + */ 476 + static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap, 477 + struct in_addr *next_hop) 478 + { 479 + if (encap->unigue.next_hop > encap->unigue.hop_count) 480 + return TC_ACT_SHOT; 481 + 482 + /* Skip "used" next hops. */ 483 + MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop)); 484 + 485 + if (encap->unigue.next_hop == encap->unigue.hop_count) { 486 + /* No more next hops, we are at the end of the GLB header. */ 487 + next_hop->s_addr = 0; 488 + return CONTINUE_PROCESSING; 489 + } 490 + 491 + if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0)) 492 + return TC_ACT_SHOT; 493 + 494 + *offset += sizeof(*next_hop); 495 + 496 + /* Skip the remainig next hops (may be zero). */ 497 + return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1); 498 + } 499 + 500 + /* Fill a bpf_sock_tuple to be used with the socket lookup functions. 501 + * This is a kludge that let's us work around verifier limitations: 502 + * 503 + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 504 + * 505 + * clang will substitue a costant for sizeof, which allows the verifier 506 + * to track it's value. Based on this, it can figure out the constant 507 + * return value, and calling code works while still being "generic" to 508 + * IPv4 and IPv6. 509 + */ 510 + static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 511 + uint64_t iphlen, uint16_t sport, uint16_t dport) 512 + { 513 + switch (iphlen) { 514 + case sizeof(struct iphdr): { 515 + struct iphdr *ipv4 = (struct iphdr *)iph; 516 + tuple->ipv4.daddr = ipv4->daddr; 517 + tuple->ipv4.saddr = ipv4->saddr; 518 + tuple->ipv4.sport = sport; 519 + tuple->ipv4.dport = dport; 520 + return sizeof(tuple->ipv4); 521 + } 522 + 523 + case sizeof(struct ipv6hdr): { 524 + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 525 + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 526 + sizeof(tuple->ipv6.daddr)); 527 + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 528 + sizeof(tuple->ipv6.saddr)); 529 + tuple->ipv6.sport = sport; 530 + tuple->ipv6.dport = dport; 531 + return sizeof(tuple->ipv6); 532 + } 533 + 534 + default: 535 + return 0; 536 + } 537 + } 538 + 539 + static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, 540 + uint64_t tuplen, void *iph, struct tcphdr *tcp) 541 + { 542 + struct bpf_sock *sk = 543 + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 544 + 545 + if (sk == NULL) 546 + return UNKNOWN; 547 + 548 + if (sk->state != BPF_TCP_LISTEN) { 549 + bpf_sk_release(sk); 550 + return ESTABLISHED; 551 + } 552 + 553 + if (iph != NULL && tcp != NULL) { 554 + /* Kludge: we've run out of arguments, but need the length of the ip header. */ 555 + uint64_t iphlen = sizeof(struct iphdr); 556 + 557 + if (tuplen == sizeof(tuple->ipv6)) 558 + iphlen = sizeof(struct ipv6hdr); 559 + 560 + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 561 + sizeof(*tcp)) == 0) { 562 + bpf_sk_release(sk); 563 + return SYN_COOKIE; 564 + } 565 + } 566 + 567 + bpf_sk_release(sk); 568 + return UNKNOWN; 569 + } 570 + 571 + static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen) 572 + { 573 + struct bpf_sock *sk = 574 + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 575 + 576 + if (sk == NULL) 577 + return UNKNOWN; 578 + 579 + if (sk->state == BPF_TCP_ESTABLISHED) { 580 + bpf_sk_release(sk); 581 + return ESTABLISHED; 582 + } 583 + 584 + bpf_sk_release(sk); 585 + return UNKNOWN; 586 + } 587 + 588 + static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple, 589 + uint64_t tuplen, metrics_t *metrics) 590 + { 591 + switch (proto) { 592 + case IPPROTO_TCP: 593 + return classify_tcp(skb, tuple, tuplen, NULL, NULL); 594 + 595 + case IPPROTO_UDP: 596 + return classify_udp(skb, tuple, tuplen); 597 + 598 + default: 599 + metrics->errors_total_malformed_icmp++; 600 + return INVALID; 601 + } 602 + } 603 + 604 + static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset, 605 + metrics_t *metrics) 606 + { 607 + struct icmphdr icmp; 608 + struct iphdr ipv4; 609 + 610 + if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) { 611 + metrics->errors_total_malformed_icmp++; 612 + return INVALID; 613 + } 614 + 615 + *offset += sizeof(icmp); 616 + 617 + /* We should never receive encapsulated echo replies. */ 618 + if (icmp.type == ICMP_ECHOREPLY) { 619 + metrics->errors_total_icmp_echo_replies++; 620 + return INVALID; 621 + } 622 + 623 + if (icmp.type == ICMP_ECHO) 624 + return ECHO_REQUEST; 625 + 626 + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 627 + metrics->errors_total_unwanted_icmp++; 628 + return INVALID; 629 + } 630 + 631 + if (pkt_parse_ipv4(dynptr, offset, &ipv4)) { 632 + metrics->errors_total_malformed_icmp_pkt_too_big++; 633 + return INVALID; 634 + } 635 + 636 + /* The source address in the outer IP header is from the entity that 637 + * originated the ICMP message. Use the original IP header to restore 638 + * the correct flow tuple. 639 + */ 640 + struct bpf_sock_tuple tuple; 641 + tuple.ipv4.saddr = ipv4.daddr; 642 + tuple.ipv4.daddr = ipv4.saddr; 643 + 644 + if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) { 645 + metrics->errors_total_malformed_icmp_pkt_too_big++; 646 + return INVALID; 647 + } 648 + 649 + return classify_icmp(skb, ipv4.protocol, &tuple, 650 + sizeof(tuple.ipv4), metrics); 651 + } 652 + 653 + static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 654 + metrics_t *metrics) 655 + { 656 + struct bpf_sock_tuple tuple; 657 + struct ipv6hdr ipv6; 658 + struct icmp6hdr icmp6; 659 + bool is_fragment; 660 + uint8_t l4_proto; 661 + 662 + if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) { 663 + metrics->errors_total_malformed_icmp++; 664 + return INVALID; 665 + } 666 + 667 + /* We should never receive encapsulated echo replies. */ 668 + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 669 + metrics->errors_total_icmp_echo_replies++; 670 + return INVALID; 671 + } 672 + 673 + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 674 + return ECHO_REQUEST; 675 + } 676 + 677 + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 678 + metrics->errors_total_unwanted_icmp++; 679 + return INVALID; 680 + } 681 + 682 + if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) { 683 + metrics->errors_total_malformed_icmp_pkt_too_big++; 684 + return INVALID; 685 + } 686 + 687 + if (is_fragment) { 688 + metrics->errors_total_fragmented_ip++; 689 + return INVALID; 690 + } 691 + 692 + /* Swap source and dest addresses. */ 693 + memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr)); 694 + memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr)); 695 + 696 + if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) { 697 + metrics->errors_total_malformed_icmp_pkt_too_big++; 698 + return INVALID; 699 + } 700 + 701 + return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6), 702 + metrics); 703 + } 704 + 705 + static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 706 + struct iphdr_info *info, metrics_t *metrics) 707 + { 708 + struct bpf_sock_tuple tuple; 709 + struct tcphdr tcp; 710 + uint64_t tuplen; 711 + 712 + metrics->l4_protocol_packets_total_tcp++; 713 + 714 + if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) { 715 + metrics->errors_total_malformed_tcp++; 716 + return INVALID; 717 + } 718 + 719 + *offset += sizeof(tcp); 720 + 721 + if (tcp.syn) 722 + return SYN; 723 + 724 + tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest); 725 + return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp); 726 + } 727 + 728 + static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, 729 + struct iphdr_info *info, metrics_t *metrics) 730 + { 731 + struct bpf_sock_tuple tuple; 732 + struct udphdr udph; 733 + uint64_t tuplen; 734 + 735 + metrics->l4_protocol_packets_total_udp++; 736 + 737 + if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) { 738 + metrics->errors_total_malformed_udp++; 739 + return INVALID; 740 + } 741 + *offset += sizeof(udph); 742 + 743 + tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest); 744 + return classify_udp(skb, &tuple, tuplen); 745 + } 746 + 747 + static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 748 + __u64 *offset, metrics_t *metrics) 749 + { 750 + struct iphdr ipv4; 751 + struct iphdr_info info = { 752 + .hdr = &ipv4, 753 + .len = sizeof(ipv4), 754 + }; 755 + 756 + metrics->l3_protocol_packets_total_ipv4++; 757 + 758 + if (pkt_parse_ipv4(dynptr, offset, &ipv4)) { 759 + metrics->errors_total_malformed_ip++; 760 + return INVALID; 761 + } 762 + 763 + if (ipv4.version != 4) { 764 + metrics->errors_total_malformed_ip++; 765 + return INVALID; 766 + } 767 + 768 + if (ipv4_is_fragment(&ipv4)) { 769 + metrics->errors_total_fragmented_ip++; 770 + return INVALID; 771 + } 772 + 773 + switch (ipv4.protocol) { 774 + case IPPROTO_ICMP: 775 + return process_icmpv4(skb, dynptr, offset, metrics); 776 + 777 + case IPPROTO_TCP: 778 + return process_tcp(dynptr, offset, skb, &info, metrics); 779 + 780 + case IPPROTO_UDP: 781 + return process_udp(dynptr, offset, skb, &info, metrics); 782 + 783 + default: 784 + metrics->errors_total_unknown_l4_proto++; 785 + return INVALID; 786 + } 787 + } 788 + 789 + static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr, 790 + __u64 *offset, metrics_t *metrics) 791 + { 792 + struct ipv6hdr ipv6; 793 + struct iphdr_info info = { 794 + .hdr = &ipv6, 795 + .len = sizeof(ipv6), 796 + }; 797 + uint8_t l4_proto; 798 + bool is_fragment; 799 + 800 + metrics->l3_protocol_packets_total_ipv6++; 801 + 802 + if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) { 803 + metrics->errors_total_malformed_ip++; 804 + return INVALID; 805 + } 806 + 807 + if (ipv6.version != 6) { 808 + metrics->errors_total_malformed_ip++; 809 + return INVALID; 810 + } 811 + 812 + if (is_fragment) { 813 + metrics->errors_total_fragmented_ip++; 814 + return INVALID; 815 + } 816 + 817 + switch (l4_proto) { 818 + case IPPROTO_ICMPV6: 819 + return process_icmpv6(dynptr, offset, skb, metrics); 820 + 821 + case IPPROTO_TCP: 822 + return process_tcp(dynptr, offset, skb, &info, metrics); 823 + 824 + case IPPROTO_UDP: 825 + return process_udp(dynptr, offset, skb, &info, metrics); 826 + 827 + default: 828 + metrics->errors_total_unknown_l4_proto++; 829 + return INVALID; 830 + } 831 + } 832 + 833 + SEC("tc") 834 + int cls_redirect(struct __sk_buff *skb) 835 + { 836 + __u8 encap_buffer[sizeof(encap_headers_t)] = {}; 837 + struct bpf_dynptr dynptr; 838 + struct in_addr next_hop; 839 + /* Tracks offset of the dynptr. This will be unnecessary once 840 + * bpf_dynptr_advance() is available. 841 + */ 842 + __u64 off = 0; 843 + ret_t ret; 844 + 845 + bpf_dynptr_from_skb(skb, 0, &dynptr); 846 + 847 + metrics_t *metrics = get_global_metrics(); 848 + if (metrics == NULL) 849 + return TC_ACT_SHOT; 850 + 851 + metrics->processed_packets_total++; 852 + 853 + /* Pass bogus packets as long as we're not sure they're 854 + * destined for us. 855 + */ 856 + if (skb->protocol != bpf_htons(ETH_P_IP)) 857 + return TC_ACT_OK; 858 + 859 + encap_headers_t *encap; 860 + 861 + /* Make sure that all encapsulation headers are available in 862 + * the linear portion of the skb. This makes it easy to manipulate them. 863 + */ 864 + if (bpf_skb_pull_data(skb, sizeof(*encap))) 865 + return TC_ACT_OK; 866 + 867 + encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer)); 868 + if (!encap) 869 + return TC_ACT_OK; 870 + 871 + off += sizeof(*encap); 872 + 873 + if (encap->ip.ihl != 5) 874 + /* We never have any options. */ 875 + return TC_ACT_OK; 876 + 877 + if (encap->ip.daddr != ENCAPSULATION_IP || 878 + encap->ip.protocol != IPPROTO_UDP) 879 + return TC_ACT_OK; 880 + 881 + /* TODO Check UDP length? */ 882 + if (encap->udp.dest != ENCAPSULATION_PORT) 883 + return TC_ACT_OK; 884 + 885 + /* We now know that the packet is destined to us, we can 886 + * drop bogus ones. 887 + */ 888 + if (ipv4_is_fragment((void *)&encap->ip)) { 889 + metrics->errors_total_fragmented_ip++; 890 + return TC_ACT_SHOT; 891 + } 892 + 893 + if (encap->gue.variant != 0) { 894 + metrics->errors_total_malformed_encapsulation++; 895 + return TC_ACT_SHOT; 896 + } 897 + 898 + if (encap->gue.control != 0) { 899 + metrics->errors_total_malformed_encapsulation++; 900 + return TC_ACT_SHOT; 901 + } 902 + 903 + if (encap->gue.flags != 0) { 904 + metrics->errors_total_malformed_encapsulation++; 905 + return TC_ACT_SHOT; 906 + } 907 + 908 + if (encap->gue.hlen != 909 + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 910 + metrics->errors_total_malformed_encapsulation++; 911 + return TC_ACT_SHOT; 912 + } 913 + 914 + if (encap->unigue.version != 0) { 915 + metrics->errors_total_malformed_encapsulation++; 916 + return TC_ACT_SHOT; 917 + } 918 + 919 + if (encap->unigue.reserved != 0) 920 + return TC_ACT_SHOT; 921 + 922 + MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop)); 923 + 924 + if (next_hop.s_addr == 0) { 925 + metrics->accepted_packets_total_last_hop++; 926 + return accept_locally(skb, encap); 927 + } 928 + 929 + verdict_t verdict; 930 + switch (encap->gue.proto_ctype) { 931 + case IPPROTO_IPIP: 932 + verdict = process_ipv4(skb, &dynptr, &off, metrics); 933 + break; 934 + 935 + case IPPROTO_IPV6: 936 + verdict = process_ipv6(skb, &dynptr, &off, metrics); 937 + break; 938 + 939 + default: 940 + metrics->errors_total_unknown_l3_proto++; 941 + return TC_ACT_SHOT; 942 + } 943 + 944 + switch (verdict) { 945 + case INVALID: 946 + /* metrics have already been bumped */ 947 + return TC_ACT_SHOT; 948 + 949 + case UNKNOWN: 950 + return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics); 951 + 952 + case ECHO_REQUEST: 953 + metrics->accepted_packets_total_icmp_echo_request++; 954 + break; 955 + 956 + case SYN: 957 + if (encap->unigue.forward_syn) { 958 + return forward_to_next_hop(skb, &dynptr, encap, &next_hop, 959 + metrics); 960 + } 961 + 962 + metrics->accepted_packets_total_syn++; 963 + break; 964 + 965 + case SYN_COOKIE: 966 + metrics->accepted_packets_total_syn_cookies++; 967 + break; 968 + 969 + case ESTABLISHED: 970 + metrics->accepted_packets_total_established++; 971 + break; 972 + } 973 + 974 + ret = accept_locally(skb, encap); 975 + 976 + if (encap == encap_buffer) 977 + bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); 978 + 979 + return ret; 980 + }

+487

tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // Copyright (c) 2017 Facebook 3 + #include <stddef.h> 4 + #include <stdbool.h> 5 + #include <string.h> 6 + #include <linux/pkt_cls.h> 7 + #include <linux/bpf.h> 8 + #include <linux/in.h> 9 + #include <linux/if_ether.h> 10 + #include <linux/ip.h> 11 + #include <linux/ipv6.h> 12 + #include <linux/icmp.h> 13 + #include <linux/icmpv6.h> 14 + #include <linux/tcp.h> 15 + #include <linux/udp.h> 16 + #include <bpf/bpf_helpers.h> 17 + #include "test_iptunnel_common.h" 18 + #include <bpf/bpf_endian.h> 19 + 20 + #include "bpf_kfuncs.h" 21 + 22 + static __always_inline __u32 rol32(__u32 word, unsigned int shift) 23 + { 24 + return (word << shift) | (word >> ((-shift) & 31)); 25 + } 26 + 27 + /* copy paste of jhash from kernel sources to make sure llvm 28 + * can compile it into valid sequence of bpf instructions 29 + */ 30 + #define __jhash_mix(a, b, c) \ 31 + { \ 32 + a -= c; a ^= rol32(c, 4); c += b; \ 33 + b -= a; b ^= rol32(a, 6); a += c; \ 34 + c -= b; c ^= rol32(b, 8); b += a; \ 35 + a -= c; a ^= rol32(c, 16); c += b; \ 36 + b -= a; b ^= rol32(a, 19); a += c; \ 37 + c -= b; c ^= rol32(b, 4); b += a; \ 38 + } 39 + 40 + #define __jhash_final(a, b, c) \ 41 + { \ 42 + c ^= b; c -= rol32(b, 14); \ 43 + a ^= c; a -= rol32(c, 11); \ 44 + b ^= a; b -= rol32(a, 25); \ 45 + c ^= b; c -= rol32(b, 16); \ 46 + a ^= c; a -= rol32(c, 4); \ 47 + b ^= a; b -= rol32(a, 14); \ 48 + c ^= b; c -= rol32(b, 24); \ 49 + } 50 + 51 + #define JHASH_INITVAL 0xdeadbeef 52 + 53 + typedef unsigned int u32; 54 + 55 + static __noinline u32 jhash(const void *key, u32 length, u32 initval) 56 + { 57 + u32 a, b, c; 58 + const unsigned char *k = key; 59 + 60 + a = b = c = JHASH_INITVAL + length + initval; 61 + 62 + while (length > 12) { 63 + a += *(u32 *)(k); 64 + b += *(u32 *)(k + 4); 65 + c += *(u32 *)(k + 8); 66 + __jhash_mix(a, b, c); 67 + length -= 12; 68 + k += 12; 69 + } 70 + switch (length) { 71 + case 12: c += (u32)k[11]<<24; 72 + case 11: c += (u32)k[10]<<16; 73 + case 10: c += (u32)k[9]<<8; 74 + case 9: c += k[8]; 75 + case 8: b += (u32)k[7]<<24; 76 + case 7: b += (u32)k[6]<<16; 77 + case 6: b += (u32)k[5]<<8; 78 + case 5: b += k[4]; 79 + case 4: a += (u32)k[3]<<24; 80 + case 3: a += (u32)k[2]<<16; 81 + case 2: a += (u32)k[1]<<8; 82 + case 1: a += k[0]; 83 + __jhash_final(a, b, c); 84 + case 0: /* Nothing left to add */ 85 + break; 86 + } 87 + 88 + return c; 89 + } 90 + 91 + static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) 92 + { 93 + a += initval; 94 + b += initval; 95 + c += initval; 96 + __jhash_final(a, b, c); 97 + return c; 98 + } 99 + 100 + static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval) 101 + { 102 + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); 103 + } 104 + 105 + #define PCKT_FRAGMENTED 65343 106 + #define IPV4_HDR_LEN_NO_OPT 20 107 + #define IPV4_PLUS_ICMP_HDR 28 108 + #define IPV6_PLUS_ICMP_HDR 48 109 + #define RING_SIZE 2 110 + #define MAX_VIPS 12 111 + #define MAX_REALS 5 112 + #define CTL_MAP_SIZE 16 113 + #define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE) 114 + #define F_IPV6 (1 << 0) 115 + #define F_HASH_NO_SRC_PORT (1 << 0) 116 + #define F_ICMP (1 << 0) 117 + #define F_SYN_SET (1 << 1) 118 + 119 + struct packet_description { 120 + union { 121 + __be32 src; 122 + __be32 srcv6[4]; 123 + }; 124 + union { 125 + __be32 dst; 126 + __be32 dstv6[4]; 127 + }; 128 + union { 129 + __u32 ports; 130 + __u16 port16[2]; 131 + }; 132 + __u8 proto; 133 + __u8 flags; 134 + }; 135 + 136 + struct ctl_value { 137 + union { 138 + __u64 value; 139 + __u32 ifindex; 140 + __u8 mac[6]; 141 + }; 142 + }; 143 + 144 + struct vip_meta { 145 + __u32 flags; 146 + __u32 vip_num; 147 + }; 148 + 149 + struct real_definition { 150 + union { 151 + __be32 dst; 152 + __be32 dstv6[4]; 153 + }; 154 + __u8 flags; 155 + }; 156 + 157 + struct vip_stats { 158 + __u64 bytes; 159 + __u64 pkts; 160 + }; 161 + 162 + struct eth_hdr { 163 + unsigned char eth_dest[ETH_ALEN]; 164 + unsigned char eth_source[ETH_ALEN]; 165 + unsigned short eth_proto; 166 + }; 167 + 168 + struct { 169 + __uint(type, BPF_MAP_TYPE_HASH); 170 + __uint(max_entries, MAX_VIPS); 171 + __type(key, struct vip); 172 + __type(value, struct vip_meta); 173 + } vip_map SEC(".maps"); 174 + 175 + struct { 176 + __uint(type, BPF_MAP_TYPE_ARRAY); 177 + __uint(max_entries, CH_RINGS_SIZE); 178 + __type(key, __u32); 179 + __type(value, __u32); 180 + } ch_rings SEC(".maps"); 181 + 182 + struct { 183 + __uint(type, BPF_MAP_TYPE_ARRAY); 184 + __uint(max_entries, MAX_REALS); 185 + __type(key, __u32); 186 + __type(value, struct real_definition); 187 + } reals SEC(".maps"); 188 + 189 + struct { 190 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 191 + __uint(max_entries, MAX_VIPS); 192 + __type(key, __u32); 193 + __type(value, struct vip_stats); 194 + } stats SEC(".maps"); 195 + 196 + struct { 197 + __uint(type, BPF_MAP_TYPE_ARRAY); 198 + __uint(max_entries, CTL_MAP_SIZE); 199 + __type(key, __u32); 200 + __type(value, struct ctl_value); 201 + } ctl_array SEC(".maps"); 202 + 203 + static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6) 204 + { 205 + if (ipv6) 206 + return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), 207 + pckt->ports, CH_RINGS_SIZE); 208 + else 209 + return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); 210 + } 211 + 212 + static __noinline bool get_packet_dst(struct real_definition **real, 213 + struct packet_description *pckt, 214 + struct vip_meta *vip_info, 215 + bool is_ipv6) 216 + { 217 + __u32 hash = get_packet_hash(pckt, is_ipv6); 218 + __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE; 219 + __u32 *real_pos; 220 + 221 + if (hash != 0x358459b7 /* jhash of ipv4 packet */ && 222 + hash != 0x2f4bc6bb /* jhash of ipv6 packet */) 223 + return false; 224 + 225 + real_pos = bpf_map_lookup_elem(&ch_rings, &key); 226 + if (!real_pos) 227 + return false; 228 + key = *real_pos; 229 + *real = bpf_map_lookup_elem(&reals, &key); 230 + if (!(*real)) 231 + return false; 232 + return true; 233 + } 234 + 235 + static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off, 236 + struct packet_description *pckt) 237 + { 238 + __u8 buffer[sizeof(struct ipv6hdr)] = {}; 239 + struct icmp6hdr *icmp_hdr; 240 + struct ipv6hdr *ip6h; 241 + 242 + icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 243 + if (!icmp_hdr) 244 + return TC_ACT_SHOT; 245 + 246 + if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG) 247 + return TC_ACT_OK; 248 + off += sizeof(struct icmp6hdr); 249 + ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 250 + if (!ip6h) 251 + return TC_ACT_SHOT; 252 + pckt->proto = ip6h->nexthdr; 253 + pckt->flags |= F_ICMP; 254 + memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16); 255 + memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16); 256 + return TC_ACT_UNSPEC; 257 + } 258 + 259 + static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off, 260 + struct packet_description *pckt) 261 + { 262 + __u8 buffer_icmp[sizeof(struct iphdr)] = {}; 263 + __u8 buffer_ip[sizeof(struct iphdr)] = {}; 264 + struct icmphdr *icmp_hdr; 265 + struct iphdr *iph; 266 + 267 + icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer_icmp, sizeof(buffer_icmp)); 268 + if (!icmp_hdr) 269 + return TC_ACT_SHOT; 270 + if (icmp_hdr->type != ICMP_DEST_UNREACH || 271 + icmp_hdr->code != ICMP_FRAG_NEEDED) 272 + return TC_ACT_OK; 273 + off += sizeof(struct icmphdr); 274 + iph = bpf_dynptr_slice(skb_ptr, off, buffer_ip, sizeof(buffer_ip)); 275 + if (!iph || iph->ihl != 5) 276 + return TC_ACT_SHOT; 277 + pckt->proto = iph->protocol; 278 + pckt->flags |= F_ICMP; 279 + pckt->src = iph->daddr; 280 + pckt->dst = iph->saddr; 281 + return TC_ACT_UNSPEC; 282 + } 283 + 284 + static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off, 285 + struct packet_description *pckt) 286 + { 287 + __u8 buffer[sizeof(struct udphdr)] = {}; 288 + struct udphdr *udp; 289 + 290 + udp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 291 + if (!udp) 292 + return false; 293 + 294 + if (!(pckt->flags & F_ICMP)) { 295 + pckt->port16[0] = udp->source; 296 + pckt->port16[1] = udp->dest; 297 + } else { 298 + pckt->port16[0] = udp->dest; 299 + pckt->port16[1] = udp->source; 300 + } 301 + return true; 302 + } 303 + 304 + static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off, 305 + struct packet_description *pckt) 306 + { 307 + __u8 buffer[sizeof(struct tcphdr)] = {}; 308 + struct tcphdr *tcp; 309 + 310 + tcp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 311 + if (!tcp) 312 + return false; 313 + 314 + if (tcp->syn) 315 + pckt->flags |= F_SYN_SET; 316 + 317 + if (!(pckt->flags & F_ICMP)) { 318 + pckt->port16[0] = tcp->source; 319 + pckt->port16[1] = tcp->dest; 320 + } else { 321 + pckt->port16[0] = tcp->dest; 322 + pckt->port16[1] = tcp->source; 323 + } 324 + return true; 325 + } 326 + 327 + static __noinline int process_packet(struct bpf_dynptr *skb_ptr, 328 + struct eth_hdr *eth, __u64 off, 329 + bool is_ipv6, struct __sk_buff *skb) 330 + { 331 + struct packet_description pckt = {}; 332 + struct bpf_tunnel_key tkey = {}; 333 + struct vip_stats *data_stats; 334 + struct real_definition *dst; 335 + struct vip_meta *vip_info; 336 + struct ctl_value *cval; 337 + __u32 v4_intf_pos = 1; 338 + __u32 v6_intf_pos = 2; 339 + struct ipv6hdr *ip6h; 340 + struct vip vip = {}; 341 + struct iphdr *iph; 342 + int tun_flag = 0; 343 + __u16 pkt_bytes; 344 + __u64 iph_len; 345 + __u32 ifindex; 346 + __u8 protocol; 347 + __u32 vip_num; 348 + int action; 349 + 350 + tkey.tunnel_ttl = 64; 351 + if (is_ipv6) { 352 + __u8 buffer[sizeof(struct ipv6hdr)] = {}; 353 + 354 + ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 355 + if (!ip6h) 356 + return TC_ACT_SHOT; 357 + 358 + iph_len = sizeof(struct ipv6hdr); 359 + protocol = ip6h->nexthdr; 360 + pckt.proto = protocol; 361 + pkt_bytes = bpf_ntohs(ip6h->payload_len); 362 + off += iph_len; 363 + if (protocol == IPPROTO_FRAGMENT) { 364 + return TC_ACT_SHOT; 365 + } else if (protocol == IPPROTO_ICMPV6) { 366 + action = parse_icmpv6(skb_ptr, off, &pckt); 367 + if (action >= 0) 368 + return action; 369 + off += IPV6_PLUS_ICMP_HDR; 370 + } else { 371 + memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16); 372 + memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16); 373 + } 374 + } else { 375 + __u8 buffer[sizeof(struct iphdr)] = {}; 376 + 377 + iph = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); 378 + if (!iph || iph->ihl != 5) 379 + return TC_ACT_SHOT; 380 + 381 + protocol = iph->protocol; 382 + pckt.proto = protocol; 383 + pkt_bytes = bpf_ntohs(iph->tot_len); 384 + off += IPV4_HDR_LEN_NO_OPT; 385 + 386 + if (iph->frag_off & PCKT_FRAGMENTED) 387 + return TC_ACT_SHOT; 388 + if (protocol == IPPROTO_ICMP) { 389 + action = parse_icmp(skb_ptr, off, &pckt); 390 + if (action >= 0) 391 + return action; 392 + off += IPV4_PLUS_ICMP_HDR; 393 + } else { 394 + pckt.src = iph->saddr; 395 + pckt.dst = iph->daddr; 396 + } 397 + } 398 + protocol = pckt.proto; 399 + 400 + if (protocol == IPPROTO_TCP) { 401 + if (!parse_tcp(skb_ptr, off, &pckt)) 402 + return TC_ACT_SHOT; 403 + } else if (protocol == IPPROTO_UDP) { 404 + if (!parse_udp(skb_ptr, off, &pckt)) 405 + return TC_ACT_SHOT; 406 + } else { 407 + return TC_ACT_SHOT; 408 + } 409 + 410 + if (is_ipv6) 411 + memcpy(vip.daddr.v6, pckt.dstv6, 16); 412 + else 413 + vip.daddr.v4 = pckt.dst; 414 + 415 + vip.dport = pckt.port16[1]; 416 + vip.protocol = pckt.proto; 417 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 418 + if (!vip_info) { 419 + vip.dport = 0; 420 + vip_info = bpf_map_lookup_elem(&vip_map, &vip); 421 + if (!vip_info) 422 + return TC_ACT_SHOT; 423 + pckt.port16[1] = 0; 424 + } 425 + 426 + if (vip_info->flags & F_HASH_NO_SRC_PORT) 427 + pckt.port16[0] = 0; 428 + 429 + if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 430 + return TC_ACT_SHOT; 431 + 432 + if (dst->flags & F_IPV6) { 433 + cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos); 434 + if (!cval) 435 + return TC_ACT_SHOT; 436 + ifindex = cval->ifindex; 437 + memcpy(tkey.remote_ipv6, dst->dstv6, 16); 438 + tun_flag = BPF_F_TUNINFO_IPV6; 439 + } else { 440 + cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos); 441 + if (!cval) 442 + return TC_ACT_SHOT; 443 + ifindex = cval->ifindex; 444 + tkey.remote_ipv4 = dst->dst; 445 + } 446 + vip_num = vip_info->vip_num; 447 + data_stats = bpf_map_lookup_elem(&stats, &vip_num); 448 + if (!data_stats) 449 + return TC_ACT_SHOT; 450 + data_stats->pkts++; 451 + data_stats->bytes += pkt_bytes; 452 + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag); 453 + *(u32 *)eth->eth_dest = tkey.remote_ipv4; 454 + return bpf_redirect(ifindex, 0); 455 + } 456 + 457 + SEC("tc") 458 + int balancer_ingress(struct __sk_buff *ctx) 459 + { 460 + __u8 buffer[sizeof(struct eth_hdr)] = {}; 461 + struct bpf_dynptr ptr; 462 + struct eth_hdr *eth; 463 + __u32 eth_proto; 464 + __u32 nh_off; 465 + int err; 466 + 467 + nh_off = sizeof(struct eth_hdr); 468 + 469 + bpf_dynptr_from_skb(ctx, 0, &ptr); 470 + eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); 471 + if (!eth) 472 + return TC_ACT_SHOT; 473 + eth_proto = eth->eth_proto; 474 + if (eth_proto == bpf_htons(ETH_P_IP)) 475 + err = process_packet(&ptr, eth, nh_off, false, ctx); 476 + else if (eth_proto == bpf_htons(ETH_P_IPV6)) 477 + err = process_packet(&ptr, eth, nh_off, true, ctx); 478 + else 479 + return TC_ACT_SHOT; 480 + 481 + if (eth == buffer) 482 + bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); 483 + 484 + return err; 485 + } 486 + 487 + char _license[] SEC("license") = "GPL";

+119

tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* This parsing logic is taken from the open source library katran, a layer 4 4 + * load balancer. 5 + * 6 + * This code logic using dynptrs can be found in test_parse_tcp_hdr_opt_dynptr.c 7 + * 8 + * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h 9 + */ 10 + 11 + #include <linux/bpf.h> 12 + #include <bpf/bpf_helpers.h> 13 + #include <linux/tcp.h> 14 + #include <stdbool.h> 15 + #include <linux/ipv6.h> 16 + #include <linux/if_ether.h> 17 + #include "test_tcp_hdr_options.h" 18 + 19 + char _license[] SEC("license") = "GPL"; 20 + 21 + /* Kind number used for experiments */ 22 + const __u32 tcp_hdr_opt_kind_tpr = 0xFD; 23 + /* Length of the tcp header option */ 24 + const __u32 tcp_hdr_opt_len_tpr = 6; 25 + /* maximum number of header options to check to lookup server_id */ 26 + const __u32 tcp_hdr_opt_max_opt_checks = 15; 27 + 28 + __u32 server_id; 29 + 30 + struct hdr_opt_state { 31 + __u32 server_id; 32 + __u8 byte_offset; 33 + __u8 hdr_bytes_remaining; 34 + }; 35 + 36 + static int parse_hdr_opt(const struct xdp_md *xdp, struct hdr_opt_state *state) 37 + { 38 + const void *data = (void *)(long)xdp->data; 39 + const void *data_end = (void *)(long)xdp->data_end; 40 + __u8 *tcp_opt, kind, hdr_len; 41 + 42 + tcp_opt = (__u8 *)(data + state->byte_offset); 43 + if (tcp_opt + 1 > data_end) 44 + return -1; 45 + 46 + kind = tcp_opt[0]; 47 + 48 + if (kind == TCPOPT_EOL) 49 + return -1; 50 + 51 + if (kind == TCPOPT_NOP) { 52 + state->hdr_bytes_remaining--; 53 + state->byte_offset++; 54 + return 0; 55 + } 56 + 57 + if (state->hdr_bytes_remaining < 2 || 58 + tcp_opt + sizeof(__u8) + sizeof(__u8) > data_end) 59 + return -1; 60 + 61 + hdr_len = tcp_opt[1]; 62 + if (hdr_len > state->hdr_bytes_remaining) 63 + return -1; 64 + 65 + if (kind == tcp_hdr_opt_kind_tpr) { 66 + if (hdr_len != tcp_hdr_opt_len_tpr) 67 + return -1; 68 + 69 + if (tcp_opt + tcp_hdr_opt_len_tpr > data_end) 70 + return -1; 71 + 72 + state->server_id = *(__u32 *)&tcp_opt[2]; 73 + return 1; 74 + } 75 + 76 + state->hdr_bytes_remaining -= hdr_len; 77 + state->byte_offset += hdr_len; 78 + return 0; 79 + } 80 + 81 + SEC("xdp") 82 + int xdp_ingress_v6(struct xdp_md *xdp) 83 + { 84 + const void *data = (void *)(long)xdp->data; 85 + const void *data_end = (void *)(long)xdp->data_end; 86 + struct hdr_opt_state opt_state = {}; 87 + __u8 tcp_hdr_opt_len = 0; 88 + struct tcphdr *tcp_hdr; 89 + __u64 tcp_offset = 0; 90 + __u32 off; 91 + int err; 92 + 93 + tcp_offset = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); 94 + tcp_hdr = (struct tcphdr *)(data + tcp_offset); 95 + if (tcp_hdr + 1 > data_end) 96 + return XDP_DROP; 97 + 98 + tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr); 99 + if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr) 100 + return XDP_DROP; 101 + 102 + opt_state.hdr_bytes_remaining = tcp_hdr_opt_len; 103 + opt_state.byte_offset = sizeof(struct tcphdr) + tcp_offset; 104 + 105 + /* max number of bytes of options in tcp header is 40 bytes */ 106 + for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) { 107 + err = parse_hdr_opt(xdp, &opt_state); 108 + 109 + if (err || !opt_state.hdr_bytes_remaining) 110 + break; 111 + } 112 + 113 + if (!opt_state.server_id) 114 + return XDP_DROP; 115 + 116 + server_id = opt_state.server_id; 117 + 118 + return XDP_PASS; 119 + }

+114

tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt_dynptr.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* This logic is lifted from a real-world use case of packet parsing, used in 4 + * the open source library katran, a layer 4 load balancer. 5 + * 6 + * This test demonstrates how to parse packet contents using dynptrs. The 7 + * original code (parsing without dynptrs) can be found in test_parse_tcp_hdr_opt.c 8 + */ 9 + 10 + #include <linux/bpf.h> 11 + #include <bpf/bpf_helpers.h> 12 + #include <linux/tcp.h> 13 + #include <stdbool.h> 14 + #include <linux/ipv6.h> 15 + #include <linux/if_ether.h> 16 + #include "test_tcp_hdr_options.h" 17 + #include "bpf_kfuncs.h" 18 + 19 + char _license[] SEC("license") = "GPL"; 20 + 21 + /* Kind number used for experiments */ 22 + const __u32 tcp_hdr_opt_kind_tpr = 0xFD; 23 + /* Length of the tcp header option */ 24 + const __u32 tcp_hdr_opt_len_tpr = 6; 25 + /* maximum number of header options to check to lookup server_id */ 26 + const __u32 tcp_hdr_opt_max_opt_checks = 15; 27 + 28 + __u32 server_id; 29 + 30 + static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining, 31 + __u32 *server_id) 32 + { 33 + __u8 *tcp_opt, kind, hdr_len; 34 + __u8 buffer[sizeof(kind) + sizeof(hdr_len) + sizeof(*server_id)]; 35 + __u8 *data; 36 + 37 + __builtin_memset(buffer, 0, sizeof(buffer)); 38 + 39 + data = bpf_dynptr_slice(ptr, *off, buffer, sizeof(buffer)); 40 + if (!data) 41 + return -1; 42 + 43 + kind = data[0]; 44 + 45 + if (kind == TCPOPT_EOL) 46 + return -1; 47 + 48 + if (kind == TCPOPT_NOP) { 49 + *off += 1; 50 + *hdr_bytes_remaining -= 1; 51 + return 0; 52 + } 53 + 54 + if (*hdr_bytes_remaining < 2) 55 + return -1; 56 + 57 + hdr_len = data[1]; 58 + if (hdr_len > *hdr_bytes_remaining) 59 + return -1; 60 + 61 + if (kind == tcp_hdr_opt_kind_tpr) { 62 + if (hdr_len != tcp_hdr_opt_len_tpr) 63 + return -1; 64 + 65 + __builtin_memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id)); 66 + return 1; 67 + } 68 + 69 + *off += hdr_len; 70 + *hdr_bytes_remaining -= hdr_len; 71 + return 0; 72 + } 73 + 74 + SEC("xdp") 75 + int xdp_ingress_v6(struct xdp_md *xdp) 76 + { 77 + __u8 buffer[sizeof(struct tcphdr)] = {}; 78 + __u8 hdr_bytes_remaining; 79 + struct tcphdr *tcp_hdr; 80 + __u8 tcp_hdr_opt_len; 81 + int err = 0; 82 + __u32 off; 83 + 84 + struct bpf_dynptr ptr; 85 + 86 + bpf_dynptr_from_xdp(xdp, 0, &ptr); 87 + 88 + off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); 89 + 90 + tcp_hdr = bpf_dynptr_slice(&ptr, off, buffer, sizeof(buffer)); 91 + if (!tcp_hdr) 92 + return XDP_DROP; 93 + 94 + tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr); 95 + if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr) 96 + return XDP_DROP; 97 + 98 + hdr_bytes_remaining = tcp_hdr_opt_len; 99 + 100 + off += sizeof(struct tcphdr); 101 + 102 + /* max number of bytes of options in tcp header is 40 bytes */ 103 + for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) { 104 + err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id); 105 + 106 + if (err || !hdr_bytes_remaining) 107 + break; 108 + } 109 + 110 + if (!server_id) 111 + return XDP_DROP; 112 + 113 + return XDP_PASS; 114 + }

+257

tools/testing/selftests/bpf/progs/test_xdp_dynptr.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2022 Meta */ 3 + #include <stddef.h> 4 + #include <string.h> 5 + #include <linux/bpf.h> 6 + #include <linux/if_ether.h> 7 + #include <linux/if_packet.h> 8 + #include <linux/ip.h> 9 + #include <linux/ipv6.h> 10 + #include <linux/in.h> 11 + #include <linux/udp.h> 12 + #include <linux/tcp.h> 13 + #include <linux/pkt_cls.h> 14 + #include <sys/socket.h> 15 + #include <bpf/bpf_helpers.h> 16 + #include <bpf/bpf_endian.h> 17 + #include "test_iptunnel_common.h" 18 + #include "bpf_kfuncs.h" 19 + 20 + const size_t tcphdr_sz = sizeof(struct tcphdr); 21 + const size_t udphdr_sz = sizeof(struct udphdr); 22 + const size_t ethhdr_sz = sizeof(struct ethhdr); 23 + const size_t iphdr_sz = sizeof(struct iphdr); 24 + const size_t ipv6hdr_sz = sizeof(struct ipv6hdr); 25 + 26 + struct { 27 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 28 + __uint(max_entries, 256); 29 + __type(key, __u32); 30 + __type(value, __u64); 31 + } rxcnt SEC(".maps"); 32 + 33 + struct { 34 + __uint(type, BPF_MAP_TYPE_HASH); 35 + __uint(max_entries, MAX_IPTNL_ENTRIES); 36 + __type(key, struct vip); 37 + __type(value, struct iptnl_info); 38 + } vip2tnl SEC(".maps"); 39 + 40 + static __always_inline void count_tx(__u32 protocol) 41 + { 42 + __u64 *rxcnt_count; 43 + 44 + rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol); 45 + if (rxcnt_count) 46 + *rxcnt_count += 1; 47 + } 48 + 49 + static __always_inline int get_dport(void *trans_data, __u8 protocol) 50 + { 51 + struct tcphdr *th; 52 + struct udphdr *uh; 53 + 54 + switch (protocol) { 55 + case IPPROTO_TCP: 56 + th = (struct tcphdr *)trans_data; 57 + return th->dest; 58 + case IPPROTO_UDP: 59 + uh = (struct udphdr *)trans_data; 60 + return uh->dest; 61 + default: 62 + return 0; 63 + } 64 + } 65 + 66 + static __always_inline void set_ethhdr(struct ethhdr *new_eth, 67 + const struct ethhdr *old_eth, 68 + const struct iptnl_info *tnl, 69 + __be16 h_proto) 70 + { 71 + memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); 72 + memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); 73 + new_eth->h_proto = h_proto; 74 + } 75 + 76 + static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr) 77 + { 78 + __u8 eth_buffer[ethhdr_sz + iphdr_sz + ethhdr_sz]; 79 + __u8 iph_buffer_tcp[iphdr_sz + tcphdr_sz]; 80 + __u8 iph_buffer_udp[iphdr_sz + udphdr_sz]; 81 + struct bpf_dynptr new_xdp_ptr; 82 + struct iptnl_info *tnl; 83 + struct ethhdr *new_eth; 84 + struct ethhdr *old_eth; 85 + __u32 transport_hdr_sz; 86 + struct iphdr *iph; 87 + __u16 *next_iph; 88 + __u16 payload_len; 89 + struct vip vip = {}; 90 + int dport; 91 + __u32 csum = 0; 92 + int i; 93 + 94 + __builtin_memset(eth_buffer, 0, sizeof(eth_buffer)); 95 + __builtin_memset(iph_buffer_tcp, 0, sizeof(iph_buffer_tcp)); 96 + __builtin_memset(iph_buffer_udp, 0, sizeof(iph_buffer_udp)); 97 + 98 + if (ethhdr_sz + iphdr_sz + tcphdr_sz > xdp->data_end - xdp->data) 99 + iph = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, iph_buffer_udp, sizeof(iph_buffer_udp)); 100 + else 101 + iph = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, iph_buffer_tcp, sizeof(iph_buffer_tcp)); 102 + 103 + if (!iph) 104 + return XDP_DROP; 105 + 106 + dport = get_dport(iph + 1, iph->protocol); 107 + if (dport == -1) 108 + return XDP_DROP; 109 + 110 + vip.protocol = iph->protocol; 111 + vip.family = AF_INET; 112 + vip.daddr.v4 = iph->daddr; 113 + vip.dport = dport; 114 + payload_len = bpf_ntohs(iph->tot_len); 115 + 116 + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); 117 + /* It only does v4-in-v4 */ 118 + if (!tnl || tnl->family != AF_INET) 119 + return XDP_PASS; 120 + 121 + if (bpf_xdp_adjust_head(xdp, 0 - (int)iphdr_sz)) 122 + return XDP_DROP; 123 + 124 + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr); 125 + new_eth = bpf_dynptr_slice_rdwr(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer)); 126 + if (!new_eth) 127 + return XDP_DROP; 128 + 129 + iph = (struct iphdr *)(new_eth + 1); 130 + old_eth = (struct ethhdr *)(iph + 1); 131 + 132 + set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP)); 133 + 134 + if (new_eth == eth_buffer) 135 + bpf_dynptr_write(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer), 0); 136 + 137 + iph->version = 4; 138 + iph->ihl = iphdr_sz >> 2; 139 + iph->frag_off = 0; 140 + iph->protocol = IPPROTO_IPIP; 141 + iph->check = 0; 142 + iph->tos = 0; 143 + iph->tot_len = bpf_htons(payload_len + iphdr_sz); 144 + iph->daddr = tnl->daddr.v4; 145 + iph->saddr = tnl->saddr.v4; 146 + iph->ttl = 8; 147 + 148 + next_iph = (__u16 *)iph; 149 + for (i = 0; i < iphdr_sz >> 1; i++) 150 + csum += *next_iph++; 151 + 152 + iph->check = ~((csum & 0xffff) + (csum >> 16)); 153 + 154 + count_tx(vip.protocol); 155 + 156 + return XDP_TX; 157 + } 158 + 159 + static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr) 160 + { 161 + __u8 eth_buffer[ethhdr_sz + ipv6hdr_sz + ethhdr_sz]; 162 + __u8 ip6h_buffer_tcp[ipv6hdr_sz + tcphdr_sz]; 163 + __u8 ip6h_buffer_udp[ipv6hdr_sz + udphdr_sz]; 164 + struct bpf_dynptr new_xdp_ptr; 165 + struct iptnl_info *tnl; 166 + struct ethhdr *new_eth; 167 + struct ethhdr *old_eth; 168 + __u32 transport_hdr_sz; 169 + struct ipv6hdr *ip6h; 170 + __u16 payload_len; 171 + struct vip vip = {}; 172 + int dport; 173 + 174 + __builtin_memset(eth_buffer, 0, sizeof(eth_buffer)); 175 + __builtin_memset(ip6h_buffer_tcp, 0, sizeof(ip6h_buffer_tcp)); 176 + __builtin_memset(ip6h_buffer_udp, 0, sizeof(ip6h_buffer_udp)); 177 + 178 + if (ethhdr_sz + iphdr_sz + tcphdr_sz > xdp->data_end - xdp->data) 179 + ip6h = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, ip6h_buffer_udp, sizeof(ip6h_buffer_udp)); 180 + else 181 + ip6h = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, ip6h_buffer_tcp, sizeof(ip6h_buffer_tcp)); 182 + 183 + if (!ip6h) 184 + return XDP_DROP; 185 + 186 + dport = get_dport(ip6h + 1, ip6h->nexthdr); 187 + if (dport == -1) 188 + return XDP_DROP; 189 + 190 + vip.protocol = ip6h->nexthdr; 191 + vip.family = AF_INET6; 192 + memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr)); 193 + vip.dport = dport; 194 + payload_len = ip6h->payload_len; 195 + 196 + tnl = bpf_map_lookup_elem(&vip2tnl, &vip); 197 + /* It only does v6-in-v6 */ 198 + if (!tnl || tnl->family != AF_INET6) 199 + return XDP_PASS; 200 + 201 + if (bpf_xdp_adjust_head(xdp, 0 - (int)ipv6hdr_sz)) 202 + return XDP_DROP; 203 + 204 + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr); 205 + new_eth = bpf_dynptr_slice_rdwr(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer)); 206 + if (!new_eth) 207 + return XDP_DROP; 208 + 209 + ip6h = (struct ipv6hdr *)(new_eth + 1); 210 + old_eth = (struct ethhdr *)(ip6h + 1); 211 + 212 + set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6)); 213 + 214 + if (new_eth == eth_buffer) 215 + bpf_dynptr_write(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer), 0); 216 + 217 + ip6h->version = 6; 218 + ip6h->priority = 0; 219 + memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); 220 + ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + ipv6hdr_sz); 221 + ip6h->nexthdr = IPPROTO_IPV6; 222 + ip6h->hop_limit = 8; 223 + memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6)); 224 + memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6)); 225 + 226 + count_tx(vip.protocol); 227 + 228 + return XDP_TX; 229 + } 230 + 231 + SEC("xdp") 232 + int _xdp_tx_iptunnel(struct xdp_md *xdp) 233 + { 234 + __u8 buffer[ethhdr_sz]; 235 + struct bpf_dynptr ptr; 236 + struct ethhdr *eth; 237 + __u16 h_proto; 238 + 239 + __builtin_memset(buffer, 0, sizeof(buffer)); 240 + 241 + bpf_dynptr_from_xdp(xdp, 0, &ptr); 242 + eth = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer)); 243 + if (!eth) 244 + return XDP_DROP; 245 + 246 + h_proto = eth->h_proto; 247 + 248 + if (h_proto == bpf_htons(ETH_P_IP)) 249 + return handle_ipv4(xdp, &ptr); 250 + else if (h_proto == bpf_htons(ETH_P_IPV6)) 251 + 252 + return handle_ipv6(xdp, &ptr); 253 + else 254 + return XDP_DROP; 255 + } 256 + 257 + char _license[] SEC("license") = "GPL";

+1

tools/testing/selftests/bpf/test_tcp_hdr_options.h

··· 50 50 51 51 #define TCPOPT_EOL 0 52 52 #define TCPOPT_NOP 1 53 + #define TCPOPT_MSS 2 53 54 #define TCPOPT_WINDOW 3 54 55 #define TCPOPT_EXP 254 55 56