Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'cloudflare-prog'

Lorenz Bauer says:

====================
We've been developing an in-house L4 load balancer based on XDP
and TC for a while. Following Alexei's call for more up-to-date examples of
production BPF in the kernel tree [1], Cloudflare is making this available
under dual GPL-2.0 or BSD 3-clause terms.

The code requires at least v5.3 to function correctly.

1: https://lore.kernel.org/bpf/20200326210719.den5isqxntnoqhmv@ast-mbp/
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+1575
+456
tools/testing/selftests/bpf/prog_tests/cls_redirect.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + // Copyright (c) 2020 Cloudflare 3 + 4 + #define _GNU_SOURCE 5 + 6 + #include <arpa/inet.h> 7 + #include <string.h> 8 + 9 + #include <linux/pkt_cls.h> 10 + 11 + #include <test_progs.h> 12 + 13 + #include "progs/test_cls_redirect.h" 14 + #include "test_cls_redirect.skel.h" 15 + 16 + #define ENCAP_IP INADDR_LOOPBACK 17 + #define ENCAP_PORT (1234) 18 + 19 + struct addr_port { 20 + in_port_t port; 21 + union { 22 + struct in_addr in_addr; 23 + struct in6_addr in6_addr; 24 + }; 25 + }; 26 + 27 + struct tuple { 28 + int family; 29 + struct addr_port src; 30 + struct addr_port dst; 31 + }; 32 + 33 + static int start_server(const struct sockaddr *addr, socklen_t len, int type) 34 + { 35 + int fd = socket(addr->sa_family, type, 0); 36 + if (CHECK_FAIL(fd == -1)) 37 + return -1; 38 + if (CHECK_FAIL(bind(fd, addr, len) == -1)) 39 + goto err; 40 + if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1)) 41 + goto err; 42 + 43 + return fd; 44 + 45 + err: 46 + close(fd); 47 + return -1; 48 + } 49 + 50 + static int connect_to_server(const struct sockaddr *addr, socklen_t len, 51 + int type) 52 + { 53 + int fd = socket(addr->sa_family, type, 0); 54 + if (CHECK_FAIL(fd == -1)) 55 + return -1; 56 + if (CHECK_FAIL(connect(fd, addr, len))) 57 + goto err; 58 + 59 + return fd; 60 + 61 + err: 62 + close(fd); 63 + return -1; 64 + } 65 + 66 + static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap) 67 + { 68 + const struct sockaddr_in6 *in6; 69 + const struct sockaddr_in *in; 70 + 71 + switch (sa->sa_family) { 72 + case AF_INET: 73 + in = (const struct sockaddr_in *)sa; 74 + ap->in_addr = in->sin_addr; 75 + ap->port = in->sin_port; 76 + return true; 77 + 78 + case AF_INET6: 79 + in6 = (const struct sockaddr_in6 *)sa; 80 + ap->in6_addr = in6->sin6_addr; 81 + ap->port = in6->sin6_port; 82 + return true; 83 + 84 + default: 85 + return false; 86 + } 87 + } 88 + 89 + static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type, 90 + int *server, int *conn, struct tuple *tuple) 91 + { 92 + struct sockaddr_storage ss; 93 + socklen_t slen = sizeof(ss); 94 + struct sockaddr *sa = (struct sockaddr *)&ss; 95 + 96 + *server = start_server(addr, len, type); 97 + if (*server < 0) 98 + return false; 99 + 100 + if (CHECK_FAIL(getsockname(*server, sa, &slen))) 101 + goto close_server; 102 + 103 + *conn = connect_to_server(sa, slen, type); 104 + if (*conn < 0) 105 + goto close_server; 106 + 107 + /* We want to simulate packets arriving at conn, so we have to 108 + * swap src and dst. 109 + */ 110 + slen = sizeof(ss); 111 + if (CHECK_FAIL(getsockname(*conn, sa, &slen))) 112 + goto close_conn; 113 + 114 + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst))) 115 + goto close_conn; 116 + 117 + slen = sizeof(ss); 118 + if (CHECK_FAIL(getpeername(*conn, sa, &slen))) 119 + goto close_conn; 120 + 121 + if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src))) 122 + goto close_conn; 123 + 124 + tuple->family = ss.ss_family; 125 + return true; 126 + 127 + close_conn: 128 + close(*conn); 129 + *conn = -1; 130 + close_server: 131 + close(*server); 132 + *server = -1; 133 + return false; 134 + } 135 + 136 + static socklen_t prepare_addr(struct sockaddr_storage *addr, int family) 137 + { 138 + struct sockaddr_in *addr4; 139 + struct sockaddr_in6 *addr6; 140 + 141 + switch (family) { 142 + case AF_INET: 143 + addr4 = (struct sockaddr_in *)addr; 144 + memset(addr4, 0, sizeof(*addr4)); 145 + addr4->sin_family = family; 146 + addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 147 + return sizeof(*addr4); 148 + case AF_INET6: 149 + addr6 = (struct sockaddr_in6 *)addr; 150 + memset(addr6, 0, sizeof(*addr6)); 151 + addr6->sin6_family = family; 152 + addr6->sin6_addr = in6addr_loopback; 153 + return sizeof(*addr6); 154 + default: 155 + fprintf(stderr, "Invalid family %d", family); 156 + return 0; 157 + } 158 + } 159 + 160 + static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr) 161 + { 162 + return tattr->data_size_out < tattr->data_size_in; 163 + } 164 + 165 + enum type { 166 + UDP, 167 + TCP, 168 + __NR_KIND, 169 + }; 170 + 171 + enum hops { 172 + NO_HOPS, 173 + ONE_HOP, 174 + }; 175 + 176 + enum flags { 177 + NONE, 178 + SYN, 179 + ACK, 180 + }; 181 + 182 + enum conn { 183 + KNOWN_CONN, 184 + UNKNOWN_CONN, 185 + }; 186 + 187 + enum result { 188 + ACCEPT, 189 + FORWARD, 190 + }; 191 + 192 + struct test_cfg { 193 + enum type type; 194 + enum result result; 195 + enum conn conn; 196 + enum hops hops; 197 + enum flags flags; 198 + }; 199 + 200 + static int test_str(void *buf, size_t len, const struct test_cfg *test, 201 + int family) 202 + { 203 + const char *family_str, *type, *conn, *hops, *result, *flags; 204 + 205 + family_str = "IPv4"; 206 + if (family == AF_INET6) 207 + family_str = "IPv6"; 208 + 209 + type = "TCP"; 210 + if (test->type == UDP) 211 + type = "UDP"; 212 + 213 + conn = "known"; 214 + if (test->conn == UNKNOWN_CONN) 215 + conn = "unknown"; 216 + 217 + hops = "no hops"; 218 + if (test->hops == ONE_HOP) 219 + hops = "one hop"; 220 + 221 + result = "accept"; 222 + if (test->result == FORWARD) 223 + result = "forward"; 224 + 225 + flags = "none"; 226 + if (test->flags == SYN) 227 + flags = "SYN"; 228 + else if (test->flags == ACK) 229 + flags = "ACK"; 230 + 231 + return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str, 232 + type, result, conn, hops, flags); 233 + } 234 + 235 + static struct test_cfg tests[] = { 236 + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN }, 237 + { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK }, 238 + { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK }, 239 + { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK }, 240 + { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE }, 241 + { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE }, 242 + { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE }, 243 + }; 244 + 245 + static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto) 246 + { 247 + const uint8_t hlen = 248 + (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count; 249 + *encap = (encap_headers_t){ 250 + .eth = { .h_proto = htons(ETH_P_IP) }, 251 + .ip = { 252 + .ihl = 5, 253 + .version = 4, 254 + .ttl = IPDEFTTL, 255 + .protocol = IPPROTO_UDP, 256 + .daddr = htonl(ENCAP_IP) 257 + }, 258 + .udp = { 259 + .dest = htons(ENCAP_PORT), 260 + }, 261 + .gue = { 262 + .hlen = hlen, 263 + .proto_ctype = proto 264 + }, 265 + .unigue = { 266 + .hop_count = hop_count 267 + }, 268 + }; 269 + } 270 + 271 + static size_t build_input(const struct test_cfg *test, void *const buf, 272 + const struct tuple *tuple) 273 + { 274 + in_port_t sport = tuple->src.port; 275 + encap_headers_t encap; 276 + struct iphdr ip; 277 + struct ipv6hdr ipv6; 278 + struct tcphdr tcp; 279 + struct udphdr udp; 280 + struct in_addr next_hop; 281 + uint8_t *p = buf; 282 + int proto; 283 + 284 + proto = IPPROTO_IPIP; 285 + if (tuple->family == AF_INET6) 286 + proto = IPPROTO_IPV6; 287 + 288 + encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto); 289 + p = mempcpy(p, &encap, sizeof(encap)); 290 + 291 + if (test->hops == ONE_HOP) { 292 + next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) }; 293 + p = mempcpy(p, &next_hop, sizeof(next_hop)); 294 + } 295 + 296 + proto = IPPROTO_TCP; 297 + if (test->type == UDP) 298 + proto = IPPROTO_UDP; 299 + 300 + switch (tuple->family) { 301 + case AF_INET: 302 + ip = (struct iphdr){ 303 + .ihl = 5, 304 + .version = 4, 305 + .ttl = IPDEFTTL, 306 + .protocol = proto, 307 + .saddr = tuple->src.in_addr.s_addr, 308 + .daddr = tuple->dst.in_addr.s_addr, 309 + }; 310 + p = mempcpy(p, &ip, sizeof(ip)); 311 + break; 312 + case AF_INET6: 313 + ipv6 = (struct ipv6hdr){ 314 + .version = 6, 315 + .hop_limit = IPDEFTTL, 316 + .nexthdr = proto, 317 + .saddr = tuple->src.in6_addr, 318 + .daddr = tuple->dst.in6_addr, 319 + }; 320 + p = mempcpy(p, &ipv6, sizeof(ipv6)); 321 + break; 322 + default: 323 + return 0; 324 + } 325 + 326 + if (test->conn == UNKNOWN_CONN) 327 + sport--; 328 + 329 + switch (test->type) { 330 + case TCP: 331 + tcp = (struct tcphdr){ 332 + .source = sport, 333 + .dest = tuple->dst.port, 334 + }; 335 + if (test->flags == SYN) 336 + tcp.syn = true; 337 + if (test->flags == ACK) 338 + tcp.ack = true; 339 + p = mempcpy(p, &tcp, sizeof(tcp)); 340 + break; 341 + case UDP: 342 + udp = (struct udphdr){ 343 + .source = sport, 344 + .dest = tuple->dst.port, 345 + }; 346 + p = mempcpy(p, &udp, sizeof(udp)); 347 + break; 348 + default: 349 + return 0; 350 + } 351 + 352 + return (void *)p - buf; 353 + } 354 + 355 + static void close_fds(int *fds, int n) 356 + { 357 + int i; 358 + 359 + for (i = 0; i < n; i++) 360 + if (fds[i] > 0) 361 + close(fds[i]); 362 + } 363 + 364 + void test_cls_redirect(void) 365 + { 366 + struct test_cls_redirect *skel = NULL; 367 + struct bpf_prog_test_run_attr tattr = {}; 368 + int families[] = { AF_INET, AF_INET6 }; 369 + struct sockaddr_storage ss; 370 + struct sockaddr *addr; 371 + socklen_t slen; 372 + int i, j, err; 373 + 374 + int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; 375 + int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; 376 + struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; 377 + 378 + skel = test_cls_redirect__open(); 379 + if (CHECK_FAIL(!skel)) 380 + return; 381 + 382 + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); 383 + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); 384 + 385 + if (CHECK_FAIL(test_cls_redirect__load(skel))) 386 + goto cleanup; 387 + 388 + addr = (struct sockaddr *)&ss; 389 + for (i = 0; i < ARRAY_SIZE(families); i++) { 390 + slen = prepare_addr(&ss, families[i]); 391 + if (CHECK_FAIL(!slen)) 392 + goto cleanup; 393 + 394 + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM, 395 + &servers[UDP][i], &conns[UDP][i], 396 + &tuples[UDP][i]))) 397 + goto cleanup; 398 + 399 + if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM, 400 + &servers[TCP][i], &conns[TCP][i], 401 + &tuples[TCP][i]))) 402 + goto cleanup; 403 + } 404 + 405 + tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); 406 + for (i = 0; i < ARRAY_SIZE(tests); i++) { 407 + struct test_cfg *test = &tests[i]; 408 + 409 + for (j = 0; j < ARRAY_SIZE(families); j++) { 410 + struct tuple *tuple = &tuples[test->type][j]; 411 + char input[256]; 412 + char tmp[256]; 413 + 414 + test_str(tmp, sizeof(tmp), test, tuple->family); 415 + if (!test__start_subtest(tmp)) 416 + continue; 417 + 418 + tattr.data_out = tmp; 419 + tattr.data_size_out = sizeof(tmp); 420 + 421 + tattr.data_in = input; 422 + tattr.data_size_in = build_input(test, input, tuple); 423 + if (CHECK_FAIL(!tattr.data_size_in)) 424 + continue; 425 + 426 + err = bpf_prog_test_run_xattr(&tattr); 427 + if (CHECK_FAIL(err)) 428 + continue; 429 + 430 + if (tattr.retval != TC_ACT_REDIRECT) { 431 + PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n", 432 + tattr.retval); 433 + continue; 434 + } 435 + 436 + switch (test->result) { 437 + case ACCEPT: 438 + if (CHECK_FAIL(!was_decapsulated(&tattr))) 439 + continue; 440 + break; 441 + case FORWARD: 442 + if (CHECK_FAIL(was_decapsulated(&tattr))) 443 + continue; 444 + break; 445 + default: 446 + PRINT_FAIL("unknown result %d\n", test->result); 447 + continue; 448 + } 449 + } 450 + } 451 + 452 + cleanup: 453 + test_cls_redirect__destroy(skel); 454 + close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); 455 + close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); 456 + }
+1058
tools/testing/selftests/bpf/progs/test_cls_redirect.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 + // Copyright (c) 2019, 2020 Cloudflare 3 + 4 + #include <stdbool.h> 5 + #include <stddef.h> 6 + #include <stdint.h> 7 + #include <string.h> 8 + 9 + #include <linux/bpf.h> 10 + #include <linux/icmp.h> 11 + #include <linux/icmpv6.h> 12 + #include <linux/if_ether.h> 13 + #include <linux/in.h> 14 + #include <linux/ip.h> 15 + #include <linux/ipv6.h> 16 + #include <linux/pkt_cls.h> 17 + #include <linux/tcp.h> 18 + #include <linux/udp.h> 19 + 20 + #include <bpf/bpf_helpers.h> 21 + #include <bpf/bpf_endian.h> 22 + 23 + #include "test_cls_redirect.h" 24 + 25 + #define offsetofend(TYPE, MEMBER) \ 26 + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) 27 + 28 + #define IP_OFFSET_MASK (0x1FFF) 29 + #define IP_MF (0x2000) 30 + 31 + char _license[] SEC("license") = "Dual BSD/GPL"; 32 + 33 + /** 34 + * Destination port and IP used for UDP encapsulation. 35 + */ 36 + static volatile const __be16 ENCAPSULATION_PORT; 37 + static volatile const __be32 ENCAPSULATION_IP; 38 + 39 + typedef struct { 40 + uint64_t processed_packets_total; 41 + uint64_t l3_protocol_packets_total_ipv4; 42 + uint64_t l3_protocol_packets_total_ipv6; 43 + uint64_t l4_protocol_packets_total_tcp; 44 + uint64_t l4_protocol_packets_total_udp; 45 + uint64_t accepted_packets_total_syn; 46 + uint64_t accepted_packets_total_syn_cookies; 47 + uint64_t accepted_packets_total_last_hop; 48 + uint64_t accepted_packets_total_icmp_echo_request; 49 + uint64_t accepted_packets_total_established; 50 + uint64_t forwarded_packets_total_gue; 51 + uint64_t forwarded_packets_total_gre; 52 + 53 + uint64_t errors_total_unknown_l3_proto; 54 + uint64_t errors_total_unknown_l4_proto; 55 + uint64_t errors_total_malformed_ip; 56 + uint64_t errors_total_fragmented_ip; 57 + uint64_t errors_total_malformed_icmp; 58 + uint64_t errors_total_unwanted_icmp; 59 + uint64_t errors_total_malformed_icmp_pkt_too_big; 60 + uint64_t errors_total_malformed_tcp; 61 + uint64_t errors_total_malformed_udp; 62 + uint64_t errors_total_icmp_echo_replies; 63 + uint64_t errors_total_malformed_encapsulation; 64 + uint64_t errors_total_encap_adjust_failed; 65 + uint64_t errors_total_encap_buffer_too_small; 66 + uint64_t errors_total_redirect_loop; 67 + } metrics_t; 68 + 69 + typedef enum { 70 + INVALID = 0, 71 + UNKNOWN, 72 + ECHO_REQUEST, 73 + SYN, 74 + SYN_COOKIE, 75 + ESTABLISHED, 76 + } verdict_t; 77 + 78 + typedef struct { 79 + uint16_t src, dst; 80 + } flow_ports_t; 81 + 82 + _Static_assert( 83 + sizeof(flow_ports_t) != 84 + offsetofend(struct bpf_sock_tuple, ipv4.dport) - 85 + offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, 86 + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 87 + _Static_assert( 88 + sizeof(flow_ports_t) != 89 + offsetofend(struct bpf_sock_tuple, ipv6.dport) - 90 + offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, 91 + "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); 92 + 93 + typedef int ret_t; 94 + 95 + /* This is a bit of a hack. We need a return value which allows us to 96 + * indicate that the regular flow of the program should continue, 97 + * while allowing functions to use XDP_PASS and XDP_DROP, etc. 98 + */ 99 + static const ret_t CONTINUE_PROCESSING = -1; 100 + 101 + /* Convenience macro to call functions which return ret_t. 102 + */ 103 + #define MAYBE_RETURN(x) \ 104 + do { \ 105 + ret_t __ret = x; \ 106 + if (__ret != CONTINUE_PROCESSING) \ 107 + return __ret; \ 108 + } while (0) 109 + 110 + /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), 111 + * or not aligned if the arch supports efficient unaligned access. 112 + * 113 + * Since the verifier ensures that eBPF packet accesses follow these rules, 114 + * we can tell LLVM to emit code as if we always had a larger alignment. 115 + * It will yell at us if we end up on a platform where this is not valid. 116 + */ 117 + typedef uint8_t *net_ptr __attribute__((align_value(8))); 118 + 119 + typedef struct buf { 120 + struct __sk_buff *skb; 121 + net_ptr head; 122 + /* NB: tail musn't have alignment other than 1, otherwise 123 + * LLVM will go and eliminate code, e.g. when checking packet lengths. 124 + */ 125 + uint8_t *const tail; 126 + } buf_t; 127 + 128 + static size_t buf_off(const buf_t *buf) 129 + { 130 + /* Clang seems to optimize constructs like 131 + * a - b + c 132 + * if c is known: 133 + * r? = c 134 + * r? -= b 135 + * r? += a 136 + * 137 + * This is a problem if a and b are packet pointers, 138 + * since the verifier allows subtracting two pointers to 139 + * get a scalar, but not a scalar and a pointer. 140 + * 141 + * Use inline asm to break this optimization. 142 + */ 143 + size_t off = (size_t)buf->head; 144 + asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); 145 + return off; 146 + } 147 + 148 + static bool buf_copy(buf_t *buf, void *dst, size_t len) 149 + { 150 + if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { 151 + return false; 152 + } 153 + 154 + buf->head += len; 155 + return true; 156 + } 157 + 158 + static bool buf_skip(buf_t *buf, const size_t len) 159 + { 160 + /* Check whether off + len is valid in the non-linear part. */ 161 + if (buf_off(buf) + len > buf->skb->len) { 162 + return false; 163 + } 164 + 165 + buf->head += len; 166 + return true; 167 + } 168 + 169 + /* Returns a pointer to the start of buf, or NULL if len is 170 + * larger than the remaining data. Consumes len bytes on a successful 171 + * call. 172 + * 173 + * If scratch is not NULL, the function will attempt to load non-linear 174 + * data via bpf_skb_load_bytes. On success, scratch is returned. 175 + */ 176 + static void *buf_assign(buf_t *buf, const size_t len, void *scratch) 177 + { 178 + if (buf->head + len > buf->tail) { 179 + if (scratch == NULL) { 180 + return NULL; 181 + } 182 + 183 + return buf_copy(buf, scratch, len) ? scratch : NULL; 184 + } 185 + 186 + void *ptr = buf->head; 187 + buf->head += len; 188 + return ptr; 189 + } 190 + 191 + static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) 192 + { 193 + if (ipv4->ihl <= 5) { 194 + return true; 195 + } 196 + 197 + return buf_skip(buf, (ipv4->ihl - 5) * 4); 198 + } 199 + 200 + static bool ipv4_is_fragment(const struct iphdr *ip) 201 + { 202 + uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); 203 + return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; 204 + } 205 + 206 + static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) 207 + { 208 + struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); 209 + if (ipv4 == NULL) { 210 + return NULL; 211 + } 212 + 213 + if (ipv4->ihl < 5) { 214 + return NULL; 215 + } 216 + 217 + if (!pkt_skip_ipv4_options(pkt, ipv4)) { 218 + return NULL; 219 + } 220 + 221 + return ipv4; 222 + } 223 + 224 + /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ 225 + static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) 226 + { 227 + if (!buf_copy(pkt, ports, sizeof(*ports))) { 228 + return false; 229 + } 230 + 231 + /* Ports in the L4 headers are reversed, since we are parsing an ICMP 232 + * payload which is going towards the eyeball. 233 + */ 234 + uint16_t dst = ports->src; 235 + ports->src = ports->dst; 236 + ports->dst = dst; 237 + return true; 238 + } 239 + 240 + static uint16_t pkt_checksum_fold(uint32_t csum) 241 + { 242 + /* The highest reasonable value for an IPv4 header 243 + * checksum requires two folds, so we just do that always. 244 + */ 245 + csum = (csum & 0xffff) + (csum >> 16); 246 + csum = (csum & 0xffff) + (csum >> 16); 247 + return (uint16_t)~csum; 248 + } 249 + 250 + static void pkt_ipv4_checksum(struct iphdr *iph) 251 + { 252 + iph->check = 0; 253 + 254 + /* An IP header without options is 20 bytes. Two of those 255 + * are the checksum, which we always set to zero. Hence, 256 + * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, 257 + * which fits in 32 bit. 258 + */ 259 + _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); 260 + uint32_t acc = 0; 261 + uint16_t *ipw = (uint16_t *)iph; 262 + 263 + #pragma clang loop unroll(full) 264 + for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { 265 + acc += ipw[i]; 266 + } 267 + 268 + iph->check = pkt_checksum_fold(acc); 269 + } 270 + 271 + static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, 272 + const struct ipv6hdr *ipv6, 273 + uint8_t *upper_proto, 274 + bool *is_fragment) 275 + { 276 + /* We understand five extension headers. 277 + * https://tools.ietf.org/html/rfc8200#section-4.1 states that all 278 + * headers should occur once, except Destination Options, which may 279 + * occur twice. Hence we give up after 6 headers. 280 + */ 281 + struct { 282 + uint8_t next; 283 + uint8_t len; 284 + } exthdr = { 285 + .next = ipv6->nexthdr, 286 + }; 287 + *is_fragment = false; 288 + 289 + #pragma clang loop unroll(full) 290 + for (int i = 0; i < 6; i++) { 291 + switch (exthdr.next) { 292 + case IPPROTO_FRAGMENT: 293 + *is_fragment = true; 294 + /* NB: We don't check that hdrlen == 0 as per spec. */ 295 + /* fallthrough; */ 296 + 297 + case IPPROTO_HOPOPTS: 298 + case IPPROTO_ROUTING: 299 + case IPPROTO_DSTOPTS: 300 + case IPPROTO_MH: 301 + if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { 302 + return false; 303 + } 304 + 305 + /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ 306 + if (!buf_skip(pkt, 307 + (exthdr.len + 1) * 8 - sizeof(exthdr))) { 308 + return false; 309 + } 310 + 311 + /* Decode next header */ 312 + break; 313 + 314 + default: 315 + /* The next header is not one of the known extension 316 + * headers, treat it as the upper layer header. 317 + * 318 + * This handles IPPROTO_NONE. 319 + * 320 + * Encapsulating Security Payload (50) and Authentication 321 + * Header (51) also end up here (and will trigger an 322 + * unknown proto error later). They have a custom header 323 + * format and seem too esoteric to care about. 324 + */ 325 + *upper_proto = exthdr.next; 326 + return true; 327 + } 328 + } 329 + 330 + /* We never found an upper layer header. */ 331 + return false; 332 + } 333 + 334 + /* This function has to be inlined, because the verifier otherwise rejects it 335 + * due to returning a pointer to the stack. This is technically correct, since 336 + * scratch is allocated on the stack. However, this usage should be safe since 337 + * it's the callers stack after all. 338 + */ 339 + static inline __attribute__((__always_inline__)) struct ipv6hdr * 340 + pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, 341 + bool *is_fragment) 342 + { 343 + struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); 344 + if (ipv6 == NULL) { 345 + return NULL; 346 + } 347 + 348 + if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { 349 + return NULL; 350 + } 351 + 352 + return ipv6; 353 + } 354 + 355 + /* Global metrics, per CPU 356 + */ 357 + struct bpf_map_def metrics_map SEC("maps") = { 358 + .type = BPF_MAP_TYPE_PERCPU_ARRAY, 359 + .key_size = sizeof(unsigned int), 360 + .value_size = sizeof(metrics_t), 361 + .max_entries = 1, 362 + }; 363 + 364 + static metrics_t *get_global_metrics(void) 365 + { 366 + uint64_t key = 0; 367 + return bpf_map_lookup_elem(&metrics_map, &key); 368 + } 369 + 370 + static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) 371 + { 372 + const int payload_off = 373 + sizeof(*encap) + 374 + sizeof(struct in_addr) * encap->unigue.hop_count; 375 + int32_t encap_overhead = payload_off - sizeof(struct ethhdr); 376 + 377 + // Changing the ethertype if the encapsulated packet is ipv6 378 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { 379 + encap->eth.h_proto = bpf_htons(ETH_P_IPV6); 380 + } 381 + 382 + if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, 383 + BPF_F_ADJ_ROOM_FIXED_GSO)) { 384 + return TC_ACT_SHOT; 385 + } 386 + 387 + return bpf_redirect(skb->ifindex, BPF_F_INGRESS); 388 + } 389 + 390 + static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, 391 + struct in_addr *next_hop, metrics_t *metrics) 392 + { 393 + metrics->forwarded_packets_total_gre++; 394 + 395 + const int payload_off = 396 + sizeof(*encap) + 397 + sizeof(struct in_addr) * encap->unigue.hop_count; 398 + int32_t encap_overhead = 399 + payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); 400 + int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; 401 + uint16_t proto = ETH_P_IP; 402 + 403 + /* Loop protection: the inner packet's TTL is decremented as a safeguard 404 + * against any forwarding loop. As the only interesting field is the TTL 405 + * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes 406 + * as they handle the split packets if needed (no need for the data to be 407 + * in the linear section). 408 + */ 409 + if (encap->gue.proto_ctype == IPPROTO_IPV6) { 410 + proto = ETH_P_IPV6; 411 + uint8_t ttl; 412 + int rc; 413 + 414 + rc = bpf_skb_load_bytes( 415 + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 416 + &ttl, 1); 417 + if (rc != 0) { 418 + metrics->errors_total_malformed_encapsulation++; 419 + return TC_ACT_SHOT; 420 + } 421 + 422 + if (ttl == 0) { 423 + metrics->errors_total_redirect_loop++; 424 + return TC_ACT_SHOT; 425 + } 426 + 427 + ttl--; 428 + rc = bpf_skb_store_bytes( 429 + skb, payload_off + offsetof(struct ipv6hdr, hop_limit), 430 + &ttl, 1, 0); 431 + if (rc != 0) { 432 + metrics->errors_total_malformed_encapsulation++; 433 + return TC_ACT_SHOT; 434 + } 435 + } else { 436 + uint8_t ttl; 437 + int rc; 438 + 439 + rc = bpf_skb_load_bytes( 440 + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 441 + 1); 442 + if (rc != 0) { 443 + metrics->errors_total_malformed_encapsulation++; 444 + return TC_ACT_SHOT; 445 + } 446 + 447 + if (ttl == 0) { 448 + metrics->errors_total_redirect_loop++; 449 + return TC_ACT_SHOT; 450 + } 451 + 452 + /* IPv4 also has a checksum to patch. While the TTL is only one byte, 453 + * this function only works for 2 and 4 bytes arguments (the result is 454 + * the same). 455 + */ 456 + rc = bpf_l3_csum_replace( 457 + skb, payload_off + offsetof(struct iphdr, check), ttl, 458 + ttl - 1, 2); 459 + if (rc != 0) { 460 + metrics->errors_total_malformed_encapsulation++; 461 + return TC_ACT_SHOT; 462 + } 463 + 464 + ttl--; 465 + rc = bpf_skb_store_bytes( 466 + skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 467 + 0); 468 + if (rc != 0) { 469 + metrics->errors_total_malformed_encapsulation++; 470 + return TC_ACT_SHOT; 471 + } 472 + } 473 + 474 + if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, 475 + BPF_F_ADJ_ROOM_FIXED_GSO)) { 476 + metrics->errors_total_encap_adjust_failed++; 477 + return TC_ACT_SHOT; 478 + } 479 + 480 + if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { 481 + metrics->errors_total_encap_buffer_too_small++; 482 + return TC_ACT_SHOT; 483 + } 484 + 485 + buf_t pkt = { 486 + .skb = skb, 487 + .head = (uint8_t *)(long)skb->data, 488 + .tail = (uint8_t *)(long)skb->data_end, 489 + }; 490 + 491 + encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); 492 + if (encap_gre == NULL) { 493 + metrics->errors_total_encap_buffer_too_small++; 494 + return TC_ACT_SHOT; 495 + } 496 + 497 + encap_gre->ip.protocol = IPPROTO_GRE; 498 + encap_gre->ip.daddr = next_hop->s_addr; 499 + encap_gre->ip.saddr = ENCAPSULATION_IP; 500 + encap_gre->ip.tot_len = 501 + bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); 502 + encap_gre->gre.flags = 0; 503 + encap_gre->gre.protocol = bpf_htons(proto); 504 + pkt_ipv4_checksum((void *)&encap_gre->ip); 505 + 506 + return bpf_redirect(skb->ifindex, 0); 507 + } 508 + 509 + static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, 510 + struct in_addr *next_hop, metrics_t *metrics) 511 + { 512 + /* swap L2 addresses */ 513 + /* This assumes that packets are received from a router. 514 + * So just swapping the MAC addresses here will make the packet go back to 515 + * the router, which will send it to the appropriate machine. 516 + */ 517 + unsigned char temp[ETH_ALEN]; 518 + memcpy(temp, encap->eth.h_dest, sizeof(temp)); 519 + memcpy(encap->eth.h_dest, encap->eth.h_source, 520 + sizeof(encap->eth.h_dest)); 521 + memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); 522 + 523 + if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && 524 + encap->unigue.last_hop_gre) { 525 + return forward_with_gre(skb, encap, next_hop, metrics); 526 + } 527 + 528 + metrics->forwarded_packets_total_gue++; 529 + uint32_t old_saddr = encap->ip.saddr; 530 + encap->ip.saddr = encap->ip.daddr; 531 + encap->ip.daddr = next_hop->s_addr; 532 + if (encap->unigue.next_hop < encap->unigue.hop_count) { 533 + encap->unigue.next_hop++; 534 + } 535 + 536 + /* Remove ip->saddr, add next_hop->s_addr */ 537 + const uint64_t off = offsetof(typeof(*encap), ip.check); 538 + int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); 539 + if (ret < 0) { 540 + return TC_ACT_SHOT; 541 + } 542 + 543 + return bpf_redirect(skb->ifindex, 0); 544 + } 545 + 546 + static ret_t skip_next_hops(buf_t *pkt, int n) 547 + { 548 + switch (n) { 549 + case 1: 550 + if (!buf_skip(pkt, sizeof(struct in_addr))) 551 + return TC_ACT_SHOT; 552 + case 0: 553 + return CONTINUE_PROCESSING; 554 + 555 + default: 556 + return TC_ACT_SHOT; 557 + } 558 + } 559 + 560 + /* Get the next hop from the GLB header. 561 + * 562 + * Sets next_hop->s_addr to 0 if there are no more hops left. 563 + * pkt is positioned just after the variable length GLB header 564 + * iff the call is successful. 565 + */ 566 + static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, 567 + struct in_addr *next_hop) 568 + { 569 + if (encap->unigue.next_hop > encap->unigue.hop_count) { 570 + return TC_ACT_SHOT; 571 + } 572 + 573 + /* Skip "used" next hops. */ 574 + MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); 575 + 576 + if (encap->unigue.next_hop == encap->unigue.hop_count) { 577 + /* No more next hops, we are at the end of the GLB header. */ 578 + next_hop->s_addr = 0; 579 + return CONTINUE_PROCESSING; 580 + } 581 + 582 + if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { 583 + return TC_ACT_SHOT; 584 + } 585 + 586 + /* Skip the remainig next hops (may be zero). */ 587 + return skip_next_hops(pkt, encap->unigue.hop_count - 588 + encap->unigue.next_hop - 1); 589 + } 590 + 591 + /* Fill a bpf_sock_tuple to be used with the socket lookup functions. 592 + * This is a kludge that let's us work around verifier limitations: 593 + * 594 + * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) 595 + * 596 + * clang will substitue a costant for sizeof, which allows the verifier 597 + * to track it's value. Based on this, it can figure out the constant 598 + * return value, and calling code works while still being "generic" to 599 + * IPv4 and IPv6. 600 + */ 601 + static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, 602 + uint64_t iphlen, uint16_t sport, uint16_t dport) 603 + { 604 + switch (iphlen) { 605 + case sizeof(struct iphdr): { 606 + struct iphdr *ipv4 = (struct iphdr *)iph; 607 + tuple->ipv4.daddr = ipv4->daddr; 608 + tuple->ipv4.saddr = ipv4->saddr; 609 + tuple->ipv4.sport = sport; 610 + tuple->ipv4.dport = dport; 611 + return sizeof(tuple->ipv4); 612 + } 613 + 614 + case sizeof(struct ipv6hdr): { 615 + struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; 616 + memcpy(&tuple->ipv6.daddr, &ipv6->daddr, 617 + sizeof(tuple->ipv6.daddr)); 618 + memcpy(&tuple->ipv6.saddr, &ipv6->saddr, 619 + sizeof(tuple->ipv6.saddr)); 620 + tuple->ipv6.sport = sport; 621 + tuple->ipv6.dport = dport; 622 + return sizeof(tuple->ipv6); 623 + } 624 + 625 + default: 626 + return 0; 627 + } 628 + } 629 + 630 + static verdict_t classify_tcp(struct __sk_buff *skb, 631 + struct bpf_sock_tuple *tuple, uint64_t tuplen, 632 + void *iph, struct tcphdr *tcp) 633 + { 634 + struct bpf_sock *sk = 635 + bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 636 + if (sk == NULL) { 637 + return UNKNOWN; 638 + } 639 + 640 + if (sk->state != BPF_TCP_LISTEN) { 641 + bpf_sk_release(sk); 642 + return ESTABLISHED; 643 + } 644 + 645 + if (iph != NULL && tcp != NULL) { 646 + /* Kludge: we've run out of arguments, but need the length of the ip header. */ 647 + uint64_t iphlen = sizeof(struct iphdr); 648 + if (tuplen == sizeof(tuple->ipv6)) { 649 + iphlen = sizeof(struct ipv6hdr); 650 + } 651 + 652 + if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, 653 + sizeof(*tcp)) == 0) { 654 + bpf_sk_release(sk); 655 + return SYN_COOKIE; 656 + } 657 + } 658 + 659 + bpf_sk_release(sk); 660 + return UNKNOWN; 661 + } 662 + 663 + static verdict_t classify_udp(struct __sk_buff *skb, 664 + struct bpf_sock_tuple *tuple, uint64_t tuplen) 665 + { 666 + struct bpf_sock *sk = 667 + bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); 668 + if (sk == NULL) { 669 + return UNKNOWN; 670 + } 671 + 672 + if (sk->state == BPF_TCP_ESTABLISHED) { 673 + bpf_sk_release(sk); 674 + return ESTABLISHED; 675 + } 676 + 677 + bpf_sk_release(sk); 678 + return UNKNOWN; 679 + } 680 + 681 + static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, 682 + struct bpf_sock_tuple *tuple, uint64_t tuplen, 683 + metrics_t *metrics) 684 + { 685 + switch (proto) { 686 + case IPPROTO_TCP: 687 + return classify_tcp(skb, tuple, tuplen, NULL, NULL); 688 + 689 + case IPPROTO_UDP: 690 + return classify_udp(skb, tuple, tuplen); 691 + 692 + default: 693 + metrics->errors_total_malformed_icmp++; 694 + return INVALID; 695 + } 696 + } 697 + 698 + static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) 699 + { 700 + struct icmphdr icmp; 701 + if (!buf_copy(pkt, &icmp, sizeof(icmp))) { 702 + metrics->errors_total_malformed_icmp++; 703 + return INVALID; 704 + } 705 + 706 + /* We should never receive encapsulated echo replies. */ 707 + if (icmp.type == ICMP_ECHOREPLY) { 708 + metrics->errors_total_icmp_echo_replies++; 709 + return INVALID; 710 + } 711 + 712 + if (icmp.type == ICMP_ECHO) { 713 + return ECHO_REQUEST; 714 + } 715 + 716 + if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { 717 + metrics->errors_total_unwanted_icmp++; 718 + return INVALID; 719 + } 720 + 721 + struct iphdr _ip4; 722 + const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 723 + if (ipv4 == NULL) { 724 + metrics->errors_total_malformed_icmp_pkt_too_big++; 725 + return INVALID; 726 + } 727 + 728 + /* The source address in the outer IP header is from the entity that 729 + * originated the ICMP message. Use the original IP header to restore 730 + * the correct flow tuple. 731 + */ 732 + struct bpf_sock_tuple tuple; 733 + tuple.ipv4.saddr = ipv4->daddr; 734 + tuple.ipv4.daddr = ipv4->saddr; 735 + 736 + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { 737 + metrics->errors_total_malformed_icmp_pkt_too_big++; 738 + return INVALID; 739 + } 740 + 741 + return classify_icmp(pkt->skb, ipv4->protocol, &tuple, 742 + sizeof(tuple.ipv4), metrics); 743 + } 744 + 745 + static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) 746 + { 747 + struct icmp6hdr icmp6; 748 + if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { 749 + metrics->errors_total_malformed_icmp++; 750 + return INVALID; 751 + } 752 + 753 + /* We should never receive encapsulated echo replies. */ 754 + if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { 755 + metrics->errors_total_icmp_echo_replies++; 756 + return INVALID; 757 + } 758 + 759 + if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { 760 + return ECHO_REQUEST; 761 + } 762 + 763 + if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { 764 + metrics->errors_total_unwanted_icmp++; 765 + return INVALID; 766 + } 767 + 768 + bool is_fragment; 769 + uint8_t l4_proto; 770 + struct ipv6hdr _ipv6; 771 + const struct ipv6hdr *ipv6 = 772 + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 773 + if (ipv6 == NULL) { 774 + metrics->errors_total_malformed_icmp_pkt_too_big++; 775 + return INVALID; 776 + } 777 + 778 + if (is_fragment) { 779 + metrics->errors_total_fragmented_ip++; 780 + return INVALID; 781 + } 782 + 783 + /* Swap source and dest addresses. */ 784 + struct bpf_sock_tuple tuple; 785 + memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); 786 + memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); 787 + 788 + if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { 789 + metrics->errors_total_malformed_icmp_pkt_too_big++; 790 + return INVALID; 791 + } 792 + 793 + return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), 794 + metrics); 795 + } 796 + 797 + static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, 798 + metrics_t *metrics) 799 + { 800 + metrics->l4_protocol_packets_total_tcp++; 801 + 802 + struct tcphdr _tcp; 803 + struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); 804 + if (tcp == NULL) { 805 + metrics->errors_total_malformed_tcp++; 806 + return INVALID; 807 + } 808 + 809 + if (tcp->syn) { 810 + return SYN; 811 + } 812 + 813 + struct bpf_sock_tuple tuple; 814 + uint64_t tuplen = 815 + fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); 816 + return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); 817 + } 818 + 819 + static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, 820 + metrics_t *metrics) 821 + { 822 + metrics->l4_protocol_packets_total_udp++; 823 + 824 + struct udphdr _udp; 825 + struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); 826 + if (udph == NULL) { 827 + metrics->errors_total_malformed_udp++; 828 + return INVALID; 829 + } 830 + 831 + struct bpf_sock_tuple tuple; 832 + uint64_t tuplen = 833 + fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); 834 + return classify_udp(pkt->skb, &tuple, tuplen); 835 + } 836 + 837 + static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) 838 + { 839 + metrics->l3_protocol_packets_total_ipv4++; 840 + 841 + struct iphdr _ip4; 842 + struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); 843 + if (ipv4 == NULL) { 844 + metrics->errors_total_malformed_ip++; 845 + return INVALID; 846 + } 847 + 848 + if (ipv4->version != 4) { 849 + metrics->errors_total_malformed_ip++; 850 + return INVALID; 851 + } 852 + 853 + if (ipv4_is_fragment(ipv4)) { 854 + metrics->errors_total_fragmented_ip++; 855 + return INVALID; 856 + } 857 + 858 + switch (ipv4->protocol) { 859 + case IPPROTO_ICMP: 860 + return process_icmpv4(pkt, metrics); 861 + 862 + case IPPROTO_TCP: 863 + return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); 864 + 865 + case IPPROTO_UDP: 866 + return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); 867 + 868 + default: 869 + metrics->errors_total_unknown_l4_proto++; 870 + return INVALID; 871 + } 872 + } 873 + 874 + static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) 875 + { 876 + metrics->l3_protocol_packets_total_ipv6++; 877 + 878 + uint8_t l4_proto; 879 + bool is_fragment; 880 + struct ipv6hdr _ipv6; 881 + struct ipv6hdr *ipv6 = 882 + pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); 883 + if (ipv6 == NULL) { 884 + metrics->errors_total_malformed_ip++; 885 + return INVALID; 886 + } 887 + 888 + if (ipv6->version != 6) { 889 + metrics->errors_total_malformed_ip++; 890 + return INVALID; 891 + } 892 + 893 + if (is_fragment) { 894 + metrics->errors_total_fragmented_ip++; 895 + return INVALID; 896 + } 897 + 898 + switch (l4_proto) { 899 + case IPPROTO_ICMPV6: 900 + return process_icmpv6(pkt, metrics); 901 + 902 + case IPPROTO_TCP: 903 + return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); 904 + 905 + case IPPROTO_UDP: 906 + return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); 907 + 908 + default: 909 + metrics->errors_total_unknown_l4_proto++; 910 + return INVALID; 911 + } 912 + } 913 + 914 + SEC("classifier/cls_redirect") 915 + int cls_redirect(struct __sk_buff *skb) 916 + { 917 + metrics_t *metrics = get_global_metrics(); 918 + if (metrics == NULL) { 919 + return TC_ACT_SHOT; 920 + } 921 + 922 + metrics->processed_packets_total++; 923 + 924 + /* Pass bogus packets as long as we're not sure they're 925 + * destined for us. 926 + */ 927 + if (skb->protocol != bpf_htons(ETH_P_IP)) { 928 + return TC_ACT_OK; 929 + } 930 + 931 + encap_headers_t *encap; 932 + 933 + /* Make sure that all encapsulation headers are available in 934 + * the linear portion of the skb. This makes it easy to manipulate them. 935 + */ 936 + if (bpf_skb_pull_data(skb, sizeof(*encap))) { 937 + return TC_ACT_OK; 938 + } 939 + 940 + buf_t pkt = { 941 + .skb = skb, 942 + .head = (uint8_t *)(long)skb->data, 943 + .tail = (uint8_t *)(long)skb->data_end, 944 + }; 945 + 946 + encap = buf_assign(&pkt, sizeof(*encap), NULL); 947 + if (encap == NULL) { 948 + return TC_ACT_OK; 949 + } 950 + 951 + if (encap->ip.ihl != 5) { 952 + /* We never have any options. */ 953 + return TC_ACT_OK; 954 + } 955 + 956 + if (encap->ip.daddr != ENCAPSULATION_IP || 957 + encap->ip.protocol != IPPROTO_UDP) { 958 + return TC_ACT_OK; 959 + } 960 + 961 + /* TODO Check UDP length? */ 962 + if (encap->udp.dest != ENCAPSULATION_PORT) { 963 + return TC_ACT_OK; 964 + } 965 + 966 + /* We now know that the packet is destined to us, we can 967 + * drop bogus ones. 968 + */ 969 + if (ipv4_is_fragment((void *)&encap->ip)) { 970 + metrics->errors_total_fragmented_ip++; 971 + return TC_ACT_SHOT; 972 + } 973 + 974 + if (encap->gue.variant != 0) { 975 + metrics->errors_total_malformed_encapsulation++; 976 + return TC_ACT_SHOT; 977 + } 978 + 979 + if (encap->gue.control != 0) { 980 + metrics->errors_total_malformed_encapsulation++; 981 + return TC_ACT_SHOT; 982 + } 983 + 984 + if (encap->gue.flags != 0) { 985 + metrics->errors_total_malformed_encapsulation++; 986 + return TC_ACT_SHOT; 987 + } 988 + 989 + if (encap->gue.hlen != 990 + sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { 991 + metrics->errors_total_malformed_encapsulation++; 992 + return TC_ACT_SHOT; 993 + } 994 + 995 + if (encap->unigue.version != 0) { 996 + metrics->errors_total_malformed_encapsulation++; 997 + return TC_ACT_SHOT; 998 + } 999 + 1000 + if (encap->unigue.reserved != 0) { 1001 + return TC_ACT_SHOT; 1002 + } 1003 + 1004 + struct in_addr next_hop; 1005 + MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); 1006 + 1007 + if (next_hop.s_addr == 0) { 1008 + metrics->accepted_packets_total_last_hop++; 1009 + return accept_locally(skb, encap); 1010 + } 1011 + 1012 + verdict_t verdict; 1013 + switch (encap->gue.proto_ctype) { 1014 + case IPPROTO_IPIP: 1015 + verdict = process_ipv4(&pkt, metrics); 1016 + break; 1017 + 1018 + case IPPROTO_IPV6: 1019 + verdict = process_ipv6(&pkt, metrics); 1020 + break; 1021 + 1022 + default: 1023 + metrics->errors_total_unknown_l3_proto++; 1024 + return TC_ACT_SHOT; 1025 + } 1026 + 1027 + switch (verdict) { 1028 + case INVALID: 1029 + /* metrics have already been bumped */ 1030 + return TC_ACT_SHOT; 1031 + 1032 + case UNKNOWN: 1033 + return forward_to_next_hop(skb, encap, &next_hop, metrics); 1034 + 1035 + case ECHO_REQUEST: 1036 + metrics->accepted_packets_total_icmp_echo_request++; 1037 + break; 1038 + 1039 + case SYN: 1040 + if (encap->unigue.forward_syn) { 1041 + return forward_to_next_hop(skb, encap, &next_hop, 1042 + metrics); 1043 + } 1044 + 1045 + metrics->accepted_packets_total_syn++; 1046 + break; 1047 + 1048 + case SYN_COOKIE: 1049 + metrics->accepted_packets_total_syn_cookies++; 1050 + break; 1051 + 1052 + case ESTABLISHED: 1053 + metrics->accepted_packets_total_established++; 1054 + break; 1055 + } 1056 + 1057 + return accept_locally(skb, encap); 1058 + }
+54
tools/testing/selftests/bpf/progs/test_cls_redirect.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ 2 + /* Copyright 2019, 2020 Cloudflare */ 3 + 4 + #include <stdbool.h> 5 + #include <stddef.h> 6 + #include <stdint.h> 7 + #include <string.h> 8 + 9 + #include <linux/if_ether.h> 10 + #include <linux/in.h> 11 + #include <linux/ip.h> 12 + #include <linux/ipv6.h> 13 + #include <linux/udp.h> 14 + 15 + struct gre_base_hdr { 16 + uint16_t flags; 17 + uint16_t protocol; 18 + } __attribute__((packed)); 19 + 20 + struct guehdr { 21 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 22 + uint8_t hlen : 5, control : 1, variant : 2; 23 + #else 24 + uint8_t variant : 2, control : 1, hlen : 5; 25 + #endif 26 + uint8_t proto_ctype; 27 + uint16_t flags; 28 + }; 29 + 30 + struct unigue { 31 + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 32 + uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4; 33 + #else 34 + uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2; 35 + #endif 36 + uint8_t reserved; 37 + uint8_t next_hop; 38 + uint8_t hop_count; 39 + // Next hops go here 40 + } __attribute__((packed)); 41 + 42 + typedef struct { 43 + struct ethhdr eth; 44 + struct iphdr ip; 45 + struct gre_base_hdr gre; 46 + } __attribute__((packed)) encap_gre_t; 47 + 48 + typedef struct { 49 + struct ethhdr eth; 50 + struct iphdr ip; 51 + struct udphdr udp; 52 + struct guehdr gue; 53 + struct unigue unigue; 54 + } __attribute__((packed)) encap_headers_t;
+7
tools/testing/selftests/bpf/test_progs.h
··· 105 105 } __packed; 106 106 extern struct ipv6_packet pkt_v6; 107 107 108 + #define PRINT_FAIL(format...) \ 109 + ({ \ 110 + test__fail(); \ 111 + fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \ 112 + fprintf(stdout, ##format); \ 113 + }) 114 + 108 115 #define _CHECK(condition, tag, duration, format...) ({ \ 109 116 int __ret = !!(condition); \ 110 117 int __save_errno = errno; \