Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipip,ip_tunnel,sit: Add FOU support for externally controlled ipip devices

Today ipip devices in collect-metadata mode don't allow for sending FOU
or GUE encapsulated packets. This patch lifts the restriction by adding
a struct ip_tunnel_encap to the tunnel metadata.

On the egress path, the members of this struct can be set by the
bpf_skb_set_fou_encap kfunc via a BPF tc-hook. Instead of dropping packets
wishing to use additional UDP encapsulation, ip_md_tunnel_xmit now
evaluates the contents of this struct and adds the corresponding FOU or
GUE header. Furthermore, it is making sure that additional header bytes
are taken into account for PMTU discovery.

On the ingress path, an ipip device in collect-metadata mode will fill this
struct and a BPF tc-hook can obtain the information via a call to the
bpf_skb_get_fou_encap kfunc.

The minor change to ip_tunnel_encap, which now takes a pointer to
struct ip_tunnel_encap instead of struct ip_tunnel, allows us to control
FOU encap type and parameters on a per packet-level.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Link: https://lore.kernel.org/r/cfea47de655d0f870248abf725932f851b53960a.1680874078.git.cehrig@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Christian Ehrig and committed by
Alexei Starovoitov
ac931d4c ed17aa92

+37 -16
+15 -13
include/net/ip_tunnels.h
··· 57 57 __u8 flow_flags; 58 58 }; 59 59 60 + struct ip_tunnel_encap { 61 + u16 type; 62 + u16 flags; 63 + __be16 sport; 64 + __be16 dport; 65 + }; 66 + 60 67 /* Flags for ip_tunnel_info mode. */ 61 68 #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ 62 69 #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ ··· 73 66 #define IP_TUNNEL_OPTS_MAX \ 74 67 GENMASK((sizeof_field(struct ip_tunnel_info, \ 75 68 options_len) * BITS_PER_BYTE) - 1, 0) 76 - 77 69 struct ip_tunnel_info { 78 70 struct ip_tunnel_key key; 71 + struct ip_tunnel_encap encap; 79 72 #ifdef CONFIG_DST_CACHE 80 73 struct dst_cache dst_cache; 81 74 #endif ··· 92 85 u16 relay_prefixlen; 93 86 }; 94 87 #endif 95 - 96 - struct ip_tunnel_encap { 97 - u16 type; 98 - u16 flags; 99 - __be16 sport; 100 - __be16 dport; 101 - }; 102 88 103 89 struct ip_tunnel_prl_entry { 104 90 struct ip_tunnel_prl_entry __rcu *next; ··· 293 293 __be32 remote, __be32 local, 294 294 __be32 key); 295 295 296 + void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); 296 297 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 297 298 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 298 299 bool log_ecn_error); ··· 372 371 return hlen; 373 372 } 374 373 375 - static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, 374 + static inline int ip_tunnel_encap(struct sk_buff *skb, 375 + struct ip_tunnel_encap *e, 376 376 u8 *protocol, struct flowi4 *fl4) 377 377 { 378 378 const struct ip_tunnel_encap_ops *ops; 379 379 int ret = -EINVAL; 380 380 381 - if (t->encap.type == TUNNEL_ENCAP_NONE) 381 + if (e->type == TUNNEL_ENCAP_NONE) 382 382 return 0; 383 383 384 - if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) 384 + if (e->type >= MAX_IPTUN_ENCAP_OPS) 385 385 return -EINVAL; 386 386 387 387 rcu_read_lock(); 388 - ops = rcu_dereference(iptun_encaps[t->encap.type]); 388 + ops = rcu_dereference(iptun_encaps[e->type]); 389 389 if (likely(ops && ops->build_header)) 390 - ret = ops->build_header(skb, &t->encap, protocol, fl4); 390 + ret = ops->build_header(skb, e, protocol, fl4); 391 391 rcu_read_unlock(); 392 392 393 393 return ret;
+20 -2
net/ipv4/ip_tunnel.c
··· 359 359 return ERR_PTR(err); 360 360 } 361 361 362 + void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) 363 + { 364 + const struct iphdr *iph = ip_hdr(skb); 365 + const struct udphdr *udph; 366 + 367 + if (iph->protocol != IPPROTO_UDP) 368 + return; 369 + 370 + udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); 371 + info->encap.sport = udph->source; 372 + info->encap.dport = udph->dest; 373 + } 374 + EXPORT_SYMBOL(ip_tunnel_md_udp_encap); 375 + 362 376 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 363 377 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 364 378 bool log_ecn_error) ··· 586 572 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 587 573 dev_net(dev), 0, skb->mark, skb_get_hash(skb), 588 574 key->flow_flags); 589 - if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 575 + 576 + if (!tunnel_hlen) 577 + tunnel_hlen = ip_encap_hlen(&tun_info->encap); 578 + 579 + if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) 590 580 goto tx_error; 591 581 592 582 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); ··· 750 732 dev_net(dev), tunnel->parms.link, 751 733 tunnel->fwmark, skb_get_hash(skb), 0); 752 734 753 - if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 735 + if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) 754 736 goto tx_error; 755 737 756 738 if (connected && md) {
+1
net/ipv4/ipip.c
··· 241 241 tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); 242 242 if (!tun_dst) 243 243 return 0; 244 + ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info); 244 245 } 245 246 skb_reset_mac_header(skb); 246 247
+1 -1
net/ipv6/sit.c
··· 1024 1024 ttl = iph6->hop_limit; 1025 1025 tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); 1026 1026 1027 - if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) { 1027 + if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { 1028 1028 ip_rt_put(rt); 1029 1029 goto tx_error; 1030 1030 }