Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ip_tunnel: add collect_md mode to IPIP tunnel

Similar to gre, vxlan, geneve tunnels allow IPIP tunnels to
operate in 'collect metadata' mode.
bpf_skb_[gs]et_tunnel_key() helpers can make use of it right away.
ovs can use it as well in the future (once appropriate ovs-vport
abstractions and user apis are added).
Note that just like in other tunnels we cannot cache the dst,
since tunnel_info metadata can be different for every packet.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Alexei Starovoitov and committed by
David S. Miller
cfc7381b eb94737d

+108 -6
+2
include/net/ip_tunnels.h
··· 255 255 256 256 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 257 257 const struct iphdr *tnl_params, const u8 protocol); 258 + void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 259 + const u8 proto); 258 260 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); 259 261 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); 260 262 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
+1
include/uapi/linux/if_tunnel.h
··· 73 73 IFLA_IPTUN_ENCAP_FLAGS, 74 74 IFLA_IPTUN_ENCAP_SPORT, 75 75 IFLA_IPTUN_ENCAP_DPORT, 76 + IFLA_IPTUN_COLLECT_METADATA, 76 77 __IFLA_IPTUN_MAX, 77 78 }; 78 79 #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1)
+76
net/ipv4/ip_tunnel.c
··· 55 55 #include <net/netns/generic.h> 56 56 #include <net/rtnetlink.h> 57 57 #include <net/udp.h> 58 + #include <net/dst_metadata.h> 58 59 59 60 #if IS_ENABLED(CONFIG_IPV6) 60 61 #include <net/ipv6.h> ··· 546 545 #endif 547 546 return 0; 548 547 } 548 + 549 + void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) 550 + { 551 + struct ip_tunnel *tunnel = netdev_priv(dev); 552 + u32 headroom = sizeof(struct iphdr); 553 + struct ip_tunnel_info *tun_info; 554 + const struct ip_tunnel_key *key; 555 + const struct iphdr *inner_iph; 556 + struct rtable *rt; 557 + struct flowi4 fl4; 558 + __be16 df = 0; 559 + u8 tos, ttl; 560 + 561 + tun_info = skb_tunnel_info(skb); 562 + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 563 + ip_tunnel_info_af(tun_info) != AF_INET)) 564 + goto tx_error; 565 + key = &tun_info->key; 566 + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 567 + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 568 + tos = key->tos; 569 + if (tos == 1) { 570 + if (skb->protocol == htons(ETH_P_IP)) 571 + tos = inner_iph->tos; 572 + else if (skb->protocol == htons(ETH_P_IPV6)) 573 + tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 574 + } 575 + init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, 576 + RT_TOS(tos), tunnel->parms.link); 577 + if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 578 + goto tx_error; 579 + rt = ip_route_output_key(tunnel->net, &fl4); 580 + if (IS_ERR(rt)) { 581 + dev->stats.tx_carrier_errors++; 582 + goto tx_error; 583 + } 584 + if (rt->dst.dev == dev) { 585 + ip_rt_put(rt); 586 + dev->stats.collisions++; 587 + goto tx_error; 588 + } 589 + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 590 + ttl = key->ttl; 591 + if (ttl == 0) { 592 + if (skb->protocol == htons(ETH_P_IP)) 593 + ttl = inner_iph->ttl; 594 + else if (skb->protocol == htons(ETH_P_IPV6)) 595 + ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 596 + else 597 + ttl = ip4_dst_hoplimit(&rt->dst); 598 + } 599 + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 600 + df = htons(IP_DF); 601 + else if (skb->protocol == htons(ETH_P_IP)) 602 + df = inner_iph->frag_off & htons(IP_DF); 603 + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 604 + if (headroom > dev->needed_headroom) 605 + dev->needed_headroom = headroom; 606 + 607 + if (skb_cow_head(skb, dev->needed_headroom)) { 608 + ip_rt_put(rt); 609 + goto tx_dropped; 610 + } 611 + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, 612 + key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); 613 + return; 614 + tx_error: 615 + dev->stats.tx_errors++; 616 + goto kfree; 617 + tx_dropped: 618 + dev->stats.tx_dropped++; 619 + kfree: 620 + kfree_skb(skb); 621 + } 622 + EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 549 623 550 624 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 551 625 const struct iphdr *tnl_params, u8 protocol)
+29 -6
net/ipv4/ipip.c
··· 115 115 #include <net/xfrm.h> 116 116 #include <net/net_namespace.h> 117 117 #include <net/netns/generic.h> 118 + #include <net/dst_metadata.h> 118 119 119 120 static bool log_ecn_error = true; 120 121 module_param(log_ecn_error, bool, 0644); ··· 194 193 { 195 194 struct net *net = dev_net(skb->dev); 196 195 struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); 196 + struct metadata_dst *tun_dst = NULL; 197 197 struct ip_tunnel *tunnel; 198 198 const struct iphdr *iph; 199 199 ··· 218 216 tpi = &ipip_tpi; 219 217 if (iptunnel_pull_header(skb, 0, tpi->proto, false)) 220 218 goto drop; 221 - return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); 219 + if (tunnel->collect_md) { 220 + tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); 221 + if (!tun_dst) 222 + return 0; 223 + } 224 + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); 222 225 } 223 226 224 227 return -1; ··· 277 270 278 271 skb_set_inner_ipproto(skb, ipproto); 279 272 280 - ip_tunnel_xmit(skb, dev, tiph, ipproto); 273 + if (tunnel->collect_md) 274 + ip_md_tunnel_xmit(skb, dev, ipproto); 275 + else 276 + ip_tunnel_xmit(skb, dev, tiph, ipproto); 281 277 return NETDEV_TX_OK; 282 278 283 279 tx_error: ··· 390 380 } 391 381 392 382 static void ipip_netlink_parms(struct nlattr *data[], 393 - struct ip_tunnel_parm *parms) 383 + struct ip_tunnel_parm *parms, bool *collect_md) 394 384 { 395 385 memset(parms, 0, sizeof(*parms)); 396 386 397 387 parms->iph.version = 4; 398 388 parms->iph.protocol = IPPROTO_IPIP; 399 389 parms->iph.ihl = 5; 390 + *collect_md = false; 400 391 401 392 if (!data) 402 393 return; ··· 425 414 426 415 if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) 427 416 parms->iph.frag_off = htons(IP_DF); 417 + 418 + if (data[IFLA_IPTUN_COLLECT_METADATA]) 419 + *collect_md = true; 428 420 } 429 421 430 422 /* This function returns true when ENCAP attributes are present in the nl msg */ ··· 467 453 static int ipip_newlink(struct net *src_net, struct net_device *dev, 468 454 struct nlattr *tb[], struct nlattr *data[]) 469 455 { 456 + struct ip_tunnel *t = netdev_priv(dev); 470 457 struct ip_tunnel_parm p; 471 458 struct ip_tunnel_encap ipencap; 472 459 473 460 if (ipip_netlink_encap_parms(data, &ipencap)) { 474 - struct ip_tunnel *t = netdev_priv(dev); 475 461 int err = ip_tunnel_encap_setup(t, &ipencap); 476 462 477 463 if (err < 0) 478 464 return err; 479 465 } 480 466 481 - ipip_netlink_parms(data, &p); 467 + ipip_netlink_parms(data, &p, &t->collect_md); 482 468 return ip_tunnel_newlink(dev, tb, &p); 483 469 } 484 470 ··· 487 473 { 488 474 struct ip_tunnel_parm p; 489 475 struct ip_tunnel_encap ipencap; 476 + bool collect_md; 490 477 491 478 if (ipip_netlink_encap_parms(data, &ipencap)) { 492 479 struct ip_tunnel *t = netdev_priv(dev); ··· 497 482 return err; 498 483 } 499 484 500 - ipip_netlink_parms(data, &p); 485 + ipip_netlink_parms(data, &p, &collect_md); 486 + if (collect_md) 487 + return -EINVAL; 501 488 502 489 if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || 503 490 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) ··· 533 516 nla_total_size(2) + 534 517 /* IFLA_IPTUN_ENCAP_DPORT */ 535 518 nla_total_size(2) + 519 + /* IFLA_IPTUN_COLLECT_METADATA */ 520 + nla_total_size(0) + 536 521 0; 537 522 } 538 523 ··· 563 544 tunnel->encap.flags)) 564 545 goto nla_put_failure; 565 546 547 + if (tunnel->collect_md) 548 + if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) 549 + goto nla_put_failure; 566 550 return 0; 567 551 568 552 nla_put_failure: ··· 584 562 [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, 585 563 [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, 586 564 [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, 565 + [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, 587 566 }; 588 567 589 568 static struct rtnl_link_ops ipip_link_ops __read_mostly = {