netfilter: flowtable: Add IPIP tx sw acceleration

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Introduce sw acceleration for tx path of IPIP tunnels relying on the
netfilter flowtable infrastructure.
This patch introduces basic infrastructure to accelerate other tunnel
types (e.g. IP6IP6).
IPIP sw tx acceleration can be tested running the following scenario where
the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP
tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet 192.168.0.2/24 scope global eth0
valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet 192.168.1.1/24 scope global eth1
valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
link/ipip 192.168.1.1 peer 192.168.1.2
inet 192.168.100.1/24 scope global tun0
valid_lft forever preferred_lft forever

$ip route show
default via 192.168.100.2 dev tun0
192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2
192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1
192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { eth0, eth1 }
}

chain forward {
type filter hook forward priority filter; policy accept;
meta l4proto { tcp, udp } flow add @ft
}
}

Reproducing the scenario described above using veths I got the following
results:
- TCP stream trasmitted into the IPIP tunnel:
- net-next: (baseline) ~ 85Gbps
- net-next + IPIP flowtable support: ~102Gbps

Co-developed-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Lorenzo Bianconi and committed by

Pablo Neira Ayuso 4 months ago d30301ba ab427db1

+106 -4

2 changed files

expand all

net

netfilter

nf_flow_table_ip.c

nf_flow_table_path.c

+62

net/netfilter/nf_flow_table_ip.c

··· 437 437 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 438 438 439 439 mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; 440 + if (flow->tuplehash[!dir].tuple.tun_num) 441 + mtu -= sizeof(*iph); 442 + 440 443 if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) 441 444 return 0; 442 445 ··· 511 508 return 0; 512 509 } 513 510 511 + static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, 512 + struct flow_offload_tuple *tuple, 513 + __be32 *ip_daddr) 514 + { 515 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 516 + struct rtable *rt = dst_rtable(tuple->dst_cache); 517 + u8 tos = iph->tos, ttl = iph->ttl; 518 + __be16 frag_off = iph->frag_off; 519 + u32 headroom = sizeof(*iph); 520 + int err; 521 + 522 + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); 523 + if (err) 524 + return err; 525 + 526 + skb_set_inner_ipproto(skb, IPPROTO_IPIP); 527 + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 528 + err = skb_cow_head(skb, headroom); 529 + if (err) 530 + return err; 531 + 532 + skb_scrub_packet(skb, true); 533 + skb_clear_hash_if_not_l4(skb); 534 + 535 + /* Push down and install the IP header. */ 536 + skb_push(skb, sizeof(*iph)); 537 + skb_reset_network_header(skb); 538 + 539 + iph = ip_hdr(skb); 540 + iph->version = 4; 541 + iph->ihl = sizeof(*iph) >> 2; 542 + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; 543 + iph->protocol = tuple->tun.l3_proto; 544 + iph->tos = tos; 545 + iph->daddr = tuple->tun.src_v4.s_addr; 546 + iph->saddr = tuple->tun.dst_v4.s_addr; 547 + iph->ttl = ttl; 548 + iph->tot_len = htons(skb->len); 549 + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); 550 + ip_send_check(iph); 551 + 552 + *ip_daddr = tuple->tun.src_v4.s_addr; 553 + 554 + return 0; 555 + } 556 + 557 + static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, 558 + struct flow_offload_tuple *tuple, 559 + __be32 *ip_daddr) 560 + { 561 + if (tuple->tun_num) 562 + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); 563 + 564 + return 0; 565 + } 566 + 514 567 static int nf_flow_encap_push(struct sk_buff *skb, 515 568 struct flow_offload_tuple *tuple) 516 569 { ··· 630 571 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 631 572 other_tuple = &flow->tuplehash[!dir].tuple; 632 573 ip_daddr = other_tuple->src_v4.s_addr; 574 + 575 + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) 576 + return NF_DROP; 633 577 634 578 if (nf_flow_encap_push(skb, other_tuple) < 0) 635 579 return NF_DROP;

+44 -4

net/netfilter/nf_flow_table_path.c

··· 197 197 return found; 198 198 } 199 199 200 - static void nft_dev_forward_path(struct nf_flow_route *route, 200 + static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, 201 + struct flow_offload_tunnel *tun, 202 + struct nf_flow_route *route, 203 + enum ip_conntrack_dir dir) 204 + { 205 + struct dst_entry *cur_dst = route->tuple[dir].dst; 206 + struct dst_entry *tun_dst = NULL; 207 + struct flowi fl = {}; 208 + 209 + switch (nft_pf(pkt)) { 210 + case NFPROTO_IPV4: 211 + fl.u.ip4.daddr = tun->dst_v4.s_addr; 212 + fl.u.ip4.saddr = tun->src_v4.s_addr; 213 + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; 214 + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 215 + fl.u.ip4.flowi4_mark = pkt->skb->mark; 216 + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 217 + break; 218 + case NFPROTO_IPV6: 219 + fl.u.ip6.daddr = tun->dst_v6; 220 + fl.u.ip6.saddr = tun->src_v6; 221 + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; 222 + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 223 + fl.u.ip6.flowi6_mark = pkt->skb->mark; 224 + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 225 + break; 226 + } 227 + 228 + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); 229 + if (!tun_dst) 230 + return -ENOENT; 231 + 232 + route->tuple[dir].dst = tun_dst; 233 + dst_release(cur_dst); 234 + 235 + return 0; 236 + } 237 + 238 + static void nft_dev_forward_path(const struct nft_pktinfo *pkt, 239 + struct nf_flow_route *route, 201 240 const struct nf_conn *ct, 202 241 enum ip_conntrack_dir dir, 203 242 struct nft_flowtable *ft) ··· 259 220 route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 260 221 } 261 222 262 - if (info.num_tuns) { 223 + if (info.num_tuns && 224 + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { 263 225 route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; 264 226 route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; 265 227 route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; ··· 321 281 nft_default_forward_path(route, other_dst, !dir); 322 282 323 283 if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 324 - nft_dev_forward_path(route, ct, dir, ft); 284 + nft_dev_forward_path(pkt, route, ct, dir, ft); 325 285 if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 326 - nft_dev_forward_path(route, ct, !dir, ft); 286 + nft_dev_forward_path(pkt, route, ct, !dir, ft); 327 287 328 288 return 0; 329 289 }