Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: use dst hint for ipv4 list receive

This is alike the previous change, with some additional ipv4 specific
quirk. Even when using the route hint we still have to do perform
additional per packet checks about source address validity: a new
helper is added to wrap them.

Hints are explicitly disabled if the destination is a local broadcast,
that keeps the code simple and local broadcast are a slower path anyway.

UDP flood performances vs recvmmsg() receiver:

vanilla patched delta
Kpps Kpps %
1683 1871 +11

In the worst case scenario - each packet has a different
destination address - the performance delta is within noise
range.

v3 -> v4:
- re-enable hints for forward

v2 -> v3:
- really fix build (sic) and hint usage check
- use fib4_has_custom_rules() helpers (David A.)
- add ip_extract_route_hint() helper (Edward C.)
- use prev skb as hint instead of copying data (Willem)

v1 -> v2:
- fix build issue with !CONFIG_IP_MULTIPLE_TABLES

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Paolo Abeni and committed by
David S. Miller
02b24941 c43c3d76

+77 -4
+4
include/net/route.h
··· 185 185 u8 tos, struct net_device *devin, 186 186 struct fib_result *res); 187 187 188 + int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, 189 + u8 tos, struct net_device *devin, 190 + const struct sk_buff *hint); 191 + 188 192 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, 189 193 u8 tos, struct net_device *devin) 190 194 {
+31 -4
net/ipv4/ip_input.c
··· 302 302 return true; 303 303 } 304 304 305 + static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, 306 + const struct sk_buff *hint) 307 + { 308 + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && 309 + ip_hdr(hint)->tos == iph->tos; 310 + } 311 + 305 312 INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); 306 313 INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); 307 314 static int ip_rcv_finish_core(struct net *net, struct sock *sk, 308 - struct sk_buff *skb, struct net_device *dev) 315 + struct sk_buff *skb, struct net_device *dev, 316 + const struct sk_buff *hint) 309 317 { 310 318 const struct iphdr *iph = ip_hdr(skb); 311 319 int (*edemux)(struct sk_buff *skb); 312 320 struct rtable *rt; 313 321 int err; 322 + 323 + if (ip_can_use_hint(skb, iph, hint)) { 324 + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, 325 + dev, hint); 326 + if (unlikely(err)) 327 + goto drop_error; 328 + } 314 329 315 330 if (net->ipv4.sysctl_ip_early_demux && 316 331 !skb_dst(skb) && ··· 423 408 if (!skb) 424 409 return NET_RX_SUCCESS; 425 410 426 - ret = ip_rcv_finish_core(net, sk, skb, dev); 411 + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); 427 412 if (ret != NET_RX_DROP) 428 413 ret = dst_input(skb); 429 414 return ret; ··· 550 535 } 551 536 } 552 537 538 + static struct sk_buff *ip_extract_route_hint(const struct net *net, 539 + struct sk_buff *skb, int rt_type) 540 + { 541 + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) 542 + return NULL; 543 + 544 + return skb; 545 + } 546 + 553 547 static void ip_list_rcv_finish(struct net *net, struct sock *sk, 554 548 struct list_head *head) 555 549 { 550 + struct sk_buff *skb, *next, *hint = NULL; 556 551 struct dst_entry *curr_dst = NULL; 557 - struct sk_buff *skb, *next; 558 552 struct list_head sublist; 559 553 560 554 INIT_LIST_HEAD(&sublist); ··· 578 554 skb = l3mdev_ip_rcv(skb); 579 555 if (!skb) 580 556 continue; 581 - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) 557 + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) 582 558 continue; 583 559 584 560 dst = skb_dst(skb); 585 561 if (curr_dst != dst) { 562 + hint = ip_extract_route_hint(net, skb, 563 + ((struct rtable *)dst)->rt_type); 564 + 586 565 /* dispatch old sublist */ 587 566 if (!list_empty(&sublist)) 588 567 ip_sublist_rcv_finish(&sublist);
+42
net/ipv4/route.c
··· 2019 2019 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 2020 2020 } 2021 2021 2022 + /* Implements all the saddr-related checks as ip_route_input_slow(), 2023 + * assuming daddr is valid and the destination is not a local broadcast one. 2024 + * Uses the provided hint instead of performing a route lookup. 2025 + */ 2026 + int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2027 + u8 tos, struct net_device *dev, 2028 + const struct sk_buff *hint) 2029 + { 2030 + struct in_device *in_dev = __in_dev_get_rcu(dev); 2031 + struct rtable *rt = (struct rtable *)hint; 2032 + struct net *net = dev_net(dev); 2033 + int err = -EINVAL; 2034 + u32 tag = 0; 2035 + 2036 + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2037 + goto martian_source; 2038 + 2039 + if (ipv4_is_zeronet(saddr)) 2040 + goto martian_source; 2041 + 2042 + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2043 + goto martian_source; 2044 + 2045 + if (rt->rt_type != RTN_LOCAL) 2046 + goto skip_validate_source; 2047 + 2048 + tos &= IPTOS_RT_MASK; 2049 + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); 2050 + if (err < 0) 2051 + goto martian_source; 2052 + 2053 + skip_validate_source: 2054 + skb_dst_copy(skb, hint); 2055 + return 0; 2056 + 2057 + martian_source: 2058 + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2059 + return err; 2060 + } 2061 + 2022 2062 /* 2023 2063 * NOTE. We drop all the packets that has local source 2024 2064 * addresses, because every properly looped back packet 2025 2065 * must have correct destination already attached by output routine. 2066 + * Changes in the enforced policies must be applied also to 2067 + * ip_route_use_hint(). 2026 2068 * 2027 2069 * Such approach solves two big problems: 2028 2070 * 1. Not simplex devices are handled properly.