Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-introduce-and-use-route-hint'

Paolo Abeni says:

====================
net: introduce and use route hint

This series leverages the listification infrastructure to avoid
unnecessary route lookup on ingress packets. In absence of custom rules,
packets with equal daddr will usually land on the same dst.

When processing packet bursts (lists) we can easily reference the previous
dst entry. When we hit the 'same destination' condition we can avoid the
route lookup, coping the already available dst.

Detailed performance numbers are available in the individual commit
messages.

v3 -> v4:
- move helpers to their own patches (Eric D.)
- enable hints for SUBTREE builds (David A.)
- re-enable hints for ipv4 forward (David A.)

v2 -> v3:
- use fib*_has_custom_rules() helpers (David A.)
- add ip*_extract_route_hint() helper (Edward C.)
- use prev skb as hint instead of copying data (Willem )

v1 -> v2:
- fix build issue with !CONFIG_IP*_MULTIPLE_TABLES
- fix potential race in ip6_list_rcv_finish()
====================

Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

+160 -16
+39
include/net/ip6_fib.h
··· 90 90 91 91 #ifndef CONFIG_IPV6_SUBTREES 92 92 #define FIB6_SUBTREE(fn) NULL 93 + 94 + static inline bool fib6_routes_require_src(const struct net *net) 95 + { 96 + return false; 97 + } 98 + 99 + static inline void fib6_routes_require_src_inc(struct net *net) {} 100 + static inline void fib6_routes_require_src_dec(struct net *net) {} 101 + 93 102 #else 103 + 104 + static inline bool fib6_routes_require_src(const struct net *net) 105 + { 106 + return net->ipv6.fib6_routes_require_src > 0; 107 + } 108 + 109 + static inline void fib6_routes_require_src_inc(struct net *net) 110 + { 111 + net->ipv6.fib6_routes_require_src++; 112 + } 113 + 114 + static inline void fib6_routes_require_src_dec(struct net *net) 115 + { 116 + net->ipv6.fib6_routes_require_src--; 117 + } 118 + 94 119 #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) 95 120 #endif 96 121 ··· 235 210 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) 236 211 { 237 212 return ((struct rt6_info *)dst)->rt6i_idev; 213 + } 214 + 215 + static inline bool fib6_requires_src(const struct fib6_info *rt) 216 + { 217 + return rt->fib6_src.plen > 0; 238 218 } 239 219 240 220 static inline void fib6_clean_expires(struct fib6_info *f6i) ··· 532 502 } 533 503 534 504 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 505 + static inline bool fib6_has_custom_rules(const struct net *net) 506 + { 507 + return net->ipv6.fib6_has_custom_rules; 508 + } 509 + 535 510 int fib6_rules_init(void); 536 511 void fib6_rules_cleanup(void); 537 512 bool fib6_rule_default(const struct fib_rule *rule); ··· 562 527 return true; 563 528 } 564 529 #else 530 + static inline bool fib6_has_custom_rules(const struct net *net) 531 + { 532 + return false; 533 + } 565 534 static inline int fib6_rules_init(void) 566 535 { 567 536 return 0;
+10
include/net/ip_fib.h
··· 311 311 return err; 312 312 } 313 313 314 + static inline bool fib4_has_custom_rules(const struct net *net) 315 + { 316 + return false; 317 + } 318 + 314 319 static inline bool fib4_rule_default(const struct fib_rule *rule) 315 320 { 316 321 return true; ··· 381 376 rcu_read_unlock(); 382 377 383 378 return err; 379 + } 380 + 381 + static inline bool fib4_has_custom_rules(const struct net *net) 382 + { 383 + return net->ipv4.fib_has_custom_rules; 384 384 } 385 385 386 386 bool fib4_rule_default(const struct fib_rule *rule);
+3
include/net/netns/ipv6.h
··· 83 83 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 84 84 unsigned int fib6_rules_require_fldissect; 85 85 bool fib6_has_custom_rules; 86 + #ifdef CONFIG_IPV6_SUBTREES 87 + unsigned int fib6_routes_require_src; 88 + #endif 86 89 struct rt6_info *ip6_prohibit_entry; 87 90 struct rt6_info *ip6_blk_hole_entry; 88 91 struct fib6_table *fib6_local_tbl;
+4
include/net/route.h
··· 185 185 u8 tos, struct net_device *devin, 186 186 struct fib_result *res); 187 187 188 + int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, 189 + u8 tos, struct net_device *devin, 190 + const struct sk_buff *hint); 191 + 188 192 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, 189 193 u8 tos, struct net_device *devin) 190 194 {
-10
net/ipv4/fib_frontend.c
··· 70 70 fib_free_table(main_table); 71 71 return -ENOMEM; 72 72 } 73 - 74 - static bool fib4_has_custom_rules(struct net *net) 75 - { 76 - return false; 77 - } 78 73 #else 79 74 80 75 struct fib_table *fib_new_table(struct net *net, u32 id) ··· 125 130 return tb; 126 131 } 127 132 return NULL; 128 - } 129 - 130 - static bool fib4_has_custom_rules(struct net *net) 131 - { 132 - return net->ipv4.fib_has_custom_rules; 133 133 } 134 134 #endif /* CONFIG_IP_MULTIPLE_TABLES */ 135 135
+31 -4
net/ipv4/ip_input.c
··· 302 302 return true; 303 303 } 304 304 305 + static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, 306 + const struct sk_buff *hint) 307 + { 308 + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && 309 + ip_hdr(hint)->tos == iph->tos; 310 + } 311 + 305 312 INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *)); 306 313 INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *)); 307 314 static int ip_rcv_finish_core(struct net *net, struct sock *sk, 308 - struct sk_buff *skb, struct net_device *dev) 315 + struct sk_buff *skb, struct net_device *dev, 316 + const struct sk_buff *hint) 309 317 { 310 318 const struct iphdr *iph = ip_hdr(skb); 311 319 int (*edemux)(struct sk_buff *skb); 312 320 struct rtable *rt; 313 321 int err; 322 + 323 + if (ip_can_use_hint(skb, iph, hint)) { 324 + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, 325 + dev, hint); 326 + if (unlikely(err)) 327 + goto drop_error; 328 + } 314 329 315 330 if (net->ipv4.sysctl_ip_early_demux && 316 331 !skb_dst(skb) && ··· 423 408 if (!skb) 424 409 return NET_RX_SUCCESS; 425 410 426 - ret = ip_rcv_finish_core(net, sk, skb, dev); 411 + ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); 427 412 if (ret != NET_RX_DROP) 428 413 ret = dst_input(skb); 429 414 return ret; ··· 550 535 } 551 536 } 552 537 538 + static struct sk_buff *ip_extract_route_hint(const struct net *net, 539 + struct sk_buff *skb, int rt_type) 540 + { 541 + if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST) 542 + return NULL; 543 + 544 + return skb; 545 + } 546 + 553 547 static void ip_list_rcv_finish(struct net *net, struct sock *sk, 554 548 struct list_head *head) 555 549 { 550 + struct sk_buff *skb, *next, *hint = NULL; 556 551 struct dst_entry *curr_dst = NULL; 557 - struct sk_buff *skb, *next; 558 552 struct list_head sublist; 559 553 560 554 INIT_LIST_HEAD(&sublist); ··· 578 554 skb = l3mdev_ip_rcv(skb); 579 555 if (!skb) 580 556 continue; 581 - if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) 557 + if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) 582 558 continue; 583 559 584 560 dst = skb_dst(skb); 585 561 if (curr_dst != dst) { 562 + hint = ip_extract_route_hint(net, skb, 563 + ((struct rtable *)dst)->rt_type); 564 + 586 565 /* dispatch old sublist */ 587 566 if (!list_empty(&sublist)) 588 567 ip_sublist_rcv_finish(&sublist);
+42
net/ipv4/route.c
··· 2019 2019 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 2020 2020 } 2021 2021 2022 + /* Implements all the saddr-related checks as ip_route_input_slow(), 2023 + * assuming daddr is valid and the destination is not a local broadcast one. 2024 + * Uses the provided hint instead of performing a route lookup. 2025 + */ 2026 + int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2027 + u8 tos, struct net_device *dev, 2028 + const struct sk_buff *hint) 2029 + { 2030 + struct in_device *in_dev = __in_dev_get_rcu(dev); 2031 + struct rtable *rt = (struct rtable *)hint; 2032 + struct net *net = dev_net(dev); 2033 + int err = -EINVAL; 2034 + u32 tag = 0; 2035 + 2036 + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2037 + goto martian_source; 2038 + 2039 + if (ipv4_is_zeronet(saddr)) 2040 + goto martian_source; 2041 + 2042 + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2043 + goto martian_source; 2044 + 2045 + if (rt->rt_type != RTN_LOCAL) 2046 + goto skip_validate_source; 2047 + 2048 + tos &= IPTOS_RT_MASK; 2049 + err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); 2050 + if (err < 0) 2051 + goto martian_source; 2052 + 2053 + skip_validate_source: 2054 + skb_dst_copy(skb, hint); 2055 + return 0; 2056 + 2057 + martian_source: 2058 + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2059 + return err; 2060 + } 2061 + 2022 2062 /* 2023 2063 * NOTE. We drop all the packets that has local source 2024 2064 * addresses, because every properly looped back packet 2025 2065 * must have correct destination already attached by output routine. 2066 + * Changes in the enforced policies must be applied also to 2067 + * ip_route_use_hint(). 2026 2068 * 2027 2069 * Such approach solves two big problems: 2028 2070 * 1. Not simplex devices are handled properly.
+4
net/ipv6/ip6_fib.c
··· 1461 1461 } 1462 1462 #endif 1463 1463 goto failure; 1464 + } else if (fib6_requires_src(rt)) { 1465 + fib6_routes_require_src_inc(info->nl_net); 1464 1466 } 1465 1467 return err; 1466 1468 ··· 1935 1933 struct fib6_info *cur = rcu_dereference_protected(*rtp, 1936 1934 lockdep_is_held(&table->tb6_lock)); 1937 1935 if (rt == cur) { 1936 + if (fib6_requires_src(cur)) 1937 + fib6_routes_require_src_dec(info->nl_net); 1938 1938 fib6_del_route(table, fn, rtp, info); 1939 1939 return 0; 1940 1940 }
+24 -2
net/ipv6/ip6_input.c
··· 86 86 } 87 87 } 88 88 89 + static bool ip6_can_use_hint(const struct sk_buff *skb, 90 + const struct sk_buff *hint) 91 + { 92 + return hint && !skb_dst(skb) && 93 + ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr); 94 + } 95 + 96 + static struct sk_buff *ip6_extract_route_hint(const struct net *net, 97 + struct sk_buff *skb) 98 + { 99 + if (fib6_routes_require_src(net) || fib6_has_custom_rules(net)) 100 + return NULL; 101 + 102 + return skb; 103 + } 104 + 89 105 static void ip6_list_rcv_finish(struct net *net, struct sock *sk, 90 106 struct list_head *head) 91 107 { 108 + struct sk_buff *skb, *next, *hint = NULL; 92 109 struct dst_entry *curr_dst = NULL; 93 - struct sk_buff *skb, *next; 94 110 struct list_head sublist; 95 111 96 112 INIT_LIST_HEAD(&sublist); ··· 120 104 skb = l3mdev_ip6_rcv(skb); 121 105 if (!skb) 122 106 continue; 123 - ip6_rcv_finish_core(net, sk, skb); 107 + 108 + if (ip6_can_use_hint(skb, hint)) 109 + skb_dst_copy(skb, hint); 110 + else 111 + ip6_rcv_finish_core(net, sk, skb); 124 112 dst = skb_dst(skb); 125 113 if (curr_dst != dst) { 114 + hint = ip6_extract_route_hint(net, skb); 115 + 126 116 /* dispatch old sublist */ 127 117 if (!list_empty(&sublist)) 128 118 ip6_sublist_rcv_finish(&sublist);
+3
net/ipv6/route.c
··· 6199 6199 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6200 6200 ip6_template_metrics, true); 6201 6201 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); 6202 + #ifdef CONFIG_IPV6_SUBTREES 6203 + net->ipv6.fib6_routes_require_src = 0; 6204 + #endif 6202 6205 #endif 6203 6206 6204 6207 net->ipv6.sysctl.flush_delay = 0;