Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'ipv6-avoid-taking-refcnt-on-dst-during-route-lookup'

Wei Wang says:

====================
ipv6: avoid taking refcnt on dst during route lookup

Ipv6 route lookup code always grabs refcnt on the dst for the caller.
But for certain cases, grabbing refcnt is not always necessary if the
call path is rcu protected and the caller does not cache the dst.
Another issue in the route lookup logic is:
When there are multiple custom rules, we have to do the lookup into
each table associated to each rule individually. And when we can't
find the route in one table, we grab and release refcnt on
net->ipv6.ip6_null_entry before going to the next table.
This operation is completely redundant, and causes false issue because
net->ipv6.ip6_null_entry is a shared object.

This patch set introduces a new flag RT6_LOOKUP_F_DST_NOREF for route
lookup callers to set, to avoid any manipulation on the dst refcnt. And
it converts the major input and output path to use it.

The performance gain is noticable.
I ran synflood tests between 2 hosts under the same switch. Both hosts
have 20G mlx NIC, and 8 tx/rx queues.
Sender sends pure SYN flood with random src IPs and ports using trafgen.
Receiver has a simple TCP listener on the target port.
Both hosts have multiple custom rules:
- For incoming packets, only local table is traversed.
- For outgoing packets, 3 tables are traversed to find the route.
The packet processing rate on the receiver is as follows:
- Before the fix: 3.78Mpps
- After the fix: 5.50Mpps

v2->v3:
- Handled fib6_rule_lookup() when CONFIG_IPV6_MULTIPLE_TABLES is not
configured in patch 03 (suggested by David Ahern)
- Removed the renaming of l3mdev_link_scope_lookup() in patch 05
(suggested by David Ahern)
- Moved definition of ip6_route_output_flags() from an inline function
in /net/ipv6/route.c to net/ipv6/route.c in order to address kbuild
error in patch 05

v1->v2:
- Added a helper ip6_rt_put_flags() in patch 3 suggested by David Miller
====================

Reviewed-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

+95 -61
+3 -2
drivers/net/vrf.c
··· 1072 1072 #if IS_ENABLED(CONFIG_IPV6) 1073 1073 /* send to link-local or multicast address via interface enslaved to 1074 1074 * VRF device. Force lookup to VRF table without changing flow struct 1075 + * Note: Caller to this function must hold rcu_read_lock() and no refcnt 1076 + * is taken on the dst by this function. 1075 1077 */ 1076 1078 static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, 1077 1079 struct flowi6 *fl6) 1078 1080 { 1079 1081 struct net *net = dev_net(dev); 1080 - int flags = RT6_LOOKUP_F_IFACE; 1082 + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; 1081 1083 struct dst_entry *dst = NULL; 1082 1084 struct rt6_info *rt; 1083 1085 ··· 1089 1087 */ 1090 1088 if (fl6->flowi6_oif == dev->ifindex) { 1091 1089 dst = &net->ipv6.ip6_null_entry->dst; 1092 - dst_hold(dst); 1093 1090 return dst; 1094 1091 } 1095 1092
+15
include/net/ip6_route.h
··· 36 36 #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 37 37 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 38 38 #define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 39 + #define RT6_LOOKUP_F_DST_NOREF 0x00000080 39 40 40 41 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) 41 42 * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header ··· 84 83 struct flowi6 *fl6, 85 84 const struct sk_buff *skb, int flags); 86 85 86 + struct dst_entry *ip6_route_output_flags_noref(struct net *net, 87 + const struct sock *sk, 88 + struct flowi6 *fl6, int flags); 89 + 87 90 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 88 91 struct flowi6 *fl6, int flags); 89 92 ··· 96 91 struct flowi6 *fl6) 97 92 { 98 93 return ip6_route_output_flags(net, sk, fl6, 0); 94 + } 95 + 96 + /* Only conditionally release dst if flags indicates 97 + * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. 98 + */ 99 + static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) 100 + { 101 + if (!(flags & RT6_LOOKUP_F_DST_NOREF) || 102 + !list_empty(&rt->rt6i_uncached)) 103 + ip6_rt_put(rt); 99 104 } 100 105 101 106 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
+7 -5
net/ipv6/fib6_rules.c
··· 113 113 rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); 114 114 if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) 115 115 return &rt->dst; 116 - ip6_rt_put(rt); 116 + ip6_rt_put_flags(rt, flags); 117 117 rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); 118 118 if (rt->dst.error != -EAGAIN) 119 119 return &rt->dst; 120 - ip6_rt_put(rt); 120 + ip6_rt_put_flags(rt, flags); 121 121 } 122 122 123 - dst_hold(&net->ipv6.ip6_null_entry->dst); 123 + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 124 + dst_hold(&net->ipv6.ip6_null_entry->dst); 124 125 return &net->ipv6.ip6_null_entry->dst; 125 126 } 126 127 ··· 238 237 goto out; 239 238 } 240 239 again: 241 - ip6_rt_put(rt); 240 + ip6_rt_put_flags(rt, flags); 242 241 err = -EAGAIN; 243 242 rt = NULL; 244 243 goto out; 245 244 246 245 discard_pkt: 247 - dst_hold(&rt->dst); 246 + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 247 + dst_hold(&rt->dst); 248 248 out: 249 249 res->rt6 = rt; 250 250 return err;
+3 -2
net/ipv6/ip6_fib.c
··· 316 316 317 317 rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); 318 318 if (rt->dst.error == -EAGAIN) { 319 - ip6_rt_put(rt); 319 + ip6_rt_put_flags(rt, flags); 320 320 rt = net->ipv6.ip6_null_entry; 321 - dst_hold(&rt->dst); 321 + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) 322 + dst_hold(&rt->dst); 322 323 } 323 324 324 325 return &rt->dst;
+64 -48
net/ipv6/route.c
··· 1391 1391 1392 1392 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); 1393 1393 1394 - if (pcpu_rt) 1395 - ip6_hold_safe(NULL, &pcpu_rt); 1396 - 1397 1394 return pcpu_rt; 1398 1395 } 1399 1396 ··· 1400 1403 struct rt6_info *pcpu_rt, *prev, **p; 1401 1404 1402 1405 pcpu_rt = ip6_rt_pcpu_alloc(res); 1403 - if (!pcpu_rt) { 1404 - dst_hold(&net->ipv6.ip6_null_entry->dst); 1405 - return net->ipv6.ip6_null_entry; 1406 - } 1406 + if (!pcpu_rt) 1407 + return NULL; 1407 1408 1408 - dst_hold(&pcpu_rt->dst); 1409 1409 p = this_cpu_ptr(res->nh->rt6i_pcpu); 1410 1410 prev = cmpxchg(p, NULL, pcpu_rt); 1411 1411 BUG_ON(prev); ··· 2183 2189 const struct sk_buff *skb, int flags) 2184 2190 { 2185 2191 struct fib6_result res = {}; 2186 - struct rt6_info *rt; 2192 + struct rt6_info *rt = NULL; 2187 2193 int strict = 0; 2194 + 2195 + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && 2196 + !rcu_read_lock_held()); 2188 2197 2189 2198 strict |= flags & RT6_LOOKUP_F_IFACE; 2190 2199 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; ··· 2197 2200 rcu_read_lock(); 2198 2201 2199 2202 fib6_table_lookup(net, table, oif, fl6, &res, strict); 2200 - if (res.f6i == net->ipv6.fib6_null_entry) { 2201 - rt = net->ipv6.ip6_null_entry; 2202 - rcu_read_unlock(); 2203 - dst_hold(&rt->dst); 2204 - return rt; 2205 - } 2203 + if (res.f6i == net->ipv6.fib6_null_entry) 2204 + goto out; 2206 2205 2207 2206 fib6_select_path(net, &res, fl6, oif, false, skb, strict); 2208 2207 2209 2208 /*Search through exception table */ 2210 2209 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); 2211 2210 if (rt) { 2212 - if (ip6_hold_safe(net, &rt)) 2213 - dst_use_noref(&rt->dst, jiffies); 2214 - 2215 - rcu_read_unlock(); 2216 - return rt; 2211 + goto out; 2217 2212 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 2218 2213 !res.nh->fib_nh_gw_family)) { 2219 2214 /* Create a RTF_CACHE clone which will not be ··· 2213 2224 * the daddr in the skb during the neighbor look-up is different 2214 2225 * from the fl6->daddr used to look-up route here. 2215 2226 */ 2216 - struct rt6_info *uncached_rt; 2227 + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2217 2228 2218 - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); 2219 - 2220 - rcu_read_unlock(); 2221 - 2222 - if (uncached_rt) { 2223 - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 2224 - * No need for another dst_hold() 2229 + if (rt) { 2230 + /* 1 refcnt is taken during ip6_rt_cache_alloc(). 2231 + * As rt6_uncached_list_add() does not consume refcnt, 2232 + * this refcnt is always returned to the caller even 2233 + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. 2225 2234 */ 2226 - rt6_uncached_list_add(uncached_rt); 2235 + rt6_uncached_list_add(rt); 2227 2236 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2228 - } else { 2229 - uncached_rt = net->ipv6.ip6_null_entry; 2230 - dst_hold(&uncached_rt->dst); 2231 - } 2237 + rcu_read_unlock(); 2232 2238 2233 - return uncached_rt; 2239 + return rt; 2240 + } 2234 2241 } else { 2235 2242 /* Get a percpu copy */ 2236 - 2237 - struct rt6_info *pcpu_rt; 2238 - 2239 2243 local_bh_disable(); 2240 - pcpu_rt = rt6_get_pcpu_route(&res); 2244 + rt = rt6_get_pcpu_route(&res); 2241 2245 2242 - if (!pcpu_rt) 2243 - pcpu_rt = rt6_make_pcpu_route(net, &res); 2246 + if (!rt) 2247 + rt = rt6_make_pcpu_route(net, &res); 2244 2248 2245 2249 local_bh_enable(); 2246 - rcu_read_unlock(); 2247 - 2248 - return pcpu_rt; 2249 2250 } 2251 + out: 2252 + if (!rt) 2253 + rt = net->ipv6.ip6_null_entry; 2254 + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) 2255 + ip6_hold_safe(net, &rt); 2256 + rcu_read_unlock(); 2257 + 2258 + return rt; 2250 2259 } 2251 2260 EXPORT_SYMBOL_GPL(ip6_pol_route); 2252 2261 ··· 2375 2388 return mhash >> 1; 2376 2389 } 2377 2390 2391 + /* Called with rcu held */ 2378 2392 void ip6_route_input(struct sk_buff *skb) 2379 2393 { 2380 2394 const struct ipv6hdr *iph = ipv6_hdr(skb); 2381 2395 struct net *net = dev_net(skb->dev); 2382 - int flags = RT6_LOOKUP_F_HAS_SADDR; 2396 + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; 2383 2397 struct ip_tunnel_info *tun_info; 2384 2398 struct flowi6 fl6 = { 2385 2399 .flowi6_iif = skb->dev->ifindex, ··· 2402 2414 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 2403 2415 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); 2404 2416 skb_dst_drop(skb); 2405 - skb_dst_set(skb, 2406 - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); 2417 + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, 2418 + &fl6, skb, flags)); 2407 2419 } 2408 2420 2409 2421 static struct rt6_info *ip6_pol_route_output(struct net *net, ··· 2415 2427 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); 2416 2428 } 2417 2429 2418 - struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 2419 - struct flowi6 *fl6, int flags) 2430 + struct dst_entry *ip6_route_output_flags_noref(struct net *net, 2431 + const struct sock *sk, 2432 + struct flowi6 *fl6, int flags) 2420 2433 { 2421 2434 bool any_src; 2422 2435 ··· 2425 2436 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { 2426 2437 struct dst_entry *dst; 2427 2438 2439 + /* This function does not take refcnt on the dst */ 2428 2440 dst = l3mdev_link_scope_lookup(net, fl6); 2429 2441 if (dst) 2430 2442 return dst; ··· 2433 2443 2434 2444 fl6->flowi6_iif = LOOPBACK_IFINDEX; 2435 2445 2446 + flags |= RT6_LOOKUP_F_DST_NOREF; 2436 2447 any_src = ipv6_addr_any(&fl6->saddr); 2437 2448 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 2438 2449 (fl6->flowi6_oif && any_src)) ··· 2445 2454 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 2446 2455 2447 2456 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); 2457 + } 2458 + EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); 2459 + 2460 + struct dst_entry *ip6_route_output_flags(struct net *net, 2461 + const struct sock *sk, 2462 + struct flowi6 *fl6, 2463 + int flags) 2464 + { 2465 + struct dst_entry *dst; 2466 + struct rt6_info *rt6; 2467 + 2468 + rcu_read_lock(); 2469 + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); 2470 + rt6 = (struct rt6_info *)dst; 2471 + /* For dst cached in uncached_list, refcnt is already taken. */ 2472 + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { 2473 + dst = &net->ipv6.ip6_null_entry->dst; 2474 + dst_hold(dst); 2475 + } 2476 + rcu_read_unlock(); 2477 + 2478 + return dst; 2448 2479 } 2449 2480 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 2450 2481 ··· 6042 6029 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6043 6030 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 6044 6031 ip6_template_metrics, true); 6032 + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); 6045 6033 6046 6034 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 6047 6035 net->ipv6.fib6_has_custom_rules = false; ··· 6054 6040 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6055 6041 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 6056 6042 ip6_template_metrics, true); 6043 + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); 6057 6044 6058 6045 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 6059 6046 sizeof(*net->ipv6.ip6_blk_hole_entry), ··· 6064 6049 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 6065 6050 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 6066 6051 ip6_template_metrics, true); 6052 + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); 6067 6053 #endif 6068 6054 6069 6055 net->ipv6.sysctl.flush_delay = 0;
+3 -4
net/l3mdev/l3mdev.c
··· 118 118 * local and multicast addresses 119 119 * @net: network namespace for device index lookup 120 120 * @fl6: IPv6 flow struct for lookup 121 + * This function does not hold refcnt on the returned dst. 122 + * Caller must hold rcu_read_lock(). 121 123 */ 122 124 123 125 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, ··· 128 126 struct dst_entry *dst = NULL; 129 127 struct net_device *dev; 130 128 129 + WARN_ON_ONCE(!rcu_read_lock_held()); 131 130 if (fl6->flowi6_oif) { 132 - rcu_read_lock(); 133 - 134 131 dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); 135 132 if (dev && netif_is_l3_slave(dev)) 136 133 dev = netdev_master_upper_dev_get_rcu(dev); ··· 137 136 if (dev && netif_is_l3_master(dev) && 138 137 dev->l3mdev_ops->l3mdev_link_scope_lookup) 139 138 dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); 140 - 141 - rcu_read_unlock(); 142 139 } 143 140 144 141 return dst;