Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: Fix input route performance regression.

With the routing cache removal we lost the "noref" code paths on
input, and this can kill some routing workloads.

Reinstate the noref path when we hit a cached route in the FIB
nexthops.

With help from Eric Dumazet.

Reported-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

+48 -37
+17 -2
include/net/route.h
··· 30 30 #include <net/inet_sock.h> 31 31 #include <linux/in_route.h> 32 32 #include <linux/rtnetlink.h> 33 + #include <linux/rcupdate.h> 33 34 #include <linux/route.h> 34 35 #include <linux/ip.h> 35 36 #include <linux/cache.h> ··· 158 157 return ip_route_output_key(net, fl4); 159 158 } 160 159 161 - extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, 162 - u8 tos, struct net_device *devin); 160 + extern int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, 161 + u8 tos, struct net_device *devin); 162 + 163 + static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, 164 + u8 tos, struct net_device *devin) 165 + { 166 + int err; 167 + 168 + rcu_read_lock(); 169 + err = ip_route_input_noref(skb, dst, src, tos, devin); 170 + if (!err) 171 + skb_dst_force(skb); 172 + rcu_read_unlock(); 173 + 174 + return err; 175 + } 163 176 164 177 extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 165 178 int oif, u32 mark, u8 protocol, int flow_flags);
+1 -1
net/ipv4/arp.c
··· 827 827 } 828 828 829 829 if (arp->ar_op == htons(ARPOP_REQUEST) && 830 - ip_route_input(skb, tip, sip, 0, dev) == 0) { 830 + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { 831 831 832 832 rt = skb_rtable(skb); 833 833 addr_type = rt->rt_type;
+2 -2
net/ipv4/fib_semantics.c
··· 172 172 if (nexthop_nh->nh_exceptions) 173 173 free_nh_exceptions(nexthop_nh); 174 174 if (nexthop_nh->nh_rth_output) 175 - dst_release(&nexthop_nh->nh_rth_output->dst); 175 + dst_free(&nexthop_nh->nh_rth_output->dst); 176 176 if (nexthop_nh->nh_rth_input) 177 - dst_release(&nexthop_nh->nh_rth_input->dst); 177 + dst_free(&nexthop_nh->nh_rth_input->dst); 178 178 } endfor_nexthops(fi); 179 179 180 180 release_net(fi->fib_net);
+2 -2
net/ipv4/ip_fragment.c
··· 258 258 /* skb dst is stale, drop it, and perform route lookup again */ 259 259 skb_dst_drop(head); 260 260 iph = ip_hdr(head); 261 - err = ip_route_input(head, iph->daddr, iph->saddr, 262 - iph->tos, head->dev); 261 + err = ip_route_input_noref(head, iph->daddr, iph->saddr, 262 + iph->tos, head->dev); 263 263 if (err) 264 264 goto out_rcu_unlock; 265 265
+2 -2
net/ipv4/ip_input.c
··· 339 339 * how the packet travels inside Linux networking. 340 340 */ 341 341 if (!skb_dst(skb)) { 342 - int err = ip_route_input(skb, iph->daddr, iph->saddr, 343 - iph->tos, skb->dev); 342 + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 343 + iph->tos, skb->dev); 344 344 if (unlikely(err)) { 345 345 if (err == -EXDEV) 346 346 NET_INC_STATS_BH(dev_net(skb->dev),
+22 -26
net/ipv4/route.c
··· 1199 1199 fnhe->fnhe_stamp = jiffies; 1200 1200 } 1201 1201 1202 - static inline void rt_release_rcu(struct rcu_head *head) 1202 + static inline void rt_free(struct rtable *rt) 1203 1203 { 1204 - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); 1205 - dst_release(dst); 1204 + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 1206 1205 } 1207 1206 1208 1207 static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) ··· 1215 1216 1216 1217 prev = cmpxchg(p, orig, rt); 1217 1218 if (prev == orig) { 1218 - dst_clone(&rt->dst); 1219 1219 if (orig) 1220 - call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); 1220 + rt_free(orig); 1221 + } else { 1222 + /* Routes we intend to cache in the FIB nexthop have 1223 + * the DST_NOCACHE bit clear. However, if we are 1224 + * unsuccessful at storing this route into the cache 1225 + * we really need to set it. 1226 + */ 1227 + rt->dst.flags |= DST_NOCACHE; 1221 1228 } 1222 1229 } 1223 1230 ··· 1250 1245 #ifdef CONFIG_IP_ROUTE_CLASSID 1251 1246 rt->dst.tclassid = nh->nh_tclassid; 1252 1247 #endif 1253 - if (!(rt->dst.flags & DST_HOST)) 1248 + if (!(rt->dst.flags & DST_NOCACHE)) 1254 1249 rt_cache_route(nh, rt); 1255 1250 } 1256 1251 ··· 1266 1261 bool nopolicy, bool noxfrm, bool will_cache) 1267 1262 { 1268 1263 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1269 - (will_cache ? 0 : DST_HOST) | DST_NOCACHE | 1264 + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | 1270 1265 (nopolicy ? DST_NOPOLICY : 0) | 1271 1266 (noxfrm ? DST_NOXFRM : 0)); 1272 1267 } ··· 1371 1366 static int __mkroute_input(struct sk_buff *skb, 1372 1367 const struct fib_result *res, 1373 1368 struct in_device *in_dev, 1374 - __be32 daddr, __be32 saddr, u32 tos, 1375 - struct rtable **result) 1369 + __be32 daddr, __be32 saddr, u32 tos) 1376 1370 { 1377 1371 struct rtable *rth; 1378 1372 int err; ··· 1422 1418 if (!itag) { 1423 1419 rth = FIB_RES_NH(*res).nh_rth_input; 1424 1420 if (rt_cache_valid(rth)) { 1425 - dst_hold(&rth->dst); 1421 + skb_dst_set_noref(skb, &rth->dst); 1426 1422 goto out; 1427 1423 } 1428 1424 do_cache = true; ··· 1449 1445 rth->dst.output = ip_output; 1450 1446 1451 1447 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); 1448 + skb_dst_set(skb, &rth->dst); 1452 1449 out: 1453 - *result = rth; 1454 1450 err = 0; 1455 1451 cleanup: 1456 1452 return err; ··· 1462 1458 struct in_device *in_dev, 1463 1459 __be32 daddr, __be32 saddr, u32 tos) 1464 1460 { 1465 - struct rtable *rth = NULL; 1466 - int err; 1467 - 1468 1461 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1469 1462 if (res->fi && res->fi->fib_nhs > 1) 1470 1463 fib_select_multipath(res); 1471 1464 #endif 1472 1465 1473 1466 /* create a routing cache entry */ 1474 - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 1475 - if (err) 1476 - return err; 1477 - 1478 - skb_dst_set(skb, &rth->dst); 1479 - return 0; 1467 + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1480 1468 } 1481 1469 1482 1470 /* ··· 1584 1588 if (!itag) { 1585 1589 rth = FIB_RES_NH(res).nh_rth_input; 1586 1590 if (rt_cache_valid(rth)) { 1587 - dst_hold(&rth->dst); 1588 - goto set_and_out; 1591 + skb_dst_set_noref(skb, &rth->dst); 1592 + err = 0; 1593 + goto out; 1589 1594 } 1590 1595 do_cache = true; 1591 1596 } ··· 1617 1620 } 1618 1621 if (do_cache) 1619 1622 rt_cache_route(&FIB_RES_NH(res), rth); 1620 - set_and_out: 1621 1623 skb_dst_set(skb, &rth->dst); 1622 1624 err = 0; 1623 1625 goto out; ··· 1654 1658 goto out; 1655 1659 } 1656 1660 1657 - int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1658 - u8 tos, struct net_device *dev) 1661 + int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1662 + u8 tos, struct net_device *dev) 1659 1663 { 1660 1664 int res; 1661 1665 ··· 1698 1702 rcu_read_unlock(); 1699 1703 return res; 1700 1704 } 1701 - EXPORT_SYMBOL(ip_route_input); 1705 + EXPORT_SYMBOL(ip_route_input_noref); 1702 1706 1703 1707 /* called with rcu_read_lock() */ 1704 1708 static struct rtable *__mkroute_output(const struct fib_result *res,
+2 -2
net/ipv4/xfrm4_input.c
··· 27 27 if (skb_dst(skb) == NULL) { 28 28 const struct iphdr *iph = ip_hdr(skb); 29 29 30 - if (ip_route_input(skb, iph->daddr, iph->saddr, 31 - iph->tos, skb->dev)) 30 + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, 31 + iph->tos, skb->dev)) 32 32 goto drop; 33 33 } 34 34 return dst_input(skb);