Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ipv4: adopt dst_dev, skb_dst_dev and skb_dst_dev_net[_rcu]

Use the new helpers as a first step to deal with
potential dst->dev races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by
Jakub Kicinski
a74fc62e 88fe1425

+43 -38
+1 -1
include/net/inet_hashtables.h
··· 481 481 const int sdif, 482 482 bool *refcounted) 483 483 { 484 - struct net *net = dev_net_rcu(skb_dst(skb)->dev); 484 + struct net *net = skb_dst_dev_net_rcu(skb); 485 485 const struct iphdr *iph = ip_hdr(skb); 486 486 struct sock *sk; 487 487
+6 -5
include/net/ip.h
··· 472 472 473 473 rcu_read_lock(); 474 474 475 - net = dev_net_rcu(dst->dev); 475 + net = dev_net_rcu(dst_dev(dst)); 476 476 if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || 477 477 ip_mtu_locked(dst) || 478 478 !forwarding) { ··· 486 486 if (mtu) 487 487 goto out; 488 488 489 - mtu = READ_ONCE(dst->dev->mtu); 489 + mtu = READ_ONCE(dst_dev(dst)->mtu); 490 490 491 491 if (unlikely(ip_mtu_locked(dst))) { 492 492 if (rt->rt_uses_gateway && mtu > 576) ··· 506 506 static inline unsigned int ip_skb_dst_mtu(struct sock *sk, 507 507 const struct sk_buff *skb) 508 508 { 509 + const struct dst_entry *dst = skb_dst(skb); 509 510 unsigned int mtu; 510 511 511 512 if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { 512 513 bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; 513 514 514 - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); 515 + return ip_dst_mtu_maybe_forward(dst, forwarding); 515 516 } 516 517 517 - mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); 518 - return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); 518 + mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU); 519 + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 519 520 } 520 521 521 522 struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
+1 -1
include/net/route.h
··· 390 390 const struct net *net; 391 391 392 392 rcu_read_lock(); 393 - net = dev_net_rcu(dst->dev); 393 + net = dev_net_rcu(dst_dev(dst)); 394 394 hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); 395 395 rcu_read_unlock(); 396 396 }
+13 -11
net/ipv4/icmp.c
··· 311 311 { 312 312 struct dst_entry *dst = &rt->dst; 313 313 struct inet_peer *peer; 314 + struct net_device *dev; 314 315 bool rc = true; 315 316 316 317 if (!apply_ratelimit) 317 318 return true; 318 319 319 320 /* No rate limit on loopback */ 320 - if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) 321 + dev = dst_dev(dst); 322 + if (dev && (dev->flags & IFF_LOOPBACK)) 321 323 goto out; 322 324 323 325 rcu_read_lock(); 324 326 peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 325 - l3mdev_master_ifindex_rcu(dst->dev)); 327 + l3mdev_master_ifindex_rcu(dev)); 326 328 rc = inet_peer_xrlim_allow(peer, 327 329 READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); 328 330 rcu_read_unlock(); ··· 468 466 */ 469 467 static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) 470 468 { 471 - struct net_device *route_lookup_dev = NULL; 469 + struct net_device *dev = skb->dev; 470 + const struct dst_entry *dst; 472 471 473 - if (skb->dev) 474 - route_lookup_dev = skb->dev; 475 - else if (skb_dst(skb)) 476 - route_lookup_dev = skb_dst(skb)->dev; 477 - return route_lookup_dev; 472 + if (dev) 473 + return dev; 474 + dst = skb_dst(skb); 475 + return dst ? dst_dev(dst) : NULL; 478 476 } 479 477 480 478 static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, ··· 871 869 struct net *net; 872 870 u32 info = 0; 873 871 874 - net = dev_net_rcu(skb_dst(skb)->dev); 872 + net = skb_dst_dev_net_rcu(skb); 875 873 876 874 /* 877 875 * Incomplete header ? ··· 1014 1012 struct icmp_bxm icmp_param; 1015 1013 struct net *net; 1016 1014 1017 - net = dev_net_rcu(skb_dst(skb)->dev); 1015 + net = skb_dst_dev_net_rcu(skb); 1018 1016 /* should there be an ICMP stat for ignored echos? */ 1019 1017 if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) 1020 1018 return SKB_NOT_DROPPED_YET; ··· 1184 1182 return SKB_NOT_DROPPED_YET; 1185 1183 1186 1184 out_err: 1187 - __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); 1185 + __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS); 1188 1186 return SKB_DROP_REASON_PKT_TOO_SMALL; 1189 1187 } 1190 1188
+1 -1
net/ipv4/igmp.c
··· 427 427 428 428 pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); 429 429 430 - return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 430 + return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb); 431 431 } 432 432 433 433 static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+1 -1
net/ipv4/ip_fragment.c
··· 476 476 /* Process an incoming IP datagram fragment. */ 477 477 int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) 478 478 { 479 - struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; 479 + struct net_device *dev = skb->dev ? : skb_dst_dev(skb); 480 480 int vif = l3mdev_master_ifindex_rcu(dev); 481 481 struct ipq *qp; 482 482
+3 -3
net/ipv4/ip_output.c
··· 116 116 skb->protocol = htons(ETH_P_IP); 117 117 118 118 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, 119 - net, sk, skb, NULL, skb_dst(skb)->dev, 119 + net, sk, skb, NULL, skb_dst_dev(skb), 120 120 dst_output); 121 121 } 122 122 ··· 199 199 { 200 200 struct dst_entry *dst = skb_dst(skb); 201 201 struct rtable *rt = dst_rtable(dst); 202 - struct net_device *dev = dst->dev; 202 + struct net_device *dev = dst_dev(dst); 203 203 unsigned int hh_len = LL_RESERVED_SPACE(dev); 204 204 struct neighbour *neigh; 205 205 bool is_v6gw = false; ··· 425 425 426 426 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) 427 427 { 428 - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; 428 + struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev; 429 429 430 430 skb->dev = dev; 431 431 skb->protocol = htons(ETH_P_IP);
+2 -2
net/ipv4/ip_vti.c
··· 229 229 goto tx_error_icmp; 230 230 } 231 231 232 - tdev = dst->dev; 232 + tdev = dst_dev(dst); 233 233 234 234 if (tdev == dev) { 235 235 dst_release(dst); ··· 259 259 xmit: 260 260 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); 261 261 skb_dst_set(skb, dst); 262 - skb->dev = skb_dst(skb)->dev; 262 + skb->dev = skb_dst_dev(skb); 263 263 264 264 err = dst_output(tunnel->net, skb->sk, skb); 265 265 if (net_xmit_eval(err) == 0)
+2 -2
net/ipv4/netfilter.c
··· 20 20 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ 21 21 int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) 22 22 { 23 + struct net_device *dev = skb_dst_dev(skb); 23 24 const struct iphdr *iph = ip_hdr(skb); 24 25 struct rtable *rt; 25 26 struct flowi4 fl4 = {}; 26 27 __be32 saddr = iph->saddr; 27 28 __u8 flags; 28 - struct net_device *dev = skb_dst(skb)->dev; 29 29 struct flow_keys flkeys; 30 30 unsigned int hh_len; 31 31 ··· 74 74 #endif 75 75 76 76 /* Change in oif may mean change in hh_len. */ 77 - hh_len = skb_dst(skb)->dev->hard_header_len; 77 + hh_len = skb_dst_dev(skb)->hard_header_len; 78 78 if (skb_headroom(skb) < hh_len && 79 79 pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 80 80 0, GFP_ATOMIC))
+4 -4
net/ipv4/route.c
··· 413 413 const void *daddr) 414 414 { 415 415 const struct rtable *rt = container_of(dst, struct rtable, dst); 416 - struct net_device *dev = dst->dev; 416 + struct net_device *dev = dst_dev(dst); 417 417 struct neighbour *n; 418 418 419 419 rcu_read_lock(); ··· 440 440 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 441 441 { 442 442 const struct rtable *rt = container_of(dst, struct rtable, dst); 443 - struct net_device *dev = dst->dev; 443 + struct net_device *dev = dst_dev(dst); 444 444 const __be32 *pkey = daddr; 445 445 446 446 if (rt->rt_gw_family == AF_INET) { ··· 1026 1026 return; 1027 1027 1028 1028 rcu_read_lock(); 1029 - net = dev_net_rcu(dst->dev); 1029 + net = dev_net_rcu(dst_dev(dst)); 1030 1030 if (mtu < net->ipv4.ip_rt_min_pmtu) { 1031 1031 lock = true; 1032 1032 mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); ··· 1326 1326 struct net *net; 1327 1327 1328 1328 rcu_read_lock(); 1329 - net = dev_net_rcu(dst->dev); 1329 + net = dev_net_rcu(dst_dev(dst)); 1330 1330 advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1331 1331 net->ipv4.ip_rt_min_advmss); 1332 1332 rcu_read_unlock();
+3 -1
net/ipv4/tcp_fastopen.c
··· 559 559 void tcp_fastopen_active_disable_ofo_check(struct sock *sk) 560 560 { 561 561 struct tcp_sock *tp = tcp_sk(sk); 562 + struct net_device *dev; 562 563 struct dst_entry *dst; 563 564 struct sk_buff *skb; 564 565 ··· 577 576 } else if (tp->syn_fastopen_ch && 578 577 atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { 579 578 dst = sk_dst_get(sk); 580 - if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) 579 + dev = dst ? dst_dev(dst) : NULL; 580 + if (!(dev && (dev->flags & IFF_LOOPBACK))) 581 581 atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); 582 582 dst_release(dst); 583 583 }
+1 -1
net/ipv4/tcp_ipv4.c
··· 788 788 arg.iov[0].iov_base = (unsigned char *)&rep; 789 789 arg.iov[0].iov_len = sizeof(rep.th); 790 790 791 - net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); 791 + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); 792 792 793 793 /* Invalid TCP option size or twice included auth */ 794 794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
+4 -4
net/ipv4/tcp_metrics.c
··· 166 166 unsigned int hash) 167 167 { 168 168 struct tcp_metrics_block *tm; 169 - struct net *net; 170 169 bool reclaim = false; 170 + struct net *net; 171 171 172 172 spin_lock_bh(&tcp_metrics_lock); 173 - net = dev_net_rcu(dst->dev); 173 + net = dev_net_rcu(dst_dev(dst)); 174 174 175 175 /* While waiting for the spin-lock the cache might have been populated 176 176 * with this entry and so we have to check again. ··· 273 273 return NULL; 274 274 } 275 275 276 - net = dev_net_rcu(dst->dev); 276 + net = dev_net_rcu(dst_dev(dst)); 277 277 hash ^= net_hash_mix(net); 278 278 hash = hash_32(hash, tcp_metrics_hash_log); 279 279 ··· 318 318 else 319 319 return NULL; 320 320 321 - net = dev_net_rcu(dst->dev); 321 + net = dev_net_rcu(dst_dev(dst)); 322 322 hash ^= net_hash_mix(net); 323 323 hash = hash_32(hash, tcp_metrics_hash_log); 324 324
+1 -1
net/ipv4/xfrm4_output.c
··· 31 31 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) 32 32 { 33 33 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 34 - net, sk, skb, skb->dev, skb_dst(skb)->dev, 34 + net, sk, skb, skb->dev, skb_dst_dev(skb), 35 35 __xfrm4_output, 36 36 !(IPCB(skb)->flags & IPSKB_REROUTED)); 37 37 }