Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'net-first-round-to-use-dev_net_rcu'

Eric Dumazet says:

====================
net: first round to use dev_net_rcu()

dev_net(dev) should either be protected by RTNL or RCU.

There is no LOCKDEP support yet for this helper.

Adding it would trigger too many splats.

Instead, add dev_net_rcu() for rcu_read_lock() contexts
and start to use it to fix bugs and clearly document the
safety requirements.

v4: https://lore.kernel.org/CANn89i+AozhFhZNK0Y4e_EqXV1=yKjGuvf43Wa6JJKWMOixWQQ@mail.gmail.com
v3: https://lore.kernel.org/20250203153633.46ce0337@kernel.org/
====================

Link: https://patch.msgid.link/20250205155120.1676781-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+113 -65
+6
include/linux/netdevice.h
··· 2664 2664 } 2665 2665 2666 2666 static inline 2667 + struct net *dev_net_rcu(const struct net_device *dev) 2668 + { 2669 + return read_pnet_rcu(&dev->nd_net); 2670 + } 2671 + 2672 + static inline 2667 2673 void dev_net_set(struct net_device *dev, struct net *net) 2668 2674 { 2669 2675 write_pnet(&dev->nd_net, net);
+10 -3
include/net/ip.h
··· 471 471 bool forwarding) 472 472 { 473 473 const struct rtable *rt = dst_rtable(dst); 474 - struct net *net = dev_net(dst->dev); 475 - unsigned int mtu; 474 + unsigned int mtu, res; 475 + struct net *net; 476 476 477 + rcu_read_lock(); 478 + 479 + net = dev_net_rcu(dst->dev); 477 480 if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || 478 481 ip_mtu_locked(dst) || 479 482 !forwarding) { ··· 500 497 out: 501 498 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 502 499 503 - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 500 + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); 501 + 502 + rcu_read_unlock(); 503 + 504 + return res; 504 505 } 505 506 506 507 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
+1 -1
include/net/net_namespace.h
··· 398 398 #endif 399 399 } 400 400 401 - static inline struct net *read_pnet_rcu(possible_net_t *pnet) 401 + static inline struct net *read_pnet_rcu(const possible_net_t *pnet) 402 402 { 403 403 #ifdef CONFIG_NET_NS 404 404 return rcu_dereference(pnet->net);
+7 -2
include/net/route.h
··· 382 382 static inline int ip4_dst_hoplimit(const struct dst_entry *dst) 383 383 { 384 384 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); 385 - struct net *net = dev_net(dst->dev); 386 385 387 - if (hoplimit == 0) 386 + if (hoplimit == 0) { 387 + const struct net *net; 388 + 389 + rcu_read_lock(); 390 + net = dev_net_rcu(dst->dev); 388 391 hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); 392 + rcu_read_unlock(); 393 + } 389 394 return hoplimit; 390 395 } 391 396
+11 -10
net/core/flow_dissector.c
··· 1108 1108 FLOW_DISSECTOR_KEY_BASIC, 1109 1109 target_container); 1110 1110 1111 + rcu_read_lock(); 1112 + 1111 1113 if (skb) { 1112 1114 if (!net) { 1113 1115 if (skb->dev) 1114 - net = dev_net(skb->dev); 1116 + net = dev_net_rcu(skb->dev); 1115 1117 else if (skb->sk) 1116 1118 net = sock_net(skb->sk); 1117 1119 } ··· 1124 1122 enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; 1125 1123 struct bpf_prog_array *run_array; 1126 1124 1127 - rcu_read_lock(); 1128 1125 run_array = rcu_dereference(init_net.bpf.run_array[type]); 1129 1126 if (!run_array) 1130 1127 run_array = rcu_dereference(net->bpf.run_array[type]); ··· 1151 1150 prog = READ_ONCE(run_array->items[0].prog); 1152 1151 result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, 1153 1152 hlen, flags); 1154 - if (result == BPF_FLOW_DISSECTOR_CONTINUE) 1155 - goto dissect_continue; 1156 - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, 1157 - target_container); 1158 - rcu_read_unlock(); 1159 - return result == BPF_OK; 1153 + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { 1154 + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, 1155 + target_container); 1156 + rcu_read_unlock(); 1157 + return result == BPF_OK; 1158 + } 1160 1159 } 1161 - dissect_continue: 1162 - rcu_read_unlock(); 1163 1160 } 1161 + 1162 + rcu_read_unlock(); 1164 1163 1165 1164 if (dissector_uses_key(flow_dissector, 1166 1165 FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+2 -1
net/ipv4/devinet.c
··· 1371 1371 __be32 addr = 0; 1372 1372 unsigned char localnet_scope = RT_SCOPE_HOST; 1373 1373 struct in_device *in_dev; 1374 - struct net *net = dev_net(dev); 1374 + struct net *net; 1375 1375 int master_idx; 1376 1376 1377 1377 rcu_read_lock(); 1378 + net = dev_net_rcu(dev); 1378 1379 in_dev = __in_dev_get_rcu(dev); 1379 1380 if (!in_dev) 1380 1381 goto no_in_dev;
+17 -14
net/ipv4/icmp.c
··· 399 399 400 400 static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) 401 401 { 402 - struct ipcm_cookie ipc; 403 402 struct rtable *rt = skb_rtable(skb); 404 - struct net *net = dev_net(rt->dst.dev); 403 + struct net *net = dev_net_rcu(rt->dst.dev); 405 404 bool apply_ratelimit = false; 405 + struct ipcm_cookie ipc; 406 406 struct flowi4 fl4; 407 407 struct sock *sk; 408 408 struct inet_sock *inet; ··· 608 608 struct sock *sk; 609 609 610 610 if (!rt) 611 - goto out; 611 + return; 612 + 613 + rcu_read_lock(); 612 614 613 615 if (rt->dst.dev) 614 - net = dev_net(rt->dst.dev); 616 + net = dev_net_rcu(rt->dst.dev); 615 617 else if (skb_in->dev) 616 - net = dev_net(skb_in->dev); 618 + net = dev_net_rcu(skb_in->dev); 617 619 else 618 620 goto out; 619 621 ··· 787 785 icmp_xmit_unlock(sk); 788 786 out_bh_enable: 789 787 local_bh_enable(); 790 - out:; 788 + out: 789 + rcu_read_unlock(); 791 790 } 792 791 EXPORT_SYMBOL(__icmp_send); 793 792 ··· 837 834 * avoid additional coding at protocol handlers. 838 835 */ 839 836 if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { 840 - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); 837 + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); 841 838 return; 842 839 } 843 840 ··· 871 868 struct net *net; 872 869 u32 info = 0; 873 870 874 - net = dev_net(skb_dst(skb)->dev); 871 + net = dev_net_rcu(skb_dst(skb)->dev); 875 872 876 873 /* 877 874 * Incomplete header ? ··· 982 979 static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) 983 980 { 984 981 if (skb->len < sizeof(struct iphdr)) { 985 - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); 982 + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); 986 983 return SKB_DROP_REASON_PKT_TOO_SMALL; 987 984 } 988 985 ··· 1014 1011 struct icmp_bxm icmp_param; 1015 1012 struct net *net; 1016 1013 1017 - net = dev_net(skb_dst(skb)->dev); 1014 + net = dev_net_rcu(skb_dst(skb)->dev); 1018 1015 /* should there be an ICMP stat for ignored echos? */ 1019 1016 if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) 1020 1017 return SKB_NOT_DROPPED_YET; ··· 1043 1040 1044 1041 bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) 1045 1042 { 1043 + struct net *net = dev_net_rcu(skb->dev); 1046 1044 struct icmp_ext_hdr *ext_hdr, _ext_hdr; 1047 1045 struct icmp_ext_echo_iio *iio, _iio; 1048 - struct net *net = dev_net(skb->dev); 1049 1046 struct inet6_dev *in6_dev; 1050 1047 struct in_device *in_dev; 1051 1048 struct net_device *dev; ··· 1184 1181 return SKB_NOT_DROPPED_YET; 1185 1182 1186 1183 out_err: 1187 - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); 1184 + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); 1188 1185 return SKB_DROP_REASON_PKT_TOO_SMALL; 1189 1186 } 1190 1187 ··· 1201 1198 { 1202 1199 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 1203 1200 struct rtable *rt = skb_rtable(skb); 1204 - struct net *net = dev_net(rt->dst.dev); 1201 + struct net *net = dev_net_rcu(rt->dst.dev); 1205 1202 struct icmphdr *icmph; 1206 1203 1207 1204 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { ··· 1374 1371 struct iphdr *iph = (struct iphdr *)skb->data; 1375 1372 int offset = iph->ihl<<2; 1376 1373 struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); 1374 + struct net *net = dev_net_rcu(skb->dev); 1377 1375 int type = icmp_hdr(skb)->type; 1378 1376 int code = icmp_hdr(skb)->code; 1379 - struct net *net = dev_net(skb->dev); 1380 1377 1381 1378 /* 1382 1379 * Use ping_err to handle all icmp errors except those
+21 -9
net/ipv4/route.c
··· 390 390 391 391 static inline bool rt_is_expired(const struct rtable *rth) 392 392 { 393 - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 393 + bool res; 394 + 395 + rcu_read_lock(); 396 + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); 397 + rcu_read_unlock(); 398 + 399 + return res; 394 400 } 395 401 396 402 void rt_cache_flush(struct net *net) ··· 1008 1002 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1009 1003 { 1010 1004 struct dst_entry *dst = &rt->dst; 1011 - struct net *net = dev_net(dst->dev); 1012 1005 struct fib_result res; 1013 1006 bool lock = false; 1007 + struct net *net; 1014 1008 u32 old_mtu; 1015 1009 1016 1010 if (ip_mtu_locked(dst)) ··· 1020 1014 if (old_mtu < mtu) 1021 1015 return; 1022 1016 1017 + rcu_read_lock(); 1018 + net = dev_net_rcu(dst->dev); 1023 1019 if (mtu < net->ipv4.ip_rt_min_pmtu) { 1024 1020 lock = true; 1025 1021 mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); ··· 1029 1021 1030 1022 if (rt->rt_pmtu == mtu && !lock && 1031 1023 time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) 1032 - return; 1024 + goto out; 1033 1025 1034 - rcu_read_lock(); 1035 1026 if (fib_lookup(net, fl4, &res, 0) == 0) { 1036 1027 struct fib_nh_common *nhc; 1037 1028 ··· 1044 1037 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1045 1038 jiffies + net->ipv4.ip_rt_mtu_expires); 1046 1039 } 1047 - rcu_read_unlock(); 1048 - return; 1040 + goto out; 1049 1041 } 1050 1042 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1051 1043 nhc = FIB_RES_NHC(res); 1052 1044 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1053 1045 jiffies + net->ipv4.ip_rt_mtu_expires); 1054 1046 } 1047 + out: 1055 1048 rcu_read_unlock(); 1056 1049 } 1057 1050 ··· 1314 1307 1315 1308 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1316 1309 { 1317 - struct net *net = dev_net(dst->dev); 1318 1310 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1319 - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1320 - net->ipv4.ip_rt_min_advmss); 1311 + unsigned int advmss; 1312 + struct net *net; 1313 + 1314 + rcu_read_lock(); 1315 + net = dev_net_rcu(dst->dev); 1316 + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1317 + net->ipv4.ip_rt_min_advmss); 1318 + rcu_read_unlock(); 1321 1319 1322 1320 return min(advmss, IPV4_MAX_PMTU - header_size); 1323 1321 }
+23 -19
net/ipv6/icmp.c
··· 76 76 { 77 77 /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ 78 78 struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); 79 - struct net *net = dev_net(skb->dev); 79 + struct net *net = dev_net_rcu(skb->dev); 80 80 81 81 if (type == ICMPV6_PKT_TOOBIG) 82 82 ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); ··· 473 473 474 474 if (!skb->dev) 475 475 return; 476 - net = dev_net(skb->dev); 476 + 477 + rcu_read_lock(); 478 + 479 + net = dev_net_rcu(skb->dev); 477 480 mark = IP6_REPLY_MARK(net, skb->mark); 478 481 /* 479 482 * Make sure we respect the rules ··· 499 496 !(type == ICMPV6_PARAMPROB && 500 497 code == ICMPV6_UNK_OPTION && 501 498 (opt_unrec(skb, info)))) 502 - return; 499 + goto out; 503 500 504 501 saddr = NULL; 505 502 } ··· 529 526 if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { 530 527 net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", 531 528 &hdr->saddr, &hdr->daddr); 532 - return; 529 + goto out; 533 530 } 534 531 535 532 /* ··· 538 535 if (is_ineligible(skb)) { 539 536 net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", 540 537 &hdr->saddr, &hdr->daddr); 541 - return; 538 + goto out; 542 539 } 543 540 544 541 /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ ··· 585 582 np = inet6_sk(sk); 586 583 587 584 if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) 588 - goto out; 585 + goto out_unlock; 589 586 590 587 tmp_hdr.icmp6_type = type; 591 588 tmp_hdr.icmp6_code = code; ··· 603 600 604 601 dst = icmpv6_route_lookup(net, skb, sk, &fl6); 605 602 if (IS_ERR(dst)) 606 - goto out; 603 + goto out_unlock; 607 604 608 605 ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); 609 606 ··· 619 616 goto out_dst_release; 620 617 } 621 618 622 - rcu_read_lock(); 623 619 idev = __in6_dev_get(skb->dev); 624 620 625 621 if (ip6_append_data(sk, icmpv6_getfrag, &msg, ··· 632 630 icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, 633 631 len + sizeof(struct icmp6hdr)); 634 632 } 635 - rcu_read_unlock(); 633 + 636 634 out_dst_release: 637 635 dst_release(dst); 638 - out: 636 + out_unlock: 639 637 icmpv6_xmit_unlock(sk); 640 638 out_bh_enable: 641 639 local_bh_enable(); 640 + out: 641 + rcu_read_unlock(); 642 642 } 643 643 EXPORT_SYMBOL(icmp6_send); 644 644 ··· 683 679 skb_pull(skb2, nhs); 684 680 skb_reset_network_header(skb2); 685 681 686 - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 687 - skb, 0); 682 + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, 683 + NULL, 0, skb, 0); 688 684 689 685 if (rt && rt->dst.dev) 690 686 skb2->dev = rt->dst.dev; ··· 721 717 722 718 static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) 723 719 { 724 - struct net *net = dev_net(skb->dev); 720 + struct net *net = dev_net_rcu(skb->dev); 725 721 struct sock *sk; 726 722 struct inet6_dev *idev; 727 723 struct ipv6_pinfo *np; ··· 836 832 u8 code, __be32 info) 837 833 { 838 834 struct inet6_skb_parm *opt = IP6CB(skb); 839 - struct net *net = dev_net(skb->dev); 835 + struct net *net = dev_net_rcu(skb->dev); 840 836 const struct inet6_protocol *ipprot; 841 837 enum skb_drop_reason reason; 842 838 int inner_offset; ··· 893 889 static int icmpv6_rcv(struct sk_buff *skb) 894 890 { 895 891 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 896 - struct net *net = dev_net(skb->dev); 892 + struct net *net = dev_net_rcu(skb->dev); 897 893 struct net_device *dev = icmp6_dev(skb); 898 894 struct inet6_dev *idev = __in6_dev_get(dev); 899 895 const struct in6_addr *saddr, *daddr; ··· 925 921 skb_set_network_header(skb, nh); 926 922 } 927 923 928 - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); 924 + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); 929 925 930 926 saddr = &ipv6_hdr(skb)->saddr; 931 927 daddr = &ipv6_hdr(skb)->daddr; ··· 943 939 944 940 type = hdr->icmp6_type; 945 941 946 - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); 942 + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); 947 943 948 944 switch (type) { 949 945 case ICMPV6_ECHO_REQUEST: ··· 1038 1034 1039 1035 csum_error: 1040 1036 reason = SKB_DROP_REASON_ICMP_CSUM; 1041 - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); 1037 + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); 1042 1038 discard_it: 1043 - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); 1039 + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); 1044 1040 drop_no_count: 1045 1041 kfree_skb_reason(skb, reason); 1046 1042 return 0;
+9 -5
net/ipv6/ip6_input.c
··· 477 477 static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 478 478 { 479 479 skb_clear_delivery_time(skb); 480 - rcu_read_lock(); 481 480 ip6_protocol_deliver_rcu(net, skb, 0, false); 482 - rcu_read_unlock(); 483 481 484 482 return 0; 485 483 } ··· 485 487 486 488 int ip6_input(struct sk_buff *skb) 487 489 { 488 - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, 489 - dev_net(skb->dev), NULL, skb, skb->dev, NULL, 490 - ip6_input_finish); 490 + int res; 491 + 492 + rcu_read_lock(); 493 + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, 494 + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, 495 + ip6_input_finish); 496 + rcu_read_unlock(); 497 + 498 + return res; 491 499 } 492 500 EXPORT_SYMBOL_GPL(ip6_input); 493 501
+6 -1
net/ipv6/route.c
··· 3196 3196 { 3197 3197 struct net_device *dev = dst->dev; 3198 3198 unsigned int mtu = dst_mtu(dst); 3199 - struct net *net = dev_net(dev); 3199 + struct net *net; 3200 3200 3201 3201 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 3202 3202 3203 + rcu_read_lock(); 3204 + 3205 + net = dev_net_rcu(dev); 3203 3206 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 3204 3207 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 3208 + 3209 + rcu_read_unlock(); 3205 3210 3206 3211 /* 3207 3212 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and