Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: dont hold rtnl mutex during netlink dump callbacks

Four years ago, Patrick made a change to hold rtnl mutex during netlink
dump callbacks.

I believe it was a wrong move. This slows down concurrent dumps, making
good old /proc/net/ files faster than rtnetlink in some situations.

This occurred to me because one "ip link show dev ..." was _very_ slow
on a workload adding/removing network devices in background.

All dump callbacks are able to use RCU locking now, so this patch does
roughly a revert of commits :

1c2d670f366 : [RTNETLINK]: Hold rtnl_mutex during netlink dump callbacks
6313c1e0992 : [RTNETLINK]: Remove unnecessary locking in dump callbacks

This let writers fight for rtnl mutex and readers going full speed.

It also takes care of phonet : phonet_route_get() is now called from rcu
read section. I renamed it to phonet_route_get_rcu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
e67f88dd dcfd9cdc

+25 -23
+1 -1
include/net/phonet/pn_dev.h
··· 51 51 int phonet_route_add(struct net_device *dev, u8 daddr); 52 52 int phonet_route_del(struct net_device *dev, u8 daddr); 53 53 void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); 54 - struct net_device *phonet_route_get(struct net *net, u8 daddr); 54 + struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); 55 55 struct net_device *phonet_route_output(struct net *net, u8 daddr); 56 56 57 57 #define PN_NO_ADDR 0xff
+4 -3
net/bridge/br_netlink.c
··· 120 120 int idx; 121 121 122 122 idx = 0; 123 - for_each_netdev(net, dev) { 124 - struct net_bridge_port *port = br_port_get_rtnl(dev); 123 + rcu_read_lock(); 124 + for_each_netdev_rcu(net, dev) { 125 + struct net_bridge_port *port = br_port_get_rcu(dev); 125 126 126 127 /* not a bridge port */ 127 128 if (!port || idx < cb->args[0]) ··· 136 135 skip: 137 136 ++idx; 138 137 } 139 - 138 + rcu_read_unlock(); 140 139 cb->args[0] = idx; 141 140 142 141 return skb->len;
+2 -1
net/core/fib_rules.c
··· 590 590 int idx = 0; 591 591 struct fib_rule *rule; 592 592 593 - list_for_each_entry(rule, &ops->rules_list, list) { 593 + rcu_read_lock(); 594 + list_for_each_entry_rcu(rule, &ops->rules_list, list) { 594 595 if (idx < cb->args[1]) 595 596 goto skip; 596 597
+5 -7
net/core/rtnetlink.c
··· 1007 1007 s_h = cb->args[0]; 1008 1008 s_idx = cb->args[1]; 1009 1009 1010 + rcu_read_lock(); 1010 1011 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { 1011 1012 idx = 0; 1012 1013 head = &net->dev_index_head[h]; 1013 - hlist_for_each_entry(dev, node, head, index_hlist) { 1014 + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { 1014 1015 if (idx < s_idx) 1015 1016 goto cont; 1016 1017 if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, ··· 1024 1023 } 1025 1024 } 1026 1025 out: 1026 + rcu_read_unlock(); 1027 1027 cb->args[1] = idx; 1028 1028 cb->args[0] = h; 1029 1029 ··· 1881 1879 int min_len; 1882 1880 int family; 1883 1881 int type; 1884 - int err; 1885 1882 1886 1883 type = nlh->nlmsg_type; 1887 1884 if (type > RTM_MAX) ··· 1907 1906 if (dumpit == NULL) 1908 1907 return -EOPNOTSUPP; 1909 1908 1910 - __rtnl_unlock(); 1911 1909 rtnl = net->rtnl; 1912 - err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); 1913 - rtnl_lock(); 1914 - return err; 1910 + return netlink_dump_start(rtnl, skb, nlh, dumpit, NULL); 1915 1911 } 1916 1912 1917 1913 memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); ··· 1978 1980 { 1979 1981 struct sock *sk; 1980 1982 sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, 1981 - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); 1983 + rtnetlink_rcv, NULL, THIS_MODULE); 1982 1984 if (!sk) 1983 1985 return -ENOMEM; 1984 1986 net->rtnl = sk;
+6 -4
net/decnet/dn_dev.c
··· 752 752 skip_naddr = cb->args[1]; 753 753 754 754 idx = 0; 755 - for_each_netdev(&init_net, dev) { 755 + rcu_read_lock(); 756 + for_each_netdev_rcu(&init_net, dev) { 756 757 if (idx < skip_ndevs) 757 758 goto cont; 758 759 else if (idx > skip_ndevs) { ··· 762 761 skip_naddr = 0; 763 762 } 764 763 765 - if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) 764 + if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL) 766 765 goto cont; 767 766 768 - for (ifa = rtnl_dereference(dn_db->ifa_list), dn_idx = 0; ifa; 769 - ifa = rtnl_dereference(ifa->ifa_next), dn_idx++) { 767 + for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa; 768 + ifa = rcu_dereference(ifa->ifa_next), dn_idx++) { 770 769 if (dn_idx < skip_naddr) 771 770 continue; 772 771 ··· 779 778 idx++; 780 779 } 781 780 done: 781 + rcu_read_unlock(); 782 782 cb->args[0] = idx; 783 783 cb->args[1] = dn_idx; 784 784
+3 -1
net/ipv6/ip6_fib.c
··· 394 394 arg.net = net; 395 395 w->args = &arg; 396 396 397 + rcu_read_lock(); 397 398 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { 398 399 e = 0; 399 400 head = &net->ipv6.fib_table_hash[h]; 400 - hlist_for_each_entry(tb, node, head, tb6_hlist) { 401 + hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { 401 402 if (e < s_e) 402 403 goto next; 403 404 res = fib6_dump_table(tb, skb, cb); ··· 409 408 } 410 409 } 411 410 out: 411 + rcu_read_unlock(); 412 412 cb->args[1] = e; 413 413 cb->args[0] = h; 414 414
+1 -5
net/phonet/pn_dev.c
··· 426 426 return 0; 427 427 } 428 428 429 - struct net_device *phonet_route_get(struct net *net, u8 daddr) 429 + struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr) 430 430 { 431 431 struct phonet_net *pnn = phonet_pernet(net); 432 432 struct phonet_routes *routes = &pnn->routes; 433 433 struct net_device *dev; 434 434 435 - ASSERT_RTNL(); /* no need to hold the device */ 436 - 437 435 daddr >>= 2; 438 - rcu_read_lock(); 439 436 dev = rcu_dereference(routes->table[daddr]); 440 - rcu_read_unlock(); 441 437 return dev; 442 438 } 443 439
+3 -1
net/phonet/pn_netlink.c
··· 264 264 struct net *net = sock_net(skb->sk); 265 265 u8 addr, addr_idx = 0, addr_start_idx = cb->args[0]; 266 266 267 + rcu_read_lock(); 267 268 for (addr = 0; addr < 64; addr++) { 268 269 struct net_device *dev; 269 270 270 - dev = phonet_route_get(net, addr << 2); 271 + dev = phonet_route_get_rcu(net, addr << 2); 271 272 if (!dev) 272 273 continue; 273 274 ··· 280 279 } 281 280 282 281 out: 282 + rcu_read_unlock(); 283 283 cb->args[0] = addr_idx; 284 284 cb->args[1] = 0; 285 285