Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rtnetlink: Add per-netns RTNL.

The goal is to break RTNL down into per-netns mutex.

This patch adds per-netns mutex and its helper functions, rtnl_net_lock()
and rtnl_net_unlock().

rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and
rtnl_net_unlock() releases them.

We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes
rtnl_lock() in rtnl_net_lock().

When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(),
and its locking order is defined by rtnl_net_lock_cmp_fn() as follows:

1. init_net is first
2. netns address ascending order

Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL
with LOCKDEP so that we can carefully add the extra mutex without slowing
down RTNL operations during conversion.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Kuniyuki Iwashima and committed by
Paolo Abeni
76aed953 ec763c23

+104
+21
include/linux/rtnetlink.h
··· 92 92 #define rcu_replace_pointer_rtnl(rp, p) \ 93 93 rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) 94 94 95 + #ifdef CONFIG_DEBUG_NET_SMALL_RTNL 96 + void __rtnl_net_lock(struct net *net); 97 + void __rtnl_net_unlock(struct net *net); 98 + void rtnl_net_lock(struct net *net); 99 + void rtnl_net_unlock(struct net *net); 100 + int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); 101 + #else 102 + static inline void __rtnl_net_lock(struct net *net) {} 103 + static inline void __rtnl_net_unlock(struct net *net) {} 104 + 105 + static inline void rtnl_net_lock(struct net *net) 106 + { 107 + rtnl_lock(); 108 + } 109 + 110 + static inline void rtnl_net_unlock(struct net *net) 111 + { 112 + rtnl_unlock(); 113 + } 114 + #endif 115 + 95 116 static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) 96 117 { 97 118 return rtnl_dereference(dev->ingress_queue);
+4
include/net/net_namespace.h
··· 188 188 #if IS_ENABLED(CONFIG_SMC) 189 189 struct netns_smc smc; 190 190 #endif 191 + #ifdef CONFIG_DEBUG_NET_SMALL_RTNL 192 + /* Move to a better place when the config guard is removed. */ 193 + struct mutex rtnl_mutex; 194 + #endif 191 195 } __randomize_layout; 192 196 193 197 #include <linux/seq_file_net.h>
+15
net/Kconfig.debug
··· 24 24 help 25 25 Enable extra sanity checks in networking. 26 26 This is mostly used by fuzzers, but is safe to select. 27 + 28 + config DEBUG_NET_SMALL_RTNL 29 + bool "Add extra per-netns mutex inside RTNL" 30 + depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT 31 + select PROVE_LOCKING 32 + default n 33 + help 34 + rtnl_lock() is being replaced with rtnl_net_lock() that 35 + acquires the global RTNL and a small per-netns RTNL mutex. 36 + 37 + During the conversion, rtnl_net_lock() just adds an extra 38 + mutex in every RTNL scope and slows down the operations. 39 + 40 + Once the conversion completes, rtnl_lock() will be removed 41 + and rtnetlink will gain per-netns scalability.
+6
net/core/net_namespace.c
··· 334 334 idr_init(&net->netns_ids); 335 335 spin_lock_init(&net->nsid_lock); 336 336 mutex_init(&net->ipv4.ra_mutex); 337 + 338 + #ifdef CONFIG_DEBUG_NET_SMALL_RTNL 339 + mutex_init(&net->rtnl_mutex); 340 + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); 341 + #endif 342 + 337 343 preinit_net_sysctl(net); 338 344 } 339 345
+58
net/core/rtnetlink.c
··· 179 179 EXPORT_SYMBOL(lockdep_rtnl_is_held); 180 180 #endif /* #ifdef CONFIG_PROVE_LOCKING */ 181 181 182 + #ifdef CONFIG_DEBUG_NET_SMALL_RTNL 183 + void __rtnl_net_lock(struct net *net) 184 + { 185 + ASSERT_RTNL(); 186 + 187 + mutex_lock(&net->rtnl_mutex); 188 + } 189 + EXPORT_SYMBOL(__rtnl_net_lock); 190 + 191 + void __rtnl_net_unlock(struct net *net) 192 + { 193 + ASSERT_RTNL(); 194 + 195 + mutex_unlock(&net->rtnl_mutex); 196 + } 197 + EXPORT_SYMBOL(__rtnl_net_unlock); 198 + 199 + void rtnl_net_lock(struct net *net) 200 + { 201 + rtnl_lock(); 202 + __rtnl_net_lock(net); 203 + } 204 + EXPORT_SYMBOL(rtnl_net_lock); 205 + 206 + void rtnl_net_unlock(struct net *net) 207 + { 208 + __rtnl_net_unlock(net); 209 + rtnl_unlock(); 210 + } 211 + EXPORT_SYMBOL(rtnl_net_unlock); 212 + 213 + static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) 214 + { 215 + if (net_eq(net_a, net_b)) 216 + return 0; 217 + 218 + /* always init_net first */ 219 + if (net_eq(net_a, &init_net)) 220 + return -1; 221 + 222 + if (net_eq(net_b, &init_net)) 223 + return 1; 224 + 225 + /* otherwise lock in ascending order */ 226 + return net_a < net_b ? -1 : 1; 227 + } 228 + 229 + int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) 230 + { 231 + const struct net *net_a, *net_b; 232 + 233 + net_a = container_of(a, struct net, rtnl_mutex.dep_map); 234 + net_b = container_of(b, struct net, rtnl_mutex.dep_map); 235 + 236 + return rtnl_net_cmp_locks(net_a, net_b); 237 + } 238 + #endif 239 + 182 240 static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; 183 241 184 242 static inline int rtm_msgindex(int msgtype)