Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: dst: Switch to rcuref_t reference counting

Under high contention dst_entry::__refcnt becomes a significant bottleneck.

atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
high retry rates on contention.

Switch the reference count to rcuref_t which results in a significant
performance gain. Rename the reference count member to __rcuref to reflect
the change.

The gain depends on the micro-architecture and the number of concurrent
operations and has been measured in the range of +25% to +130% with a
localhost memtier/memcached benchmark which amplifies the problem
massively.

Running the memtier/memcached benchmark over a real (1Gb) network
connection the conversion on top of the false sharing fix for struct
dst_entry::__refcnt results in a total gain in the 2%-5% range over the
upstream baseline.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Thomas Gleixner and committed by
Jakub Kicinski
bc9d3a9f d288a162

+23 -38
+10 -9
include/net/dst.h
··· 16 16 #include <linux/bug.h> 17 17 #include <linux/jiffies.h> 18 18 #include <linux/refcount.h> 19 + #include <linux/rcuref.h> 19 20 #include <net/neighbour.h> 20 21 #include <asm/processor.h> 21 22 #include <linux/indirect_call_wrapper.h> ··· 62 61 unsigned short trailer_len; /* space to reserve at tail */ 63 62 64 63 /* 65 - * __refcnt wants to be on a different cache line from 64 + * __rcuref wants to be on a different cache line from 66 65 * input/output/ops or performance tanks badly 67 66 */ 68 67 #ifdef CONFIG_64BIT 69 - atomic_t __refcnt; /* 64-bit offset 64 */ 68 + rcuref_t __rcuref; /* 64-bit offset 64 */ 70 69 #endif 71 70 int __use; 72 71 unsigned long lastuse; ··· 76 75 __u32 tclassid; 77 76 #ifndef CONFIG_64BIT 78 77 struct lwtunnel_state *lwtstate; 79 - atomic_t __refcnt; /* 32-bit offset 64 */ 78 + rcuref_t __rcuref; /* 32-bit offset 64 */ 80 79 #endif 81 80 netdevice_tracker dev_tracker; 82 81 83 82 /* 84 83 * Used by rtable and rt6_info. Moves lwtstate into the next cache 85 84 * line on 64bit so that lwtstate does not cause false sharing with 86 - * __refcnt under contention of __refcnt. This also puts the 85 + * __rcuref under contention of __rcuref. This also puts the 87 86 * frequently accessed members of rtable and rt6_info out of the 88 - * __refcnt cache line. 87 + * __rcuref cache line. 89 88 */ 90 89 struct list_head rt_uncached; 91 90 struct uncached_list *rt_uncached_list; ··· 239 238 { 240 239 /* 241 240 * If your kernel compilation stops here, please check 242 - * the placement of __refcnt in struct dst_entry 241 + * the placement of __rcuref in struct dst_entry 243 242 */ 244 - BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); 245 - WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); 243 + BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); 244 + WARN_ON(!rcuref_get(&dst->__rcuref)); 246 245 } 247 246 248 247 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) ··· 306 305 */ 307 306 static inline bool dst_hold_safe(struct dst_entry *dst) 308 307 { 309 - return atomic_inc_not_zero(&dst->__refcnt); 308 + return rcuref_get(&dst->__rcuref); 310 309 } 311 310 312 311 /**
+1 -1
include/net/sock.h
··· 2131 2131 2132 2132 rcu_read_lock(); 2133 2133 dst = rcu_dereference(sk->sk_dst_cache); 2134 - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) 2134 + if (dst && !rcuref_get(&dst->__rcuref)) 2135 2135 dst = NULL; 2136 2136 rcu_read_unlock(); 2137 2137 return dst;
+1 -1
net/bridge/br_nf_core.c
··· 73 73 { 74 74 struct rtable *rt = &br->fake_rtable; 75 75 76 - atomic_set(&rt->dst.__refcnt, 1); 76 + rcuref_init(&rt->dst.__rcuref, 1); 77 77 rt->dst.dev = br->dev; 78 78 dst_init_metrics(&rt->dst, br_dst_default_metrics, true); 79 79 rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
+5 -21
net/core/dst.c
··· 66 66 dst->tclassid = 0; 67 67 #endif 68 68 dst->lwtstate = NULL; 69 - atomic_set(&dst->__refcnt, initial_ref); 69 + rcuref_init(&dst->__rcuref, initial_ref); 70 70 dst->__use = 0; 71 71 dst->lastuse = jiffies; 72 72 dst->flags = flags; ··· 162 162 163 163 void dst_release(struct dst_entry *dst) 164 164 { 165 - if (dst) { 166 - int newrefcnt; 167 - 168 - newrefcnt = atomic_dec_return(&dst->__refcnt); 169 - if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) 170 - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", 171 - __func__, dst, newrefcnt); 172 - if (!newrefcnt) 173 - call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); 174 - } 165 + if (dst && rcuref_put(&dst->__rcuref)) 166 + call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); 175 167 } 176 168 EXPORT_SYMBOL(dst_release); 177 169 178 170 void dst_release_immediate(struct dst_entry *dst) 179 171 { 180 - if (dst) { 181 - int newrefcnt; 182 - 183 - newrefcnt = atomic_dec_return(&dst->__refcnt); 184 - if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) 185 - net_warn_ratelimited("%s: dst:%p refcnt:%d\n", 186 - __func__, dst, newrefcnt); 187 - if (!newrefcnt) 188 - dst_destroy(dst); 189 - } 172 + if (dst && rcuref_put(&dst->__rcuref)) 173 + dst_destroy(dst); 190 174 } 191 175 EXPORT_SYMBOL(dst_release_immediate); 192 176
+1 -1
net/core/rtnetlink.c
··· 843 843 if (dst) { 844 844 ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); 845 845 ci.rta_used = dst->__use; 846 - ci.rta_clntref = atomic_read(&dst->__refcnt); 846 + ci.rta_clntref = rcuref_read(&dst->__rcuref); 847 847 } 848 848 if (expires) { 849 849 unsigned long clock;
+3 -3
net/ipv6/route.c
··· 293 293 294 294 static const struct rt6_info ip6_null_entry_template = { 295 295 .dst = { 296 - .__refcnt = ATOMIC_INIT(1), 296 + .__rcuref = RCUREF_INIT(1), 297 297 .__use = 1, 298 298 .obsolete = DST_OBSOLETE_FORCE_CHK, 299 299 .error = -ENETUNREACH, ··· 307 307 308 308 static const struct rt6_info ip6_prohibit_entry_template = { 309 309 .dst = { 310 - .__refcnt = ATOMIC_INIT(1), 310 + .__rcuref = RCUREF_INIT(1), 311 311 .__use = 1, 312 312 .obsolete = DST_OBSOLETE_FORCE_CHK, 313 313 .error = -EACCES, ··· 319 319 320 320 static const struct rt6_info ip6_blk_hole_entry_template = { 321 321 .dst = { 322 - .__refcnt = ATOMIC_INIT(1), 322 + .__rcuref = RCUREF_INIT(1), 323 323 .__use = 1, 324 324 .obsolete = DST_OBSOLETE_FORCE_CHK, 325 325 .error = -EINVAL,
+2 -2
net/netfilter/ipvs/ip_vs_xmit.c
··· 339 339 spin_unlock_bh(&dest->dst_lock); 340 340 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 341 341 &dest->addr.ip, &dest_dst->dst_saddr.ip, 342 - atomic_read(&rt->dst.__refcnt)); 342 + rcuref_read(&rt->dst.__rcuref)); 343 343 } 344 344 if (ret_saddr) 345 345 *ret_saddr = dest_dst->dst_saddr.ip; ··· 507 507 spin_unlock_bh(&dest->dst_lock); 508 508 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 509 509 &dest->addr.in6, &dest_dst->dst_saddr.in6, 510 - atomic_read(&rt->dst.__refcnt)); 510 + rcuref_read(&rt->dst.__rcuref)); 511 511 } 512 512 if (ret_saddr) 513 513 *ret_saddr = dest_dst->dst_saddr.in6;