Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: add sk->sk_drop_counters

Some sockets suffer from heavy false sharing on sk->sk_drops,
and fields in the same cache line.

Add sk->sk_drop_counters to:

- move the drop counter(s) to dedicated cache lines.
- Add basic NUMA awareness to these drop counter(s).

Following patches will use this infrastructure for UDP and RAW sockets.

sk_clone_lock() is not yet ready, it would need to properly
set newsk->sk_drop_counters if we plan to use this for TCP sockets.

v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250826125031.1578842-4-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Eric Dumazet and committed by
Paolo Abeni
c51613fa cb4d5a6e

+33 -1
+31 -1
include/net/sock.h
··· 102 102 typedef __u32 __bitwise __portpair; 103 103 typedef __u64 __bitwise __addrpair; 104 104 105 + struct socket_drop_counters { 106 + atomic_t drops0 ____cacheline_aligned_in_smp; 107 + atomic_t drops1 ____cacheline_aligned_in_smp; 108 + }; 109 + 105 110 /** 106 111 * struct sock_common - minimal network layer representation of sockets 107 112 * @skc_daddr: Foreign IPv4 addr ··· 287 282 * @sk_err_soft: errors that don't cause failure but are the cause of a 288 283 * persistent failure not just 'timed out' 289 284 * @sk_drops: raw/udp drops counter 285 + * @sk_drop_counters: optional pointer to socket_drop_counters 290 286 * @sk_ack_backlog: current listen backlog 291 287 * @sk_max_ack_backlog: listen backlog set in listen() 292 288 * @sk_uid: user id of owner ··· 455 449 #ifdef CONFIG_XFRM 456 450 struct xfrm_policy __rcu *sk_policy[2]; 457 451 #endif 452 + struct socket_drop_counters *sk_drop_counters; 458 453 __cacheline_group_end(sock_read_rxtx); 459 454 460 455 __cacheline_group_begin(sock_write_rxtx); ··· 2691 2684 2692 2685 static inline void sk_drops_add(struct sock *sk, int segs) 2693 2686 { 2694 - atomic_add(segs, &sk->sk_drops); 2687 + struct socket_drop_counters *sdc = sk->sk_drop_counters; 2688 + 2689 + if (sdc) { 2690 + int n = numa_node_id() % 2; 2691 + 2692 + if (n) 2693 + atomic_add(segs, &sdc->drops1); 2694 + else 2695 + atomic_add(segs, &sdc->drops0); 2696 + } else { 2697 + atomic_add(segs, &sk->sk_drops); 2698 + } 2695 2699 } 2696 2700 2697 2701 static inline void sk_drops_inc(struct sock *sk) ··· 2712 2694 2713 2695 static inline int sk_drops_read(const struct sock *sk) 2714 2696 { 2697 + const struct socket_drop_counters *sdc = sk->sk_drop_counters; 2698 + 2699 + if (sdc) { 2700 + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); 2701 + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); 2702 + } 2715 2703 return atomic_read(&sk->sk_drops); 2716 2704 } 2717 2705 2718 2706 static inline void sk_drops_reset(struct sock *sk) 2719 2707 { 2708 + struct socket_drop_counters *sdc = sk->sk_drop_counters; 2709 + 2710 + if (sdc) { 2711 + atomic_set(&sdc->drops0, 0); 2712 + atomic_set(&sdc->drops1, 0); 2713 + } 2720 2714 atomic_set(&sk->sk_drops, 0); 2721 2715 } 2722 2716
+2
net/core/sock.c
··· 2505 2505 newsk->sk_wmem_queued = 0; 2506 2506 newsk->sk_forward_alloc = 0; 2507 2507 newsk->sk_reserved_mem = 0; 2508 + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2508 2509 sk_drops_reset(newsk); 2509 2510 newsk->sk_send_head = NULL; 2510 2511 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; ··· 4458 4457 #ifdef CONFIG_MEMCG 4459 4458 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4460 4459 #endif 4460 + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); 4461 4461 4462 4462 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4463 4463 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);