Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: align sk_refcnt on 128 bytes boundary

sk->sk_refcnt is dirtied for every TCP/UDP incoming packet.
This is a performance issue if multiple cpus hit a common socket,
or multiple sockets are chained due to SO_REUSEPORT.

By moving sk_refcnt 8 bytes further, first 128 bytes of sockets
are mostly read. As they contain the lookup keys, this has
a considerable performance impact, as cpus can cache them.

These 8 bytes are not wasted, we use them as a place holder
for various fields, depending on the socket type.

Tested:
SYN flood hitting a 16 RX queues NIC.
TCP listener using 16 sockets and SO_REUSEPORT
and SO_INCOMING_CPU for proper siloing.

Could process 6.0 Mpps SYN instead of 4.2 Mpps

Kernel profile looked like :
11.68% [kernel] [k] sha_transform
6.51% [kernel] [k] __inet_lookup_listener
5.07% [kernel] [k] __inet_lookup_established
4.15% [kernel] [k] memcpy_erms
3.46% [kernel] [k] ipt_do_table
2.74% [kernel] [k] fib_table_lookup
2.54% [kernel] [k] tcp_make_synack
2.34% [kernel] [k] tcp_conn_request
2.05% [kernel] [k] __netif_receive_skb_core
2.03% [kernel] [k] kmem_cache_alloc

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
8e5eb54d 70da268b

+16 -5
+1 -1
include/net/inet_timewait_sock.h
··· 70 70 #define tw_dport __tw_common.skc_dport 71 71 #define tw_num __tw_common.skc_num 72 72 #define tw_cookie __tw_common.skc_cookie 73 + #define tw_dr __tw_common.skc_tw_dr 73 74 74 75 int tw_timeout; 75 76 volatile unsigned char tw_substate; ··· 89 88 kmemcheck_bitfield_end(flags); 90 89 struct timer_list tw_timer; 91 90 struct inet_bind_bucket *tw_tb; 92 - struct inet_timewait_death_row *tw_dr; 93 91 }; 94 92 #define tw_tclass tw_tos 95 93
+1 -1
include/net/request_sock.h
··· 50 50 struct sock_common __req_common; 51 51 #define rsk_refcnt __req_common.skc_refcnt 52 52 #define rsk_hash __req_common.skc_hash 53 + #define rsk_listener __req_common.skc_listener 53 54 54 55 struct request_sock *dl_next; 55 - struct sock *rsk_listener; 56 56 u16 mss; 57 57 u8 num_retrans; /* number of retransmits */ 58 58 u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */
+14 -3
include/net/sock.h
··· 150 150 * @skc_node: main hash linkage for various protocol lookup tables 151 151 * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol 152 152 * @skc_tx_queue_mapping: tx queue number for this connection 153 + * @skc_flags: place holder for sk_flags 154 + * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 155 + * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 153 156 * @skc_incoming_cpu: record/match cpu processing incoming packets 154 157 * @skc_refcnt: reference count 155 158 * ··· 204 201 205 202 atomic64_t skc_cookie; 206 203 204 + /* following fields are padding to force 205 + * offset(struct sock, sk_refcnt) == 128 on 64bit arches 206 + * assuming IPV6 is enabled. We use this padding differently 207 + * for different kind of 'sockets' 208 + */ 209 + union { 210 + unsigned long skc_flags; 211 + struct sock *skc_listener; /* request_sock */ 212 + struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ 213 + }; 207 214 /* 208 215 * fields between dontcopy_begin/dontcopy_end 209 216 * are not copied in sock_copy() ··· 259 246 * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) 260 247 * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) 261 248 * @sk_sndbuf: size of send buffer in bytes 262 - * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 263 - * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 264 249 * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets 265 250 * @sk_no_check_rx: allow zero checksum in RX packets 266 251 * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) ··· 345 334 #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr 346 335 #define sk_cookie __sk_common.skc_cookie 347 336 #define sk_incoming_cpu __sk_common.skc_incoming_cpu 337 + #define sk_flags __sk_common.skc_flags 348 338 349 339 socket_lock_t sk_lock; 350 340 struct sk_buff_head sk_receive_queue; ··· 383 371 #ifdef CONFIG_XFRM 384 372 struct xfrm_policy *sk_policy[2]; 385 373 #endif 386 - unsigned long sk_flags; 387 374 struct dst_entry *sk_rx_dst; 388 375 struct dst_entry __rcu *sk_dst_cache; 389 376 spinlock_t sk_dst_lock;