Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Save TX flow hash in sock and set in skbuf on xmit

For a connected socket we can precompute the flow hash for setting
in skb->hash on output. This is a performance advantage over
calculating the skb->hash for every packet on the connection. The
computation is done using the common hash algorithm to be consistent
with computations done for packets of the connection in other states
where thers is no socket (e.g. time-wait, syn-recv, syn-cookies).

This patch adds sk_txhash to the sock structure. inet_set_txhash and
ip6_set_txhash functions are added which are called from points in
TCP and UDP where socket moves to established state.

skb_set_hash_from_sk is a function which sets skb->hash from the
sock txhash value. This is called in UDP and TCP transmit path when
transmitting within the context of a socket.

Tested: ran super_netperf with 200 TCP_RR streams over a vxlan
interface (in this case skb_get_hash called on every TX packet to
create a UDP source port).

Before fix:

95.02% CPU utilization
154/256/505 90/95/99% latencies
1.13042e+06 tps

Time in functions:
0.28% skb_flow_dissect
0.21% __skb_get_hash

After fix:

94.95% CPU utilization
156/254/485 90/95/99% latencies
1.15447e+06

Neither __skb_get_hash nor skb_flow_dissect appear in perf

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Tom Herbert and committed by
David S. Miller
b73c3d0e 5ed20a68

+50
+14
include/net/ip.h
··· 31 31 #include <net/route.h> 32 32 #include <net/snmp.h> 33 33 #include <net/flow.h> 34 + #include <net/flow_keys.h> 34 35 35 36 struct sock; 36 37 ··· 352 351 { 353 352 return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 354 353 skb->len, proto, 0); 354 + } 355 + 356 + static inline void inet_set_txhash(struct sock *sk) 357 + { 358 + struct inet_sock *inet = inet_sk(sk); 359 + struct flow_keys keys; 360 + 361 + keys.src = inet->inet_saddr; 362 + keys.dst = inet->inet_daddr; 363 + keys.port16[0] = inet->inet_sport; 364 + keys.port16[1] = inet->inet_dport; 365 + 366 + sk->sk_txhash = flow_hash_from_keys(&keys); 355 367 } 356 368 357 369 /*
+15
include/net/ipv6.h
··· 19 19 #include <net/if_inet6.h> 20 20 #include <net/ndisc.h> 21 21 #include <net/flow.h> 22 + #include <net/flow_keys.h> 22 23 #include <net/snmp.h> 23 24 24 25 #define SIN6_LEN_RFC2133 24 ··· 683 682 if (hlimit < 0) 684 683 hlimit = ip6_dst_hoplimit(dst); 685 684 return hlimit; 685 + } 686 + 687 + static inline void ip6_set_txhash(struct sock *sk) 688 + { 689 + struct inet_sock *inet = inet_sk(sk); 690 + struct ipv6_pinfo *np = inet6_sk(sk); 691 + struct flow_keys keys; 692 + 693 + keys.src = (__force __be32)ipv6_addr_hash(&np->saddr); 694 + keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr); 695 + keys.port16[0] = inet->inet_sport; 696 + keys.port16[1] = inet->inet_dport; 697 + 698 + sk->sk_txhash = flow_hash_from_keys(&keys); 686 699 } 687 700 688 701 /*
+11
include/net/sock.h
··· 273 273 * @sk_rcvtimeo: %SO_RCVTIMEO setting 274 274 * @sk_sndtimeo: %SO_SNDTIMEO setting 275 275 * @sk_rxhash: flow hash received from netif layer 276 + * @sk_txhash: computed flow hash for use on transmit 276 277 * @sk_filter: socket filtering instructions 277 278 * @sk_protinfo: private area, net family specific, when not using slab 278 279 * @sk_timer: sock cleanup timer ··· 348 347 #ifdef CONFIG_RPS 349 348 __u32 sk_rxhash; 350 349 #endif 350 + __u32 sk_txhash; 351 351 #ifdef CONFIG_NET_RX_BUSY_POLL 352 352 unsigned int sk_napi_id; 353 353 unsigned int sk_ll_usec; ··· 1982 1980 } 1983 1981 } 1984 1982 1983 + static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) 1984 + { 1985 + if (sk->sk_txhash) { 1986 + skb->l4_hash = 1; 1987 + skb->hash = sk->sk_txhash; 1988 + } 1989 + } 1990 + 1985 1991 /* 1986 1992 * Queue a received datagram if it will fit. Stream and sequenced 1987 1993 * protocols can't normally use this as they need to fit buffers in ··· 2004 1994 skb_orphan(skb); 2005 1995 skb->sk = sk; 2006 1996 skb->destructor = sock_wfree; 1997 + skb_set_hash_from_sk(skb, sk); 2007 1998 /* 2008 1999 * We used to take a refcount on sk, but following operation 2009 2000 * is enough to guarantee sk_free() wont free this sock until
+1
net/ipv4/datagram.c
··· 76 76 inet->inet_daddr = fl4->daddr; 77 77 inet->inet_dport = usin->sin_port; 78 78 sk->sk_state = TCP_ESTABLISHED; 79 + inet_set_txhash(sk); 79 80 inet->inet_id = jiffies; 80 81 81 82 sk_dst_set(sk, &rt->dst);
+3
net/ipv4/tcp_ipv4.c
··· 208 208 inet->inet_dport = usin->sin_port; 209 209 inet->inet_daddr = daddr; 210 210 211 + inet_set_txhash(sk); 212 + 211 213 inet_csk(sk)->icsk_ext_hdr_len = 0; 212 214 if (inet_opt) 213 215 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; ··· 1336 1334 newinet->mc_ttl = ip_hdr(skb)->ttl; 1337 1335 newinet->rcv_tos = ip_hdr(skb)->tos; 1338 1336 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1337 + inet_set_txhash(newsk); 1339 1338 if (inet_opt) 1340 1339 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1341 1340 newinet->inet_id = newtp->write_seq ^ jiffies;
+1
net/ipv4/tcp_output.c
··· 916 916 skb_orphan(skb); 917 917 skb->sk = sk; 918 918 skb->destructor = tcp_wfree; 919 + skb_set_hash_from_sk(skb, sk); 919 920 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 920 921 921 922 /* Build TCP header and checksum it. */
+1
net/ipv6/datagram.c
··· 199 199 NULL); 200 200 201 201 sk->sk_state = TCP_ESTABLISHED; 202 + ip6_set_txhash(sk); 202 203 out: 203 204 fl6_sock_release(flowlabel); 204 205 return err;
+4
net/ipv6/tcp_ipv6.c
··· 198 198 sk->sk_v6_daddr = usin->sin6_addr; 199 199 np->flow_label = fl6.flowlabel; 200 200 201 + ip6_set_txhash(sk); 202 + 201 203 /* 202 204 * TCP over IPv4 203 205 */ ··· 1133 1131 newnp->saddr = ireq->ir_v6_loc_addr; 1134 1132 newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; 1135 1133 newsk->sk_bound_dev_if = ireq->ir_iif; 1134 + 1135 + ip6_set_txhash(newsk); 1136 1136 1137 1137 /* Now IPv6 options... 1138 1138