Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT

TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12
as a step to enable bigger tcp sndbuf limits.

It works reasonably well, but the following happens :

Once the limit is reached, TCP stack generates
an [E]POLLOUT event for every incoming ACK packet.

This causes a high number of context switches.

This patch implements the strategy David Miller added
in sock_def_write_space() :

- If TCP socket has a notsent_lowat constraint of X bytes,
allow sendmsg() to fill up to X bytes, but send [E]POLLOUT
only if number of notsent bytes is below X/2

This considerably reduces TCP_NOTSENT_LOWAT overhead,
while allowing to keep the pipe full.

Tested:
100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM

A:/# cat /proc/sys/net/ipv4/tcp_wmem
4096 262144 64000000
A:/# super_netperf 100 -H B -l 1000 -- -K bbr &

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/

A:/# vmstat 5 5
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0
2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0
0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0
1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0
2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0

A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat

A:/# grep TCP /proc/net/sockstat
TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow

A:/# vmstat 5 5 # check that context switches have not inflated too much.
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
r b swpd free buff cache si so bi bo in cs us sy id wa st
2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0
0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0
1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0
1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0
1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
a74f0fa0 4dc88ce6

+22 -8
+15 -5
include/net/sock.h
··· 1110 1110 unsigned int inuse_idx; 1111 1111 #endif 1112 1112 1113 - bool (*stream_memory_free)(const struct sock *sk); 1113 + bool (*stream_memory_free)(const struct sock *sk, int wake); 1114 1114 bool (*stream_memory_read)(const struct sock *sk); 1115 1115 /* Memory pressure */ 1116 1116 void (*enter_memory_pressure)(struct sock *sk); ··· 1192 1192 #define sk_refcnt_debug_release(sk) do { } while (0) 1193 1193 #endif /* SOCK_REFCNT_DEBUG */ 1194 1194 1195 - static inline bool sk_stream_memory_free(const struct sock *sk) 1195 + static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) 1196 1196 { 1197 1197 if (sk->sk_wmem_queued >= sk->sk_sndbuf) 1198 1198 return false; 1199 1199 1200 1200 return sk->sk_prot->stream_memory_free ? 1201 - sk->sk_prot->stream_memory_free(sk) : true; 1201 + sk->sk_prot->stream_memory_free(sk, wake) : true; 1202 + } 1203 + 1204 + static inline bool sk_stream_memory_free(const struct sock *sk) 1205 + { 1206 + return __sk_stream_memory_free(sk, 0); 1207 + } 1208 + 1209 + static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) 1210 + { 1211 + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && 1212 + __sk_stream_memory_free(sk, wake); 1202 1213 } 1203 1214 1204 1215 static inline bool sk_stream_is_writeable(const struct sock *sk) 1205 1216 { 1206 - return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && 1207 - sk_stream_memory_free(sk); 1217 + return __sk_stream_is_writeable(sk, 0); 1208 1218 } 1209 1219 1210 1220 static inline int sk_under_cgroup_hierarchy(struct sock *sk,
+6 -2
include/net/tcp.h
··· 1870 1870 return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; 1871 1871 } 1872 1872 1873 - static inline bool tcp_stream_memory_free(const struct sock *sk) 1873 + /* @wake is one when sk_stream_write_space() calls us. 1874 + * This sends EPOLLOUT only if notsent_bytes is half the limit. 1875 + * This mimics the strategy used in sock_def_write_space(). 1876 + */ 1877 + static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) 1874 1878 { 1875 1879 const struct tcp_sock *tp = tcp_sk(sk); 1876 1880 u32 notsent_bytes = tp->write_seq - tp->snd_nxt; 1877 1881 1878 - return notsent_bytes < tcp_notsent_lowat(tp); 1882 + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); 1879 1883 } 1880 1884 1881 1885 #ifdef CONFIG_PROC_FS
+1 -1
net/core/stream.c
··· 32 32 struct socket *sock = sk->sk_socket; 33 33 struct socket_wq *wq; 34 34 35 - if (sk_stream_is_writeable(sk) && sock) { 35 + if (__sk_stream_is_writeable(sk, 1) && sock) { 36 36 clear_bit(SOCK_NOSPACE, &sock->flags); 37 37 38 38 rcu_read_lock();