Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: auto corking

With the introduction of TCP Small Queues, TSO auto sizing, and TCP
pacing, we can implement Automatic Corking in the kernel, to help
applications doing small write()/sendmsg() to TCP sockets.

Idea is to change tcp_push() to check if the current skb payload is
under skb optimal size (a multiple of MSS bytes)

If under 'size_goal', and at least one packet is still in Qdisc or
NIC TX queues, set the TCP Small Queue Throttled bit, so that the push
will be delayed up to TX completion time.

This delay might allow the application to coalesce more bytes
in the skb in following write()/sendmsg()/sendfile() system calls.

The exact duration of the delay is depending on the dynamics
of the system, and might be zero if no packet for this flow
is actually held in Qdisc or NIC TX ring.

Using FQ/pacing is a way to increase the probability of
autocorking being triggered.

Add a new sysctl (/proc/sys/net/ipv4/tcp_autocorking) to control
this feature and default it to 1 (enabled)

Add a new SNMP counter : nstat -a | grep TcpExtTCPAutoCorking
This counter is incremented every time we detected skb was under used
and its flush was deferred.

Tested:

Interesting effects when using line buffered commands under ssh.

Excellent performance results in term of cpu usage and total throughput.

lpq83:~# echo 1 >/proc/sys/net/ipv4/tcp_autocorking
lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128
9410.39

Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128':

35209.439626 task-clock # 2.901 CPUs utilized
2,294 context-switches # 0.065 K/sec
101 CPU-migrations # 0.003 K/sec
4,079 page-faults # 0.116 K/sec
97,923,241,298 cycles # 2.781 GHz [83.31%]
51,832,908,236 stalled-cycles-frontend # 52.93% frontend cycles idle [83.30%]
25,697,986,603 stalled-cycles-backend # 26.24% backend cycles idle [66.70%]
102,225,978,536 instructions # 1.04 insns per cycle
# 0.51 stalled cycles per insn [83.38%]
18,657,696,819 branches # 529.906 M/sec [83.29%]
91,679,646 branch-misses # 0.49% of all branches [83.40%]

12.136204899 seconds time elapsed

lpq83:~# echo 0 >/proc/sys/net/ipv4/tcp_autocorking
lpq83:~# perf stat ./super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128
6624.89

Performance counter stats for './super_netperf 4 -t TCP_STREAM -H lpq84 -- -m 128':
40045.864494 task-clock # 3.301 CPUs utilized
171 context-switches # 0.004 K/sec
53 CPU-migrations # 0.001 K/sec
4,080 page-faults # 0.102 K/sec
111,340,458,645 cycles # 2.780 GHz [83.34%]
61,778,039,277 stalled-cycles-frontend # 55.49% frontend cycles idle [83.31%]
29,295,522,759 stalled-cycles-backend # 26.31% backend cycles idle [66.67%]
108,654,349,355 instructions # 0.98 insns per cycle
# 0.57 stalled cycles per insn [83.34%]
19,552,170,748 branches # 488.244 M/sec [83.34%]
157,875,417 branch-misses # 0.81% of all branches [83.34%]

12.130267788 seconds time elapsed

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
f54b3111 d8535a0a

+72 -13
+10
Documentation/networking/ip-sysctl.txt
··· 156 156 buffer. Value 0 is special, it means that nothing is reserved. 157 157 Default: 31 158 158 159 + tcp_autocorking - BOOLEAN 160 + Enable TCP auto corking : 161 + When applications do consecutive small write()/sendmsg() system calls, 162 + we try to coalesce these small writes as much as possible, to lower 163 + total amount of sent packets. This is done if at least one prior 164 + packet for the flow is waiting in Qdisc queues or device transmit 165 + queue. Applications can still use TCP_CORK for optimal behavior 166 + when they know how/when to uncork their sockets. 167 + Default : 1 168 + 159 169 tcp_available_congestion_control - STRING 160 170 Shows the available congestion control choices that are registered. 161 171 More congestion control algorithms may be available as modules,
+1
include/net/tcp.h
··· 282 282 extern int sysctl_tcp_challenge_ack_limit; 283 283 extern unsigned int sysctl_tcp_notsent_lowat; 284 284 extern int sysctl_tcp_min_tso_segs; 285 + extern int sysctl_tcp_autocorking; 285 286 286 287 extern atomic_long_t tcp_memory_allocated; 287 288 extern struct percpu_counter tcp_sockets_allocated;
+1
include/uapi/linux/snmp.h
··· 258 258 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ 259 259 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ 260 260 LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ 261 + LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ 261 262 __LINUX_MIB_MAX 262 263 }; 263 264
+1
net/ipv4/proc.c
··· 279 279 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), 280 280 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), 281 281 SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), 282 + SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), 282 283 SNMP_MIB_SENTINEL 283 284 }; 284 285
+9
net/ipv4/sysctl_net_ipv4.c
··· 733 733 .extra2 = &gso_max_segs, 734 734 }, 735 735 { 736 + .procname = "tcp_autocorking", 737 + .data = &sysctl_tcp_autocorking, 738 + .maxlen = sizeof(int), 739 + .mode = 0644, 740 + .proc_handler = proc_dointvec_minmax, 741 + .extra1 = &zero, 742 + .extra2 = &one, 743 + }, 744 + { 736 745 .procname = "udp_mem", 737 746 .data = &sysctl_udp_mem, 738 747 .maxlen = sizeof(sysctl_udp_mem),
+50 -13
net/ipv4/tcp.c
··· 285 285 286 286 int sysctl_tcp_min_tso_segs __read_mostly = 2; 287 287 288 + int sysctl_tcp_autocorking __read_mostly = 1; 289 + 288 290 struct percpu_counter tcp_orphan_count; 289 291 EXPORT_SYMBOL_GPL(tcp_orphan_count); 290 292 ··· 621 619 tp->snd_up = tp->write_seq; 622 620 } 623 621 624 - static inline void tcp_push(struct sock *sk, int flags, int mss_now, 625 - int nonagle) 622 + /* If a not yet filled skb is pushed, do not send it if 623 + * we have packets in Qdisc or NIC queues : 624 + * Because TX completion will happen shortly, it gives a chance 625 + * to coalesce future sendmsg() payload into this skb, without 626 + * need for a timer, and with no latency trade off. 627 + * As packets containing data payload have a bigger truesize 628 + * than pure acks (dataless) packets, the last check prevents 629 + * autocorking if we only have an ACK in Qdisc/NIC queues. 630 + */ 631 + static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, 632 + int size_goal) 626 633 { 627 - if (tcp_send_head(sk)) { 628 - struct tcp_sock *tp = tcp_sk(sk); 634 + return skb->len < size_goal && 635 + sysctl_tcp_autocorking && 636 + atomic_read(&sk->sk_wmem_alloc) > skb->truesize; 637 + } 629 638 630 - if (!(flags & MSG_MORE) || forced_push(tp)) 631 - tcp_mark_push(tp, tcp_write_queue_tail(sk)); 639 + static void tcp_push(struct sock *sk, int flags, int mss_now, 640 + int nonagle, int size_goal) 641 + { 642 + struct tcp_sock *tp = tcp_sk(sk); 643 + struct sk_buff *skb; 632 644 633 - tcp_mark_urg(tp, flags); 634 - __tcp_push_pending_frames(sk, mss_now, 635 - (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 645 + if (!tcp_send_head(sk)) 646 + return; 647 + 648 + skb = tcp_write_queue_tail(sk); 649 + if (!(flags & MSG_MORE) || forced_push(tp)) 650 + tcp_mark_push(tp, skb); 651 + 652 + tcp_mark_urg(tp, flags); 653 + 654 + if (tcp_should_autocork(sk, skb, size_goal)) { 655 + 656 + /* avoid atomic op if TSQ_THROTTLED bit is already set */ 657 + if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) { 658 + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); 659 + set_bit(TSQ_THROTTLED, &tp->tsq_flags); 660 + } 661 + return; 636 662 } 663 + 664 + if (flags & MSG_MORE) 665 + nonagle = TCP_NAGLE_CORK; 666 + 667 + __tcp_push_pending_frames(sk, mss_now, nonagle); 637 668 } 638 669 639 670 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, ··· 969 934 wait_for_sndbuf: 970 935 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 971 936 wait_for_memory: 972 - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 937 + tcp_push(sk, flags & ~MSG_MORE, mss_now, 938 + TCP_NAGLE_PUSH, size_goal); 973 939 974 940 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 975 941 goto do_error; ··· 980 944 981 945 out: 982 946 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 983 - tcp_push(sk, flags, mss_now, tp->nonagle); 947 + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 984 948 return copied; 985 949 986 950 do_error: ··· 1261 1225 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1262 1226 wait_for_memory: 1263 1227 if (copied) 1264 - tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 1228 + tcp_push(sk, flags & ~MSG_MORE, mss_now, 1229 + TCP_NAGLE_PUSH, size_goal); 1265 1230 1266 1231 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1267 1232 goto do_error; ··· 1273 1236 1274 1237 out: 1275 1238 if (copied) 1276 - tcp_push(sk, flags, mss_now, tp->nonagle); 1239 + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); 1277 1240 release_sock(sk); 1278 1241 return copied + copied_syn; 1279 1242