Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: adjust TSO packet sizes based on min_rtt

Back when tcp_tso_autosize() and TCP pacing were introduced,
our focus was really to reduce burst sizes for long distance
flows.

The simple heuristic of using sk_pacing_rate/1024 has worked
well, but can lead to too small packets for hosts in the same
rack/cluster, when thousands of flows compete for the bottleneck.

Neal Cardwell had the idea of making the TSO burst size
a function of both sk_pacing_rate and tcp_min_rtt()

Indeed, for local flows, sending bigger bursts is better
to reduce cpu costs, as occasional losses can be repaired
quite fast.

This patch is based on Neal Cardwell implementation
done more than two years ago.
bbr is adjusting max_pacing_rate based on measured bandwidth,
while cubic would over estimate max_pacing_rate.

/proc/sys/net/ipv4/tcp_tso_rtt_log can be used to tune or disable
this new feature, in logarithmic steps.

Tested:

100Gbit NIC, two hosts in the same rack, 4K MTU.
600 flows rate-limited to 20000000 bytes per second.

Before patch: (TSO sizes would be limited to 20000000/1024/4096 -> 4 segments per TSO)

~# echo 0 >/proc/sys/net/ipv4/tcp_tso_rtt_log
~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered"
96005

Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000':

65,945.29 msec task-clock # 2.845 CPUs utilized
1,314,632 context-switches # 19935.279 M/sec
5,292 cpu-migrations # 80.249 M/sec
940,641 page-faults # 14264.023 M/sec
201,117,030,926 cycles # 3049769.216 GHz (83.45%)
17,699,435,405 stalled-cycles-frontend # 8.80% frontend cycles idle (83.48%)
136,584,015,071 stalled-cycles-backend # 67.91% backend cycles idle (83.44%)
53,809,530,436 instructions # 0.27 insn per cycle
# 2.54 stalled cycles per insn (83.36%)
9,062,315,523 branches # 137422329.563 M/sec (83.22%)
153,008,621 branch-misses # 1.69% of all branches (83.32%)

23.182970846 seconds time elapsed

TcpInSegs 15648792 0.0
TcpOutSegs 58659110 0.0 # Average of 3.7 4K segments per TSO packet
TcpExtTCPDelivered 58654791 0.0
TcpExtTCPDeliveredCE 19 0.0

After patch:

~# echo 9 >/proc/sys/net/ipv4/tcp_tso_rtt_log
~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered"
96046

Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000':

48,982.58 msec task-clock # 2.104 CPUs utilized
186,014 context-switches # 3797.599 M/sec
3,109 cpu-migrations # 63.472 M/sec
941,180 page-faults # 19214.814 M/sec
153,459,763,868 cycles # 3132982.807 GHz (83.56%)
12,069,861,356 stalled-cycles-frontend # 7.87% frontend cycles idle (83.32%)
120,485,917,953 stalled-cycles-backend # 78.51% backend cycles idle (83.24%)
36,803,672,106 instructions # 0.24 insn per cycle
# 3.27 stalled cycles per insn (83.18%)
5,947,266,275 branches # 121417383.427 M/sec (83.64%)
87,984,616 branch-misses # 1.48% of all branches (83.43%)

23.281200256 seconds time elapsed

TcpInSegs 1434706 0.0
TcpOutSegs 58883378 0.0 # Average of 41 4K segments per TSO packet
TcpExtTCPDelivered 58878971 0.0
TcpExtTCPDeliveredCE 9664 0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://lore.kernel.org/r/20220309015757.2532973-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by
Jakub Kicinski
65466904 b0de0cf4

+54 -13
+23
Documentation/networking/ip-sysctl.rst
··· 878 878 879 879 Default: 2 880 880 881 + tcp_tso_rtt_log - INTEGER 882 + Adjustment of TSO packet sizes based on min_rtt 883 + 884 + Starting from linux-5.18, TCP autosizing can be tweaked 885 + for flows having small RTT. 886 + 887 + Old autosizing was splitting the pacing budget to send 1024 TSO 888 + per second. 889 + 890 + tso_packet_size = sk->sk_pacing_rate / 1024; 891 + 892 + With the new mechanism, we increase this TSO sizing using: 893 + 894 + distance = min_rtt_usec / (2^tcp_tso_rtt_log) 895 + tso_packet_size += gso_max_size >> distance; 896 + 897 + This means that flows between very close hosts can use bigger 898 + TSO packets, reducing their cpu costs. 899 + 900 + If you want to use the old autosizing, set this sysctl to 0. 901 + 902 + Default: 9 (2^9 = 512 usec) 903 + 881 904 tcp_pacing_ss_ratio - INTEGER 882 905 sk->sk_pacing_rate is set by TCP stack using a ratio applied 883 906 to current rate. (current_rate = cwnd * mss / srtt)
+2 -1
include/net/netns/ipv4.h
··· 127 127 u8 sysctl_tcp_synack_retries; 128 128 u8 sysctl_tcp_syncookies; 129 129 u8 sysctl_tcp_migrate_req; 130 + u8 sysctl_tcp_comp_sack_nr; 130 131 int sysctl_tcp_reordering; 131 132 u8 sysctl_tcp_retries1; 132 133 u8 sysctl_tcp_retries2; ··· 161 160 int sysctl_tcp_challenge_ack_limit; 162 161 int sysctl_tcp_min_rtt_wlen; 163 162 u8 sysctl_tcp_min_tso_segs; 163 + u8 sysctl_tcp_tso_rtt_log; 164 164 u8 sysctl_tcp_autocorking; 165 165 u8 sysctl_tcp_reflect_tos; 166 - u8 sysctl_tcp_comp_sack_nr; 167 166 int sysctl_tcp_invalid_ratelimit; 168 167 int sysctl_tcp_pacing_ss_ratio; 169 168 int sysctl_tcp_pacing_ca_ratio;
+7
net/ipv4/sysctl_net_ipv4.c
··· 1272 1272 .extra1 = SYSCTL_ONE, 1273 1273 }, 1274 1274 { 1275 + .procname = "tcp_tso_rtt_log", 1276 + .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log, 1277 + .maxlen = sizeof(u8), 1278 + .mode = 0644, 1279 + .proc_handler = proc_dou8vec_minmax, 1280 + }, 1281 + { 1275 1282 .procname = "tcp_min_rtt_wlen", 1276 1283 .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, 1277 1284 .maxlen = sizeof(int),
+1
net/ipv4/tcp_ipv4.c
··· 3137 3137 /* rfc5961 challenge ack rate limiting */ 3138 3138 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; 3139 3139 net->ipv4.sysctl_tcp_min_tso_segs = 2; 3140 + net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ 3140 3141 net->ipv4.sysctl_tcp_min_rtt_wlen = 300; 3141 3142 net->ipv4.sysctl_tcp_autocorking = 1; 3142 3143 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
+21 -12
net/ipv4/tcp_output.c
··· 1951 1951 } 1952 1952 1953 1953 /* Return how many segs we'd like on a TSO packet, 1954 - * to send one TSO packet per ms 1954 + * depending on current pacing rate, and how close the peer is. 1955 + * 1956 + * Rationale is: 1957 + * - For close peers, we rather send bigger packets to reduce 1958 + * cpu costs, because occasional losses will be repaired fast. 1959 + * - For long distance/rtt flows, we would like to get ACK clocking 1960 + * with 1 ACK per ms. 1961 + * 1962 + * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting 1963 + * in bigger TSO bursts. We we cut the RTT-based allowance in half 1964 + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance 1965 + * is below 1500 bytes after 6 * ~500 usec = 3ms. 1955 1966 */ 1956 1967 static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, 1957 1968 int min_tso_segs) 1958 1969 { 1959 - u32 bytes, segs; 1970 + unsigned long bytes; 1971 + u32 r; 1960 1972 1961 - bytes = min_t(unsigned long, 1962 - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), 1963 - sk->sk_gso_max_size); 1973 + bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift); 1964 1974 1965 - /* Goal is to send at least one packet per ms, 1966 - * not one big TSO packet every 100 ms. 1967 - * This preserves ACK clocking and is consistent 1968 - * with tcp_tso_should_defer() heuristic. 1969 - */ 1970 - segs = max_t(u32, bytes / mss_now, min_tso_segs); 1975 + r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log; 1976 + if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) 1977 + bytes += sk->sk_gso_max_size >> r; 1971 1978 1972 - return segs; 1979 + bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); 1980 + 1981 + return max_t(u32, bytes / mss_now, min_tso_segs); 1973 1982 } 1974 1983 1975 1984 /* Return the number of segments we want in the skb we are transmitting.