Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps

Some devices or distributions use HZ=100 or HZ=250

TCP receive buffer autotuning has poor behavior caused by this choice.
Since autotuning happens after 4 ms or 10 ms, short distance flows
get their receive buffer tuned to a very high value, but after an initial
period where it was frozen to (too small) initial value.

With tp->tcp_mstamp introduction, we can switch to high resolution
timestamps almost for free (at the expense of 8 additional bytes per
TCP structure)

Note that some TCP stacks use usec TCP timestamps where this
patch makes even more sense : Many TCP flows have < 500 usec RTT.
Hopefully this finer TS option can be standardized soon.

Tested:
HZ=100 kernel
./netperf -H lpaa24 -t TCP_RR -l 1000 -- -r 10000,10000 &

Peer without patch :
lpaa24:~# ss -tmi dst lpaa23
...
skmem:(r0,rb8388608,...)
rcv_rtt:10 rcv_space:3210000 minrtt:0.017

Peer with the patch :
lpaa23:~# ss -tmi dst lpaa24
...
skmem:(r0,rb428800,...)
rcv_rtt:0.069 rcv_space:30000 minrtt:0.017

We can see saner RCVBUF, and more precise rcv_rtt information.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
645f4c6f a6db50b8

+24 -18
+6 -6
include/linux/tcp.h
··· 333 333 334 334 /* Receiver side RTT estimation */ 335 335 struct { 336 - u32 rtt; 337 - u32 seq; 338 - u32 time; 336 + u32 rtt_us; 337 + u32 seq; 338 + struct skb_mstamp time; 339 339 } rcv_rtt_est; 340 340 341 341 /* Receiver queue space */ 342 342 struct { 343 - int space; 344 - u32 seq; 345 - u32 time; 343 + int space; 344 + u32 seq; 345 + struct skb_mstamp time; 346 346 } rcvq_space; 347 347 348 348 /* TCP-specific MTU probe information. */
+1 -1
net/ipv4/tcp.c
··· 2853 2853 info->tcpi_snd_ssthresh = tp->snd_ssthresh; 2854 2854 info->tcpi_advmss = tp->advmss; 2855 2855 2856 - info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; 2856 + info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; 2857 2857 info->tcpi_rcv_space = tp->rcvq_space.space; 2858 2858 2859 2859 info->tcpi_total_retrans = tp->total_retrans;
+17 -11
net/ipv4/tcp_input.c
··· 442 442 tcp_sndbuf_expand(sk); 443 443 444 444 tp->rcvq_space.space = tp->rcv_wnd; 445 - tp->rcvq_space.time = tcp_time_stamp; 445 + skb_mstamp_get(&tp->tcp_mstamp); 446 + tp->rcvq_space.time = tp->tcp_mstamp; 446 447 tp->rcvq_space.seq = tp->copied_seq; 447 448 448 449 maxwin = tcp_full_space(sk); ··· 519 518 */ 520 519 static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) 521 520 { 522 - u32 new_sample = tp->rcv_rtt_est.rtt; 521 + u32 new_sample = tp->rcv_rtt_est.rtt_us; 523 522 long m = sample; 524 523 525 524 if (m == 0) ··· 549 548 new_sample = m << 3; 550 549 } 551 550 552 - if (tp->rcv_rtt_est.rtt != new_sample) 553 - tp->rcv_rtt_est.rtt = new_sample; 551 + tp->rcv_rtt_est.rtt_us = new_sample; 554 552 } 555 553 556 554 static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) 557 555 { 558 - if (tp->rcv_rtt_est.time == 0) 556 + u32 delta_us; 557 + 558 + if (tp->rcv_rtt_est.time.v64 == 0) 559 559 goto new_measure; 560 560 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) 561 561 return; 562 - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); 562 + delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); 563 + tcp_rcv_rtt_update(tp, delta_us, 1); 563 564 564 565 new_measure: 565 566 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; 566 - tp->rcv_rtt_est.time = tcp_time_stamp; 567 + tp->rcv_rtt_est.time = tp->tcp_mstamp; 567 568 } 568 569 569 570 static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, ··· 575 572 if (tp->rx_opt.rcv_tsecr && 576 573 (TCP_SKB_CB(skb)->end_seq - 577 574 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) 578 - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); 575 + tcp_rcv_rtt_update(tp, 576 + jiffies_to_usecs(tcp_time_stamp - 577 + tp->rx_opt.rcv_tsecr), 578 + 0); 579 579 } 580 580 581 581 /* ··· 591 585 int time; 592 586 int copied; 593 587 594 - time = tcp_time_stamp - tp->rcvq_space.time; 595 - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) 588 + time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); 589 + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) 596 590 return; 597 591 598 592 /* Number of bytes copied to user in last RTT */ ··· 648 642 649 643 new_measure: 650 644 tp->rcvq_space.seq = tp->copied_seq; 651 - tp->rcvq_space.time = tcp_time_stamp; 645 + tp->rcvq_space.time = tp->tcp_mstamp; 652 646 } 653 647 654 648 /* There is something which you must keep in mind when you analyze the