Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: invalidate rate samples during SACK reneging

Mark tcp_sock during a SACK reneging event and invalidate rate samples
while marked. Such rate samples may overestimate bw by including packets
that were SACKed before reneging.

< ack 6001 win 10000 sack 7001:38001
< ack 7001 win 0 sack 8001:38001 // Reneg detected
> seq 7001:8001 // RTO, SACK cleared.
< ack 38001 win 10000

In above example the rate sample taken after the last ack will count
7001-38001 as delivered while the actual delivery rate likely could
be much lower i.e. 7001-8001.

This patch adds a new field tcp_sock.sack_reneg and marks it when we
declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
the last rate sample was taken before moving back to TCP_CA_Open. This
patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
is set.

Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Yousuk Seung and committed by
David S. Miller
d4761754 195bd525

+19 -7
+2 -1
include/linux/tcp.h
··· 224 224 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ 225 225 fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ 226 226 fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ 227 - unused:3; 227 + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ 228 + unused:2; 228 229 u8 nonagle : 4,/* Disable Nagle algorithm? */ 229 230 thin_lto : 1,/* Use linear timeouts for thin streams */ 230 231 unused1 : 1,
+1 -1
include/net/tcp.h
··· 1055 1055 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, 1056 1056 struct rate_sample *rs); 1057 1057 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, 1058 - struct rate_sample *rs); 1058 + bool is_sack_reneg, struct rate_sample *rs); 1059 1059 void tcp_rate_check_app_limited(struct sock *sk); 1060 1060 1061 1061 /* These functions determine how the current flow behaves in respect of SACK
+1
net/ipv4/tcp.c
··· 2412 2412 tp->snd_cwnd_cnt = 0; 2413 2413 tp->window_clamp = 0; 2414 2414 tcp_set_ca_state(sk, TCP_CA_Open); 2415 + tp->is_sack_reneg = 0; 2415 2416 tcp_clear_retrans(tp); 2416 2417 inet_csk_delack_init(sk); 2417 2418 /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+8 -2
net/ipv4/tcp_input.c
··· 1942 1942 if (is_reneg) { 1943 1943 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 1944 1944 tp->sacked_out = 0; 1945 + /* Mark SACK reneging until we recover from this loss event. */ 1946 + tp->is_sack_reneg = 1; 1945 1947 } 1946 1948 tcp_clear_all_retrans_hints(tp); 1947 1949 ··· 2367 2365 return true; 2368 2366 } 2369 2367 tcp_set_ca_state(sk, TCP_CA_Open); 2368 + tp->is_sack_reneg = 0; 2370 2369 return false; 2371 2370 } 2372 2371 ··· 2401 2398 NET_INC_STATS(sock_net(sk), 2402 2399 LINUX_MIB_TCPSPURIOUSRTOS); 2403 2400 inet_csk(sk)->icsk_retransmits = 0; 2404 - if (frto_undo || tcp_is_sack(tp)) 2401 + if (frto_undo || tcp_is_sack(tp)) { 2405 2402 tcp_set_ca_state(sk, TCP_CA_Open); 2403 + tp->is_sack_reneg = 0; 2404 + } 2406 2405 return true; 2407 2406 } 2408 2407 return false; ··· 3501 3496 struct tcp_sacktag_state sack_state; 3502 3497 struct rate_sample rs = { .prior_delivered = 0 }; 3503 3498 u32 prior_snd_una = tp->snd_una; 3499 + bool is_sack_reneg = tp->is_sack_reneg; 3504 3500 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3505 3501 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3506 3502 bool is_dupack = false; ··· 3618 3612 3619 3613 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ 3620 3614 lost = tp->lost - lost; /* freshly marked lost */ 3621 - tcp_rate_gen(sk, delivered, lost, sack_state.rate); 3615 + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); 3622 3616 tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); 3623 3617 tcp_xmit_recovery(sk, rexmit); 3624 3618 return 1;
+7 -3
net/ipv4/tcp_rate.c
··· 106 106 107 107 /* Update the connection delivery information and generate a rate sample. */ 108 108 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, 109 - struct rate_sample *rs) 109 + bool is_sack_reneg, struct rate_sample *rs) 110 110 { 111 111 struct tcp_sock *tp = tcp_sk(sk); 112 112 u32 snd_us, ack_us; ··· 124 124 125 125 rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ 126 126 rs->losses = lost; /* freshly marked lost */ 127 - /* Return an invalid sample if no timing information is available. */ 128 - if (!rs->prior_mstamp) { 127 + /* Return an invalid sample if no timing information is available or 128 + * in recovery from loss with SACK reneging. Rate samples taken during 129 + * a SACK reneging event may overestimate bw by including packets that 130 + * were SACKed before the reneg. 131 + */ 132 + if (!rs->prior_mstamp || is_sack_reneg) { 129 133 rs->delivered = -1; 130 134 rs->interval_us = -1; 131 135 return;