Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp-undo-congestion'

Yuchung Cheng says:

====================
undo congestion window on spurious SYN or SYNACK timeout

Linux TCP currently uses the initial congestion window of 1 packet
if multiple SYN or SYNACK timeouts per RFC6298. However such
timeouts are often spurious on wireless or cellular networks that
experience high delay variances (e.g. ramping up dormant radios or
local link retransmission). Another case is when the underlying
path is longer than the default SYN timeout (e.g. 1 second). In
these cases starting the transfer with a minimal congestion window
is detrimental to the performance for short flows.

One naive approach is to simply ignore SYN or SYNACK timeouts and
always use a larger or default initial window. This approach however
risks pouring gas to the fire when the network is already highly
congested. This is particularly true in data center where application
could start thousands to millions of connections over a single or
multiple hosts resulting in high SYN drops (e.g. incast).

This patch-set detects spurious SYN and SYNACK timeouts upon
completing the handshake via the widely-supported TCP timestamp
options. Upon such events the sender reverts to the default
initial window to start the data transfer so it gets best of both
worlds. This patch-set supports this feature for both active and
passive as well as Fast Open or regular connections.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+84 -49
-12
net/ipv4/tcp.c
··· 457 457 } 458 458 EXPORT_SYMBOL(tcp_init_sock); 459 459 460 - void tcp_init_transfer(struct sock *sk, int bpf_op) 461 - { 462 - struct inet_connection_sock *icsk = inet_csk(sk); 463 - 464 - tcp_mtup_init(sk); 465 - icsk->icsk_af_ops->rebuild_header(sk); 466 - tcp_init_metrics(sk); 467 - tcp_call_bpf(sk, bpf_op, 0, NULL); 468 - tcp_init_congestion_control(sk); 469 - tcp_init_buffer_space(sk); 470 - } 471 - 472 460 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) 473 461 { 474 462 struct sk_buff *skb = tcp_write_queue_tail(sk);
+72 -27
net/ipv4/tcp_input.c
··· 2252 2252 */ 2253 2253 static inline bool tcp_packet_delayed(const struct tcp_sock *tp) 2254 2254 { 2255 - return !tp->retrans_stamp || 2255 + return tp->retrans_stamp && 2256 2256 tcp_tsopt_ecr_before(tp, tp->retrans_stamp); 2257 2257 } 2258 2258 ··· 3521 3521 { 3522 3522 struct tcp_sock *tp = tcp_sk(sk); 3523 3523 3524 - if (rexmit == REXMIT_NONE) 3524 + if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) 3525 3525 return; 3526 3526 3527 3527 if (unlikely(rexmit == 2)) { ··· 5647 5647 } 5648 5648 EXPORT_SYMBOL(tcp_rcv_established); 5649 5649 5650 + void tcp_init_transfer(struct sock *sk, int bpf_op) 5651 + { 5652 + struct inet_connection_sock *icsk = inet_csk(sk); 5653 + struct tcp_sock *tp = tcp_sk(sk); 5654 + 5655 + tcp_mtup_init(sk); 5656 + icsk->icsk_af_ops->rebuild_header(sk); 5657 + tcp_init_metrics(sk); 5658 + 5659 + /* Initialize the congestion window to start the transfer. 5660 + * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 5661 + * retransmitted. In light of RFC6298 more aggressive 1sec 5662 + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 5663 + * retransmission has occurred. 5664 + */ 5665 + if (tp->total_retrans > 1 && tp->undo_marker) 5666 + tp->snd_cwnd = 1; 5667 + else 5668 + tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); 5669 + tp->snd_cwnd_stamp = tcp_jiffies32; 5670 + 5671 + tcp_call_bpf(sk, bpf_op, 0, NULL); 5672 + tcp_init_congestion_control(sk); 5673 + tcp_init_buffer_space(sk); 5674 + } 5675 + 5650 5676 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) 5651 5677 { 5652 5678 struct tcp_sock *tp = tcp_sk(sk); ··· 5774 5748 #endif 5775 5749 } 5776 5750 5751 + static void tcp_try_undo_spurious_syn(struct sock *sk) 5752 + { 5753 + struct tcp_sock *tp = tcp_sk(sk); 5754 + u32 syn_stamp; 5755 + 5756 + /* undo_marker is set when SYN or SYNACK times out. The timeout is 5757 + * spurious if the ACK's timestamp option echo value matches the 5758 + * original SYN timestamp. 5759 + */ 5760 + syn_stamp = tp->retrans_stamp; 5761 + if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && 5762 + syn_stamp == tp->rx_opt.rcv_tsecr) 5763 + tp->undo_marker = 0; 5764 + } 5765 + 5777 5766 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5778 5767 const struct tcphdr *th) 5779 5768 { ··· 5856 5815 tcp_ecn_rcv_synack(tp, th); 5857 5816 5858 5817 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5818 + tcp_try_undo_spurious_syn(sk); 5859 5819 tcp_ack(sk, skb, FLAG_SLOWPATH); 5860 5820 5861 5821 /* Ok.. it's good. Set up sequence numbers and ··· 6015 5973 return 1; 6016 5974 } 6017 5975 5976 + static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) 5977 + { 5978 + tcp_try_undo_loss(sk, false); 5979 + inet_csk(sk)->icsk_retransmits = 0; 5980 + 5981 + /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, 5982 + * we no longer need req so release it. 5983 + */ 5984 + reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false); 5985 + 5986 + /* Re-arm the timer because data may have been sent out. 5987 + * This is similar to the regular data transmission case 5988 + * when new data has just been ack'ed. 5989 + * 5990 + * (TFO) - we could try to be more aggressive and 5991 + * retransmitting any data sooner based on when they 5992 + * are sent out. 5993 + */ 5994 + tcp_rearm_rto(sk); 5995 + } 5996 + 6018 5997 /* 6019 5998 * This function implements the receiving procedure of RFC 793 for 6020 5999 * all states except ESTABLISHED and TIME_WAIT. ··· 6132 6069 if (!tp->srtt_us) 6133 6070 tcp_synack_rtt_meas(sk, req); 6134 6071 6135 - /* Once we leave TCP_SYN_RECV, we no longer need req 6136 - * so release it. 6137 - */ 6138 6072 if (req) { 6139 - inet_csk(sk)->icsk_retransmits = 0; 6140 - reqsk_fastopen_remove(sk, req, false); 6141 - /* Re-arm the timer because data may have been sent out. 6142 - * This is similar to the regular data transmission case 6143 - * when new data has just been ack'ed. 6144 - * 6145 - * (TFO) - we could try to be more aggressive and 6146 - * retransmitting any data sooner based on when they 6147 - * are sent out. 6148 - */ 6149 - tcp_rearm_rto(sk); 6073 + tcp_rcv_synrecv_state_fastopen(sk); 6150 6074 } else { 6075 + tcp_try_undo_spurious_syn(sk); 6076 + tp->retrans_stamp = 0; 6151 6077 tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); 6152 6078 tp->copied_seq = tp->rcv_nxt; 6153 6079 } ··· 6171 6119 case TCP_FIN_WAIT1: { 6172 6120 int tmo; 6173 6121 6174 - /* If we enter the TCP_FIN_WAIT1 state and we are a 6175 - * Fast Open socket and this is the first acceptable 6176 - * ACK we have received, this would have acknowledged 6177 - * our SYNACK so stop the SYNACK timer. 6178 - */ 6179 - if (req) { 6180 - /* We no longer need the request sock. */ 6181 - reqsk_fastopen_remove(sk, req, false); 6182 - tcp_rearm_rto(sk); 6183 - } 6122 + if (req) 6123 + tcp_rcv_synrecv_state_fastopen(sk); 6124 + 6184 6125 if (tp->snd_una != tp->write_seq) 6185 6126 break; 6186 6127 ··· 6348 6303 req->cookie_ts = 0; 6349 6304 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 6350 6305 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 6351 - tcp_rsk(req)->snt_synack = tcp_clock_us(); 6306 + tcp_rsk(req)->snt_synack = 0; 6352 6307 tcp_rsk(req)->last_oow_ack_time = 0; 6353 6308 req->mss = rx_opt->mss_clamp; 6354 6309 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
-10
net/ipv4/tcp_metrics.c
··· 512 512 513 513 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 514 514 } 515 - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 516 - * retransmitted. In light of RFC6298 more aggressive 1sec 517 - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 518 - * retransmission has occurred. 519 - */ 520 - if (tp->total_retrans > 1) 521 - tp->snd_cwnd = 1; 522 - else 523 - tp->snd_cwnd = tcp_init_cwnd(tp, dst); 524 - tp->snd_cwnd_stamp = tcp_jiffies32; 525 515 } 526 516 527 517 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
+5
net/ipv4/tcp_minisocks.c
··· 522 522 newtp->rx_opt.ts_recent_stamp = 0; 523 523 newtp->tcp_header_len = sizeof(struct tcphdr); 524 524 } 525 + if (req->num_timeout) { 526 + newtp->undo_marker = treq->snt_isn; 527 + newtp->retrans_stamp = div_u64(treq->snt_synack, 528 + USEC_PER_SEC / TCP_TS_HZ); 529 + } 525 530 newtp->tsoffset = treq->ts_off; 526 531 #ifdef CONFIG_TCP_MD5SIG 527 532 newtp->md5sig_info = NULL; /*XXX*/
+4
net/ipv4/tcp_output.c
··· 3247 3247 skb->skb_mstamp_ns = cookie_init_timestamp(req); 3248 3248 else 3249 3249 #endif 3250 + { 3250 3251 skb->skb_mstamp_ns = tcp_clock_ns(); 3252 + if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ 3253 + tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); 3254 + } 3251 3255 3252 3256 #ifdef CONFIG_TCP_MD5SIG 3253 3257 rcu_read_lock();
+3
net/ipv4/tcp_timer.c
··· 393 393 tcp_write_err(sk); 394 394 return; 395 395 } 396 + /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ 397 + if (icsk->icsk_retransmits == 1) 398 + tcp_enter_loss(sk); 396 399 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error 397 400 * returned from rtx_syn_ack() to make it more persistent like 398 401 * regular retransmit because if the child socket has been accepted