Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: better retrans tracking for defer-accept

For passive TCP connections using TCP_DEFER_ACCEPT facility,
we incorrectly increment req->retrans each time timeout triggers
while no SYNACK is sent.

SYNACK are not sent for TCP_DEFER_ACCEPT that were established (for
which we received the ACK from client). Only the last SYNACK is sent
so that we can receive again an ACK from client, to move the req into
accept queue. We plan to change this later to avoid the useless
retransmit (and potential problem as this SYNACK could be lost)

TCP_INFO later gives wrong information to user, claiming imaginary
retransmits.

Decouple req->retrans field into two independent fields :

num_retrans : number of retransmit
num_timeout : number of timeouts

num_timeout is the counter that is incremented at each timeout,
regardless of actual SYNACK being sent or not, and used to
compute the exponential timeout.

Introduce inet_rtx_syn_ack() helper to increment num_retrans
only if ->rtx_syn_ack() succeeded.

Use inet_rtx_syn_ack() from tcp_check_req() to increment num_retrans
when we re-send a SYNACK in answer to a (retransmitted) SYN.
Prior to this patch, we were not counting these retransmits.

Change tcp_v[46]_rtx_synack() to increment TCP_MIB_RETRANSSEGS
only if a synack packet was successfully queued.

Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
e6c022a4 25b1e679

+56 -35
+8 -4
include/net/request_sock.h
··· 49 49 struct request_sock *req); 50 50 }; 51 51 52 + extern int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req); 53 + 52 54 /* struct request_sock - mini sock to represent a connection request 53 55 */ 54 56 struct request_sock { 55 57 struct request_sock *dl_next; /* Must be first member! */ 56 58 u16 mss; 57 - u8 retrans; 58 - u8 cookie_ts; /* syncookie: encode tcpopts in timestamp */ 59 + u8 num_retrans; /* number of retransmits */ 60 + u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */ 61 + u8 num_timeout:7; /* number of timeouts */ 59 62 /* The following two fields can be easily recomputed I think -AK */ 60 63 u32 window_clamp; /* window clamp at creation time */ 61 64 u32 rcv_wnd; /* rcv_wnd offered first time */ ··· 234 231 { 235 232 struct listen_sock *lopt = queue->listen_opt; 236 233 237 - if (req->retrans == 0) 234 + if (req->num_timeout == 0) 238 235 --lopt->qlen_young; 239 236 240 237 return --lopt->qlen; ··· 272 269 struct listen_sock *lopt = queue->listen_opt; 273 270 274 271 req->expires = jiffies + timeout; 275 - req->retrans = 0; 272 + req->num_retrans = 0; 273 + req->num_timeout = 0; 276 274 req->sk = NULL; 277 275 req->dl_next = lopt->syn_table[hash]; 278 276
+1 -2
net/dccp/minisocks.c
··· 174 174 * To protect against Request floods, increment retrans 175 175 * counter (backoff, monitored by dccp_response_timer). 176 176 */ 177 - req->retrans++; 178 - req->rsk_ops->rtx_syn_ack(sk, req, NULL); 177 + inet_rtx_syn_ack(sk, req); 179 178 } 180 179 /* Network Duplicate, discard packet */ 181 180 return NULL;
+18 -7
net/ipv4/inet_connection_sock.c
··· 521 521 int *expire, int *resend) 522 522 { 523 523 if (!rskq_defer_accept) { 524 - *expire = req->retrans >= thresh; 524 + *expire = req->num_timeout >= thresh; 525 525 *resend = 1; 526 526 return; 527 527 } 528 - *expire = req->retrans >= thresh && 529 - (!inet_rsk(req)->acked || req->retrans >= max_retries); 528 + *expire = req->num_timeout >= thresh && 529 + (!inet_rsk(req)->acked || req->num_timeout >= max_retries); 530 530 /* 531 531 * Do not resend while waiting for data after ACK, 532 532 * start to resend on end of deferring period to give 533 533 * last chance for data or ACK to create established socket. 534 534 */ 535 535 *resend = !inet_rsk(req)->acked || 536 - req->retrans >= rskq_defer_accept - 1; 536 + req->num_timeout >= rskq_defer_accept - 1; 537 537 } 538 + 539 + int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) 540 + { 541 + int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL); 542 + 543 + if (!err) 544 + req->num_retrans++; 545 + return err; 546 + } 547 + EXPORT_SYMBOL(inet_rtx_syn_ack); 538 548 539 549 void inet_csk_reqsk_queue_prune(struct sock *parent, 540 550 const unsigned long interval, ··· 609 599 req->rsk_ops->syn_ack_timeout(parent, req); 610 600 if (!expire && 611 601 (!resend || 612 - !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || 602 + !inet_rtx_syn_ack(parent, req) || 613 603 inet_rsk(req)->acked)) { 614 604 unsigned long timeo; 615 605 616 - if (req->retrans++ == 0) 606 + if (req->num_timeout++ == 0) 617 607 lopt->qlen_young--; 618 - timeo = min((timeout << req->retrans), max_rto); 608 + timeo = min(timeout << req->num_timeout, 609 + max_rto); 619 610 req->expires = now + timeo; 620 611 reqp = &req->dl_next; 621 612 continue;
+1 -1
net/ipv4/inet_diag.c
··· 620 620 r->idiag_family = sk->sk_family; 621 621 r->idiag_state = TCP_SYN_RECV; 622 622 r->idiag_timer = 1; 623 - r->idiag_retrans = req->retrans; 623 + r->idiag_retrans = req->num_retrans; 624 624 625 625 r->id.idiag_if = sk->sk_bound_dev_if; 626 626 sock_diag_save_cookie(req, r->id.idiag_cookie);
+1 -1
net/ipv4/syncookies.c
··· 340 340 } 341 341 342 342 req->expires = 0UL; 343 - req->retrans = 0; 343 + req->num_retrans = 0; 344 344 345 345 /* 346 346 * We need to lookup the route here to get at the correct
+1 -1
net/ipv4/tcp_input.c
··· 5991 5991 */ 5992 5992 if (req) { 5993 5993 tcp_synack_rtt_meas(sk, req); 5994 - tp->total_retrans = req->retrans; 5994 + tp->total_retrans = req->num_retrans; 5995 5995 5996 5996 reqsk_fastopen_remove(sk, req, false); 5997 5997 } else {
+10 -6
net/ipv4/tcp_ipv4.c
··· 877 877 } 878 878 879 879 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, 880 - struct request_values *rvp) 880 + struct request_values *rvp) 881 881 { 882 - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 883 - return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); 882 + int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); 883 + 884 + if (!res) 885 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 886 + return res; 884 887 } 885 888 886 889 /* ··· 1389 1386 struct sock *child; 1390 1387 int err; 1391 1388 1392 - req->retrans = 0; 1389 + req->num_retrans = 0; 1390 + req->num_timeout = 0; 1393 1391 req->sk = NULL; 1394 1392 1395 1393 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); ··· 1744 1740 1745 1741 tcp_initialize_rcv_mss(newsk); 1746 1742 tcp_synack_rtt_meas(newsk, req); 1747 - newtp->total_retrans = req->retrans; 1743 + newtp->total_retrans = req->num_retrans; 1748 1744 1749 1745 #ifdef CONFIG_TCP_MD5SIG 1750 1746 /* Copy over the MD5 key from the original socket */ ··· 2642 2638 0, 0, /* could print option size, but that is af dependent. */ 2643 2639 1, /* timers active (only the expire timer) */ 2644 2640 jiffies_delta_to_clock_t(delta), 2645 - req->retrans, 2641 + req->num_timeout, 2646 2642 from_kuid_munged(seq_user_ns(f), uid), 2647 2643 0, /* non standard timer */ 2648 2644 0, /* open_requests have no inode */
+4 -4
net/ipv4/tcp_minisocks.c
··· 552 552 * it can be estimated (approximately) 553 553 * from another data. 554 554 */ 555 - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 555 + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); 556 556 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 557 557 } 558 558 } ··· 581 581 * Note that even if there is new data in the SYN packet 582 582 * they will be thrown away too. 583 583 */ 584 - req->rsk_ops->rtx_syn_ack(sk, req, NULL); 584 + inet_rtx_syn_ack(sk, req); 585 585 return NULL; 586 586 } 587 587 ··· 695 695 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ 696 696 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) 697 697 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; 698 - else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ 698 + else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */ 699 699 tcp_rsk(req)->snt_synack = 0; 700 700 701 701 /* For Fast Open no more processing is needed (sk is the ··· 705 705 return sk; 706 706 707 707 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 708 - if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 708 + if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 709 709 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 710 710 inet_rsk(req)->acked = 1; 711 711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+4 -4
net/ipv4/tcp_timer.c
··· 318 318 req = tcp_sk(sk)->fastopen_rsk; 319 319 req->rsk_ops->syn_ack_timeout(sk, req); 320 320 321 - if (req->retrans >= max_retries) { 321 + if (req->num_timeout >= max_retries) { 322 322 tcp_write_err(sk); 323 323 return; 324 324 } ··· 327 327 * regular retransmit because if the child socket has been accepted 328 328 * it's not good to give up too easily. 329 329 */ 330 - req->rsk_ops->rtx_syn_ack(sk, req, NULL); 331 - req->retrans++; 330 + inet_rtx_syn_ack(sk, req); 331 + req->num_timeout++; 332 332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 333 - TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX); 333 + TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); 334 334 } 335 335 336 336 /*
+1 -1
net/ipv6/syncookies.c
··· 214 214 ireq6->iif = inet6_iif(skb); 215 215 216 216 req->expires = 0UL; 217 - req->retrans = 0; 217 + req->num_retrans = 0; 218 218 ireq->ecn_ok = ecn_ok; 219 219 ireq->snd_wscale = tcp_opt.snd_wscale; 220 220 ireq->sack_ok = tcp_opt.sack_ok;
+7 -4
net/ipv6/tcp_ipv6.c
··· 495 495 struct request_values *rvp) 496 496 { 497 497 struct flowi6 fl6; 498 + int res; 498 499 499 - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 500 - return tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0); 500 + res = tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0); 501 + if (!res) 502 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); 503 + return res; 501 504 } 502 505 503 506 static void tcp_v6_reqsk_destructor(struct request_sock *req) ··· 1367 1364 1368 1365 tcp_initialize_rcv_mss(newsk); 1369 1366 tcp_synack_rtt_meas(newsk, req); 1370 - newtp->total_retrans = req->retrans; 1367 + newtp->total_retrans = req->num_retrans; 1371 1368 1372 1369 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; 1373 1370 newinet->inet_rcv_saddr = LOOPBACK4_IPV6; ··· 1869 1866 0,0, /* could print option size, but that is af dependent. */ 1870 1867 1, /* timers active (only the expire timer) */ 1871 1868 jiffies_to_clock_t(ttd), 1872 - req->retrans, 1869 + req->num_timeout, 1873 1870 from_kuid_munged(seq_user_ns(seq), uid), 1874 1871 0, /* non standard timer */ 1875 1872 0, /* open_requests have no inode */