Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Yuchung Cheng and committed by
David S. Miller
713bafea e4ec1384

+12 -77
+1 -2
Documentation/networking/ip-sysctl.txt
··· 289 289 Default: 1 (fallback enabled) 290 290 291 291 tcp_fack - BOOLEAN 292 - Enable FACK congestion avoidance and fast retransmission. 293 - The value is not used, if tcp_sack is not enabled. 292 + This is a legacy option, it has no effect anymore. 294 293 295 294 tcp_fin_timeout - INTEGER 296 295 The length of time an orphaned (no longer referenced by any
-1
include/linux/tcp.h
··· 85 85 86 86 /*These are used to set the sack_ok field in struct tcp_options_received */ 87 87 #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ 88 - #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ 89 88 #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ 90 89 91 90 struct tcp_options_received {
+1 -13
include/net/tcp.h
··· 384 384 void tcp_init_metrics(struct sock *sk); 385 385 void tcp_metrics_init(void); 386 386 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); 387 - void tcp_disable_fack(struct tcp_sock *tp); 388 387 void tcp_close(struct sock *sk, long timeout); 389 388 void tcp_init_sock(struct sock *sk); 390 389 void tcp_init_transfer(struct sock *sk, int bpf_op); ··· 775 776 }; 776 777 __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ 777 778 778 - __u8 sacked; /* State flags for SACK/FACK. */ 779 + __u8 sacked; /* State flags for SACK. */ 779 780 #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ 780 781 #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ 781 782 #define TCPCB_LOST 0x04 /* SKB is lost */ ··· 1065 1066 * 1066 1067 * tcp_is_sack - SACK enabled 1067 1068 * tcp_is_reno - No SACK 1068 - * tcp_is_fack - FACK enabled, implies SACK enabled 1069 1069 */ 1070 1070 static inline int tcp_is_sack(const struct tcp_sock *tp) 1071 1071 { ··· 1074 1076 static inline bool tcp_is_reno(const struct tcp_sock *tp) 1075 1077 { 1076 1078 return !tcp_is_sack(tp); 1077 - } 1078 - 1079 - static inline bool tcp_is_fack(const struct tcp_sock *tp) 1080 - { 1081 - return tp->rx_opt.sack_ok & TCP_FACK_ENABLED; 1082 - } 1083 - 1084 - static inline void tcp_enable_fack(struct tcp_sock *tp) 1085 - { 1086 - tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; 1087 1079 } 1088 1080 1089 1081 static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
-1
include/uapi/linux/snmp.h
··· 191 191 LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ 192 192 LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ 193 193 LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ 194 - LINUX_MIB_TCPFACKREORDER, /* TCPFACKReorder */ 195 194 LINUX_MIB_TCPSACKREORDER, /* TCPSACKReorder */ 196 195 LINUX_MIB_TCPRENOREORDER, /* TCPRenoReorder */ 197 196 LINUX_MIB_TCPTSREORDER, /* TCPTSReorder */
-1
net/ipv4/proc.c
··· 212 212 SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), 213 213 SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), 214 214 SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), 215 - SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), 216 215 SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), 217 216 SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), 218 217 SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
-2
net/ipv4/tcp.c
··· 2509 2509 return -EINVAL; 2510 2510 2511 2511 tp->rx_opt.sack_ok |= TCP_SACK_SEEN; 2512 - if (sock_net(sk)->ipv4.sysctl_tcp_fack) 2513 - tcp_enable_fack(tp); 2514 2512 break; 2515 2513 case TCPOPT_TIMESTAMP: 2516 2514 if (opt.opt_val != 0)
+7 -46
net/ipv4/tcp_input.c
··· 842 842 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 843 843 } 844 844 845 - /* 846 - * Packet counting of FACK is based on in-order assumptions, therefore TCP 847 - * disables it when reordering is detected 848 - */ 849 - void tcp_disable_fack(struct tcp_sock *tp) 850 - { 851 - /* RFC3517 uses different metric in lost marker => reset on change */ 852 - if (tcp_is_fack(tp)) 853 - tp->lost_skb_hint = NULL; 854 - tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; 855 - } 856 - 857 845 /* Take a notice that peer is sending D-SACKs */ 858 846 static void tcp_dsack_seen(struct tcp_sock *tp) 859 847 { ··· 869 881 tp->sacked_out, 870 882 tp->undo_marker ? tp->undo_retrans : 0); 871 883 #endif 872 - tcp_disable_fack(tp); 873 884 } 874 885 875 886 tp->rack.reord = 1; ··· 878 891 mib_idx = LINUX_MIB_TCPTSREORDER; 879 892 else if (tcp_is_reno(tp)) 880 893 mib_idx = LINUX_MIB_TCPRENOREORDER; 881 - else if (tcp_is_fack(tp)) 882 - mib_idx = LINUX_MIB_TCPFACKREORDER; 883 894 else 884 895 mib_idx = LINUX_MIB_TCPSACKREORDER; 885 896 ··· 955 970 * 3. Loss detection event of two flavors: 956 971 * A. Scoreboard estimator decided the packet is lost. 957 972 * A'. Reno "three dupacks" marks head of queue lost. 958 - * A''. Its FACK modification, head until snd.fack is lost. 959 973 * B. SACK arrives sacking SND.NXT at the moment, when the 960 974 * segment was retransmitted. 961 975 * 4. D-SACK added new rule: D-SACK changes any tag to S. ··· 1232 1248 fack_count += pcount; 1233 1249 1234 1250 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1235 - if (!tcp_is_fack(tp) && tp->lost_skb_hint && 1251 + if (tp->lost_skb_hint && 1236 1252 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1237 1253 tp->lost_cnt_hint += pcount; 1238 1254 ··· 2035 2051 * counter when SACK is enabled (without SACK, sacked_out is used for 2036 2052 * that purpose). 2037 2053 * 2038 - * Instead, with FACK TCP uses fackets_out that includes both SACKed 2039 - * segments up to the highest received SACK block so far and holes in 2040 - * between them. 2041 - * 2042 2054 * With reordering, holes may still be in flight, so RFC3517 recovery 2043 2055 * uses pure sacked_out (total number of SACKed segments) even though 2044 2056 * it violates the RFC that uses duplicate ACKs, often these are equal ··· 2044 2064 */ 2045 2065 static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 2046 2066 { 2047 - return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2067 + return tp->sacked_out + 1; 2048 2068 } 2049 2069 2050 - /* Linux NewReno/SACK/FACK/ECN state machine. 2070 + /* Linux NewReno/SACK/ECN state machine. 2051 2071 * -------------------------------------- 2052 2072 * 2053 2073 * "Open" Normal state, no dubious events, fast path. ··· 2112 2132 * dynamically measured and adjusted. This is implemented in 2113 2133 * tcp_rack_mark_lost. 2114 2134 * 2115 - * FACK (Disabled by default. Subsumbed by RACK): 2116 - * It is the simplest heuristics. As soon as we decided 2117 - * that something is lost, we decide that _all_ not SACKed 2118 - * packets until the most forward SACK are lost. I.e. 2119 - * lost_out = fackets_out - sacked_out and left_out = fackets_out. 2120 - * It is absolutely correct estimate, if network does not reorder 2121 - * packets. And it loses any connection to reality when reordering 2122 - * takes place. We use FACK by default until reordering 2123 - * is suspected on the path to this destination. 2124 - * 2125 2135 * If the receiver does not support SACK: 2126 2136 * 2127 2137 * NewReno (RFC6582): in Recovery we assume that one segment ··· 2160 2190 } 2161 2191 2162 2192 /* Detect loss in event "A" above by marking head of queue up as lost. 2163 - * For FACK or non-SACK(Reno) senders, the first "packets" number of segments 2193 + * For non-SACK(Reno) senders, the first "packets" number of segments 2164 2194 * are considered lost. For RFC3517 SACK, a segment is considered lost if it 2165 2195 * has at least tp->reordering SACKed seqments above it; "packets" refers to 2166 2196 * the maximum SACKed segments to pass before reaching this limit. ··· 2196 2226 break; 2197 2227 2198 2228 oldcnt = cnt; 2199 - if (tcp_is_fack(tp) || tcp_is_reno(tp) || 2229 + if (tcp_is_reno(tp) || 2200 2230 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2201 2231 cnt += tcp_skb_pcount(skb); 2202 2232 2203 2233 if (cnt > packets) { 2204 - if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2234 + if (tcp_is_sack(tp) || 2205 2235 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2206 2236 (oldcnt >= packets)) 2207 2237 break; ··· 2232 2262 2233 2263 if (tcp_is_reno(tp)) { 2234 2264 tcp_mark_head_lost(sk, 1, 1); 2235 - } else if (tcp_is_fack(tp)) { 2236 - int lost = tp->fackets_out - tp->reordering; 2237 - if (lost <= 0) 2238 - lost = 1; 2239 - tcp_mark_head_lost(sk, lost, 0); 2240 2265 } else { 2241 2266 int sacked_upto = tp->sacked_out - tp->reordering; 2242 2267 if (sacked_upto >= 0) ··· 3164 3199 if (reord < prior_fackets && reord <= tp->fackets_out) 3165 3200 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 3166 3201 3167 - delta = tcp_is_fack(tp) ? pkts_acked : 3168 - prior_sacked - tp->sacked_out; 3202 + delta = prior_sacked - tp->sacked_out; 3169 3203 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3170 3204 } 3171 3205 ··· 5671 5707 } else { 5672 5708 tp->tcp_header_len = sizeof(struct tcphdr); 5673 5709 } 5674 - 5675 - if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_fack) 5676 - tcp_enable_fack(tp); 5677 5710 5678 5711 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 5679 5712 tcp_initialize_rcv_mss(sk);
+1 -3
net/ipv4/tcp_metrics.c
··· 470 470 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 471 471 } 472 472 val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 473 - if (val && tp->reordering != val) { 474 - tcp_disable_fack(tp); 473 + if (val && tp->reordering != val) 475 474 tp->reordering = val; 476 - } 477 475 478 476 crtt = tcp_metric_get(tm, TCP_METRIC_RTT); 479 477 rcu_read_unlock();
+1 -4
net/ipv4/tcp_minisocks.c
··· 509 509 keepalive_time_when(newtp)); 510 510 511 511 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; 512 - if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { 513 - if (sock_net(sk)->ipv4.sysctl_tcp_fack) 514 - tcp_enable_fack(newtp); 515 - } 512 + newtp->rx_opt.sack_ok = ireq->sack_ok; 516 513 newtp->window_clamp = req->rsk_window_clamp; 517 514 newtp->rcv_ssthresh = req->rsk_rcv_wnd; 518 515 newtp->rcv_wnd = req->rsk_rcv_wnd;
+1 -4
net/ipv4/tcp_output.c
··· 1257 1257 1258 1258 if (tp->lost_skb_hint && 1259 1259 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 1260 - (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))) 1260 + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 1261 1261 tp->lost_cnt_hint -= decr; 1262 1262 1263 1263 tcp_verify_left_out(tp); ··· 2961 2961 * retransmitted data is acknowledged. It tries to continue 2962 2962 * resending the rest of the retransmit queue, until either 2963 2963 * we've sent it all or the congestion window limit is reached. 2964 - * If doing SACK, the first ACK which comes back for a timeout 2965 - * based retransmit packet might feed us FACK information again. 2966 - * If so, we use it to avoid unnecessarily retransmissions. 2967 2964 */ 2968 2965 void tcp_xmit_retransmit_queue(struct sock *sk) 2969 2966 {