Merge branch 'tcp-remove-obsolete-rfc3517-rfc6675-code'

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Neal Cardwell says:

====================
tcp: remove obsolete RFC3517/RFC6675 code

RACK-TLP loss detection has been enabled as the default loss detection
algorithm for Linux TCP since 2018, in:

commit b38a51fec1c1 ("tcp: disable RFC6675 loss detection")

In case users ran into unexpected bugs or performance regressions,
that commit allowed Linux system administrators to revert to using
RFC3517/RFC6675 loss recovery by setting net.ipv4.tcp_recovery to 0.

In the seven years since 2018, our team has not heard reports of
anyone reverting Linux TCP to use RFC3517/RFC6675 loss recovery, and
we can't find any record in web searches of such a revert.

RACK-TLP was published as a standards-track RFC, RFC8985, in February
2021.

Several other major TCP implementations have default-enabled RACK-TLP
at this point as well.

RACK-TLP offers several significant performance advantages over
RFC3517/RFC6675 loss recovery, including much better performance in
the common cases of tail drops, lost retransmissions, and reordering.

It is now time to remove the obsolete and unused RFC3517/RFC6675 loss
recovery code. This will allow a substantial simplification of the
Linux TCP code base, and removes 12 bytes of state in every tcp_sock
for 64-bit machines (8 bytes on 32-bit machines).

To arrange the commits in reasonable sizes, this patch series is split
into 3 commits:

(1) Removes the core RFC3517/RFC6675 logic.

(2) Removes the RFC3517/RFC6675 hint state and the first layer of logic that
updates that state.

(3) Removes the emptied-out tcp_clear_retrans_hints_partial() helper function
and all of its call sites.
====================

Link: https://patch.msgid.link/20250615001435.2390793-1-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Jakub Kicinski 9 months ago 4b7da734 90b4e1cf

+16 -170

7 changed files

expand all

Documentation

networking

ip-sysctl.rst

net_cachelines

tcp_sock.rst

include

linux

tcp.h

net

tcp.h

net

ipv4

tcp.c

tcp_input.c

tcp_output.c

+5 -3

Documentation/networking/ip-sysctl.rst

··· 645 645 features. 646 646 647 647 ========= ============================================================= 648 - RACK: 0x1 enables the RACK loss detection for fast detection of lost 649 - retransmissions and tail drops. It also subsumes and disables 650 - RFC6675 recovery for SACK connections. 648 + RACK: 0x1 enables RACK loss detection, for fast detection of lost 649 + retransmissions and tail drops, and resilience to 650 + reordering. currently, setting this bit to 0 has no 651 + effect, since RACK is the only supported loss detection 652 + algorithm. 651 653 652 654 RACK: 0x2 makes RACK's reordering window static (min_rtt/4). 653 655

-2

Documentation/networking/net_cachelines/tcp_sock.rst

··· 115 115 u32 sacked_out read_mostly read_mostly tcp_left_out(tx);tcp_packets_in_flight(tx/rx);tcp_clean_rtx_queue(rx) 116 116 struct hrtimer pacing_timer 117 117 struct hrtimer compressed_ack_timer 118 - struct sk_buff* lost_skb_hint read_mostly tcp_clean_rtx_queue 119 118 struct sk_buff* retransmit_skb_hint read_mostly tcp_clean_rtx_queue 120 119 struct rb_root out_of_order_queue read_mostly tcp_data_queue,tcp_fast_path_check 121 120 struct sk_buff* ooo_last_skb ··· 122 123 struct tcp_sack_block[4] selective_acks 123 124 struct tcp_sack_block[4] recv_sack_cache 124 125 struct sk_buff* highest_sack read_write tcp_event_new_data_sent 125 - int lost_cnt_hint 126 126 u32 prior_ssthresh 127 127 u32 high_seq 128 128 u32 retrans_stamp

-3

include/linux/tcp.h

··· 208 208 u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ 209 209 u16 gso_segs; /* Max number of segs per GSO packet */ 210 210 /* from STCP, retrans queue hinting */ 211 - struct sk_buff *lost_skb_hint; 212 211 struct sk_buff *retransmit_skb_hint; 213 212 __cacheline_group_end(tcp_sock_read_tx); 214 213 ··· 417 418 struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ 418 419 419 420 struct tcp_sack_block recv_sack_cache[4]; 420 - 421 - int lost_cnt_hint; 422 421 423 422 u32 prior_ssthresh; /* ssthresh saved at recovery start */ 424 423 u32 high_seq; /* snd_nxt at onset of congestion */

-6

include/net/tcp.h

··· 1811 1811 } 1812 1812 1813 1813 /* from STCP */ 1814 - static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) 1815 - { 1816 - tp->lost_skb_hint = NULL; 1817 - } 1818 - 1819 1814 static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) 1820 1815 { 1821 - tcp_clear_retrans_hints_partial(tp); 1822 1816 tp->retransmit_skb_hint = NULL; 1823 1817 } 1824 1818

+1 -2

net/ipv4/tcp.c

··· 5053 5053 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); 5054 5054 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); 5055 5055 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); 5056 - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); 5057 5056 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); 5058 - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); 5057 + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); 5059 5058 5060 5059 /* TXRX read-mostly hotpath cache lines */ 5061 5060 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);

+10 -148

net/ipv4/tcp_input.c

··· 1451 1451 tp->sacked_out += pcount; 1452 1452 /* Out-of-order packets delivered */ 1453 1453 state->sack_delivered += pcount; 1454 - 1455 - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1456 - if (tp->lost_skb_hint && 1457 - before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1458 - tp->lost_cnt_hint += pcount; 1459 1454 } 1460 1455 1461 1456 /* D-SACK. We can detect redundant retransmission in S|R and plain R ··· 1491 1496 tcp_skb_timestamp_us(skb)); 1492 1497 tcp_rate_skb_delivered(sk, skb, state->rate); 1493 1498 1494 - if (skb == tp->lost_skb_hint) 1495 - tp->lost_cnt_hint += pcount; 1496 - 1497 1499 TCP_SKB_CB(prev)->end_seq += shifted; 1498 1500 TCP_SKB_CB(skb)->seq += shifted; 1499 1501 ··· 1523 1531 1524 1532 if (skb == tp->retransmit_skb_hint) 1525 1533 tp->retransmit_skb_hint = prev; 1526 - if (skb == tp->lost_skb_hint) { 1527 - tp->lost_skb_hint = prev; 1528 - tp->lost_cnt_hint -= tcp_skb_pcount(prev); 1529 - } 1530 1534 1531 1535 TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1532 1536 TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; ··· 2139 2151 tp->undo_retrans = -1; 2140 2152 } 2141 2153 2142 - static bool tcp_is_rack(const struct sock *sk) 2143 - { 2144 - return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & 2145 - TCP_RACK_LOSS_DETECTION; 2146 - } 2147 - 2148 2154 /* If we detect SACK reneging, forget all SACK information 2149 2155 * and reset tags completely, otherwise preserve SACKs. If receiver 2150 2156 * dropped its ofo queue, we will know this due to reneging detection. ··· 2164 2182 skb_rbtree_walk_from(skb) { 2165 2183 if (is_reneg) 2166 2184 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 2167 - else if (tcp_is_rack(sk) && skb != head && 2168 - tcp_rack_skb_timeout(tp, skb, 0) > 0) 2185 + else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0) 2169 2186 continue; /* Don't mark recently sent ones lost yet */ 2170 2187 tcp_mark_skb_lost(sk, skb); 2171 2188 } ··· 2245 2264 return false; 2246 2265 } 2247 2266 2248 - /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs 2249 - * counter when SACK is enabled (without SACK, sacked_out is used for 2250 - * that purpose). 2251 - * 2252 - * With reordering, holes may still be in flight, so RFC3517 recovery 2253 - * uses pure sacked_out (total number of SACKed segments) even though 2254 - * it violates the RFC that uses duplicate ACKs, often these are equal 2255 - * but when e.g. out-of-window ACKs or packet duplication occurs, 2256 - * they differ. Since neither occurs due to loss, TCP should really 2257 - * ignore them. 2258 - */ 2259 - static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 2260 - { 2261 - return tp->sacked_out + 1; 2262 - } 2263 - 2264 2267 /* Linux NewReno/SACK/ECN state machine. 2265 2268 * -------------------------------------- 2266 2269 * ··· 2297 2332 * 2298 2333 * If the receiver supports SACK: 2299 2334 * 2300 - * RFC6675/3517: It is the conventional algorithm. A packet is 2301 - * considered lost if the number of higher sequence packets 2302 - * SACKed is greater than or equal the DUPACK thoreshold 2303 - * (reordering). This is implemented in tcp_mark_head_lost and 2304 - * tcp_update_scoreboard. 2305 - * 2306 - * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm 2335 + * RACK (RFC8985): RACK is a newer loss detection algorithm 2307 2336 * (2017-) that checks timing instead of counting DUPACKs. 2308 2337 * Essentially a packet is considered lost if it's not S/ACKed 2309 2338 * after RTT + reordering_window, where both metrics are ··· 2312 2353 * is lost (NewReno). This heuristics are the same in NewReno 2313 2354 * and SACK. 2314 2355 * 2315 - * Really tricky (and requiring careful tuning) part of algorithm 2316 - * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). 2356 + * The really tricky (and requiring careful tuning) part of the algorithm 2357 + * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue(). 2317 2358 * The first determines the moment _when_ we should reduce CWND and, 2318 2359 * hence, slow down forward transmission. In fact, it determines the moment 2319 2360 * when we decide that hole is caused by loss, rather than by a reorder. ··· 2340 2381 { 2341 2382 struct tcp_sock *tp = tcp_sk(sk); 2342 2383 2343 - /* Trick#1: The loss is proven. */ 2344 - if (tp->lost_out) 2345 - return true; 2346 - 2347 - /* Not-A-Trick#2 : Classic rule... */ 2348 - if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) 2349 - return true; 2350 - 2351 - return false; 2352 - } 2353 - 2354 - /* Detect loss in event "A" above by marking head of queue up as lost. 2355 - * For RFC3517 SACK, a segment is considered lost if it 2356 - * has at least tp->reordering SACKed seqments above it; "packets" refers to 2357 - * the maximum SACKed segments to pass before reaching this limit. 2358 - */ 2359 - static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) 2360 - { 2361 - struct tcp_sock *tp = tcp_sk(sk); 2362 - struct sk_buff *skb; 2363 - int cnt; 2364 - /* Use SACK to deduce losses of new sequences sent during recovery */ 2365 - const u32 loss_high = tp->snd_nxt; 2366 - 2367 - WARN_ON(packets > tp->packets_out); 2368 - skb = tp->lost_skb_hint; 2369 - if (skb) { 2370 - /* Head already handled? */ 2371 - if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) 2372 - return; 2373 - cnt = tp->lost_cnt_hint; 2374 - } else { 2375 - skb = tcp_rtx_queue_head(sk); 2376 - cnt = 0; 2377 - } 2378 - 2379 - skb_rbtree_walk_from(skb) { 2380 - /* TODO: do this better */ 2381 - /* this is not the most efficient way to do this... */ 2382 - tp->lost_skb_hint = skb; 2383 - tp->lost_cnt_hint = cnt; 2384 - 2385 - if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) 2386 - break; 2387 - 2388 - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2389 - cnt += tcp_skb_pcount(skb); 2390 - 2391 - if (cnt > packets) 2392 - break; 2393 - 2394 - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) 2395 - tcp_mark_skb_lost(sk, skb); 2396 - 2397 - if (mark_head) 2398 - break; 2399 - } 2400 - tcp_verify_left_out(tp); 2401 - } 2402 - 2403 - /* Account newly detected lost packet(s) */ 2404 - 2405 - static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) 2406 - { 2407 - struct tcp_sock *tp = tcp_sk(sk); 2408 - 2409 - if (tcp_is_sack(tp)) { 2410 - int sacked_upto = tp->sacked_out - tp->reordering; 2411 - if (sacked_upto >= 0) 2412 - tcp_mark_head_lost(sk, sacked_upto, 0); 2413 - else if (fast_rexmit) 2414 - tcp_mark_head_lost(sk, 1, 1); 2415 - } 2384 + /* Has loss detection marked at least one packet lost? */ 2385 + return tp->lost_out != 0; 2416 2386 } 2417 2387 2418 2388 static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) ··· 2769 2881 tcp_mark_skb_lost(sk, skb); 2770 2882 } 2771 2883 2772 - tcp_clear_retrans_hints_partial(tp); 2773 - 2774 2884 if (!tp->lost_out) 2775 2885 return; 2776 2886 ··· 2876 2990 *rexmit = REXMIT_LOST; 2877 2991 } 2878 2992 2879 - static bool tcp_force_fast_retransmit(struct sock *sk) 2880 - { 2881 - struct tcp_sock *tp = tcp_sk(sk); 2882 - 2883 - return after(tcp_highest_sack_seq(tp), 2884 - tp->snd_una + tp->reordering * tp->mss_cache); 2885 - } 2886 - 2887 2993 /* Undo during fast recovery after partial ACK. */ 2888 - static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, 2889 - bool *do_lost) 2994 + static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) 2890 2995 { 2891 2996 struct tcp_sock *tp = tcp_sk(sk); 2892 2997 ··· 2902 3025 tcp_undo_cwnd_reduction(sk, true); 2903 3026 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); 2904 3027 tcp_try_keep_open(sk); 2905 - } else { 2906 - /* Partial ACK arrived. Force fast retransmit. */ 2907 - *do_lost = tcp_force_fast_retransmit(sk); 2908 3028 } 2909 3029 return false; 2910 3030 } ··· 2915 3041 2916 3042 if (unlikely(tcp_is_reno(tp))) { 2917 3043 tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); 2918 - } else if (tcp_is_rack(sk)) { 3044 + } else { 2919 3045 u32 prior_retrans = tp->retrans_out; 2920 3046 2921 3047 if (tcp_rack_mark_lost(sk)) ··· 2942 3068 { 2943 3069 struct inet_connection_sock *icsk = inet_csk(sk); 2944 3070 struct tcp_sock *tp = tcp_sk(sk); 2945 - int fast_rexmit = 0, flag = *ack_flag; 3071 + int flag = *ack_flag; 2946 3072 bool ece_ack = flag & FLAG_ECE; 2947 - bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && 2948 - tcp_force_fast_retransmit(sk)); 2949 3073 2950 3074 if (!tp->packets_out && tp->sacked_out) 2951 3075 tp->sacked_out = 0; ··· 2992 3120 if (!(flag & FLAG_SND_UNA_ADVANCED)) { 2993 3121 if (tcp_is_reno(tp)) 2994 3122 tcp_add_reno_sack(sk, num_dupack, ece_ack); 2995 - } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) 3123 + } else if (tcp_try_undo_partial(sk, prior_snd_una)) 2996 3124 return; 2997 3125 2998 3126 if (tcp_try_undo_dsack(sk)) ··· 3047 3175 3048 3176 /* Otherwise enter Recovery state */ 3049 3177 tcp_enter_recovery(sk, ece_ack); 3050 - fast_rexmit = 1; 3051 3178 } 3052 3179 3053 - if (!tcp_is_rack(sk) && do_lost) 3054 - tcp_update_scoreboard(sk, fast_rexmit); 3055 3180 *rexmit = REXMIT_LOST; 3056 3181 } 3057 3182 ··· 3304 3435 next = skb_rb_next(skb); 3305 3436 if (unlikely(skb == tp->retransmit_skb_hint)) 3306 3437 tp->retransmit_skb_hint = NULL; 3307 - if (unlikely(skb == tp->lost_skb_hint)) 3308 - tp->lost_skb_hint = NULL; 3309 3438 tcp_highest_sack_replace(sk, skb, next); 3310 3439 tcp_rtx_queue_unlink_and_free(skb, sk); 3311 3440 } ··· 3361 3494 if (flag & FLAG_RETRANS_DATA_ACKED) 3362 3495 flag &= ~FLAG_ORIG_SACK_ACKED; 3363 3496 } else { 3364 - int delta; 3365 - 3366 3497 /* Non-retransmitted hole got filled? That's reordering */ 3367 3498 if (before(reord, prior_fack)) 3368 3499 tcp_check_sack_reordering(sk, reord, 0); 3369 - 3370 - delta = prior_sacked - tp->sacked_out; 3371 - tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3372 3500 } 3373 3501 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3374 3502 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,

-6

net/ipv4/tcp_output.c

··· 1554 1554 if (tcp_is_reno(tp) && decr > 0) 1555 1555 tp->sacked_out -= min_t(u32, tp->sacked_out, decr); 1556 1556 1557 - if (tp->lost_skb_hint && 1558 - before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && 1559 - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 1560 - tp->lost_cnt_hint -= decr; 1561 - 1562 1557 tcp_verify_left_out(tp); 1563 1558 } 1564 1559 ··· 3247 3252 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; 3248 3253 3249 3254 /* changed transmit queue under us so clear hints */ 3250 - tcp_clear_retrans_hints_partial(tp); 3251 3255 if (next_skb == tp->retransmit_skb_hint) 3252 3256 tp->retransmit_skb_hint = skb; 3253 3257