Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: add SACK compression

When TCP receives an out-of-order packet, it immediately sends
a SACK packet, generating network load but also forcing the
receiver to send 1-MSS pathological packets, increasing its
RTX queue length/depth, and thus processing time.

Wifi networks suffer from this aggressive behavior, but generally
speaking, all these SACK packets add fuel to the fire when networks
are under congestion.

This patch adds a high resolution timer and tp->compressed_ack counter.

Instead of sending a SACK, we program this timer with a small delay,
based on RTT and capped to 1 ms :

delay = min ( 5 % of RTT, 1 ms)

If subsequent SACKs need to be sent while the timer has not yet
expired, we simply increment tp->compressed_ack.

When timer expires, a SACK is sent with the latest information.
Whenever an ACK is sent (if data is sent, or if in-order
data is received) timer is canceled.

Note that tcp_sack_new_ofo_skb() is able to force a SACK to be sent
if the sack blocks need to be shuffled, even if the timer has not
expired.

A new SNMP counter is added in the following patch.

Two other patches add sysctls to allow changing the 1,000,000 and 44
values that this commit hard-coded.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
5d9f4262 a3893637

+68 -7
+2
include/linux/tcp.h
··· 218 218 reord:1; /* reordering detected */ 219 219 } rack; 220 220 u16 advmss; /* Advertised MSS */ 221 + u8 compressed_ack; 221 222 u32 chrono_start; /* Start time in jiffies of a TCP chrono */ 222 223 u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ 223 224 u8 chrono_type:2, /* current chronograph type */ ··· 298 297 u32 sacked_out; /* SACK'd packets */ 299 298 300 299 struct hrtimer pacing_timer; 300 + struct hrtimer compressed_ack_timer; 301 301 302 302 /* from STCP, retrans queue hinting */ 303 303 struct sk_buff* lost_skb_hint;
+3
include/net/tcp.h
··· 561 561 if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) 562 562 __sock_put(sk); 563 563 564 + if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) 565 + __sock_put(sk); 566 + 564 567 inet_csk_clear_xmit_timers(sk); 565 568 } 566 569
+1
net/ipv4/tcp.c
··· 2595 2595 dst_release(sk->sk_rx_dst); 2596 2596 sk->sk_rx_dst = NULL; 2597 2597 tcp_saved_syn_free(tp); 2598 + tp->compressed_ack = 0; 2598 2599 2599 2600 /* Clean up fastopen related fields */ 2600 2601 tcp_free_fastopen_req(tp);
+30 -7
net/ipv4/tcp_input.c
··· 4249 4249 * If the sack array is full, forget about the last one. 4250 4250 */ 4251 4251 if (this_sack >= TCP_NUM_SACKS) { 4252 + if (tp->compressed_ack) 4253 + tcp_send_ack(sk); 4252 4254 this_sack--; 4253 4255 tp->rx_opt.num_sacks--; 4254 4256 sp--; ··· 5083 5081 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) 5084 5082 { 5085 5083 struct tcp_sock *tp = tcp_sk(sk); 5084 + unsigned long rtt, delay; 5086 5085 5087 5086 /* More than one full frame received... */ 5088 5087 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ··· 5095 5092 (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || 5096 5093 __tcp_select_window(sk) >= tp->rcv_wnd)) || 5097 5094 /* We ACK each frame or... */ 5098 - tcp_in_quickack_mode(sk) || 5099 - /* We have out of order data. */ 5100 - (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { 5101 - /* Then ack it now */ 5095 + tcp_in_quickack_mode(sk)) { 5096 + send_now: 5102 5097 tcp_send_ack(sk); 5103 - } else { 5104 - /* Else, send delayed ack. */ 5105 - tcp_send_delayed_ack(sk); 5098 + return; 5106 5099 } 5100 + 5101 + if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 5102 + tcp_send_delayed_ack(sk); 5103 + return; 5104 + } 5105 + 5106 + if (!tcp_is_sack(tp) || tp->compressed_ack >= 44) 5107 + goto send_now; 5108 + tp->compressed_ack++; 5109 + 5110 + if (hrtimer_is_queued(&tp->compressed_ack_timer)) 5111 + return; 5112 + 5113 + /* compress ack timer : 5 % of rtt, but no more than 1 ms */ 5114 + 5115 + rtt = tp->rcv_rtt_est.rtt_us; 5116 + if (tp->srtt_us && tp->srtt_us < rtt) 5117 + rtt = tp->srtt_us; 5118 + 5119 + delay = min_t(unsigned long, NSEC_PER_MSEC, 5120 + rtt * (NSEC_PER_USEC >> 3)/20); 5121 + sock_hold(sk); 5122 + hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), 5123 + HRTIMER_MODE_REL_PINNED_SOFT); 5107 5124 } 5108 5125 5109 5126 static inline void tcp_ack_snd_check(struct sock *sk)
+7
net/ipv4/tcp_output.c
··· 162 162 /* Account for an ACK we sent. */ 163 163 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 164 164 { 165 + struct tcp_sock *tp = tcp_sk(sk); 166 + 167 + if (unlikely(tp->compressed_ack)) { 168 + tp->compressed_ack = 0; 169 + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) 170 + __sock_put(sk); 171 + } 165 172 tcp_dec_quickack_mode(sk, pkts); 166 173 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); 167 174 }
+25
net/ipv4/tcp_timer.c
··· 708 708 sock_put(sk); 709 709 } 710 710 711 + static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) 712 + { 713 + struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); 714 + struct sock *sk = (struct sock *)tp; 715 + 716 + bh_lock_sock(sk); 717 + if (!sock_owned_by_user(sk)) { 718 + if (tp->compressed_ack) 719 + tcp_send_ack(sk); 720 + } else { 721 + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, 722 + &sk->sk_tsq_flags)) 723 + sock_hold(sk); 724 + } 725 + bh_unlock_sock(sk); 726 + 727 + sock_put(sk); 728 + 729 + return HRTIMER_NORESTART; 730 + } 731 + 711 732 void tcp_init_xmit_timers(struct sock *sk) 712 733 { 713 734 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, ··· 736 715 hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, 737 716 HRTIMER_MODE_ABS_PINNED_SOFT); 738 717 tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; 718 + 719 + hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, 720 + HRTIMER_MODE_REL_PINNED_SOFT); 721 + tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; 739 722 }