Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dccp ccid-2: Perform congestion-window validation

CCID-2's cwnd increases like TCP during slow-start, which has implications for
* the local Sequence Window value (should be > cwnd),
* the Ack Ratio value.
Hence an exponential growth, if it does not reflect the actual network
conditions, can quickly lead to instability.

This patch adds congestion-window validation (RFC2861) to CCID-2:
* cwnd is constrained if the sender is application limited;
* cwnd is reduced after a long idle period, as suggested in the '90 paper
by Van Jacobson, in RFC 2581 (sec. 4.1);
* cwnd is never reduced below the RFC 3390 initial window.

As marked in the comments, the code is actually almost a direct copy of the
TCP congestion-window-validation algorithms. By continuing this work, it may
in future be possible to use the TCP code (not possible at the moment).

The mechanism can be turned off using a module parameter. Sampling of the
currently-used window (moving-maximum) is however done constantly; this is
used to determine the expected window, which can be exploited to regulate
DCCP's Sequence Window value.

This patch also sets slow-start-after-idle (RFC 4341, 5.1), i.e. it behaves like
TCP when net.ipv4.tcp_slow_start_after_idle = 1.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>

+91 -3
+81 -3
net/dccp/ccids/ccid2.c
··· 153 153 sock_put(sk); 154 154 } 155 155 156 + /* 157 + * Congestion window validation (RFC 2861). 158 + */ 159 + static int ccid2_do_cwv = 1; 160 + module_param(ccid2_do_cwv, bool, 0644); 161 + MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); 162 + 163 + /** 164 + * ccid2_update_used_window - Track how much of cwnd is actually used 165 + * This is done in addition to CWV. The sender needs to have an idea of how many 166 + * packets may be in flight, to set the local Sequence Window value accordingly 167 + * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the 168 + * maximum-used window. We use an EWMA low-pass filter to filter out noise. 169 + */ 170 + static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) 171 + { 172 + hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; 173 + } 174 + 175 + /* This borrows the code of tcp_cwnd_application_limited() */ 176 + static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) 177 + { 178 + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 179 + /* don't reduce cwnd below the initial window (IW) */ 180 + u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), 181 + win_used = max(hc->tx_cwnd_used, init_win); 182 + 183 + if (win_used < hc->tx_cwnd) { 184 + hc->tx_ssthresh = max(hc->tx_ssthresh, 185 + (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); 186 + hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; 187 + } 188 + hc->tx_cwnd_used = 0; 189 + hc->tx_cwnd_stamp = now; 190 + } 191 + 192 + /* This borrows the code of tcp_cwnd_restart() */ 193 + static void ccid2_cwnd_restart(struct sock *sk, const u32 now) 194 + { 195 + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 196 + u32 cwnd = hc->tx_cwnd, restart_cwnd, 197 + iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); 198 + 199 + hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); 200 + 201 + /* don't reduce cwnd below the initial window (IW) */ 202 + restart_cwnd = min(cwnd, iwnd); 203 + cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto; 204 + hc->tx_cwnd = max(cwnd, restart_cwnd); 205 + 206 + hc->tx_cwnd_stamp = now; 207 + hc->tx_cwnd_used = 0; 208 + } 209 + 156 210 static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) 157 211 { 158 212 struct dccp_sock *dp = dccp_sk(sk); 159 213 struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); 214 + const u32 now = ccid2_time_stamp; 160 215 struct ccid2_seq *next; 161 216 162 - hc->tx_pipe++; 217 + /* slow-start after idle periods (RFC 2581, RFC 2861) */ 218 + if (ccid2_do_cwv && !hc->tx_pipe && 219 + (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) 220 + ccid2_cwnd_restart(sk, now); 221 + 222 + hc->tx_lsndtime = now; 223 + hc->tx_pipe += 1; 224 + 225 + /* see whether cwnd was fully used (RFC 2861), update expected window */ 226 + if (ccid2_cwnd_network_limited(hc)) { 227 + ccid2_update_used_window(hc, hc->tx_cwnd); 228 + hc->tx_cwnd_used = 0; 229 + hc->tx_cwnd_stamp = now; 230 + } else { 231 + if (hc->tx_pipe > hc->tx_cwnd_used) 232 + hc->tx_cwnd_used = hc->tx_pipe; 233 + 234 + ccid2_update_used_window(hc, hc->tx_cwnd_used); 235 + 236 + if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) 237 + ccid2_cwnd_application_limited(sk, now); 238 + } 163 239 164 240 hc->tx_seqh->ccid2s_seq = dp->dccps_gss; 165 241 hc->tx_seqh->ccid2s_acked = 0; 166 - hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; 242 + hc->tx_seqh->ccid2s_sent = now; 167 243 168 244 next = hc->tx_seqh->ccid2s_next; 169 245 /* check if we need to alloc more space */ ··· 670 594 671 595 /* Use larger initial windows (RFC 4341, section 5). */ 672 596 hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); 597 + hc->tx_expected_wnd = hc->tx_cwnd; 673 598 674 599 /* Make sure that Ack Ratio is enabled and within bounds. */ 675 600 max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); ··· 683 606 684 607 hc->tx_rto = DCCP_TIMEOUT_INIT; 685 608 hc->tx_rpdupack = -1; 686 - hc->tx_last_cong = ccid2_time_stamp; 609 + hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp; 610 + hc->tx_cwnd_used = 0; 687 611 setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 688 612 (unsigned long)sk); 689 613 INIT_LIST_HEAD(&hc->tx_av_chunks);
+10
net/dccp/ccids/ccid2.h
··· 53 53 * @tx_rttvar: moving average/maximum of @mdev_max 54 54 * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) 55 55 * @tx_rtt_seq: to decay RTTVAR at most once per flight 56 + * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 57 + * @tx_expected_wnd: moving average of @tx_cwnd_used 58 + * @tx_cwnd_stamp: to track idle periods in CWV 59 + * @tx_lsndtime: last time (in jiffies) a data packet was sent 56 60 * @tx_rpseq: last consecutive seqno 57 61 * @tx_rpdupack: dupacks since rpseq 58 62 * @tx_av_chunks: list of Ack Vectors received on current skb ··· 79 75 tx_rto; 80 76 u64 tx_rtt_seq:48; 81 77 struct timer_list tx_rtotimer; 78 + 79 + /* Congestion Window validation (optional, RFC 2861) */ 80 + u32 tx_cwnd_used, 81 + tx_expected_wnd, 82 + tx_cwnd_stamp, 83 + tx_lsndtime; 82 84 83 85 u64 tx_rpseq; 84 86 int tx_rpdupack;