Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp_cubic-various-fixes'

Eric Dumazet says:

====================
tcp_cubic: various fixes

This patch series converts tcp_cubic to usec clock resolution
for Hystart logic.

This makes Hystart more relevant for data-center flows.
Prior to this series, Hystart was not kicking, or was
kicking without good reason, since the 1ms clock was too coarse.

Last patch also fixes an issue with Hystart vs TCP pacing.

v2: removed a last-minute debug chunk from last patch
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+51 -31
+51 -31
net/ipv4/tcp_cubic.c
··· 40 40 41 41 /* Number of delay samples for detecting the increase of delay */ 42 42 #define HYSTART_MIN_SAMPLES 8 43 - #define HYSTART_DELAY_MIN (4U<<3) 44 - #define HYSTART_DELAY_MAX (16U<<3) 43 + #define HYSTART_DELAY_MIN (4000U) /* 4 ms */ 44 + #define HYSTART_DELAY_MAX (16000U) /* 16 ms */ 45 45 #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 46 46 47 47 static int fast_convergence __read_mostly = 1; ··· 53 53 static int hystart __read_mostly = 1; 54 54 static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; 55 55 static int hystart_low_window __read_mostly = 16; 56 - static int hystart_ack_delta __read_mostly = 2; 56 + static int hystart_ack_delta_us __read_mostly = 2000; 57 57 58 58 static u32 cube_rtt_scale __read_mostly; 59 59 static u32 beta_scale __read_mostly; ··· 77 77 " 1: packet-train 2: delay 3: both packet-train and delay"); 78 78 module_param(hystart_low_window, int, 0644); 79 79 MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); 80 - module_param(hystart_ack_delta, int, 0644); 81 - MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); 80 + module_param(hystart_ack_delta_us, int, 0644); 81 + MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); 82 82 83 83 /* BIC TCP Parameters */ 84 84 struct bictcp { ··· 89 89 u32 bic_origin_point;/* origin point of bic function */ 90 90 u32 bic_K; /* time to origin point 91 91 from the beginning of the current epoch */ 92 - u32 delay_min; /* min delay (msec << 3) */ 92 + u32 delay_min; /* min delay (usec) */ 93 93 u32 epoch_start; /* beginning of an epoch */ 94 94 u32 ack_cnt; /* number of acks */ 95 95 u32 tcp_cwnd; /* estimated tcp cwnd */ ··· 117 117 ca->found = 0; 118 118 } 119 119 120 - static inline u32 bictcp_clock(void) 120 + static inline u32 bictcp_clock_us(const struct sock *sk) 121 121 { 122 - #if HZ < 1000 123 - return ktime_to_ms(ktime_get_real()); 124 - #else 125 - return jiffies_to_msecs(jiffies); 126 - #endif 122 + return tcp_sk(sk)->tcp_mstamp; 127 123 } 128 124 129 125 static inline void bictcp_hystart_reset(struct sock *sk) ··· 127 131 struct tcp_sock *tp = tcp_sk(sk); 128 132 struct bictcp *ca = inet_csk_ca(sk); 129 133 130 - ca->round_start = ca->last_ack = bictcp_clock(); 134 + ca->round_start = ca->last_ack = bictcp_clock_us(sk); 131 135 ca->end_seq = tp->snd_nxt; 132 - ca->curr_rtt = 0; 136 + ca->curr_rtt = ~0U; 133 137 ca->sample_cnt = 0; 134 138 } 135 139 ··· 272 276 */ 273 277 274 278 t = (s32)(tcp_jiffies32 - ca->epoch_start); 275 - t += msecs_to_jiffies(ca->delay_min >> 3); 279 + t += usecs_to_jiffies(ca->delay_min); 276 280 /* change the unit from HZ to bictcp_HZ */ 277 281 t <<= BICTCP_HZ; 278 282 do_div(t, HZ); ··· 376 380 { 377 381 struct tcp_sock *tp = tcp_sk(sk); 378 382 struct bictcp *ca = inet_csk_ca(sk); 379 - 380 - if (ca->found & hystart_detect) 381 - return; 383 + u32 threshold; 382 384 383 385 if (hystart_detect & HYSTART_ACK_TRAIN) { 384 - u32 now = bictcp_clock(); 386 + u32 now = bictcp_clock_us(sk); 385 387 386 388 /* first detection parameter - ack-train detection */ 387 - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { 389 + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { 388 390 ca->last_ack = now; 389 - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { 390 - ca->found |= HYSTART_ACK_TRAIN; 391 + 392 + threshold = ca->delay_min; 393 + /* Hystart ack train triggers if we get ack past 394 + * ca->delay_min/2. 395 + * Pacing might have delayed packets up to RTT/2 396 + * during slow start. 397 + */ 398 + if (sk->sk_pacing_status == SK_PACING_NONE) 399 + threshold >>= 1; 400 + 401 + if ((s32)(now - ca->round_start) > threshold) { 402 + ca->found = 1; 391 403 NET_INC_STATS(sock_net(sk), 392 404 LINUX_MIB_TCPHYSTARTTRAINDETECT); 393 405 NET_ADD_STATS(sock_net(sk), ··· 409 405 if (hystart_detect & HYSTART_DELAY) { 410 406 /* obtain the minimum delay of more than sampling packets */ 411 407 if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { 412 - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) 408 + if (ca->curr_rtt > delay) 413 409 ca->curr_rtt = delay; 414 410 415 411 ca->sample_cnt++; 416 412 } else { 417 413 if (ca->curr_rtt > ca->delay_min + 418 414 HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { 419 - ca->found |= HYSTART_DELAY; 415 + ca->found = 1; 420 416 NET_INC_STATS(sock_net(sk), 421 417 LINUX_MIB_TCPHYSTARTDELAYDETECT); 422 418 NET_ADD_STATS(sock_net(sk), ··· 428 424 } 429 425 } 430 426 431 - /* Track delayed acknowledgment ratio using sliding window 432 - * ratio = (15*ratio + sample) / 16 433 - */ 434 427 static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) 435 428 { 436 429 const struct tcp_sock *tp = tcp_sk(sk); ··· 442 441 if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) 443 442 return; 444 443 445 - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; 444 + delay = sample->rtt_us; 446 445 if (delay == 0) 447 446 delay = 1; 448 447 449 448 /* first time call or link delay decreases */ 450 - if (ca->delay_min == 0 || ca->delay_min > delay) 451 - ca->delay_min = delay; 449 + if (ca->delay_min == 0 || ca->delay_min > delay) { 450 + unsigned long rate = READ_ONCE(sk->sk_pacing_rate); 451 + 452 + /* Account for TSO/GRO delays. 453 + * Otherwise short RTT flows could get too small ssthresh, 454 + * since during slow start we begin with small TSO packets 455 + * and could lower ca->delay_min too much. 456 + * Ideally even with a very small RTT we would like to have 457 + * at least one TSO packet being sent and received by GRO, 458 + * and another one in qdisc layer. 459 + * We apply another 100% factor because @rate is doubled at 460 + * this point. 461 + * We cap the cushion to 1ms. 462 + */ 463 + if (rate) 464 + delay += min_t(u64, USEC_PER_MSEC, 465 + div64_ul((u64)GSO_MAX_SIZE * 466 + 4 * USEC_PER_SEC, rate)); 467 + if (ca->delay_min == 0 || ca->delay_min > delay) 468 + ca->delay_min = delay; 469 + } 452 470 453 471 /* hystart triggers when cwnd is larger than some threshold */ 454 - if (hystart && tcp_in_slow_start(tp) && 472 + if (!ca->found && hystart && tcp_in_slow_start(tp) && 455 473 tp->snd_cwnd >= hystart_low_window) 456 474 hystart_update(sk, delay); 457 475 }