Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: accecn: AccECN option send control

Instead of sending the option in every ACK, limit sending to
those ACKs where the option is necessary:
- Handshake
- "Change-triggered ACK" + the ACK following it. The
2nd ACK is necessary to unambiguously indicate which
of the ECN byte counters in increasing. The first
ACK has two counters increasing due to the ecnfield
edge.
- ACKs with CE to allow CEP delta validations to take
advantage of the option.
- Force option to be sent every at least once per 2^22
bytes. The check is done using the bit edges of the
byte counters (avoids need for extra variables).
- AccECN option beacon to send a few times per RTT even if
nothing in the ECN state requires that. The default is 3
times per RTT, and its period can be set via
sysctl_tcp_ecn_option_beacon.

Below are the pahole outcomes before and after this patch,
in which the group size of tcp_sock_write_tx is increased
from 89 to 97 due to the new u64 accecn_opt_tstamp member:

[BEFORE THIS PATCH]
struct tcp_sock {
[...]
u64 tcp_wstamp_ns; /* 2488 8 */
struct list_head tsorted_sent_queue; /* 2496 16 */

[...]
__cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */
u8 nonagle:4; /* 2521: 0 1 */
u8 rate_app_limited:1; /* 2521: 4 1 */
/* XXX 3 bits hole, try to pack */

/* Force alignment to the next boundary: */
u8 :0;
u8 received_ce_pending:4;/* 2522: 0 1 */
u8 unused2:4; /* 2522: 4 1 */
u8 accecn_minlen:2; /* 2523: 0 1 */
u8 est_ecnfield:2; /* 2523: 2 1 */
u8 unused3:4; /* 2523: 4 1 */

[...]
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */

[...]
/* size: 3200, cachelines: 50, members: 171 */
}

[AFTER THIS PATCH]
struct tcp_sock {
[...]
u64 tcp_wstamp_ns; /* 2488 8 */
u64 accecn_opt_tstamp; /* 2596 8 */
struct list_head tsorted_sent_queue; /* 2504 16 */

[...]
__cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */
u8 nonagle:4; /* 2529: 0 1 */
u8 rate_app_limited:1; /* 2529: 4 1 */
/* XXX 3 bits hole, try to pack */

/* Force alignment to the next boundary: */
u8 :0;
u8 received_ce_pending:4;/* 2530: 0 1 */
u8 unused2:4; /* 2530: 4 1 */
u8 accecn_minlen:2; /* 2531: 0 1 */
u8 est_ecnfield:2; /* 2531: 2 1 */
u8 accecn_opt_demand:2; /* 2531: 4 1 */
u8 prev_ecnfield:2; /* 2531: 6 1 */

[...]
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */

[...]
/* size: 3200, cachelines: 50, members: 173 */
}

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Chia-Yu Chang and committed by
Paolo Abeni
aa55a7dd b5e74132

+107 -9
+6
Documentation/networking/ip-sysctl.rst
··· 487 487 488 488 Default: 2 489 489 490 + tcp_ecn_option_beacon - INTEGER 491 + Control Accurate ECN (AccECN) option sending frequency per RTT and it 492 + takes effect only when tcp_ecn_option is set to 2. 493 + 494 + Default: 3 (AccECN will be send at least 3 times per RTT) 495 + 490 496 tcp_ecn_fallback - BOOLEAN 491 497 If the kernel detects that ECN connection misbehaves, enable fall 492 498 back to non-ECN. Currently, this knob implements the fallback
+3
Documentation/networking/net_cachelines/tcp_sock.rst
··· 109 109 u8:2 syn_ect_rcv read_mostly read_write 110 110 u8:2 accecn_minlen write_mostly read_write 111 111 u8:2 est_ecnfield read_write 112 + u8:2 accecn_opt_demand read_mostly read_write 113 + u8:2 prev_ecnfield read_write 114 + u64 accecn_opt_tstamp read_write 112 115 u8:4 accecn_fail_mode 113 116 u32 lost read_mostly tcp_ack 114 117 u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
+3 -1
include/linux/tcp.h
··· 275 275 u32 mdev_us; /* medium deviation */ 276 276 u32 rtt_seq; /* sequence number to update rttvar */ 277 277 u64 tcp_wstamp_ns; /* departure time for next sent data packet */ 278 + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ 278 279 struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ 279 280 struct sk_buff *highest_sack; /* skb just after the highest 280 281 * skb with SACKed bit set ··· 297 296 unused2:4; 298 297 u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ 299 298 est_ecnfield:2,/* ECN field for AccECN delivered estimates */ 300 - unused3:4; 299 + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ 300 + prev_ecnfield:2; /* ECN bits from the previous segment */ 301 301 __be32 pred_flags; 302 302 u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ 303 303 u64 tcp_mstamp; /* most recent packet received/sent */
+1
include/net/netns/ipv4.h
··· 149 149 150 150 u8 sysctl_tcp_ecn; 151 151 u8 sysctl_tcp_ecn_option; 152 + u8 sysctl_tcp_ecn_option_beacon; 152 153 u8 sysctl_tcp_ecn_fallback; 153 154 154 155 u8 sysctl_ip_default_ttl;
+3
include/net/tcp.h
··· 100 100 /* Maximal number of window scale according to RFC1323 */ 101 101 #define TCP_MAX_WSCALE 14U 102 102 103 + /* Default sending frequency of accurate ECN option per RTT */ 104 + #define TCP_ACCECN_OPTION_BEACON 3 105 + 103 106 /* urg_data states */ 104 107 #define TCP_URG_VALID 0x0100 105 108 #define TCP_URG_NOTYET 0x0200
+52
include/net/tcp_ecn.h
··· 176 176 } 177 177 } 178 178 179 + /* Demand the minimum # to send AccECN optnio */ 180 + static inline void tcp_accecn_opt_demand_min(struct sock *sk, 181 + u8 opt_demand_min) 182 + { 183 + struct tcp_sock *tp = tcp_sk(sk); 184 + u8 opt_demand; 185 + 186 + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); 187 + tp->accecn_opt_demand = opt_demand; 188 + } 189 + 179 190 /* Maps IP ECN field ECT/CE code point to AccECN option field number, given 180 191 * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). 181 192 */ ··· 267 256 u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; 268 257 u8 is_ce = INET_ECN_is_ce(ecnfield); 269 258 struct tcp_sock *tp = tcp_sk(sk); 259 + bool ecn_edge; 270 260 271 261 if (!INET_ECN_is_not_ect(ecnfield)) { 272 262 u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); ··· 286 274 287 275 if (len > 0) { 288 276 u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); 277 + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; 278 + u32 bytes_mask = GENMASK_U32(31, 22); 279 + 289 280 tp->received_ecn_bytes[ecnfield - 1] += len; 290 281 tp->accecn_minlen = max_t(u8, tp->accecn_minlen, 291 282 minlen); 283 + 284 + /* Send AccECN option at least once per 2^22-byte 285 + * increase in any ECN byte counter. 286 + */ 287 + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & 288 + bytes_mask) { 289 + tcp_accecn_opt_demand_min(sk, 1); 290 + } 291 + } 292 + } 293 + 294 + ecn_edge = tp->prev_ecnfield != ecnfield; 295 + if (ecn_edge || is_ce) { 296 + tp->prev_ecnfield = ecnfield; 297 + /* Demand Accurate ECN change-triggered ACKs. Two ACK are 298 + * demanded to indicate unambiguously the ecnfield value 299 + * in the latter ACK. 300 + */ 301 + if (tcp_ecn_mode_accecn(tp)) { 302 + if (ecn_edge) 303 + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 304 + tp->accecn_opt_demand = 2; 292 305 } 293 306 } 294 307 } ··· 386 349 __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); 387 350 __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); 388 351 tp->accecn_minlen = 0; 352 + tp->accecn_opt_demand = 0; 389 353 tp->est_ecnfield = 0; 390 354 } 391 355 ··· 469 431 default: 470 432 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 471 433 tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; 434 + tp->accecn_opt_demand = 2; 472 435 if (INET_ECN_is_ce(ip_dsfield) && 473 436 tcp_accecn_validate_syn_feedback(sk, ace, 474 437 tp->syn_ect_snt)) { ··· 490 451 } else { 491 452 tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & 492 453 INET_ECN_MASK; 454 + tp->prev_ecnfield = tp->syn_ect_rcv; 493 455 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 494 456 } 495 457 } ··· 580 540 tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); 581 541 else if (inet_rsk(req)->ecn_ok) 582 542 th->ece = 1; 543 + } 544 + 545 + static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) 546 + { 547 + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); 548 + const struct tcp_sock *tp = tcp_sk(sk); 549 + 550 + if (!ecn_beacon) 551 + return false; 552 + 553 + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= 554 + (tp->srtt_us >> 3); 583 555 } 584 556 585 557 #endif /* _LINUX_TCP_ECN_H */
+9
net/ipv4/sysctl_net_ipv4.c
··· 741 741 .extra2 = SYSCTL_TWO, 742 742 }, 743 743 { 744 + .procname = "tcp_ecn_option_beacon", 745 + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, 746 + .maxlen = sizeof(u8), 747 + .mode = 0644, 748 + .proc_handler = proc_dou8vec_minmax, 749 + .extra1 = SYSCTL_ZERO, 750 + .extra2 = SYSCTL_THREE, 751 + }, 752 + { 744 753 .procname = "tcp_ecn_fallback", 745 754 .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, 746 755 .maxlen = sizeof(u8),
+4 -1
net/ipv4/tcp.c
··· 3410 3410 tp->delivered_ce = 0; 3411 3411 tp->accecn_fail_mode = 0; 3412 3412 tcp_accecn_init_counters(tp); 3413 + tp->prev_ecnfield = 0; 3414 + tp->accecn_opt_tstamp = 0; 3413 3415 if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) 3414 3416 icsk->icsk_ca_ops->release(sk); 3415 3417 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); ··· 5136 5134 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); 5137 5135 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); 5138 5136 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); 5137 + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); 5139 5138 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); 5140 5139 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); 5141 5140 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); 5142 5141 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); 5143 - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); 5142 + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); 5144 5143 5145 5144 /* TXRX read-write hotpath cache lines */ 5146 5145 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+3 -1
net/ipv4/tcp_input.c
··· 6121 6121 * RFC 5961 4.2 : Send a challenge ack 6122 6122 */ 6123 6123 if (th->syn) { 6124 - if (tcp_ecn_mode_accecn(tp)) 6124 + if (tcp_ecn_mode_accecn(tp)) { 6125 6125 accecn_reflector = true; 6126 + tcp_accecn_opt_demand_min(sk, 1); 6127 + } 6126 6128 if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && 6127 6129 TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && 6128 6130 TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
+1
net/ipv4/tcp_ipv4.c
··· 3562 3562 { 3563 3563 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3564 3564 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3565 + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; 3565 3566 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3566 3567 3567 3568 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+2
net/ipv4/tcp_minisocks.c
··· 463 463 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 464 464 tp->syn_ect_snt = treq->syn_ect_snt; 465 465 tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); 466 + tp->prev_ecnfield = treq->syn_ect_rcv; 467 + tp->accecn_opt_demand = 1; 466 468 tcp_ecn_received_counters_payload(sk, skb); 467 469 } else { 468 470 tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+20 -6
net/ipv4/tcp_output.c
··· 705 705 *ptr++ = htonl(((e0b & 0xffffff) << 8) | 706 706 TCPOPT_NOP); 707 707 } 708 - if (tp) 708 + if (tp) { 709 709 tp->accecn_minlen = 0; 710 + tp->accecn_opt_tstamp = tp->tcp_mstamp; 711 + if (tp->accecn_opt_demand) 712 + tp->accecn_opt_demand--; 713 + } 710 714 } 711 715 712 716 if (unlikely(OPTION_SACK_ADVERTISE & options)) { ··· 1153 1149 opts->num_sack_blocks = 0; 1154 1150 } 1155 1151 1156 - if (tcp_ecn_mode_accecn(tp) && 1157 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { 1158 - opts->use_synack_ecn_bytes = 0; 1159 - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, 1160 - MAX_TCP_OPTION_SPACE - size); 1152 + if (tcp_ecn_mode_accecn(tp)) { 1153 + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); 1154 + 1155 + if (ecn_opt && 1156 + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || 1157 + tcp_accecn_option_beacon_check(sk))) { 1158 + opts->use_synack_ecn_bytes = 0; 1159 + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, 1160 + MAX_TCP_OPTION_SPACE - size); 1161 + } 1161 1162 } 1162 1163 1163 1164 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, ··· 2872 2863 sent_pkts = 0; 2873 2864 2874 2865 tcp_mstamp_refresh(tp); 2866 + 2867 + /* AccECN option beacon depends on mstamp, it may change mss */ 2868 + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) 2869 + mss_now = tcp_current_mss(sk); 2870 + 2875 2871 if (!push_one) { 2876 2872 /* Do MTU probing. */ 2877 2873 result = tcp_mtu_probe(sk);