Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: accecn: AccECN option failure handling

AccECN option may fail in various way, handle these:
- Attempt to negotiate the use of AccECN on the 1st retransmitted SYN
- From the 2nd retransmitted SYN, stop AccECN negotiation
- Remove option from SYN/ACK rexmits to handle blackholes
- If no option arrives in SYN/ACK, assume Option is not usable
- If an option arrives later, re-enabled
- If option is zeroed, disable AccECN option processing

This patch use existing padding bits in tcp_request_sock and
holes in tcp_sock without increasing the size.

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Chia-Yu Chang and committed by
Paolo Abeni
b40671b5 aa55a7dd

+111 -9
+3 -1
include/linux/tcp.h
··· 173 173 u8 syn_ect_snt: 2, 174 174 syn_ect_rcv: 2, 175 175 accecn_fail_mode:4; 176 + u8 saw_accecn_opt :2; 176 177 #ifdef CONFIG_TCP_AO 177 178 u8 ao_keyid; 178 179 u8 ao_rcv_next; ··· 408 407 syn_fastopen_child:1; /* created TFO passive child socket */ 409 408 410 409 u8 keepalive_probes; /* num of allowed keep alive probes */ 411 - u8 accecn_fail_mode:4; /* AccECN failure handling */ 410 + u8 accecn_fail_mode:4, /* AccECN failure handling */ 411 + saw_accecn_opt:2; /* An AccECN option was seen */ 412 412 u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ 413 413 414 414 /* RTT measurement */
+48 -3
include/net/tcp_ecn.h
··· 91 91 tp->accecn_fail_mode |= mode; 92 92 } 93 93 94 + #define TCP_ACCECN_OPT_NOT_SEEN 0x0 95 + #define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 96 + #define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 97 + #define TCP_ACCECN_OPT_FAIL_SEEN 0x3 98 + 94 99 static inline u8 tcp_accecn_ace(const struct tcphdr *th) 95 100 { 96 101 return (th->ae << 2) | (th->cwr << 1) | th->ece; ··· 149 144 } 150 145 151 146 return true; 147 + } 148 + 149 + static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, 150 + u8 saw_opt) 151 + { 152 + tp->saw_accecn_opt = saw_opt; 153 + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) 154 + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); 152 155 } 153 156 154 157 /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ ··· 441 428 } 442 429 } 443 430 431 + static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, 432 + u8 opt_offset) 433 + { 434 + u8 *ptr = skb_transport_header(skb) + opt_offset; 435 + unsigned int optlen = ptr[1] - 2; 436 + 437 + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) 438 + return TCP_ACCECN_OPT_FAIL_SEEN; 439 + ptr += 2; 440 + 441 + /* Detect option zeroing: an AccECN connection "MAY check that the 442 + * initial value of the EE0B field or the EE1B field is non-zero" 443 + */ 444 + if (optlen < TCPOLEN_ACCECN_PERFIELD) 445 + return TCP_ACCECN_OPT_EMPTY_SEEN; 446 + if (get_unaligned_be24(ptr) == 0) 447 + return TCP_ACCECN_OPT_FAIL_SEEN; 448 + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) 449 + return TCP_ACCECN_OPT_COUNTER_SEEN; 450 + ptr += TCPOLEN_ACCECN_PERFIELD * 2; 451 + if (get_unaligned_be24(ptr) == 0) 452 + return TCP_ACCECN_OPT_FAIL_SEEN; 453 + 454 + return TCP_ACCECN_OPT_COUNTER_SEEN; 455 + } 456 + 444 457 /* See Table 2 of the AccECN draft */ 445 - static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, 446 - u8 ip_dsfield) 458 + static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, 459 + const struct tcphdr *th, u8 ip_dsfield) 447 460 { 448 461 struct tcp_sock *tp = tcp_sk(sk); 449 462 u8 ace = tcp_accecn_ace(th); ··· 508 469 default: 509 470 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 510 471 tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; 511 - tp->accecn_opt_demand = 2; 472 + if (tp->rx_opt.accecn && 473 + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { 474 + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); 475 + 476 + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); 477 + tp->accecn_opt_demand = 2; 478 + } 512 479 if (INET_ECN_is_ce(ip_dsfield) && 513 480 tcp_accecn_validate_syn_feedback(sk, ace, 514 481 tp->syn_ect_snt)) {
+2
include/uapi/linux/tcp.h
··· 323 323 __u32 tcpi_received_e1_bytes; 324 324 __u32 tcpi_received_e0_bytes; 325 325 __u32 tcpi_received_ce_bytes; 326 + __u16 tcpi_accecn_fail_mode; 327 + __u16 tcpi_accecn_opt_seen; 326 328 }; 327 329 328 330 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+3
net/ipv4/tcp.c
··· 3409 3409 tp->delivered = 0; 3410 3410 tp->delivered_ce = 0; 3411 3411 tp->accecn_fail_mode = 0; 3412 + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; 3412 3413 tcp_accecn_init_counters(tp); 3413 3414 tp->prev_ecnfield = 0; 3414 3415 tp->accecn_opt_tstamp = 0; ··· 4288 4287 if (tp->rto_stamp) 4289 4288 info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; 4290 4289 4290 + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; 4291 + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; 4291 4292 info->tcpi_received_ce = tp->received_ce; 4292 4293 info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; 4293 4294 info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+33 -2
net/ipv4/tcp_input.c
··· 398 398 unsigned int i; 399 399 u8 *ptr; 400 400 401 + if (tcp_accecn_opt_fail_recv(tp)) 402 + return false; 403 + 401 404 if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { 405 + if (!tp->saw_accecn_opt) { 406 + /* Too late to enable after this point due to 407 + * potential counter wraps 408 + */ 409 + if (tp->bytes_sent >= (1 << 23) - 1) { 410 + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; 411 + 412 + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); 413 + } 414 + return false; 415 + } 416 + 402 417 if (estimate_ecnfield) { 403 418 u8 ecnfield = estimate_ecnfield - 1; 404 419 ··· 429 414 return false; 430 415 order1 = (ptr[0] == TCPOPT_ACCECN1); 431 416 ptr += 2; 417 + 418 + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { 419 + tp->saw_accecn_opt = tcp_accecn_option_init(skb, 420 + tp->rx_opt.accecn); 421 + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) 422 + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); 423 + } 432 424 433 425 res = !!estimate_ecnfield; 434 426 for (i = 0; i < 3; i++) { ··· 6145 6123 if (th->syn) { 6146 6124 if (tcp_ecn_mode_accecn(tp)) { 6147 6125 accecn_reflector = true; 6148 - tcp_accecn_opt_demand_min(sk, 1); 6126 + if (tp->rx_opt.accecn && 6127 + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { 6128 + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); 6129 + 6130 + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); 6131 + tcp_accecn_opt_demand_min(sk, 1); 6132 + } 6149 6133 } 6150 6134 if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && 6151 6135 TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && ··· 6634 6606 */ 6635 6607 6636 6608 if (tcp_ecn_mode_any(tp)) 6637 - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); 6609 + tcp_ecn_rcv_synack(sk, skb, th, 6610 + TCP_SKB_CB(skb)->ip_dsfield); 6638 6611 6639 6612 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 6640 6613 tcp_try_undo_spurious_syn(sk); ··· 7206 7177 tcp_rsk(req)->snt_tsval_first = 0; 7207 7178 tcp_rsk(req)->last_oow_ack_time = 0; 7208 7179 tcp_rsk(req)->accecn_ok = 0; 7180 + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; 7181 + tcp_rsk(req)->accecn_fail_mode = 0; 7209 7182 tcp_rsk(req)->syn_ect_rcv = 0; 7210 7183 tcp_rsk(req)->syn_ect_snt = 0; 7211 7184 req->mss = rx_opt->mss_clamp;
+14
net/ipv4/tcp_minisocks.c
··· 463 463 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 464 464 tp->syn_ect_snt = treq->syn_ect_snt; 465 465 tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); 466 + tp->saw_accecn_opt = treq->saw_accecn_opt; 466 467 tp->prev_ecnfield = treq->syn_ect_rcv; 467 468 tp->accecn_opt_demand = 1; 468 469 tcp_ecn_received_counters_payload(sk, skb); ··· 679 678 bool own_req; 680 679 681 680 tmp_opt.saw_tstamp = 0; 681 + tmp_opt.accecn = 0; 682 682 if (th->doff > (sizeof(struct tcphdr)>>2)) { 683 683 tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); 684 684 ··· 856 854 */ 857 855 if (!(flg & TCP_FLAG_ACK)) 858 856 return NULL; 857 + 858 + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && 859 + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { 860 + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); 861 + 862 + tcp_rsk(req)->saw_accecn_opt = saw_opt; 863 + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { 864 + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; 865 + 866 + tcp_rsk(req)->accecn_fail_mode |= fail_mode; 867 + } 868 + } 859 869 860 870 /* For Fast Open no more processing is needed (sk is the 861 871 * child socket).
+8 -3
net/ipv4/tcp_output.c
··· 985 985 } 986 986 } 987 987 988 - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ 988 + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. 989 + * It is attempted to negotiate the use of AccECN also on the first 990 + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" 991 + * of AccECN draft. 992 + */ 989 993 if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && 990 994 tcp_ecn_mode_accecn(tp) && 995 + inet_csk(sk)->icsk_retransmits < 2 && 991 996 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && 992 997 remaining >= TCPOLEN_ACCECN_BASE)) { 993 998 opts->use_synack_ecn_bytes = 1; ··· 1081 1076 1082 1077 if (treq->accecn_ok && 1083 1078 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && 1084 - remaining >= TCPOLEN_ACCECN_BASE) { 1079 + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { 1085 1080 opts->use_synack_ecn_bytes = 1; 1086 1081 remaining -= tcp_options_fit_accecn(opts, 0, remaining); 1087 1082 } ··· 1161 1156 if (tcp_ecn_mode_accecn(tp)) { 1162 1157 int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); 1163 1158 1164 - if (ecn_opt && 1159 + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && 1165 1160 (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || 1166 1161 tcp_accecn_option_beacon_check(sk))) { 1167 1162 opts->use_synack_ecn_bytes = 0;