Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tcp: TCP Fast Open Server - header & support functions

This patch adds all the necessary data structure and support
functions to implement TFO server side. It also documents a number
of flags for the sysctl_tcp_fastopen knob, and adds a few Linux
extension MIBs.

In addition, it includes the following:

1. a new TCP_FASTOPEN socket option an application must call to
supply a max backlog allowed in order to enable TFO on its listener.

2. A number of key data structures:
"fastopen_rsk" in tcp_sock - for a big socket to access its
request_sock for retransmission and ack processing purpose. It is
non-NULL iff 3WHS not completed.

"fastopenq" in request_sock_queue - points to a per Fast Open
listener data structure "fastopen_queue" to keep track of qlen (# of
outstanding Fast Open requests) and max_qlen, among other things.

"listener" in tcp_request_sock - to point to the original listener
for book-keeping purpose, i.e., to maintain qlen against max_qlen
as part of defense against IP spoofing attack.

3. various data structure and functions, many in tcp_fastopen.c, to
support server side Fast Open cookie operations, including
/proc/sys/net/ipv4/tcp_fastopen_key to allow manual rekeying.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jerry Chu and committed by
David S. Miller
10467163 2a35cfa5

+275 -19
+21 -6
Documentation/networking/ip-sysctl.txt
··· 467 467 tcp_fastopen - INTEGER 468 468 Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data 469 469 in the opening SYN packet. To use this feature, the client application 470 - must not use connect(). Instead, it should use sendmsg() or sendto() 471 - with MSG_FASTOPEN flag which performs a TCP handshake automatically. 470 + must use sendmsg() or sendto() with MSG_FASTOPEN flag rather than 471 + connect() to perform a TCP handshake automatically. 472 472 473 - The values (bitmap) are: 474 - 1: Enables sending data in the opening SYN on the client 475 - 5: Enables sending data in the opening SYN on the client regardless 476 - of cookie availability. 473 + The values (bitmap) are 474 + 1: Enables sending data in the opening SYN on the client. 475 + 2: Enables TCP Fast Open on the server side, i.e., allowing data in 476 + a SYN packet to be accepted and passed to the application before 477 + 3-way hand shake finishes. 478 + 4: Send data in the opening SYN regardless of cookie availability and 479 + without a cookie option. 480 + 0x100: Accept SYN data w/o validating the cookie. 481 + 0x200: Accept data-in-SYN w/o any cookie option present. 482 + 0x400/0x800: Enable Fast Open on all listeners regardless of the 483 + TCP_FASTOPEN socket option. The two different flags designate two 484 + different ways of setting max_qlen without the TCP_FASTOPEN socket 485 + option. 477 486 478 487 Default: 0 488 + 489 + Note that the client & server side Fast Open flags (1 and 2 490 + respectively) must be also enabled before the rest of flags can take 491 + effect. 492 + 493 + See include/net/tcp.h and the code for more details. 479 494 480 495 tcp_syn_retries - INTEGER 481 496 Number of times initial SYNs for an active TCP connection attempt
+4
include/linux/snmp.h
··· 241 241 LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */ 242 242 LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */ 243 243 LINUX_MIB_TCPFASTOPENACTIVE, /* TCPFastOpenActive */ 244 + LINUX_MIB_TCPFASTOPENPASSIVE, /* TCPFastOpenPassive*/ 245 + LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */ 246 + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ 247 + LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ 244 248 __LINUX_MIB_MAX 245 249 }; 246 250
+42 -3
include/linux/tcp.h
··· 110 110 #define TCP_REPAIR_QUEUE 20 111 111 #define TCP_QUEUE_SEQ 21 112 112 #define TCP_REPAIR_OPTIONS 22 113 + #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ 113 114 114 115 struct tcp_repair_opt { 115 116 __u32 opt_code; ··· 247 246 /* TCP Fast Open */ 248 247 #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ 249 248 #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ 249 + #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ 250 250 251 251 /* TCP Fast Open Cookie as stored in memory */ 252 252 struct tcp_fastopen_cookie { ··· 314 312 /* Only used by TCP MD5 Signature so far. */ 315 313 const struct tcp_request_sock_ops *af_specific; 316 314 #endif 315 + struct sock *listener; /* needed for TFO */ 317 316 u32 rcv_isn; 318 317 u32 snt_isn; 319 318 u32 snt_synack; /* synack sent time */ 319 + u32 rcv_nxt; /* the ack # by SYNACK. For 320 + * FastOpen it's the seq# 321 + * after data-in-SYN. 322 + */ 320 323 }; 321 324 322 325 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) ··· 512 505 struct tcp_md5sig_info __rcu *md5sig_info; 513 506 #endif 514 507 515 - /* TCP fastopen related information */ 516 - struct tcp_fastopen_request *fastopen_req; 517 - 518 508 /* When the cookie options are generated and exchanged, then this 519 509 * object holds a reference to them (cookie_values->kref). Also 520 510 * contains related tcp_cookie_transactions fields. 521 511 */ 522 512 struct tcp_cookie_values *cookie_values; 513 + 514 + /* TCP fastopen related information */ 515 + struct tcp_fastopen_request *fastopen_req; 516 + /* fastopen_rsk points to request_sock that resulted in this big 517 + * socket. Used to retransmit SYNACKs etc. 518 + */ 519 + struct request_sock *fastopen_rsk; 523 520 }; 524 521 525 522 enum tsq_flags { ··· 561 550 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) 562 551 { 563 552 return (struct tcp_timewait_sock *)sk; 553 + } 554 + 555 + static inline bool tcp_passive_fastopen(const struct sock *sk) 556 + { 557 + return (sk->sk_state == TCP_SYN_RECV && 558 + tcp_sk(sk)->fastopen_rsk != NULL); 559 + } 560 + 561 + static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc) 562 + { 563 + return foc->len != -1; 564 + } 565 + 566 + static inline int fastopen_init_queue(struct sock *sk, int backlog) 567 + { 568 + struct request_sock_queue *queue = 569 + &inet_csk(sk)->icsk_accept_queue; 570 + 571 + if (queue->fastopenq == NULL) { 572 + queue->fastopenq = kzalloc( 573 + sizeof(struct fastopen_queue), 574 + sk->sk_allocation); 575 + if (queue->fastopenq == NULL) 576 + return -ENOMEM; 577 + spin_lock_init(&queue->fastopenq->lock); 578 + } 579 + queue->fastopenq->max_qlen = backlog; 580 + return 0; 564 581 } 565 582 566 583 #endif /* __KERNEL__ */
+36
include/net/request_sock.h
··· 106 106 struct request_sock *syn_table[0]; 107 107 }; 108 108 109 + /* 110 + * For a TCP Fast Open listener - 111 + * lock - protects the access to all the reqsk, which is co-owned by 112 + * the listener and the child socket. 113 + * qlen - pending TFO requests (still in TCP_SYN_RECV). 114 + * max_qlen - max TFO reqs allowed before TFO is disabled. 115 + * 116 + * XXX (TFO) - ideally these fields can be made as part of "listen_sock" 117 + * structure above. But there is some implementation difficulty due to 118 + * listen_sock being part of request_sock_queue hence will be freed when 119 + * a listener is stopped. But TFO related fields may continue to be 120 + * accessed even after a listener is closed, until its sk_refcnt drops 121 + * to 0 implying no more outstanding TFO reqs. One solution is to keep 122 + * listen_opt around until sk_refcnt drops to 0. But there is some other 123 + * complexity that needs to be resolved. E.g., a listener can be disabled 124 + * temporarily through shutdown()->tcp_disconnect(), and re-enabled later. 125 + */ 126 + struct fastopen_queue { 127 + struct request_sock *rskq_rst_head; /* Keep track of past TFO */ 128 + struct request_sock *rskq_rst_tail; /* requests that caused RST. 129 + * This is part of the defense 130 + * against spoofing attack. 131 + */ 132 + spinlock_t lock; 133 + int qlen; /* # of pending (TCP_SYN_RECV) reqs */ 134 + int max_qlen; /* != 0 iff TFO is currently enabled */ 135 + }; 136 + 109 137 /** struct request_sock_queue - queue of request_socks 110 138 * 111 139 * @rskq_accept_head - FIFO head of established children ··· 157 129 u8 rskq_defer_accept; 158 130 /* 3 bytes hole, try to pack */ 159 131 struct listen_sock *listen_opt; 132 + struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been 133 + * enabled on this listener. Check 134 + * max_qlen != 0 in fastopen_queue 135 + * to determine if TFO is enabled 136 + * right at this moment. 137 + */ 160 138 }; 161 139 162 140 extern int reqsk_queue_alloc(struct request_sock_queue *queue, ··· 170 136 171 137 extern void __reqsk_queue_destroy(struct request_sock_queue *queue); 172 138 extern void reqsk_queue_destroy(struct request_sock_queue *queue); 139 + extern void reqsk_fastopen_remove(struct sock *sk, 140 + struct request_sock *req, bool reset); 173 141 174 142 static inline struct request_sock * 175 143 reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
+39 -7
include/net/tcp.h
··· 224 224 225 225 /* Bit Flags for sysctl_tcp_fastopen */ 226 226 #define TFO_CLIENT_ENABLE 1 227 + #define TFO_SERVER_ENABLE 2 227 228 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ 229 + 230 + /* Process SYN data but skip cookie validation */ 231 + #define TFO_SERVER_COOKIE_NOT_CHKED 0x100 232 + /* Accept SYN data w/o any cookie option */ 233 + #define TFO_SERVER_COOKIE_NOT_REQD 0x200 234 + 235 + /* Force enable TFO on all listeners, i.e., not requiring the 236 + * TCP_FASTOPEN socket option. SOCKOPT1/2 determine how to set max_qlen. 237 + */ 238 + #define TFO_SERVER_WO_SOCKOPT1 0x400 239 + #define TFO_SERVER_WO_SOCKOPT2 0x800 240 + /* Always create TFO child sockets on a TFO listener even when 241 + * cookie/data not present. (For testing purpose!) 242 + */ 243 + #define TFO_SERVER_ALWAYS 0x1000 228 244 229 245 extern struct inet_timewait_death_row tcp_death_row; 230 246 ··· 437 421 extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check); 438 422 extern bool tcp_remember_stamp(struct sock *sk); 439 423 extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); 440 - extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, 441 - struct tcp_fastopen_cookie *cookie, 442 - int *syn_loss, unsigned long *last_syn_loss); 443 - extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss, 444 - struct tcp_fastopen_cookie *cookie, 445 - bool syn_lost); 446 424 extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); 447 425 extern void tcp_disable_fack(struct tcp_sock *tp); 448 426 extern void tcp_close(struct sock *sk, long timeout); ··· 547 537 extern void tcp_cwnd_application_limited(struct sock *sk); 548 538 extern void tcp_resume_early_retransmit(struct sock *sk); 549 539 extern void tcp_rearm_rto(struct sock *sk); 540 + extern void tcp_reset(struct sock *sk); 550 541 551 542 /* tcp_timer.c */ 552 543 extern void tcp_init_xmit_timers(struct sock *); ··· 597 586 extern int tcp_mss_to_mtu(struct sock *sk, int mss); 598 587 extern void tcp_mtup_init(struct sock *sk); 599 588 extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt); 589 + extern void tcp_init_buffer_space(struct sock *sk); 600 590 601 591 static inline void tcp_bound_rto(const struct sock *sk) 602 592 { ··· 1116 1104 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 1117 1105 req->cookie_ts = 0; 1118 1106 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 1107 + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 1119 1108 req->mss = rx_opt->mss_clamp; 1120 1109 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 1121 1110 ireq->tstamp_ok = rx_opt->tstamp_ok; ··· 1321 1308 extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, 1322 1309 const struct tcp_md5sig_key *key); 1323 1310 1311 + /* From tcp_fastopen.c */ 1312 + extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, 1313 + struct tcp_fastopen_cookie *cookie, 1314 + int *syn_loss, unsigned long *last_syn_loss); 1315 + extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss, 1316 + struct tcp_fastopen_cookie *cookie, 1317 + bool syn_lost); 1324 1318 struct tcp_fastopen_request { 1325 1319 /* Fast Open cookie. Size 0 means a cookie request */ 1326 1320 struct tcp_fastopen_cookie cookie; 1327 1321 struct msghdr *data; /* data in MSG_FASTOPEN */ 1328 1322 u16 copied; /* queued in tcp_connect() */ 1329 1323 }; 1330 - 1331 1324 void tcp_free_fastopen_req(struct tcp_sock *tp); 1325 + 1326 + extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; 1327 + int tcp_fastopen_reset_cipher(void *key, unsigned int len); 1328 + void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc); 1329 + 1330 + #define TCP_FASTOPEN_KEY_LENGTH 16 1331 + 1332 + /* Fastopen key context */ 1333 + struct tcp_fastopen_context { 1334 + struct crypto_cipher __rcu *tfm; 1335 + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; 1336 + struct rcu_head rcu; 1337 + }; 1332 1338 1333 1339 /* write queue abstraction */ 1334 1340 static inline void tcp_write_queue_purge(struct sock *sk)
+4
net/ipv4/proc.c
··· 263 263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 264 264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 265 265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 266 + SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), 267 + SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), 268 + SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), 269 + SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), 266 270 SNMP_MIB_SENTINEL 267 271 }; 268 272
+45
net/ipv4/sysctl_net_ipv4.c
··· 232 232 return 0; 233 233 } 234 234 235 + int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, 236 + size_t *lenp, loff_t *ppos) 237 + { 238 + ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; 239 + struct tcp_fastopen_context *ctxt; 240 + int ret; 241 + u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ 242 + 243 + tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); 244 + if (!tbl.data) 245 + return -ENOMEM; 246 + 247 + rcu_read_lock(); 248 + ctxt = rcu_dereference(tcp_fastopen_ctx); 249 + if (ctxt) 250 + memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); 251 + rcu_read_unlock(); 252 + 253 + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", 254 + user_key[0], user_key[1], user_key[2], user_key[3]); 255 + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 256 + 257 + if (write && ret == 0) { 258 + if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, 259 + user_key + 2, user_key + 3) != 4) { 260 + ret = -EINVAL; 261 + goto bad_key; 262 + } 263 + tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); 264 + } 265 + 266 + bad_key: 267 + pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", 268 + user_key[0], user_key[1], user_key[2], user_key[3], 269 + (char *)tbl.data, ret); 270 + kfree(tbl.data); 271 + return ret; 272 + } 273 + 235 274 static struct ctl_table ipv4_table[] = { 236 275 { 237 276 .procname = "tcp_timestamps", ··· 423 384 .maxlen = sizeof(int), 424 385 .mode = 0644, 425 386 .proc_handler = proc_dointvec, 387 + }, 388 + { 389 + .procname = "tcp_fastopen_key", 390 + .mode = 0600, 391 + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), 392 + .proc_handler = proc_tcp_fastopen_key, 426 393 }, 427 394 { 428 395 .procname = "tcp_tw_recycle",
+82 -1
net/ipv4/tcp_fastopen.c
··· 1 + #include <linux/err.h> 1 2 #include <linux/init.h> 2 3 #include <linux/kernel.h> 4 + #include <linux/list.h> 5 + #include <linux/tcp.h> 6 + #include <linux/rcupdate.h> 7 + #include <linux/rculist.h> 8 + #include <net/inetpeer.h> 9 + #include <net/tcp.h> 3 10 4 - int sysctl_tcp_fastopen; 11 + int sysctl_tcp_fastopen __read_mostly; 12 + 13 + struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; 14 + 15 + static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); 16 + 17 + static void tcp_fastopen_ctx_free(struct rcu_head *head) 18 + { 19 + struct tcp_fastopen_context *ctx = 20 + container_of(head, struct tcp_fastopen_context, rcu); 21 + crypto_free_cipher(ctx->tfm); 22 + kfree(ctx); 23 + } 24 + 25 + int tcp_fastopen_reset_cipher(void *key, unsigned int len) 26 + { 27 + int err; 28 + struct tcp_fastopen_context *ctx, *octx; 29 + 30 + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 31 + if (!ctx) 32 + return -ENOMEM; 33 + ctx->tfm = crypto_alloc_cipher("aes", 0, 0); 34 + 35 + if (IS_ERR(ctx->tfm)) { 36 + err = PTR_ERR(ctx->tfm); 37 + error: kfree(ctx); 38 + pr_err("TCP: TFO aes cipher alloc error: %d\n", err); 39 + return err; 40 + } 41 + err = crypto_cipher_setkey(ctx->tfm, key, len); 42 + if (err) { 43 + pr_err("TCP: TFO cipher key error: %d\n", err); 44 + crypto_free_cipher(ctx->tfm); 45 + goto error; 46 + } 47 + memcpy(ctx->key, key, len); 48 + 49 + spin_lock(&tcp_fastopen_ctx_lock); 50 + 51 + octx = rcu_dereference_protected(tcp_fastopen_ctx, 52 + lockdep_is_held(&tcp_fastopen_ctx_lock)); 53 + rcu_assign_pointer(tcp_fastopen_ctx, ctx); 54 + spin_unlock(&tcp_fastopen_ctx_lock); 55 + 56 + if (octx) 57 + call_rcu(&octx->rcu, tcp_fastopen_ctx_free); 58 + return err; 59 + } 60 + 61 + /* Computes the fastopen cookie for the peer. 62 + * The peer address is a 128 bits long (pad with zeros for IPv4). 63 + * 64 + * The caller must check foc->len to determine if a valid cookie 65 + * has been generated successfully. 66 + */ 67 + void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) 68 + { 69 + __be32 peer_addr[4] = { addr, 0, 0, 0 }; 70 + struct tcp_fastopen_context *ctx; 71 + 72 + rcu_read_lock(); 73 + ctx = rcu_dereference(tcp_fastopen_ctx); 74 + if (ctx) { 75 + crypto_cipher_encrypt_one(ctx->tfm, 76 + foc->val, 77 + (__u8 *)peer_addr); 78 + foc->len = TCP_FASTOPEN_COOKIE_SIZE; 79 + } 80 + rcu_read_unlock(); 81 + } 5 82 6 83 static int __init tcp_fastopen_init(void) 7 84 { 85 + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; 86 + 87 + get_random_bytes(key, sizeof(key)); 88 + tcp_fastopen_reset_cipher(key, sizeof(key)); 8 89 return 0; 9 90 } 10 91
+2 -2
net/ipv4/tcp_input.c
··· 378 378 /* 4. Try to fixup all. It is made immediately after connection enters 379 379 * established state. 380 380 */ 381 - static void tcp_init_buffer_space(struct sock *sk) 381 + void tcp_init_buffer_space(struct sock *sk) 382 382 { 383 383 struct tcp_sock *tp = tcp_sk(sk); 384 384 int maxwin; ··· 4038 4038 } 4039 4039 4040 4040 /* When we get a reset we do this. */ 4041 - static void tcp_reset(struct sock *sk) 4041 + void tcp_reset(struct sock *sk) 4042 4042 { 4043 4043 /* We want the right error as BSD sees it (and indeed as we do). */ 4044 4044 switch (sk->sk_state) {