net/ipv4/tcp_minisocks.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / ipv4 / tcp_minisocks.c
at master 31 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
  12 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
  13 *		Florian La Roche, <flla@stud.uni-sb.de>
  14 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  15 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
  16 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  17 *		Matthew Dillon, <dillon@apollo.west.oic.com>
  18 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  19 *		Jorge Cwik, <jorge@laser.satlink.net>
  20 */
  21
  22#include <net/tcp.h>
  23#include <net/tcp_ecn.h>
  24#include <net/xfrm.h>
  25#include <net/busy_poll.h>
  26#include <net/rstreason.h>
  27#include <net/psp.h>
  28
  29static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  30{
  31	if (seq == s_win)
  32		return true;
  33	if (after(end_seq, s_win) && before(seq, e_win))
  34		return true;
  35	return seq == e_win && seq == end_seq;
  36}
  37
  38static enum tcp_tw_status
  39tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
  40				  const struct sk_buff *skb, int mib_idx)
  41{
  42	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  43
  44	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
  45				  &tcptw->tw_last_oow_ack_time)) {
  46		/* Send ACK. Note, we do not put the bucket,
  47		 * it will be released by caller.
  48		 */
  49		return TCP_TW_ACK_OOW;
  50	}
  51
  52	/* We are rate-limiting, so just release the tw sock and drop skb. */
  53	inet_twsk_put(tw);
  54	return TCP_TW_SUCCESS;
  55}
  56
  57static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
  58				u32 rcv_nxt)
  59{
  60#ifdef CONFIG_TCP_AO
  61	struct tcp_ao_info *ao;
  62
  63	ao = rcu_dereference(tcptw->ao_info);
  64	if (unlikely(ao && seq < rcv_nxt))
  65		WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
  66#endif
  67	WRITE_ONCE(tcptw->tw_rcv_nxt, seq);
  68}
  69
  70/*
  71 * * Main purpose of TIME-WAIT state is to close connection gracefully,
  72 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  73 *   (and, probably, tail of data) and one or more our ACKs are lost.
  74 * * What is TIME-WAIT timeout? It is associated with maximal packet
  75 *   lifetime in the internet, which results in wrong conclusion, that
  76 *   it is set to catch "old duplicate segments" wandering out of their path.
  77 *   It is not quite correct. This timeout is calculated so that it exceeds
  78 *   maximal retransmission timeout enough to allow to lose one (or more)
  79 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  80 * * When TIME-WAIT socket receives RST, it means that another end
  81 *   finally closed and we are allowed to kill TIME-WAIT too.
  82 * * Second purpose of TIME-WAIT is catching old duplicate segments.
  83 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  84 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  85 * * If we invented some more clever way to catch duplicates
  86 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  87 *
  88 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  89 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  90 * from the very beginning.
  91 *
  92 * NOTE. With recycling (and later with fin-wait-2) TW bucket
  93 * is _not_ stateless. It means, that strictly speaking we must
  94 * spinlock it. I do not want! Well, probability of misbehaviour
  95 * is ridiculously low and, seems, we could use some mb() tricks
  96 * to avoid misread sequence numbers, states etc.  --ANK
  97 *
  98 * We don't need to initialize tmp_out.sack_ok as we don't use the results
  99 */
 100enum tcp_tw_status
 101tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 102			   const struct tcphdr *th, u32 *tw_isn,
 103			   enum skb_drop_reason *drop_reason)
 104{
 105	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 106	u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
 107	struct tcp_options_received tmp_opt;
 108	enum skb_drop_reason psp_drop;
 109	bool paws_reject = false;
 110	int ts_recent_stamp;
 111
 112	/* Instead of dropping immediately, wait to see what value is
 113	 * returned. We will accept a non psp-encapsulated syn in the
 114	 * case where TCP_TW_SYN is returned.
 115	 */
 116	psp_drop = psp_twsk_rx_policy_check(tw, skb);
 117
 118	tmp_opt.saw_tstamp = 0;
 119	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
 120	if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
 121		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
 122
 123		if (tmp_opt.saw_tstamp) {
 124			if (tmp_opt.rcv_tsecr)
 125				tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
 126			tmp_opt.ts_recent	= READ_ONCE(tcptw->tw_ts_recent);
 127			tmp_opt.ts_recent_stamp	= ts_recent_stamp;
 128			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 129		}
 130	}
 131
 132	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
 133		/* Just repeat all the checks of tcp_rcv_state_process() */
 134
 135		if (psp_drop)
 136			goto out_put;
 137
 138		/* Out of window, send ACK */
 139		if (paws_reject ||
 140		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 141				   rcv_nxt,
 142				   rcv_nxt + tcptw->tw_rcv_wnd))
 143			return tcp_timewait_check_oow_rate_limit(
 144				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
 145
 146		if (th->rst)
 147			goto kill;
 148
 149		if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt))
 150			return TCP_TW_RST;
 151
 152		/* Dup ACK? */
 153		if (!th->ack ||
 154		    !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) ||
 155		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 156			inet_twsk_put(tw);
 157			return TCP_TW_SUCCESS;
 158		}
 159
 160		/* New data or FIN. If new data arrive after half-duplex close,
 161		 * reset.
 162		 */
 163		if (!th->fin ||
 164		    TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1)
 165			return TCP_TW_RST;
 166
 167		/* FIN arrived, enter true time-wait state. */
 168		WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT);
 169		twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq,
 170				    rcv_nxt);
 171
 172		if (tmp_opt.saw_tstamp) {
 173			u64 ts = tcp_clock_ms();
 174
 175			WRITE_ONCE(tw->tw_entry_stamp, ts);
 176			WRITE_ONCE(tcptw->tw_ts_recent_stamp,
 177				   div_u64(ts, MSEC_PER_SEC));
 178			WRITE_ONCE(tcptw->tw_ts_recent,
 179				   tmp_opt.rcv_tsval);
 180		}
 181
 182		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
 183		return TCP_TW_ACK;
 184	}
 185
 186	/*
 187	 *	Now real TIME-WAIT state.
 188	 *
 189	 *	RFC 1122:
 190	 *	"When a connection is [...] on TIME-WAIT state [...]
 191	 *	[a TCP] MAY accept a new SYN from the remote TCP to
 192	 *	reopen the connection directly, if it:
 193	 *
 194	 *	(1)  assigns its initial sequence number for the new
 195	 *	connection to be larger than the largest sequence
 196	 *	number it used on the previous connection incarnation,
 197	 *	and
 198	 *
 199	 *	(2)  returns to TIME-WAIT state if the SYN turns out
 200	 *	to be an old duplicate".
 201	 */
 202
 203	if (!paws_reject &&
 204	    (TCP_SKB_CB(skb)->seq == rcv_nxt &&
 205	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 206		/* In window segment, it may be only reset or bare ack. */
 207
 208		if (psp_drop)
 209			goto out_put;
 210
 211		if (th->rst) {
 212			/* This is TIME_WAIT assassination, in two flavors.
 213			 * Oh well... nobody has a sufficient solution to this
 214			 * protocol bug yet.
 215			 */
 216			if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
 217kill:
 218				inet_twsk_deschedule_put(tw);
 219				return TCP_TW_SUCCESS;
 220			}
 221		} else {
 222			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
 223		}
 224
 225		if (tmp_opt.saw_tstamp) {
 226			WRITE_ONCE(tcptw->tw_ts_recent,
 227				   tmp_opt.rcv_tsval);
 228			WRITE_ONCE(tcptw->tw_ts_recent_stamp,
 229				   ktime_get_seconds());
 230		}
 231
 232		inet_twsk_put(tw);
 233		return TCP_TW_SUCCESS;
 234	}
 235
 236	/* Out of window segment.
 237
 238	   All the segments are ACKed immediately.
 239
 240	   The only exception is new SYN. We accept it, if it is
 241	   not old duplicate and we are not in danger to be killed
 242	   by delayed old duplicates. RFC check is that it has
 243	   newer sequence number works at rates <40Mbit/sec.
 244	   However, if paws works, it is reliable AND even more,
 245	   we even may relax silly seq space cutoff.
 246
 247	   RED-PEN: we violate main RFC requirement, if this SYN will appear
 248	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
 249	   we must return socket to time-wait state. It is not good,
 250	   but not fatal yet.
 251	 */
 252
 253	if (th->syn && !th->rst && !th->ack && !paws_reject &&
 254	    (after(TCP_SKB_CB(skb)->seq, rcv_nxt) ||
 255	     (tmp_opt.saw_tstamp &&
 256	      (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
 257		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 258		if (isn == 0)
 259			isn++;
 260		*tw_isn = isn;
 261		return TCP_TW_SYN;
 262	}
 263
 264	if (psp_drop)
 265		goto out_put;
 266
 267	if (paws_reject) {
 268		*drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
 269		__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
 270	}
 271
 272	if (!th->rst) {
 273		/* In this case we must reset the TIMEWAIT timer.
 274		 *
 275		 * If it is ACKless SYN it may be both old duplicate
 276		 * and new good SYN with random sequence number <rcv_nxt.
 277		 * Do not reschedule in the last case.
 278		 */
 279		if (paws_reject || th->ack)
 280			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
 281
 282		return tcp_timewait_check_oow_rate_limit(
 283			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
 284	}
 285
 286out_put:
 287	inet_twsk_put(tw);
 288	return TCP_TW_SUCCESS;
 289}
 290EXPORT_IPV6_MOD(tcp_timewait_state_process);
 291
 292static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
 293{
 294#ifdef CONFIG_TCP_MD5SIG
 295	const struct tcp_sock *tp = tcp_sk(sk);
 296	struct tcp_md5sig_key *key;
 297
 298	/*
 299	 * The timewait bucket does not have the key DB from the
 300	 * sock structure. We just make a quick copy of the
 301	 * md5 key being used (if indeed we are using one)
 302	 * so the timewait ack generating code has the key.
 303	 */
 304	tcptw->tw_md5_key = NULL;
 305	if (!static_branch_unlikely(&tcp_md5_needed.key))
 306		return;
 307
 308	key = tp->af_specific->md5_lookup(sk, sk);
 309	if (key) {
 310		tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
 311		if (!tcptw->tw_md5_key)
 312			return;
 313		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
 314			goto out_free;
 315	}
 316	return;
 317out_free:
 318	WARN_ON_ONCE(1);
 319	kfree(tcptw->tw_md5_key);
 320	tcptw->tw_md5_key = NULL;
 321#endif
 322}
 323
 324/*
 325 * Move a socket to time-wait or dead fin-wait-2 state.
 326 */
 327void tcp_time_wait(struct sock *sk, int state, int timeo)
 328{
 329	const struct inet_connection_sock *icsk = inet_csk(sk);
 330	struct tcp_sock *tp = tcp_sk(sk);
 331	struct net *net = sock_net(sk);
 332	struct inet_timewait_sock *tw;
 333
 334	tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
 335
 336	if (tw) {
 337		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 338		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 339
 340		tw->tw_mark		= sk->sk_mark;
 341		tw->tw_priority		= READ_ONCE(sk->sk_priority);
 342		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 343		/* refreshed when we enter true TIME-WAIT state */
 344		tw->tw_entry_stamp	= tcp_time_stamp_ms(tp);
 345		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 346		tcptw->tw_snd_nxt	= tp->snd_nxt;
 347		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
 348		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 349		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 350		tcptw->tw_ts_offset	= tp->tsoffset;
 351		tw->tw_usec_ts		= tp->tcp_usec_ts;
 352		tcptw->tw_last_oow_ack_time = 0;
 353		tcptw->tw_tx_delay	= tp->tcp_tx_delay;
 354		tw->tw_txhash		= sk->sk_txhash;
 355		tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping;
 356#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
 357		tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping;
 358#endif
 359#if IS_ENABLED(CONFIG_IPV6)
 360		if (tw->tw_family == PF_INET6) {
 361			struct ipv6_pinfo *np = inet6_sk(sk);
 362
 363			tw->tw_v6_daddr = sk->sk_v6_daddr;
 364			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 365			tw->tw_tclass = np->tclass;
 366			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
 367			tw->tw_ipv6only = sk->sk_ipv6only;
 368		}
 369#endif
 370
 371		tcp_time_wait_init(sk, tcptw);
 372		tcp_ao_time_wait(tcptw, tp);
 373
 374		/* Get the TIME_WAIT timeout firing. */
 375		if (timeo < rto)
 376			timeo = rto;
 377
 378		if (state == TCP_TIME_WAIT)
 379			timeo = TCP_TIMEWAIT_LEN;
 380
 381		/* Linkage updates.
 382		 * Note that access to tw after this point is illegal.
 383		 */
 384		inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
 385	} else {
 386		/* Sorry, if we're out of memory, just CLOSE this
 387		 * socket up.  We've got bigger problems than
 388		 * non-graceful socket closings.
 389		 */
 390		NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
 391	}
 392
 393	tcp_update_metrics(sk);
 394	tcp_done(sk);
 395}
 396EXPORT_SYMBOL(tcp_time_wait);
 397
 398void tcp_twsk_destructor(struct sock *sk)
 399{
 400#ifdef CONFIG_TCP_MD5SIG
 401	if (static_branch_unlikely(&tcp_md5_needed.key)) {
 402		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 403
 404		if (twsk->tw_md5_key) {
 405			kfree(twsk->tw_md5_key);
 406			static_branch_slow_dec_deferred(&tcp_md5_needed);
 407		}
 408	}
 409#endif
 410	tcp_ao_destroy_sock(sk, true);
 411	psp_twsk_assoc_free(inet_twsk(sk));
 412}
 413
 414void tcp_twsk_purge(struct list_head *net_exit_list)
 415{
 416	bool purged_once = false;
 417	struct net *net;
 418
 419	list_for_each_entry(net, net_exit_list, exit_list) {
 420		if (net->ipv4.tcp_death_row.hashinfo->pernet) {
 421			/* Even if tw_refcount == 1, we must clean up kernel reqsk */
 422			inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
 423		} else if (!purged_once) {
 424			inet_twsk_purge(&tcp_hashinfo);
 425			purged_once = true;
 426		}
 427	}
 428}
 429
 430/* Warning : This function is called without sk_listener being locked.
 431 * Be sure to read socket fields once, as their value could change under us.
 432 */
 433void tcp_openreq_init_rwin(struct request_sock *req,
 434			   const struct sock *sk_listener,
 435			   const struct dst_entry *dst)
 436{
 437	struct inet_request_sock *ireq = inet_rsk(req);
 438	const struct tcp_sock *tp = tcp_sk(sk_listener);
 439	int full_space = tcp_full_space(sk_listener);
 440	u32 window_clamp;
 441	__u8 rcv_wscale;
 442	u32 rcv_wnd;
 443	int mss;
 444
 445	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 446	window_clamp = READ_ONCE(tp->window_clamp);
 447	/* Set this up on the first call only */
 448	req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
 449
 450	/* limit the window selection if the user enforce a smaller rx buffer */
 451	if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
 452	    (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
 453		req->rsk_window_clamp = full_space;
 454
 455	rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
 456	if (rcv_wnd == 0)
 457		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
 458	else if (full_space < rcv_wnd * mss)
 459		full_space = rcv_wnd * mss;
 460
 461	/* tcp_full_space because it is guaranteed to be the first packet */
 462	tcp_select_initial_window(sk_listener, full_space,
 463		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 464		&req->rsk_rcv_wnd,
 465		&req->rsk_window_clamp,
 466		ireq->wscale_ok,
 467		&rcv_wscale,
 468		rcv_wnd);
 469	ireq->rcv_wscale = rcv_wscale;
 470}
 471
 472static void tcp_ecn_openreq_child(struct sock *sk,
 473				  const struct request_sock *req,
 474				  const struct sk_buff *skb)
 475{
 476	const struct tcp_request_sock *treq = tcp_rsk(req);
 477	struct tcp_sock *tp = tcp_sk(sk);
 478
 479	if (treq->accecn_ok) {
 480		tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
 481		tp->syn_ect_snt = treq->syn_ect_snt;
 482		tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
 483		tp->saw_accecn_opt = treq->saw_accecn_opt;
 484		tp->prev_ecnfield = treq->syn_ect_rcv;
 485		tp->accecn_opt_demand = 1;
 486		tcp_ecn_received_counters_payload(sk, skb);
 487	} else {
 488		tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
 489				     TCP_ECN_MODE_RFC3168 :
 490				     TCP_ECN_DISABLED);
 491	}
 492}
 493
 494void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 495{
 496	struct inet_connection_sock *icsk = inet_csk(sk);
 497	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
 498	bool ca_got_dst = false;
 499
 500	if (ca_key != TCP_CA_UNSPEC) {
 501		const struct tcp_congestion_ops *ca;
 502
 503		rcu_read_lock();
 504		ca = tcp_ca_find_key(ca_key);
 505		if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
 506			icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
 507			icsk->icsk_ca_ops = ca;
 508			ca_got_dst = true;
 509		}
 510		rcu_read_unlock();
 511	}
 512
 513	/* If no valid choice made yet, assign current system default ca. */
 514	if (!ca_got_dst &&
 515	    (!icsk->icsk_ca_setsockopt ||
 516	     !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
 517		tcp_assign_congestion_control(sk);
 518
 519	tcp_set_ca_state(sk, TCP_CA_Open);
 520}
 521EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child);
 522
 523static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
 524				    struct request_sock *req,
 525				    struct tcp_sock *newtp)
 526{
 527#if IS_ENABLED(CONFIG_SMC)
 528	struct inet_request_sock *ireq;
 529
 530	if (static_branch_unlikely(&tcp_have_smc)) {
 531		ireq = inet_rsk(req);
 532		if (oldtp->syn_smc && !ireq->smc_ok)
 533			newtp->syn_smc = 0;
 534	}
 535#endif
 536}
 537
 538/* This is not only more efficient than what we used to do, it eliminates
 539 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 540 *
 541 * Actually, we could lots of memory writes here. tp of listening
 542 * socket contains all necessary default parameters.
 543 */
 544struct sock *tcp_create_openreq_child(const struct sock *sk,
 545				      struct request_sock *req,
 546				      struct sk_buff *skb)
 547{
 548	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 549	const struct inet_request_sock *ireq = inet_rsk(req);
 550	struct tcp_request_sock *treq = tcp_rsk(req);
 551	struct inet_connection_sock *newicsk;
 552	const struct tcp_sock *oldtp;
 553	struct tcp_sock *newtp;
 554	u32 seq;
 555
 556	if (!newsk)
 557		return NULL;
 558
 559	newicsk = inet_csk(newsk);
 560	newtp = tcp_sk(newsk);
 561	oldtp = tcp_sk(sk);
 562
 563	smc_check_reset_syn_req(oldtp, req, newtp);
 564
 565	/* Now setup tcp_sock */
 566	newtp->pred_flags = 0;
 567
 568	seq = treq->rcv_isn + 1;
 569	newtp->rcv_wup = seq;
 570	WRITE_ONCE(newtp->copied_seq, seq);
 571	WRITE_ONCE(newtp->rcv_nxt, seq);
 572	newtp->segs_in = 1;
 573
 574	seq = treq->snt_isn + 1;
 575	newtp->snd_sml = newtp->snd_una = seq;
 576	WRITE_ONCE(newtp->snd_nxt, seq);
 577	newtp->snd_up = seq;
 578
 579	INIT_LIST_HEAD(&newtp->tsq_node);
 580	INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
 581
 582	tcp_init_wl(newtp, treq->rcv_isn);
 583
 584	minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
 585	newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
 586
 587	newtp->lsndtime = tcp_jiffies32;
 588	newsk->sk_txhash = READ_ONCE(treq->txhash);
 589	newtp->total_retrans = req->num_retrans;
 590
 591	tcp_init_xmit_timers(newsk);
 592	WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
 593
 594	if (sock_flag(newsk, SOCK_KEEPOPEN))
 595		tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
 596
 597	newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 598	newtp->rx_opt.sack_ok = ireq->sack_ok;
 599	newtp->window_clamp = req->rsk_window_clamp;
 600	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
 601	newtp->rcv_wnd = req->rsk_rcv_wnd;
 602	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 603	if (newtp->rx_opt.wscale_ok) {
 604		newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 605		newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 606	} else {
 607		newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 608		newtp->window_clamp = min(newtp->window_clamp, 65535U);
 609	}
 610	newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
 611	newtp->max_window = newtp->snd_wnd;
 612
 613	if (newtp->rx_opt.tstamp_ok) {
 614		newtp->tcp_usec_ts = treq->req_usec_ts;
 615		newtp->rx_opt.ts_recent = req->ts_recent;
 616		newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
 617		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 618	} else {
 619		newtp->tcp_usec_ts = 0;
 620		newtp->rx_opt.ts_recent_stamp = 0;
 621		newtp->tcp_header_len = sizeof(struct tcphdr);
 622	}
 623	if (req->num_timeout) {
 624		newtp->total_rto = req->num_timeout;
 625		newtp->undo_marker = treq->snt_isn;
 626		if (newtp->tcp_usec_ts) {
 627			newtp->retrans_stamp = treq->snt_synack;
 628			newtp->total_rto_time = (u32)(tcp_clock_us() -
 629						      newtp->retrans_stamp) / USEC_PER_MSEC;
 630		} else {
 631			newtp->retrans_stamp = div_u64(treq->snt_synack,
 632						       USEC_PER_SEC / TCP_TS_HZ);
 633			newtp->total_rto_time = tcp_clock_ms() -
 634						newtp->retrans_stamp;
 635		}
 636		newtp->total_rto_recoveries = 1;
 637	}
 638	newtp->tsoffset = treq->ts_off;
 639#ifdef CONFIG_TCP_MD5SIG
 640	newtp->md5sig_info = NULL;	/*XXX*/
 641#endif
 642#ifdef CONFIG_TCP_AO
 643	newtp->ao_info = NULL;
 644
 645	if (tcp_rsk_used_ao(req)) {
 646		struct tcp_ao_key *ao_key;
 647
 648		ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1);
 649		if (ao_key)
 650			newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
 651	}
 652 #endif
 653	if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
 654		newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 655	newtp->rx_opt.mss_clamp = req->mss;
 656	tcp_ecn_openreq_child(newsk, req, skb);
 657	newtp->fastopen_req = NULL;
 658	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 659
 660	newtp->bpf_chg_cc_inprogress = 0;
 661	tcp_bpf_clone(sk, newsk);
 662
 663	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 664
 665	xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
 666
 667	return newsk;
 668}
 669EXPORT_SYMBOL(tcp_create_openreq_child);
 670
 671/*
 672 * Process an incoming packet for SYN_RECV sockets represented as a
 673 * request_sock. Normally sk is the listener socket but for TFO it
 674 * points to the child socket.
 675 *
 676 * XXX (TFO) - The current impl contains a special check for ack
 677 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
 678 *
 679 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
 680 *
 681 * Note: If @fastopen is true, this can be called from process context.
 682 *       Otherwise, this is from BH context.
 683 */
 684
 685struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 686			   struct request_sock *req,
 687			   bool fastopen, bool *req_stolen,
 688			   enum skb_drop_reason *drop_reason)
 689{
 690	struct tcp_options_received tmp_opt;
 691	struct sock *child;
 692	const struct tcphdr *th = tcp_hdr(skb);
 693	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 694	bool tsecr_reject = false;
 695	bool paws_reject = false;
 696	bool own_req;
 697
 698	tmp_opt.saw_tstamp = 0;
 699	tmp_opt.accecn = 0;
 700	if (th->doff > (sizeof(struct tcphdr)>>2)) {
 701		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
 702
 703		if (tmp_opt.saw_tstamp) {
 704			tmp_opt.ts_recent = req->ts_recent;
 705			if (tmp_opt.rcv_tsecr) {
 706				if (inet_rsk(req)->tstamp_ok && !fastopen)
 707					tsecr_reject = !between(tmp_opt.rcv_tsecr,
 708							tcp_rsk(req)->snt_tsval_first,
 709							READ_ONCE(tcp_rsk(req)->snt_tsval_last));
 710				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
 711			}
 712			/* We do not store true stamp, but it is not required,
 713			 * it can be estimated (approximately)
 714			 * from another data.
 715			 */
 716			tmp_opt.ts_recent_stamp = ktime_get_seconds() -
 717				tcp_reqsk_timeout(req) / HZ;
 718			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
 719		}
 720	}
 721
 722	/* Check for pure retransmitted SYN. */
 723	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 724	    flg == TCP_FLAG_SYN &&
 725	    !paws_reject) {
 726		/*
 727		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 728		 * this case on figure 6 and figure 8, but formal
 729		 * protocol description says NOTHING.
 730		 * To be more exact, it says that we should send ACK,
 731		 * because this segment (at least, if it has no data)
 732		 * is out of window.
 733		 *
 734		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 735		 *  describe SYN-RECV state. All the description
 736		 *  is wrong, we cannot believe to it and should
 737		 *  rely only on common sense and implementation
 738		 *  experience.
 739		 *
 740		 * Enforce "SYN-ACK" according to figure 8, figure 6
 741		 * of RFC793, fixed by RFC1122.
 742		 *
 743		 * Note that even if there is new data in the SYN packet
 744		 * they will be thrown away too.
 745		 *
 746		 * Reset timer after retransmitting SYNACK, similar to
 747		 * the idea of fast retransmit in recovery.
 748		 */
 749		if (!tcp_oow_rate_limited(sock_net(sk), skb,
 750					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
 751					  &tcp_rsk(req)->last_oow_ack_time) &&
 752
 753		    !tcp_rtx_synack(sk, req)) {
 754			unsigned long expires = jiffies;
 755
 756			expires += tcp_reqsk_timeout(req);
 757			if (!fastopen)
 758				mod_timer_pending(&req->rsk_timer, expires);
 759			else
 760				req->rsk_timer.expires = expires;
 761		}
 762		return NULL;
 763	}
 764
 765	/* Further reproduces section "SEGMENT ARRIVES"
 766	   for state SYN-RECEIVED of RFC793.
 767	   It is broken, however, it does not work only
 768	   when SYNs are crossed.
 769
 770	   You would think that SYN crossing is impossible here, since
 771	   we should have a SYN_SENT socket (from connect()) on our end,
 772	   but this is not true if the crossed SYNs were sent to both
 773	   ends by a malicious third party.  We must defend against this,
 774	   and to do that we first verify the ACK (as per RFC793, page
 775	   36) and reset if it is invalid.  Is this a true full defense?
 776	   To convince ourselves, let us consider a way in which the ACK
 777	   test can still pass in this 'malicious crossed SYNs' case.
 778	   Malicious sender sends identical SYNs (and thus identical sequence
 779	   numbers) to both A and B:
 780
 781		A: gets SYN, seq=7
 782		B: gets SYN, seq=7
 783
 784	   By our good fortune, both A and B select the same initial
 785	   send sequence number of seven :-)
 786
 787		A: sends SYN|ACK, seq=7, ack_seq=8
 788		B: sends SYN|ACK, seq=7, ack_seq=8
 789
 790	   So we are now A eating this SYN|ACK, ACK test passes.  So
 791	   does sequence test, SYN is truncated, and thus we consider
 792	   it a bare ACK.
 793
 794	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
 795	   bare ACK.  Otherwise, we create an established connection.  Both
 796	   ends (listening sockets) accept the new incoming connection and try
 797	   to talk to each other. 8-)
 798
 799	   Note: This case is both harmless, and rare.  Possibility is about the
 800	   same as us discovering intelligent life on another plant tomorrow.
 801
 802	   But generally, we should (RFC lies!) to accept ACK
 803	   from SYNACK both here and in tcp_rcv_state_process().
 804	   tcp_rcv_state_process() does not, hence, we do not too.
 805
 806	   Note that the case is absolutely generic:
 807	   we cannot optimize anything here without
 808	   violating protocol. All the checks must be made
 809	   before attempt to create socket.
 810	 */
 811
 812	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
 813	 *                  and the incoming segment acknowledges something not yet
 814	 *                  sent (the segment carries an unacceptable ACK) ...
 815	 *                  a reset is sent."
 816	 *
 817	 * Invalid ACK: reset will be sent by listening socket.
 818	 * Note that the ACK validity check for a Fast Open socket is done
 819	 * elsewhere and is checked directly against the child socket rather
 820	 * than req because user data may have been sent out.
 821	 */
 822	if ((flg & TCP_FLAG_ACK) && !fastopen &&
 823	    (TCP_SKB_CB(skb)->ack_seq !=
 824	     tcp_rsk(req)->snt_isn + 1))
 825		return sk;
 826
 827	/* RFC793: "first check sequence number". */
 828
 829	if (paws_reject || tsecr_reject ||
 830	    !tcp_in_window(TCP_SKB_CB(skb)->seq,
 831			   TCP_SKB_CB(skb)->end_seq,
 832			   tcp_rsk(req)->rcv_nxt,
 833			   tcp_rsk(req)->rcv_nxt +
 834			   tcp_synack_window(req))) {
 835		/* Out of window: send ACK and drop. */
 836		if (!(flg & TCP_FLAG_RST) &&
 837		    !tcp_oow_rate_limited(sock_net(sk), skb,
 838					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
 839					  &tcp_rsk(req)->last_oow_ack_time))
 840			req->rsk_ops->send_ack(sk, skb, req);
 841		if (paws_reject) {
 842			SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS);
 843			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
 844		} else if (tsecr_reject) {
 845			SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR);
 846			NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED);
 847		} else {
 848			SKB_DR_SET(*drop_reason, TCP_OVERWINDOW);
 849		}
 850		return NULL;
 851	}
 852
 853	/* In sequence, PAWS is OK. */
 854
 855	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 856		/* Truncate SYN, it is out of window starting
 857		   at tcp_rsk(req)->rcv_isn + 1. */
 858		flg &= ~TCP_FLAG_SYN;
 859	}
 860
 861	/* RFC793: "second check the RST bit" and
 862	 *	   "fourth, check the SYN bit"
 863	 */
 864	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
 865		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 866		goto embryonic_reset;
 867	}
 868
 869	/* ACK sequence verified above, just make sure ACK is
 870	 * set.  If ACK not set, just silently drop the packet.
 871	 *
 872	 * XXX (TFO) - if we ever allow "data after SYN", the
 873	 * following check needs to be removed.
 874	 */
 875	if (!(flg & TCP_FLAG_ACK))
 876		return NULL;
 877
 878	if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
 879	    tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
 880		u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
 881
 882		tcp_rsk(req)->saw_accecn_opt = saw_opt;
 883		if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
 884			u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
 885
 886			tcp_rsk(req)->accecn_fail_mode |= fail_mode;
 887		}
 888	}
 889
 890	/* For Fast Open no more processing is needed (sk is the
 891	 * child socket).
 892	 */
 893	if (fastopen)
 894		return sk;
 895
 896	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
 897	if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
 898	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 899		inet_rsk(req)->acked = 1;
 900		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
 901		return NULL;
 902	}
 903
 904	/* OK, ACK is valid, create big socket and
 905	 * feed this segment to it. It will repeat all
 906	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 907	 * ESTABLISHED STATE. If it will be dropped after
 908	 * socket is created, wait for troubles.
 909	 */
 910	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
 911							 req, &own_req);
 912	if (!child)
 913		goto listen_overflow;
 914
 915	if (own_req && tmp_opt.saw_tstamp &&
 916	    !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
 917		tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval;
 918
 919	if (own_req && rsk_drop_req(req)) {
 920		reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
 921		inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
 922		return child;
 923	}
 924
 925	sock_rps_save_rxhash(child, skb);
 926	tcp_synack_rtt_meas(child, req);
 927	*req_stolen = !own_req;
 928	return inet_csk_complete_hashdance(sk, child, req, own_req);
 929
 930listen_overflow:
 931	SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW);
 932	if (sk != req->rsk_listener)
 933		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
 934
 935	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
 936		inet_rsk(req)->acked = 1;
 937		return NULL;
 938	}
 939
 940embryonic_reset:
 941	if (!(flg & TCP_FLAG_RST)) {
 942		/* Received a bad SYN pkt - for TFO We try not to reset
 943		 * the local connection unless it's really necessary to
 944		 * avoid becoming vulnerable to outside attack aiming at
 945		 * resetting legit local connections.
 946		 */
 947		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
 948	} else if (fastopen) { /* received a valid RST pkt */
 949		reqsk_fastopen_remove(sk, req, true);
 950		tcp_reset(sk, skb);
 951	}
 952	if (!fastopen) {
 953		bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
 954
 955		if (unlinked)
 956			__NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
 957		*req_stolen = !unlinked;
 958	}
 959	return NULL;
 960}
 961EXPORT_IPV6_MOD(tcp_check_req);
 962
 963/*
 964 * Queue segment on the new socket if the new socket is active,
 965 * otherwise we just shortcircuit this and continue with
 966 * the new socket.
 967 *
 968 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
 969 * when entering. But other states are possible due to a race condition
 970 * where after __inet_lookup_established() fails but before the listener
 971 * locked is obtained, other packets cause the same connection to
 972 * be created.
 973 */
 974
 975enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
 976				       struct sk_buff *skb)
 977	__releases(&((child)->sk_lock.slock))
 978{
 979	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
 980	int state = child->sk_state;
 981
 982	/* record sk_napi_id and sk_rx_queue_mapping of child. */
 983	sk_mark_napi_id_set(child, skb);
 984
 985	tcp_segs_in(tcp_sk(child), skb);
 986	if (!sock_owned_by_user(child)) {
 987		reason = tcp_rcv_state_process(child, skb);
 988		/* Wakeup parent, send SIGIO */
 989		if (state == TCP_SYN_RECV && child->sk_state != state)
 990			parent->sk_data_ready(parent);
 991	} else {
 992		/* Alas, it is possible again, because we do lookup
 993		 * in main socket hash table and lock on listening
 994		 * socket does not protect us more.
 995		 */
 996		__sk_add_backlog(child, skb);
 997	}
 998
 999	bh_unlock_sock(child);
1000	sock_put(child);
1001	return reason;
1002}
1003EXPORT_IPV6_MOD(tcp_child_process);