net/ipv4/tcp_ipv4.c at v6.5

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v6.5 3408 lines 92 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *		IPv4 specific functions
  10 *
  11 *		code split from:
  12 *		linux/ipv4/tcp.c
  13 *		linux/ipv4/tcp_input.c
  14 *		linux/ipv4/tcp_output.c
  15 *
  16 *		See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *		David S. Miller	:	New socket lookup architecture.
  22 *					This code is dedicated to John Dyson.
  23 *		David S. Miller :	Change semantics of established hash,
  24 *					half is devoted to TIME_WAIT sockets
  25 *					and the rest go in the other half.
  26 *		Andi Kleen :		Add support for syncookies and fixed
  27 *					some bugs: ip options weren't passed to
  28 *					the TCP layer, missed a check for an
  29 *					ACK bit.
  30 *		Andi Kleen :		Implemented fast path mtu discovery.
  31 *	     				Fixed many serious bugs in the
  32 *					request_sock handling and moved
  33 *					most of it into the af independent code.
  34 *					Added tail drop and some other bugfixes.
  35 *					Added new listen semantics.
  36 *		Mike McLagan	:	Routing by source
  37 *	Juan Jose Ciarlante:		ip_dynaddr bits
  38 *		Andi Kleen:		various fixes.
  39 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  40 *					coma.
  41 *	Andi Kleen		:	Fix new listen.
  42 *	Andi Kleen		:	Fix accept error reporting.
  43 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  44 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  45 *					a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
  95
  96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  97{
  98	return secure_tcp_seq(ip_hdr(skb)->daddr,
  99			      ip_hdr(skb)->saddr,
 100			      tcp_hdr(skb)->dest,
 101			      tcp_hdr(skb)->source);
 102}
 103
 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 105{
 106	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 107}
 108
 109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110{
 111	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
 112	const struct inet_timewait_sock *tw = inet_twsk(sktw);
 113	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114	struct tcp_sock *tp = tcp_sk(sk);
 115
 116	if (reuse == 2) {
 117		/* Still does not detect *everything* that goes through
 118		 * lo, since we require a loopback src or dst address
 119		 * or direct binding to 'lo' interface.
 120		 */
 121		bool loopback = false;
 122		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 123			loopback = true;
 124#if IS_ENABLED(CONFIG_IPV6)
 125		if (tw->tw_family == AF_INET6) {
 126			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 127			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 128			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 129			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 130				loopback = true;
 131		} else
 132#endif
 133		{
 134			if (ipv4_is_loopback(tw->tw_daddr) ||
 135			    ipv4_is_loopback(tw->tw_rcv_saddr))
 136				loopback = true;
 137		}
 138		if (!loopback)
 139			reuse = 0;
 140	}
 141
 142	/* With PAWS, it is safe from the viewpoint
 143	   of data integrity. Even without PAWS it is safe provided sequence
 144	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 145
 146	   Actually, the idea is close to VJ's one, only timestamp cache is
 147	   held not per host, but per port pair and TW bucket is used as state
 148	   holder.
 149
 150	   If TW bucket has been already destroyed we fall back to VJ's scheme
 151	   and use initial timestamp retrieved from peer table.
 152	 */
 153	if (tcptw->tw_ts_recent_stamp &&
 154	    (!twp || (reuse && time_after32(ktime_get_seconds(),
 155					    tcptw->tw_ts_recent_stamp)))) {
 156		/* In case of repair and re-using TIME-WAIT sockets we still
 157		 * want to be sure that it is safe as above but honor the
 158		 * sequence numbers and time stamps set as part of the repair
 159		 * process.
 160		 *
 161		 * Without this check re-using a TIME-WAIT socket with TCP
 162		 * repair would accumulate a -1 on the repair assigned
 163		 * sequence number. The first time it is reused the sequence
 164		 * is -1, the second time -2, etc. This fixes that issue
 165		 * without appearing to create any others.
 166		 */
 167		if (likely(!tp->repair)) {
 168			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 169
 170			if (!seq)
 171				seq = 1;
 172			WRITE_ONCE(tp->write_seq, seq);
 173			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 174			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 175		}
 176		sock_hold(sktw);
 177		return 1;
 178	}
 179
 180	return 0;
 181}
 182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 183
 184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 185			      int addr_len)
 186{
 187	/* This check is replicated from tcp_v4_connect() and intended to
 188	 * prevent BPF program called below from accessing bytes that are out
 189	 * of the bound specified by user in addr_len.
 190	 */
 191	if (addr_len < sizeof(struct sockaddr_in))
 192		return -EINVAL;
 193
 194	sock_owned_by_me(sk);
 195
 196	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 197}
 198
 199/* This will initiate an outgoing connection. */
 200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 201{
 202	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 203	struct inet_timewait_death_row *tcp_death_row;
 204	struct inet_sock *inet = inet_sk(sk);
 205	struct tcp_sock *tp = tcp_sk(sk);
 206	struct ip_options_rcu *inet_opt;
 207	struct net *net = sock_net(sk);
 208	__be16 orig_sport, orig_dport;
 209	__be32 daddr, nexthop;
 210	struct flowi4 *fl4;
 211	struct rtable *rt;
 212	int err;
 213
 214	if (addr_len < sizeof(struct sockaddr_in))
 215		return -EINVAL;
 216
 217	if (usin->sin_family != AF_INET)
 218		return -EAFNOSUPPORT;
 219
 220	nexthop = daddr = usin->sin_addr.s_addr;
 221	inet_opt = rcu_dereference_protected(inet->inet_opt,
 222					     lockdep_sock_is_held(sk));
 223	if (inet_opt && inet_opt->opt.srr) {
 224		if (!daddr)
 225			return -EINVAL;
 226		nexthop = inet_opt->opt.faddr;
 227	}
 228
 229	orig_sport = inet->inet_sport;
 230	orig_dport = usin->sin_port;
 231	fl4 = &inet->cork.fl.u.ip4;
 232	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 233			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
 234			      orig_dport, sk);
 235	if (IS_ERR(rt)) {
 236		err = PTR_ERR(rt);
 237		if (err == -ENETUNREACH)
 238			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
 239		return err;
 240	}
 241
 242	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 243		ip_rt_put(rt);
 244		return -ENETUNREACH;
 245	}
 246
 247	if (!inet_opt || !inet_opt->opt.srr)
 248		daddr = fl4->daddr;
 249
 250	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 251
 252	if (!inet->inet_saddr) {
 253		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 254		if (err) {
 255			ip_rt_put(rt);
 256			return err;
 257		}
 258	} else {
 259		sk_rcv_saddr_set(sk, inet->inet_saddr);
 260	}
 261
 262	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 263		/* Reset inherited state */
 264		tp->rx_opt.ts_recent	   = 0;
 265		tp->rx_opt.ts_recent_stamp = 0;
 266		if (likely(!tp->repair))
 267			WRITE_ONCE(tp->write_seq, 0);
 268	}
 269
 270	inet->inet_dport = usin->sin_port;
 271	sk_daddr_set(sk, daddr);
 272
 273	inet_csk(sk)->icsk_ext_hdr_len = 0;
 274	if (inet_opt)
 275		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 276
 277	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 278
 279	/* Socket identity is still unknown (sport may be zero).
 280	 * However we set state to SYN-SENT and not releasing socket
 281	 * lock select source port, enter ourselves into the hash tables and
 282	 * complete initialization after this.
 283	 */
 284	tcp_set_state(sk, TCP_SYN_SENT);
 285	err = inet_hash_connect(tcp_death_row, sk);
 286	if (err)
 287		goto failure;
 288
 289	sk_set_txhash(sk);
 290
 291	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 292			       inet->inet_sport, inet->inet_dport, sk);
 293	if (IS_ERR(rt)) {
 294		err = PTR_ERR(rt);
 295		rt = NULL;
 296		goto failure;
 297	}
 298	/* OK, now commit destination to socket.  */
 299	sk->sk_gso_type = SKB_GSO_TCPV4;
 300	sk_setup_caps(sk, &rt->dst);
 301	rt = NULL;
 302
 303	if (likely(!tp->repair)) {
 304		if (!tp->write_seq)
 305			WRITE_ONCE(tp->write_seq,
 306				   secure_tcp_seq(inet->inet_saddr,
 307						  inet->inet_daddr,
 308						  inet->inet_sport,
 309						  usin->sin_port));
 310		WRITE_ONCE(tp->tsoffset,
 311			   secure_tcp_ts_off(net, inet->inet_saddr,
 312					     inet->inet_daddr));
 313	}
 314
 315	atomic_set(&inet->inet_id, get_random_u16());
 316
 317	if (tcp_fastopen_defer_connect(sk, &err))
 318		return err;
 319	if (err)
 320		goto failure;
 321
 322	err = tcp_connect(sk);
 323
 324	if (err)
 325		goto failure;
 326
 327	return 0;
 328
 329failure:
 330	/*
 331	 * This unhashes the socket and releases the local port,
 332	 * if necessary.
 333	 */
 334	tcp_set_state(sk, TCP_CLOSE);
 335	inet_bhash2_reset_saddr(sk);
 336	ip_rt_put(rt);
 337	sk->sk_route_caps = 0;
 338	inet->inet_dport = 0;
 339	return err;
 340}
 341EXPORT_SYMBOL(tcp_v4_connect);
 342
 343/*
 344 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 345 * It can be called through tcp_release_cb() if socket was owned by user
 346 * at the time tcp_v4_err() was called to handle ICMP message.
 347 */
 348void tcp_v4_mtu_reduced(struct sock *sk)
 349{
 350	struct inet_sock *inet = inet_sk(sk);
 351	struct dst_entry *dst;
 352	u32 mtu;
 353
 354	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 355		return;
 356	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 357	dst = inet_csk_update_pmtu(sk, mtu);
 358	if (!dst)
 359		return;
 360
 361	/* Something is about to be wrong... Remember soft error
 362	 * for the case, if this connection will not able to recover.
 363	 */
 364	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 365		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
 366
 367	mtu = dst_mtu(dst);
 368
 369	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 370	    ip_sk_accept_pmtu(sk) &&
 371	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 372		tcp_sync_mss(sk, mtu);
 373
 374		/* Resend the TCP packet because it's
 375		 * clear that the old packet has been
 376		 * dropped. This is the new "fast" path mtu
 377		 * discovery.
 378		 */
 379		tcp_simple_retransmit(sk);
 380	} /* else let the usual retransmit timer handle it */
 381}
 382EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 383
 384static void do_redirect(struct sk_buff *skb, struct sock *sk)
 385{
 386	struct dst_entry *dst = __sk_dst_check(sk, 0);
 387
 388	if (dst)
 389		dst->ops->redirect(dst, sk, skb);
 390}
 391
 392
 393/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 394void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 395{
 396	struct request_sock *req = inet_reqsk(sk);
 397	struct net *net = sock_net(sk);
 398
 399	/* ICMPs are not backlogged, hence we cannot get
 400	 * an established socket here.
 401	 */
 402	if (seq != tcp_rsk(req)->snt_isn) {
 403		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 404	} else if (abort) {
 405		/*
 406		 * Still in SYN_RECV, just remove it silently.
 407		 * There is no good way to pass the error to the newly
 408		 * created socket, and POSIX does not want network
 409		 * errors returned from accept().
 410		 */
 411		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 412		tcp_listendrop(req->rsk_listener);
 413	}
 414	reqsk_put(req);
 415}
 416EXPORT_SYMBOL(tcp_req_err);
 417
 418/* TCP-LD (RFC 6069) logic */
 419void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 420{
 421	struct inet_connection_sock *icsk = inet_csk(sk);
 422	struct tcp_sock *tp = tcp_sk(sk);
 423	struct sk_buff *skb;
 424	s32 remaining;
 425	u32 delta_us;
 426
 427	if (sock_owned_by_user(sk))
 428		return;
 429
 430	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 431	    !icsk->icsk_backoff)
 432		return;
 433
 434	skb = tcp_rtx_queue_head(sk);
 435	if (WARN_ON_ONCE(!skb))
 436		return;
 437
 438	icsk->icsk_backoff--;
 439	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 440	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 441
 442	tcp_mstamp_refresh(tp);
 443	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 444	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 445
 446	if (remaining > 0) {
 447		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 448					  remaining, TCP_RTO_MAX);
 449	} else {
 450		/* RTO revert clocked out retransmission.
 451		 * Will retransmit now.
 452		 */
 453		tcp_retransmit_timer(sk);
 454	}
 455}
 456EXPORT_SYMBOL(tcp_ld_RTO_revert);
 457
 458/*
 459 * This routine is called by the ICMP module when it gets some
 460 * sort of error condition.  If err < 0 then the socket should
 461 * be closed and the error returned to the user.  If err > 0
 462 * it's just the icmp type << 8 | icmp code.  After adjustment
 463 * header points to the first 8 bytes of the tcp header.  We need
 464 * to find the appropriate port.
 465 *
 466 * The locking strategy used here is very "optimistic". When
 467 * someone else accesses the socket the ICMP is just dropped
 468 * and for some paths there is no check at all.
 469 * A more general error queue to queue errors for later handling
 470 * is probably better.
 471 *
 472 */
 473
 474int tcp_v4_err(struct sk_buff *skb, u32 info)
 475{
 476	const struct iphdr *iph = (const struct iphdr *)skb->data;
 477	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 478	struct tcp_sock *tp;
 479	struct inet_sock *inet;
 480	const int type = icmp_hdr(skb)->type;
 481	const int code = icmp_hdr(skb)->code;
 482	struct sock *sk;
 483	struct request_sock *fastopen;
 484	u32 seq, snd_una;
 485	int err;
 486	struct net *net = dev_net(skb->dev);
 487
 488	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
 489				       iph->daddr, th->dest, iph->saddr,
 490				       ntohs(th->source), inet_iif(skb), 0);
 491	if (!sk) {
 492		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 493		return -ENOENT;
 494	}
 495	if (sk->sk_state == TCP_TIME_WAIT) {
 496		inet_twsk_put(inet_twsk(sk));
 497		return 0;
 498	}
 499	seq = ntohl(th->seq);
 500	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 501		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 502				     type == ICMP_TIME_EXCEEDED ||
 503				     (type == ICMP_DEST_UNREACH &&
 504				      (code == ICMP_NET_UNREACH ||
 505				       code == ICMP_HOST_UNREACH)));
 506		return 0;
 507	}
 508
 509	bh_lock_sock(sk);
 510	/* If too many ICMPs get dropped on busy
 511	 * servers this needs to be solved differently.
 512	 * We do take care of PMTU discovery (RFC1191) special case :
 513	 * we can receive locally generated ICMP messages while socket is held.
 514	 */
 515	if (sock_owned_by_user(sk)) {
 516		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 517			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 518	}
 519	if (sk->sk_state == TCP_CLOSE)
 520		goto out;
 521
 522	if (static_branch_unlikely(&ip4_min_ttl)) {
 523		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
 524		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
 525			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 526			goto out;
 527		}
 528	}
 529
 530	tp = tcp_sk(sk);
 531	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 532	fastopen = rcu_dereference(tp->fastopen_rsk);
 533	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 534	if (sk->sk_state != TCP_LISTEN &&
 535	    !between(seq, snd_una, tp->snd_nxt)) {
 536		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 537		goto out;
 538	}
 539
 540	switch (type) {
 541	case ICMP_REDIRECT:
 542		if (!sock_owned_by_user(sk))
 543			do_redirect(skb, sk);
 544		goto out;
 545	case ICMP_SOURCE_QUENCH:
 546		/* Just silently ignore these. */
 547		goto out;
 548	case ICMP_PARAMETERPROB:
 549		err = EPROTO;
 550		break;
 551	case ICMP_DEST_UNREACH:
 552		if (code > NR_ICMP_UNREACH)
 553			goto out;
 554
 555		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 556			/* We are not interested in TCP_LISTEN and open_requests
 557			 * (SYN-ACKs send out by Linux are always <576bytes so
 558			 * they should go through unfragmented).
 559			 */
 560			if (sk->sk_state == TCP_LISTEN)
 561				goto out;
 562
 563			WRITE_ONCE(tp->mtu_info, info);
 564			if (!sock_owned_by_user(sk)) {
 565				tcp_v4_mtu_reduced(sk);
 566			} else {
 567				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 568					sock_hold(sk);
 569			}
 570			goto out;
 571		}
 572
 573		err = icmp_err_convert[code].errno;
 574		/* check if this ICMP message allows revert of backoff.
 575		 * (see RFC 6069)
 576		 */
 577		if (!fastopen &&
 578		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 579			tcp_ld_RTO_revert(sk, seq);
 580		break;
 581	case ICMP_TIME_EXCEEDED:
 582		err = EHOSTUNREACH;
 583		break;
 584	default:
 585		goto out;
 586	}
 587
 588	switch (sk->sk_state) {
 589	case TCP_SYN_SENT:
 590	case TCP_SYN_RECV:
 591		/* Only in fast or simultaneous open. If a fast open socket is
 592		 * already accepted it is treated as a connected one below.
 593		 */
 594		if (fastopen && !fastopen->sk)
 595			break;
 596
 597		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 598
 599		if (!sock_owned_by_user(sk)) {
 600			WRITE_ONCE(sk->sk_err, err);
 601
 602			sk_error_report(sk);
 603
 604			tcp_done(sk);
 605		} else {
 606			WRITE_ONCE(sk->sk_err_soft, err);
 607		}
 608		goto out;
 609	}
 610
 611	/* If we've already connected we will keep trying
 612	 * until we time out, or the user gives up.
 613	 *
 614	 * rfc1122 4.2.3.9 allows to consider as hard errors
 615	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 616	 * but it is obsoleted by pmtu discovery).
 617	 *
 618	 * Note, that in modern internet, where routing is unreliable
 619	 * and in each dark corner broken firewalls sit, sending random
 620	 * errors ordered by their masters even this two messages finally lose
 621	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 622	 *
 623	 * Now we are in compliance with RFCs.
 624	 *							--ANK (980905)
 625	 */
 626
 627	inet = inet_sk(sk);
 628	if (!sock_owned_by_user(sk) && inet->recverr) {
 629		WRITE_ONCE(sk->sk_err, err);
 630		sk_error_report(sk);
 631	} else	{ /* Only an error on timeout */
 632		WRITE_ONCE(sk->sk_err_soft, err);
 633	}
 634
 635out:
 636	bh_unlock_sock(sk);
 637	sock_put(sk);
 638	return 0;
 639}
 640
 641void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 642{
 643	struct tcphdr *th = tcp_hdr(skb);
 644
 645	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 646	skb->csum_start = skb_transport_header(skb) - skb->head;
 647	skb->csum_offset = offsetof(struct tcphdr, check);
 648}
 649
 650/* This routine computes an IPv4 TCP checksum. */
 651void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 652{
 653	const struct inet_sock *inet = inet_sk(sk);
 654
 655	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 656}
 657EXPORT_SYMBOL(tcp_v4_send_check);
 658
 659/*
 660 *	This routine will send an RST to the other tcp.
 661 *
 662 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 663 *		      for reset.
 664 *	Answer: if a packet caused RST, it is not for a socket
 665 *		existing in our system, if it is matched to a socket,
 666 *		it is just duplicate segment or bug in other side's TCP.
 667 *		So that we build reply only basing on parameters
 668 *		arrived with segment.
 669 *	Exception: precedence violation. We do not implement it in any case.
 670 */
 671
 672#ifdef CONFIG_TCP_MD5SIG
 673#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
 674#else
 675#define OPTION_BYTES sizeof(__be32)
 676#endif
 677
 678static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 679{
 680	const struct tcphdr *th = tcp_hdr(skb);
 681	struct {
 682		struct tcphdr th;
 683		__be32 opt[OPTION_BYTES / sizeof(__be32)];
 684	} rep;
 685	struct ip_reply_arg arg;
 686#ifdef CONFIG_TCP_MD5SIG
 687	struct tcp_md5sig_key *key = NULL;
 688	const __u8 *hash_location = NULL;
 689	unsigned char newhash[16];
 690	int genhash;
 691	struct sock *sk1 = NULL;
 692#endif
 693	u64 transmit_time = 0;
 694	struct sock *ctl_sk;
 695	struct net *net;
 696	u32 txhash = 0;
 697
 698	/* Never send a reset in response to a reset. */
 699	if (th->rst)
 700		return;
 701
 702	/* If sk not NULL, it means we did a successful lookup and incoming
 703	 * route had to be correct. prequeue might have dropped our dst.
 704	 */
 705	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 706		return;
 707
 708	/* Swap the send and the receive. */
 709	memset(&rep, 0, sizeof(rep));
 710	rep.th.dest   = th->source;
 711	rep.th.source = th->dest;
 712	rep.th.doff   = sizeof(struct tcphdr) / 4;
 713	rep.th.rst    = 1;
 714
 715	if (th->ack) {
 716		rep.th.seq = th->ack_seq;
 717	} else {
 718		rep.th.ack = 1;
 719		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 720				       skb->len - (th->doff << 2));
 721	}
 722
 723	memset(&arg, 0, sizeof(arg));
 724	arg.iov[0].iov_base = (unsigned char *)&rep;
 725	arg.iov[0].iov_len  = sizeof(rep.th);
 726
 727	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 728#ifdef CONFIG_TCP_MD5SIG
 729	rcu_read_lock();
 730	hash_location = tcp_parse_md5sig_option(th);
 731	if (sk && sk_fullsock(sk)) {
 732		const union tcp_md5_addr *addr;
 733		int l3index;
 734
 735		/* sdif set, means packet ingressed via a device
 736		 * in an L3 domain and inet_iif is set to it.
 737		 */
 738		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 739		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 740		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 741	} else if (hash_location) {
 742		const union tcp_md5_addr *addr;
 743		int sdif = tcp_v4_sdif(skb);
 744		int dif = inet_iif(skb);
 745		int l3index;
 746
 747		/*
 748		 * active side is lost. Try to find listening socket through
 749		 * source port, and then find md5 key through listening socket.
 750		 * we are not loose security here:
 751		 * Incoming packet is checked with md5 hash with finding key,
 752		 * no RST generated if md5 hash doesn't match.
 753		 */
 754		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
 755					     NULL, 0, ip_hdr(skb)->saddr,
 756					     th->source, ip_hdr(skb)->daddr,
 757					     ntohs(th->source), dif, sdif);
 758		/* don't send rst if it can't find key */
 759		if (!sk1)
 760			goto out;
 761
 762		/* sdif set, means packet ingressed via a device
 763		 * in an L3 domain and dif is set to it.
 764		 */
 765		l3index = sdif ? dif : 0;
 766		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 767		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 768		if (!key)
 769			goto out;
 770
 771
 772		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 773		if (genhash || memcmp(hash_location, newhash, 16) != 0)
 774			goto out;
 775
 776	}
 777
 778	if (key) {
 779		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 780				   (TCPOPT_NOP << 16) |
 781				   (TCPOPT_MD5SIG << 8) |
 782				   TCPOLEN_MD5SIG);
 783		/* Update length and the length the header thinks exists */
 784		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 785		rep.th.doff = arg.iov[0].iov_len / 4;
 786
 787		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 788				     key, ip_hdr(skb)->saddr,
 789				     ip_hdr(skb)->daddr, &rep.th);
 790	}
 791#endif
 792	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
 793	if (rep.opt[0] == 0) {
 794		__be32 mrst = mptcp_reset_option(skb);
 795
 796		if (mrst) {
 797			rep.opt[0] = mrst;
 798			arg.iov[0].iov_len += sizeof(mrst);
 799			rep.th.doff = arg.iov[0].iov_len / 4;
 800		}
 801	}
 802
 803	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 804				      ip_hdr(skb)->saddr, /* XXX */
 805				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 806	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 807	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 808
 809	/* When socket is gone, all binding information is lost.
 810	 * routing might fail in this case. No choice here, if we choose to force
 811	 * input interface, we will misroute in case of asymmetric route.
 812	 */
 813	if (sk) {
 814		arg.bound_dev_if = sk->sk_bound_dev_if;
 815		if (sk_fullsock(sk))
 816			trace_tcp_send_reset(sk, skb);
 817	}
 818
 819	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 820		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 821
 822	arg.tos = ip_hdr(skb)->tos;
 823	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 824	local_bh_disable();
 825	ctl_sk = this_cpu_read(ipv4_tcp_sk);
 826	sock_net_set(ctl_sk, net);
 827	if (sk) {
 828		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 829				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 830		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 831				   inet_twsk(sk)->tw_priority : sk->sk_priority;
 832		transmit_time = tcp_transmit_time(sk);
 833		xfrm_sk_clone_policy(ctl_sk, sk);
 834		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
 835			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
 836	} else {
 837		ctl_sk->sk_mark = 0;
 838		ctl_sk->sk_priority = 0;
 839	}
 840	ip_send_unicast_reply(ctl_sk,
 841			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 842			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 843			      &arg, arg.iov[0].iov_len,
 844			      transmit_time, txhash);
 845
 846	xfrm_sk_free_policy(ctl_sk);
 847	sock_net_set(ctl_sk, &init_net);
 848	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 849	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 850	local_bh_enable();
 851
 852#ifdef CONFIG_TCP_MD5SIG
 853out:
 854	rcu_read_unlock();
 855#endif
 856}
 857
 858/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 859   outside socket context is ugly, certainly. What can I do?
 860 */
 861
 862static void tcp_v4_send_ack(const struct sock *sk,
 863			    struct sk_buff *skb, u32 seq, u32 ack,
 864			    u32 win, u32 tsval, u32 tsecr, int oif,
 865			    struct tcp_md5sig_key *key,
 866			    int reply_flags, u8 tos, u32 txhash)
 867{
 868	const struct tcphdr *th = tcp_hdr(skb);
 869	struct {
 870		struct tcphdr th;
 871		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 872#ifdef CONFIG_TCP_MD5SIG
 873			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 874#endif
 875			];
 876	} rep;
 877	struct net *net = sock_net(sk);
 878	struct ip_reply_arg arg;
 879	struct sock *ctl_sk;
 880	u64 transmit_time;
 881
 882	memset(&rep.th, 0, sizeof(struct tcphdr));
 883	memset(&arg, 0, sizeof(arg));
 884
 885	arg.iov[0].iov_base = (unsigned char *)&rep;
 886	arg.iov[0].iov_len  = sizeof(rep.th);
 887	if (tsecr) {
 888		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 889				   (TCPOPT_TIMESTAMP << 8) |
 890				   TCPOLEN_TIMESTAMP);
 891		rep.opt[1] = htonl(tsval);
 892		rep.opt[2] = htonl(tsecr);
 893		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 894	}
 895
 896	/* Swap the send and the receive. */
 897	rep.th.dest    = th->source;
 898	rep.th.source  = th->dest;
 899	rep.th.doff    = arg.iov[0].iov_len / 4;
 900	rep.th.seq     = htonl(seq);
 901	rep.th.ack_seq = htonl(ack);
 902	rep.th.ack     = 1;
 903	rep.th.window  = htons(win);
 904
 905#ifdef CONFIG_TCP_MD5SIG
 906	if (key) {
 907		int offset = (tsecr) ? 3 : 0;
 908
 909		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 910					  (TCPOPT_NOP << 16) |
 911					  (TCPOPT_MD5SIG << 8) |
 912					  TCPOLEN_MD5SIG);
 913		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 914		rep.th.doff = arg.iov[0].iov_len/4;
 915
 916		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 917				    key, ip_hdr(skb)->saddr,
 918				    ip_hdr(skb)->daddr, &rep.th);
 919	}
 920#endif
 921	arg.flags = reply_flags;
 922	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 923				      ip_hdr(skb)->saddr, /* XXX */
 924				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 925	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 926	if (oif)
 927		arg.bound_dev_if = oif;
 928	arg.tos = tos;
 929	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 930	local_bh_disable();
 931	ctl_sk = this_cpu_read(ipv4_tcp_sk);
 932	sock_net_set(ctl_sk, net);
 933	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 934			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
 935	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 936			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
 937	transmit_time = tcp_transmit_time(sk);
 938	ip_send_unicast_reply(ctl_sk,
 939			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 940			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 941			      &arg, arg.iov[0].iov_len,
 942			      transmit_time, txhash);
 943
 944	sock_net_set(ctl_sk, &init_net);
 945	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 946	local_bh_enable();
 947}
 948
 949static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 950{
 951	struct inet_timewait_sock *tw = inet_twsk(sk);
 952	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 953
 954	tcp_v4_send_ack(sk, skb,
 955			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 956			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 957			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 958			tcptw->tw_ts_recent,
 959			tw->tw_bound_dev_if,
 960			tcp_twsk_md5_key(tcptw),
 961			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 962			tw->tw_tos,
 963			tw->tw_txhash
 964			);
 965
 966	inet_twsk_put(tw);
 967}
 968
 969static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 970				  struct request_sock *req)
 971{
 972	const union tcp_md5_addr *addr;
 973	int l3index;
 974
 975	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 976	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 977	 */
 978	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 979					     tcp_sk(sk)->snd_nxt;
 980
 981	/* RFC 7323 2.3
 982	 * The window field (SEG.WND) of every outgoing segment, with the
 983	 * exception of <SYN> segments, MUST be right-shifted by
 984	 * Rcv.Wind.Shift bits:
 985	 */
 986	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 987	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 988	tcp_v4_send_ack(sk, skb, seq,
 989			tcp_rsk(req)->rcv_nxt,
 990			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 991			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 992			READ_ONCE(req->ts_recent),
 993			0,
 994			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 995			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 996			ip_hdr(skb)->tos,
 997			READ_ONCE(tcp_rsk(req)->txhash));
 998}
 999
1000/*
1001 *	Send a SYN-ACK after having received a SYN.
1002 *	This still operates on a request_sock only, not on a big
1003 *	socket.
1004 */
1005static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006			      struct flowi *fl,
1007			      struct request_sock *req,
1008			      struct tcp_fastopen_cookie *foc,
1009			      enum tcp_synack_type synack_type,
1010			      struct sk_buff *syn_skb)
1011{
1012	const struct inet_request_sock *ireq = inet_rsk(req);
1013	struct flowi4 fl4;
1014	int err = -1;
1015	struct sk_buff *skb;
1016	u8 tos;
1017
1018	/* First, grab a route. */
1019	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020		return -1;
1021
1022	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024	if (skb) {
1025		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029				(inet_sk(sk)->tos & INET_ECN_MASK) :
1030				inet_sk(sk)->tos;
1031
1032		if (!INET_ECN_is_capable(tos) &&
1033		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1034			tos |= INET_ECN_ECT_0;
1035
1036		rcu_read_lock();
1037		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038					    ireq->ir_rmt_addr,
1039					    rcu_dereference(ireq->ireq_opt),
1040					    tos);
1041		rcu_read_unlock();
1042		err = net_xmit_eval(err);
1043	}
1044
1045	return err;
1046}
1047
1048/*
1049 *	IPv4 request_sock destructor.
1050 */
1051static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052{
1053	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054}
1055
1056#ifdef CONFIG_TCP_MD5SIG
1057/*
1058 * RFC2385 MD5 checksumming requires a mapping of
1059 * IP address->MD5 Key.
1060 * We need to maintain these in the sk structure.
1061 */
1062
1063DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064EXPORT_SYMBOL(tcp_md5_needed);
1065
1066static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067{
1068	if (!old)
1069		return true;
1070
1071	/* l3index always overrides non-l3index */
1072	if (old->l3index && new->l3index == 0)
1073		return false;
1074	if (old->l3index == 0 && new->l3index)
1075		return true;
1076
1077	return old->prefixlen < new->prefixlen;
1078}
1079
1080/* Find the Key structure for an address.  */
1081struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082					   const union tcp_md5_addr *addr,
1083					   int family)
1084{
1085	const struct tcp_sock *tp = tcp_sk(sk);
1086	struct tcp_md5sig_key *key;
1087	const struct tcp_md5sig_info *md5sig;
1088	__be32 mask;
1089	struct tcp_md5sig_key *best_match = NULL;
1090	bool match;
1091
1092	/* caller either holds rcu_read_lock() or socket lock */
1093	md5sig = rcu_dereference_check(tp->md5sig_info,
1094				       lockdep_sock_is_held(sk));
1095	if (!md5sig)
1096		return NULL;
1097
1098	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099				 lockdep_sock_is_held(sk)) {
1100		if (key->family != family)
1101			continue;
1102		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103			continue;
1104		if (family == AF_INET) {
1105			mask = inet_make_mask(key->prefixlen);
1106			match = (key->addr.a4.s_addr & mask) ==
1107				(addr->a4.s_addr & mask);
1108#if IS_ENABLED(CONFIG_IPV6)
1109		} else if (family == AF_INET6) {
1110			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111						  key->prefixlen);
1112#endif
1113		} else {
1114			match = false;
1115		}
1116
1117		if (match && better_md5_match(best_match, key))
1118			best_match = key;
1119	}
1120	return best_match;
1121}
1122EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125						      const union tcp_md5_addr *addr,
1126						      int family, u8 prefixlen,
1127						      int l3index, u8 flags)
1128{
1129	const struct tcp_sock *tp = tcp_sk(sk);
1130	struct tcp_md5sig_key *key;
1131	unsigned int size = sizeof(struct in_addr);
1132	const struct tcp_md5sig_info *md5sig;
1133
1134	/* caller either holds rcu_read_lock() or socket lock */
1135	md5sig = rcu_dereference_check(tp->md5sig_info,
1136				       lockdep_sock_is_held(sk));
1137	if (!md5sig)
1138		return NULL;
1139#if IS_ENABLED(CONFIG_IPV6)
1140	if (family == AF_INET6)
1141		size = sizeof(struct in6_addr);
1142#endif
1143	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144				 lockdep_sock_is_held(sk)) {
1145		if (key->family != family)
1146			continue;
1147		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148			continue;
1149		if (key->l3index != l3index)
1150			continue;
1151		if (!memcmp(&key->addr, addr, size) &&
1152		    key->prefixlen == prefixlen)
1153			return key;
1154	}
1155	return NULL;
1156}
1157
1158struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159					 const struct sock *addr_sk)
1160{
1161	const union tcp_md5_addr *addr;
1162	int l3index;
1163
1164	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165						 addr_sk->sk_bound_dev_if);
1166	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168}
1169EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172{
1173	struct tcp_sock *tp = tcp_sk(sk);
1174	struct tcp_md5sig_info *md5sig;
1175
1176	md5sig = kmalloc(sizeof(*md5sig), gfp);
1177	if (!md5sig)
1178		return -ENOMEM;
1179
1180	sk_gso_disable(sk);
1181	INIT_HLIST_HEAD(&md5sig->head);
1182	rcu_assign_pointer(tp->md5sig_info, md5sig);
1183	return 0;
1184}
1185
1186/* This can be called on a newly created socket, from other files */
1187static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188			    int family, u8 prefixlen, int l3index, u8 flags,
1189			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190{
1191	/* Add Key to the list */
1192	struct tcp_md5sig_key *key;
1193	struct tcp_sock *tp = tcp_sk(sk);
1194	struct tcp_md5sig_info *md5sig;
1195
1196	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197	if (key) {
1198		/* Pre-existing entry - just update that one.
1199		 * Note that the key might be used concurrently.
1200		 * data_race() is telling kcsan that we do not care of
1201		 * key mismatches, since changing MD5 key on live flows
1202		 * can lead to packet drops.
1203		 */
1204		data_race(memcpy(key->key, newkey, newkeylen));
1205
1206		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207		 * Also note that a reader could catch new key->keylen value
1208		 * but old key->key[], this is the reason we use __GFP_ZERO
1209		 * at sock_kmalloc() time below these lines.
1210		 */
1211		WRITE_ONCE(key->keylen, newkeylen);
1212
1213		return 0;
1214	}
1215
1216	md5sig = rcu_dereference_protected(tp->md5sig_info,
1217					   lockdep_sock_is_held(sk));
1218
1219	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220	if (!key)
1221		return -ENOMEM;
1222	if (!tcp_alloc_md5sig_pool()) {
1223		sock_kfree_s(sk, key, sizeof(*key));
1224		return -ENOMEM;
1225	}
1226
1227	memcpy(key->key, newkey, newkeylen);
1228	key->keylen = newkeylen;
1229	key->family = family;
1230	key->prefixlen = prefixlen;
1231	key->l3index = l3index;
1232	key->flags = flags;
1233	memcpy(&key->addr, addr,
1234	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235								 sizeof(struct in_addr));
1236	hlist_add_head_rcu(&key->node, &md5sig->head);
1237	return 0;
1238}
1239
1240int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241		   int family, u8 prefixlen, int l3index, u8 flags,
1242		   const u8 *newkey, u8 newkeylen)
1243{
1244	struct tcp_sock *tp = tcp_sk(sk);
1245
1246	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248			return -ENOMEM;
1249
1250		if (!static_branch_inc(&tcp_md5_needed.key)) {
1251			struct tcp_md5sig_info *md5sig;
1252
1253			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254			rcu_assign_pointer(tp->md5sig_info, NULL);
1255			kfree_rcu(md5sig, rcu);
1256			return -EUSERS;
1257		}
1258	}
1259
1260	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261				newkey, newkeylen, GFP_KERNEL);
1262}
1263EXPORT_SYMBOL(tcp_md5_do_add);
1264
1265int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266		     int family, u8 prefixlen, int l3index,
1267		     struct tcp_md5sig_key *key)
1268{
1269	struct tcp_sock *tp = tcp_sk(sk);
1270
1271	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273			return -ENOMEM;
1274
1275		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276			struct tcp_md5sig_info *md5sig;
1277
1278			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280			rcu_assign_pointer(tp->md5sig_info, NULL);
1281			kfree_rcu(md5sig, rcu);
1282			return -EUSERS;
1283		}
1284	}
1285
1286	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287				key->flags, key->key, key->keylen,
1288				sk_gfp_mask(sk, GFP_ATOMIC));
1289}
1290EXPORT_SYMBOL(tcp_md5_key_copy);
1291
1292int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293		   u8 prefixlen, int l3index, u8 flags)
1294{
1295	struct tcp_md5sig_key *key;
1296
1297	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298	if (!key)
1299		return -ENOENT;
1300	hlist_del_rcu(&key->node);
1301	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302	kfree_rcu(key, rcu);
1303	return 0;
1304}
1305EXPORT_SYMBOL(tcp_md5_do_del);
1306
1307static void tcp_clear_md5_list(struct sock *sk)
1308{
1309	struct tcp_sock *tp = tcp_sk(sk);
1310	struct tcp_md5sig_key *key;
1311	struct hlist_node *n;
1312	struct tcp_md5sig_info *md5sig;
1313
1314	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315
1316	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317		hlist_del_rcu(&key->node);
1318		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319		kfree_rcu(key, rcu);
1320	}
1321}
1322
1323static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324				 sockptr_t optval, int optlen)
1325{
1326	struct tcp_md5sig cmd;
1327	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328	const union tcp_md5_addr *addr;
1329	u8 prefixlen = 32;
1330	int l3index = 0;
1331	u8 flags;
1332
1333	if (optlen < sizeof(cmd))
1334		return -EINVAL;
1335
1336	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337		return -EFAULT;
1338
1339	if (sin->sin_family != AF_INET)
1340		return -EINVAL;
1341
1342	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343
1344	if (optname == TCP_MD5SIG_EXT &&
1345	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346		prefixlen = cmd.tcpm_prefixlen;
1347		if (prefixlen > 32)
1348			return -EINVAL;
1349	}
1350
1351	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353		struct net_device *dev;
1354
1355		rcu_read_lock();
1356		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357		if (dev && netif_is_l3_master(dev))
1358			l3index = dev->ifindex;
1359
1360		rcu_read_unlock();
1361
1362		/* ok to reference set/not set outside of rcu;
1363		 * right now device MUST be an L3 master
1364		 */
1365		if (!dev || !l3index)
1366			return -EINVAL;
1367	}
1368
1369	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370
1371	if (!cmd.tcpm_keylen)
1372		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373
1374	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375		return -EINVAL;
1376
1377	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378			      cmd.tcpm_key, cmd.tcpm_keylen);
1379}
1380
1381static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382				   __be32 daddr, __be32 saddr,
1383				   const struct tcphdr *th, int nbytes)
1384{
1385	struct tcp4_pseudohdr *bp;
1386	struct scatterlist sg;
1387	struct tcphdr *_th;
1388
1389	bp = hp->scratch;
1390	bp->saddr = saddr;
1391	bp->daddr = daddr;
1392	bp->pad = 0;
1393	bp->protocol = IPPROTO_TCP;
1394	bp->len = cpu_to_be16(nbytes);
1395
1396	_th = (struct tcphdr *)(bp + 1);
1397	memcpy(_th, th, sizeof(*th));
1398	_th->check = 0;
1399
1400	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402				sizeof(*bp) + sizeof(*th));
1403	return crypto_ahash_update(hp->md5_req);
1404}
1405
1406static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408{
1409	struct tcp_md5sig_pool *hp;
1410	struct ahash_request *req;
1411
1412	hp = tcp_get_md5sig_pool();
1413	if (!hp)
1414		goto clear_hash_noput;
1415	req = hp->md5_req;
1416
1417	if (crypto_ahash_init(req))
1418		goto clear_hash;
1419	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420		goto clear_hash;
1421	if (tcp_md5_hash_key(hp, key))
1422		goto clear_hash;
1423	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424	if (crypto_ahash_final(req))
1425		goto clear_hash;
1426
1427	tcp_put_md5sig_pool();
1428	return 0;
1429
1430clear_hash:
1431	tcp_put_md5sig_pool();
1432clear_hash_noput:
1433	memset(md5_hash, 0, 16);
1434	return 1;
1435}
1436
1437int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438			const struct sock *sk,
1439			const struct sk_buff *skb)
1440{
1441	struct tcp_md5sig_pool *hp;
1442	struct ahash_request *req;
1443	const struct tcphdr *th = tcp_hdr(skb);
1444	__be32 saddr, daddr;
1445
1446	if (sk) { /* valid for establish/request sockets */
1447		saddr = sk->sk_rcv_saddr;
1448		daddr = sk->sk_daddr;
1449	} else {
1450		const struct iphdr *iph = ip_hdr(skb);
1451		saddr = iph->saddr;
1452		daddr = iph->daddr;
1453	}
1454
1455	hp = tcp_get_md5sig_pool();
1456	if (!hp)
1457		goto clear_hash_noput;
1458	req = hp->md5_req;
1459
1460	if (crypto_ahash_init(req))
1461		goto clear_hash;
1462
1463	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464		goto clear_hash;
1465	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466		goto clear_hash;
1467	if (tcp_md5_hash_key(hp, key))
1468		goto clear_hash;
1469	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470	if (crypto_ahash_final(req))
1471		goto clear_hash;
1472
1473	tcp_put_md5sig_pool();
1474	return 0;
1475
1476clear_hash:
1477	tcp_put_md5sig_pool();
1478clear_hash_noput:
1479	memset(md5_hash, 0, 16);
1480	return 1;
1481}
1482EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483
1484#endif
1485
1486static void tcp_v4_init_req(struct request_sock *req,
1487			    const struct sock *sk_listener,
1488			    struct sk_buff *skb)
1489{
1490	struct inet_request_sock *ireq = inet_rsk(req);
1491	struct net *net = sock_net(sk_listener);
1492
1493	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496}
1497
1498static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499					  struct sk_buff *skb,
1500					  struct flowi *fl,
1501					  struct request_sock *req)
1502{
1503	tcp_v4_init_req(req, sk, skb);
1504
1505	if (security_inet_conn_request(sk, skb, req))
1506		return NULL;
1507
1508	return inet_csk_route_req(sk, &fl->u.ip4, req);
1509}
1510
1511struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512	.family		=	PF_INET,
1513	.obj_size	=	sizeof(struct tcp_request_sock),
1514	.rtx_syn_ack	=	tcp_rtx_synack,
1515	.send_ack	=	tcp_v4_reqsk_send_ack,
1516	.destructor	=	tcp_v4_reqsk_destructor,
1517	.send_reset	=	tcp_v4_send_reset,
1518	.syn_ack_timeout =	tcp_syn_ack_timeout,
1519};
1520
1521const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522	.mss_clamp	=	TCP_MSS_DEFAULT,
1523#ifdef CONFIG_TCP_MD5SIG
1524	.req_md5_lookup	=	tcp_v4_md5_lookup,
1525	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1526#endif
1527#ifdef CONFIG_SYN_COOKIES
1528	.cookie_init_seq =	cookie_v4_init_sequence,
1529#endif
1530	.route_req	=	tcp_v4_route_req,
1531	.init_seq	=	tcp_v4_init_seq,
1532	.init_ts_off	=	tcp_v4_init_ts_off,
1533	.send_synack	=	tcp_v4_send_synack,
1534};
1535
1536int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537{
1538	/* Never answer to SYNs send to broadcast or multicast */
1539	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540		goto drop;
1541
1542	return tcp_conn_request(&tcp_request_sock_ops,
1543				&tcp_request_sock_ipv4_ops, sk, skb);
1544
1545drop:
1546	tcp_listendrop(sk);
1547	return 0;
1548}
1549EXPORT_SYMBOL(tcp_v4_conn_request);
1550
1551
1552/*
1553 * The three way handshake has completed - we got a valid synack -
1554 * now create the new socket.
1555 */
1556struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557				  struct request_sock *req,
1558				  struct dst_entry *dst,
1559				  struct request_sock *req_unhash,
1560				  bool *own_req)
1561{
1562	struct inet_request_sock *ireq;
1563	bool found_dup_sk = false;
1564	struct inet_sock *newinet;
1565	struct tcp_sock *newtp;
1566	struct sock *newsk;
1567#ifdef CONFIG_TCP_MD5SIG
1568	const union tcp_md5_addr *addr;
1569	struct tcp_md5sig_key *key;
1570	int l3index;
1571#endif
1572	struct ip_options_rcu *inet_opt;
1573
1574	if (sk_acceptq_is_full(sk))
1575		goto exit_overflow;
1576
1577	newsk = tcp_create_openreq_child(sk, req, skb);
1578	if (!newsk)
1579		goto exit_nonewsk;
1580
1581	newsk->sk_gso_type = SKB_GSO_TCPV4;
1582	inet_sk_rx_dst_set(newsk, skb);
1583
1584	newtp		      = tcp_sk(newsk);
1585	newinet		      = inet_sk(newsk);
1586	ireq		      = inet_rsk(req);
1587	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589	newsk->sk_bound_dev_if = ireq->ir_iif;
1590	newinet->inet_saddr   = ireq->ir_loc_addr;
1591	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1592	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593	newinet->mc_index     = inet_iif(skb);
1594	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1595	newinet->rcv_tos      = ip_hdr(skb)->tos;
1596	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597	if (inet_opt)
1598		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599	atomic_set(&newinet->inet_id, get_random_u16());
1600
1601	/* Set ToS of the new socket based upon the value of incoming SYN.
1602	 * ECT bits are set later in tcp_init_transfer().
1603	 */
1604	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606
1607	if (!dst) {
1608		dst = inet_csk_route_child_sock(sk, newsk, req);
1609		if (!dst)
1610			goto put_and_exit;
1611	} else {
1612		/* syncookie case : see end of cookie_v4_check() */
1613	}
1614	sk_setup_caps(newsk, dst);
1615
1616	tcp_ca_openreq_child(newsk, dst);
1617
1618	tcp_sync_mss(newsk, dst_mtu(dst));
1619	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620
1621	tcp_initialize_rcv_mss(newsk);
1622
1623#ifdef CONFIG_TCP_MD5SIG
1624	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625	/* Copy over the MD5 key from the original socket */
1626	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628	if (key) {
1629		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630			goto put_and_exit;
1631		sk_gso_disable(newsk);
1632	}
1633#endif
1634
1635	if (__inet_inherit_port(sk, newsk) < 0)
1636		goto put_and_exit;
1637	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638				       &found_dup_sk);
1639	if (likely(*own_req)) {
1640		tcp_move_syn(newtp, req);
1641		ireq->ireq_opt = NULL;
1642	} else {
1643		newinet->inet_opt = NULL;
1644
1645		if (!req_unhash && found_dup_sk) {
1646			/* This code path should only be executed in the
1647			 * syncookie case only
1648			 */
1649			bh_unlock_sock(newsk);
1650			sock_put(newsk);
1651			newsk = NULL;
1652		}
1653	}
1654	return newsk;
1655
1656exit_overflow:
1657	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658exit_nonewsk:
1659	dst_release(dst);
1660exit:
1661	tcp_listendrop(sk);
1662	return NULL;
1663put_and_exit:
1664	newinet->inet_opt = NULL;
1665	inet_csk_prepare_forced_close(newsk);
1666	tcp_done(newsk);
1667	goto exit;
1668}
1669EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672{
1673#ifdef CONFIG_SYN_COOKIES
1674	const struct tcphdr *th = tcp_hdr(skb);
1675
1676	if (!th->syn)
1677		sk = cookie_v4_check(sk, skb);
1678#endif
1679	return sk;
1680}
1681
1682u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683			 struct tcphdr *th, u32 *cookie)
1684{
1685	u16 mss = 0;
1686#ifdef CONFIG_SYN_COOKIES
1687	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688				    &tcp_request_sock_ipv4_ops, sk, th);
1689	if (mss) {
1690		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691		tcp_synq_overflow(sk);
1692	}
1693#endif
1694	return mss;
1695}
1696
1697INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698							   u32));
1699/* The socket must have it's spinlock held when we get
1700 * here, unless it is a TCP_LISTEN socket.
1701 *
1702 * We have a potential double-lock case here, so even when
1703 * doing backlog processing we use the BH locking scheme.
1704 * This is because we cannot sleep with the original spinlock
1705 * held.
1706 */
1707int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708{
1709	enum skb_drop_reason reason;
1710	struct sock *rsk;
1711
1712	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713		struct dst_entry *dst;
1714
1715		dst = rcu_dereference_protected(sk->sk_rx_dst,
1716						lockdep_sock_is_held(sk));
1717
1718		sock_rps_save_rxhash(sk, skb);
1719		sk_mark_napi_id(sk, skb);
1720		if (dst) {
1721			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723					     dst, 0)) {
1724				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725				dst_release(dst);
1726			}
1727		}
1728		tcp_rcv_established(sk, skb);
1729		return 0;
1730	}
1731
1732	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733	if (tcp_checksum_complete(skb))
1734		goto csum_err;
1735
1736	if (sk->sk_state == TCP_LISTEN) {
1737		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738
1739		if (!nsk)
1740			goto discard;
1741		if (nsk != sk) {
1742			if (tcp_child_process(sk, nsk, skb)) {
1743				rsk = nsk;
1744				goto reset;
1745			}
1746			return 0;
1747		}
1748	} else
1749		sock_rps_save_rxhash(sk, skb);
1750
1751	if (tcp_rcv_state_process(sk, skb)) {
1752		rsk = sk;
1753		goto reset;
1754	}
1755	return 0;
1756
1757reset:
1758	tcp_v4_send_reset(rsk, skb);
1759discard:
1760	kfree_skb_reason(skb, reason);
1761	/* Be careful here. If this function gets more complicated and
1762	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1763	 * might be destroyed here. This current version compiles correctly,
1764	 * but you have been warned.
1765	 */
1766	return 0;
1767
1768csum_err:
1769	reason = SKB_DROP_REASON_TCP_CSUM;
1770	trace_tcp_bad_csum(skb);
1771	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773	goto discard;
1774}
1775EXPORT_SYMBOL(tcp_v4_do_rcv);
1776
1777int tcp_v4_early_demux(struct sk_buff *skb)
1778{
1779	struct net *net = dev_net(skb->dev);
1780	const struct iphdr *iph;
1781	const struct tcphdr *th;
1782	struct sock *sk;
1783
1784	if (skb->pkt_type != PACKET_HOST)
1785		return 0;
1786
1787	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788		return 0;
1789
1790	iph = ip_hdr(skb);
1791	th = tcp_hdr(skb);
1792
1793	if (th->doff < sizeof(struct tcphdr) / 4)
1794		return 0;
1795
1796	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797				       iph->saddr, th->source,
1798				       iph->daddr, ntohs(th->dest),
1799				       skb->skb_iif, inet_sdif(skb));
1800	if (sk) {
1801		skb->sk = sk;
1802		skb->destructor = sock_edemux;
1803		if (sk_fullsock(sk)) {
1804			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805
1806			if (dst)
1807				dst = dst_check(dst, 0);
1808			if (dst &&
1809			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1810				skb_dst_set_noref(skb, dst);
1811		}
1812	}
1813	return 0;
1814}
1815
1816bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817		     enum skb_drop_reason *reason)
1818{
1819	u32 limit, tail_gso_size, tail_gso_segs;
1820	struct skb_shared_info *shinfo;
1821	const struct tcphdr *th;
1822	struct tcphdr *thtail;
1823	struct sk_buff *tail;
1824	unsigned int hdrlen;
1825	bool fragstolen;
1826	u32 gso_segs;
1827	u32 gso_size;
1828	int delta;
1829
1830	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831	 * we can fix skb->truesize to its real value to avoid future drops.
1832	 * This is valid because skb is not yet charged to the socket.
1833	 * It has been noticed pure SACK packets were sometimes dropped
1834	 * (if cooked by drivers without copybreak feature).
1835	 */
1836	skb_condense(skb);
1837
1838	skb_dst_drop(skb);
1839
1840	if (unlikely(tcp_checksum_complete(skb))) {
1841		bh_unlock_sock(sk);
1842		trace_tcp_bad_csum(skb);
1843		*reason = SKB_DROP_REASON_TCP_CSUM;
1844		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846		return true;
1847	}
1848
1849	/* Attempt coalescing to last skb in backlog, even if we are
1850	 * above the limits.
1851	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852	 */
1853	th = (const struct tcphdr *)skb->data;
1854	hdrlen = th->doff * 4;
1855
1856	tail = sk->sk_backlog.tail;
1857	if (!tail)
1858		goto no_coalesce;
1859	thtail = (struct tcphdr *)tail->data;
1860
1861	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863	    ((TCP_SKB_CB(tail)->tcp_flags |
1864	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865	    !((TCP_SKB_CB(tail)->tcp_flags &
1866	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867	    ((TCP_SKB_CB(tail)->tcp_flags ^
1868	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869#ifdef CONFIG_TLS_DEVICE
1870	    tail->decrypted != skb->decrypted ||
1871#endif
1872	    thtail->doff != th->doff ||
1873	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1874		goto no_coalesce;
1875
1876	__skb_pull(skb, hdrlen);
1877
1878	shinfo = skb_shinfo(skb);
1879	gso_size = shinfo->gso_size ?: skb->len;
1880	gso_segs = shinfo->gso_segs ?: 1;
1881
1882	shinfo = skb_shinfo(tail);
1883	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1884	tail_gso_segs = shinfo->gso_segs ?: 1;
1885
1886	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1887		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1888
1889		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1890			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1891			thtail->window = th->window;
1892		}
1893
1894		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1895		 * thtail->fin, so that the fast path in tcp_rcv_established()
1896		 * is not entered if we append a packet with a FIN.
1897		 * SYN, RST, URG are not present.
1898		 * ACK is set on both packets.
1899		 * PSH : we do not really care in TCP stack,
1900		 *       at least for 'GRO' packets.
1901		 */
1902		thtail->fin |= th->fin;
1903		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1904
1905		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1906			TCP_SKB_CB(tail)->has_rxtstamp = true;
1907			tail->tstamp = skb->tstamp;
1908			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1909		}
1910
1911		/* Not as strict as GRO. We only need to carry mss max value */
1912		shinfo->gso_size = max(gso_size, tail_gso_size);
1913		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1914
1915		sk->sk_backlog.len += delta;
1916		__NET_INC_STATS(sock_net(sk),
1917				LINUX_MIB_TCPBACKLOGCOALESCE);
1918		kfree_skb_partial(skb, fragstolen);
1919		return false;
1920	}
1921	__skb_push(skb, hdrlen);
1922
1923no_coalesce:
1924	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1925
1926	/* Only socket owner can try to collapse/prune rx queues
1927	 * to reduce memory overhead, so add a little headroom here.
1928	 * Few sockets backlog are possibly concurrently non empty.
1929	 */
1930	limit += 64 * 1024;
1931
1932	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1933		bh_unlock_sock(sk);
1934		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1935		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1936		return true;
1937	}
1938	return false;
1939}
1940EXPORT_SYMBOL(tcp_add_backlog);
1941
1942int tcp_filter(struct sock *sk, struct sk_buff *skb)
1943{
1944	struct tcphdr *th = (struct tcphdr *)skb->data;
1945
1946	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1947}
1948EXPORT_SYMBOL(tcp_filter);
1949
1950static void tcp_v4_restore_cb(struct sk_buff *skb)
1951{
1952	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1953		sizeof(struct inet_skb_parm));
1954}
1955
1956static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1957			   const struct tcphdr *th)
1958{
1959	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1960	 * barrier() makes sure compiler wont play fool^Waliasing games.
1961	 */
1962	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1963		sizeof(struct inet_skb_parm));
1964	barrier();
1965
1966	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1967	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1968				    skb->len - th->doff * 4);
1969	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1970	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1971	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1972	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1973	TCP_SKB_CB(skb)->sacked	 = 0;
1974	TCP_SKB_CB(skb)->has_rxtstamp =
1975			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1976}
1977
1978/*
1979 *	From tcp_input.c
1980 */
1981
1982int tcp_v4_rcv(struct sk_buff *skb)
1983{
1984	struct net *net = dev_net(skb->dev);
1985	enum skb_drop_reason drop_reason;
1986	int sdif = inet_sdif(skb);
1987	int dif = inet_iif(skb);
1988	const struct iphdr *iph;
1989	const struct tcphdr *th;
1990	bool refcounted;
1991	struct sock *sk;
1992	int ret;
1993
1994	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1995	if (skb->pkt_type != PACKET_HOST)
1996		goto discard_it;
1997
1998	/* Count it even if it's bad */
1999	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2000
2001	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2002		goto discard_it;
2003
2004	th = (const struct tcphdr *)skb->data;
2005
2006	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2007		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2008		goto bad_packet;
2009	}
2010	if (!pskb_may_pull(skb, th->doff * 4))
2011		goto discard_it;
2012
2013	/* An explanation is required here, I think.
2014	 * Packet length and doff are validated by header prediction,
2015	 * provided case of th->doff==0 is eliminated.
2016	 * So, we defer the checks. */
2017
2018	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2019		goto csum_error;
2020
2021	th = (const struct tcphdr *)skb->data;
2022	iph = ip_hdr(skb);
2023lookup:
2024	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2025			       skb, __tcp_hdrlen(th), th->source,
2026			       th->dest, sdif, &refcounted);
2027	if (!sk)
2028		goto no_tcp_socket;
2029
2030process:
2031	if (sk->sk_state == TCP_TIME_WAIT)
2032		goto do_time_wait;
2033
2034	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2035		struct request_sock *req = inet_reqsk(sk);
2036		bool req_stolen = false;
2037		struct sock *nsk;
2038
2039		sk = req->rsk_listener;
2040		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2041			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042		else
2043			drop_reason = tcp_inbound_md5_hash(sk, skb,
2044						   &iph->saddr, &iph->daddr,
2045						   AF_INET, dif, sdif);
2046		if (unlikely(drop_reason)) {
2047			sk_drops_add(sk, skb);
2048			reqsk_put(req);
2049			goto discard_it;
2050		}
2051		if (tcp_checksum_complete(skb)) {
2052			reqsk_put(req);
2053			goto csum_error;
2054		}
2055		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2056			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2057			if (!nsk) {
2058				inet_csk_reqsk_queue_drop_and_put(sk, req);
2059				goto lookup;
2060			}
2061			sk = nsk;
2062			/* reuseport_migrate_sock() has already held one sk_refcnt
2063			 * before returning.
2064			 */
2065		} else {
2066			/* We own a reference on the listener, increase it again
2067			 * as we might lose it too soon.
2068			 */
2069			sock_hold(sk);
2070		}
2071		refcounted = true;
2072		nsk = NULL;
2073		if (!tcp_filter(sk, skb)) {
2074			th = (const struct tcphdr *)skb->data;
2075			iph = ip_hdr(skb);
2076			tcp_v4_fill_cb(skb, iph, th);
2077			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2078		} else {
2079			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080		}
2081		if (!nsk) {
2082			reqsk_put(req);
2083			if (req_stolen) {
2084				/* Another cpu got exclusive access to req
2085				 * and created a full blown socket.
2086				 * Try to feed this packet to this socket
2087				 * instead of discarding it.
2088				 */
2089				tcp_v4_restore_cb(skb);
2090				sock_put(sk);
2091				goto lookup;
2092			}
2093			goto discard_and_relse;
2094		}
2095		nf_reset_ct(skb);
2096		if (nsk == sk) {
2097			reqsk_put(req);
2098			tcp_v4_restore_cb(skb);
2099		} else if (tcp_child_process(sk, nsk, skb)) {
2100			tcp_v4_send_reset(nsk, skb);
2101			goto discard_and_relse;
2102		} else {
2103			sock_put(sk);
2104			return 0;
2105		}
2106	}
2107
2108	if (static_branch_unlikely(&ip4_min_ttl)) {
2109		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2110		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2111			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2112			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2113			goto discard_and_relse;
2114		}
2115	}
2116
2117	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2118		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119		goto discard_and_relse;
2120	}
2121
2122	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2123					   &iph->daddr, AF_INET, dif, sdif);
2124	if (drop_reason)
2125		goto discard_and_relse;
2126
2127	nf_reset_ct(skb);
2128
2129	if (tcp_filter(sk, skb)) {
2130		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2131		goto discard_and_relse;
2132	}
2133	th = (const struct tcphdr *)skb->data;
2134	iph = ip_hdr(skb);
2135	tcp_v4_fill_cb(skb, iph, th);
2136
2137	skb->dev = NULL;
2138
2139	if (sk->sk_state == TCP_LISTEN) {
2140		ret = tcp_v4_do_rcv(sk, skb);
2141		goto put_and_return;
2142	}
2143
2144	sk_incoming_cpu_update(sk);
2145
2146	bh_lock_sock_nested(sk);
2147	tcp_segs_in(tcp_sk(sk), skb);
2148	ret = 0;
2149	if (!sock_owned_by_user(sk)) {
2150		ret = tcp_v4_do_rcv(sk, skb);
2151	} else {
2152		if (tcp_add_backlog(sk, skb, &drop_reason))
2153			goto discard_and_relse;
2154	}
2155	bh_unlock_sock(sk);
2156
2157put_and_return:
2158	if (refcounted)
2159		sock_put(sk);
2160
2161	return ret;
2162
2163no_tcp_socket:
2164	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2165	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2166		goto discard_it;
2167
2168	tcp_v4_fill_cb(skb, iph, th);
2169
2170	if (tcp_checksum_complete(skb)) {
2171csum_error:
2172		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2173		trace_tcp_bad_csum(skb);
2174		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2175bad_packet:
2176		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2177	} else {
2178		tcp_v4_send_reset(NULL, skb);
2179	}
2180
2181discard_it:
2182	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2183	/* Discard frame. */
2184	kfree_skb_reason(skb, drop_reason);
2185	return 0;
2186
2187discard_and_relse:
2188	sk_drops_add(sk, skb);
2189	if (refcounted)
2190		sock_put(sk);
2191	goto discard_it;
2192
2193do_time_wait:
2194	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2195		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2196		inet_twsk_put(inet_twsk(sk));
2197		goto discard_it;
2198	}
2199
2200	tcp_v4_fill_cb(skb, iph, th);
2201
2202	if (tcp_checksum_complete(skb)) {
2203		inet_twsk_put(inet_twsk(sk));
2204		goto csum_error;
2205	}
2206	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2207	case TCP_TW_SYN: {
2208		struct sock *sk2 = inet_lookup_listener(net,
2209							net->ipv4.tcp_death_row.hashinfo,
2210							skb, __tcp_hdrlen(th),
2211							iph->saddr, th->source,
2212							iph->daddr, th->dest,
2213							inet_iif(skb),
2214							sdif);
2215		if (sk2) {
2216			inet_twsk_deschedule_put(inet_twsk(sk));
2217			sk = sk2;
2218			tcp_v4_restore_cb(skb);
2219			refcounted = false;
2220			goto process;
2221		}
2222	}
2223		/* to ACK */
2224		fallthrough;
2225	case TCP_TW_ACK:
2226		tcp_v4_timewait_ack(sk, skb);
2227		break;
2228	case TCP_TW_RST:
2229		tcp_v4_send_reset(sk, skb);
2230		inet_twsk_deschedule_put(inet_twsk(sk));
2231		goto discard_it;
2232	case TCP_TW_SUCCESS:;
2233	}
2234	goto discard_it;
2235}
2236
2237static struct timewait_sock_ops tcp_timewait_sock_ops = {
2238	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2239	.twsk_unique	= tcp_twsk_unique,
2240	.twsk_destructor= tcp_twsk_destructor,
2241};
2242
2243void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2244{
2245	struct dst_entry *dst = skb_dst(skb);
2246
2247	if (dst && dst_hold_safe(dst)) {
2248		rcu_assign_pointer(sk->sk_rx_dst, dst);
2249		sk->sk_rx_dst_ifindex = skb->skb_iif;
2250	}
2251}
2252EXPORT_SYMBOL(inet_sk_rx_dst_set);
2253
2254const struct inet_connection_sock_af_ops ipv4_specific = {
2255	.queue_xmit	   = ip_queue_xmit,
2256	.send_check	   = tcp_v4_send_check,
2257	.rebuild_header	   = inet_sk_rebuild_header,
2258	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2259	.conn_request	   = tcp_v4_conn_request,
2260	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2261	.net_header_len	   = sizeof(struct iphdr),
2262	.setsockopt	   = ip_setsockopt,
2263	.getsockopt	   = ip_getsockopt,
2264	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2265	.sockaddr_len	   = sizeof(struct sockaddr_in),
2266	.mtu_reduced	   = tcp_v4_mtu_reduced,
2267};
2268EXPORT_SYMBOL(ipv4_specific);
2269
2270#ifdef CONFIG_TCP_MD5SIG
2271static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2272	.md5_lookup		= tcp_v4_md5_lookup,
2273	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2274	.md5_parse		= tcp_v4_parse_md5_keys,
2275};
2276#endif
2277
2278/* NOTE: A lot of things set to zero explicitly by call to
2279 *       sk_alloc() so need not be done here.
2280 */
2281static int tcp_v4_init_sock(struct sock *sk)
2282{
2283	struct inet_connection_sock *icsk = inet_csk(sk);
2284
2285	tcp_init_sock(sk);
2286
2287	icsk->icsk_af_ops = &ipv4_specific;
2288
2289#ifdef CONFIG_TCP_MD5SIG
2290	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2291#endif
2292
2293	return 0;
2294}
2295
2296void tcp_v4_destroy_sock(struct sock *sk)
2297{
2298	struct tcp_sock *tp = tcp_sk(sk);
2299
2300	trace_tcp_destroy_sock(sk);
2301
2302	tcp_clear_xmit_timers(sk);
2303
2304	tcp_cleanup_congestion_control(sk);
2305
2306	tcp_cleanup_ulp(sk);
2307
2308	/* Cleanup up the write buffer. */
2309	tcp_write_queue_purge(sk);
2310
2311	/* Check if we want to disable active TFO */
2312	tcp_fastopen_active_disable_ofo_check(sk);
2313
2314	/* Cleans up our, hopefully empty, out_of_order_queue. */
2315	skb_rbtree_purge(&tp->out_of_order_queue);
2316
2317#ifdef CONFIG_TCP_MD5SIG
2318	/* Clean up the MD5 key list, if any */
2319	if (tp->md5sig_info) {
2320		tcp_clear_md5_list(sk);
2321		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2322		tp->md5sig_info = NULL;
2323		static_branch_slow_dec_deferred(&tcp_md5_needed);
2324	}
2325#endif
2326
2327	/* Clean up a referenced TCP bind bucket. */
2328	if (inet_csk(sk)->icsk_bind_hash)
2329		inet_put_port(sk);
2330
2331	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2332
2333	/* If socket is aborted during connect operation */
2334	tcp_free_fastopen_req(tp);
2335	tcp_fastopen_destroy_cipher(sk);
2336	tcp_saved_syn_free(tp);
2337
2338	sk_sockets_allocated_dec(sk);
2339}
2340EXPORT_SYMBOL(tcp_v4_destroy_sock);
2341
2342#ifdef CONFIG_PROC_FS
2343/* Proc filesystem TCP sock list dumping. */
2344
2345static unsigned short seq_file_family(const struct seq_file *seq);
2346
2347static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2348{
2349	unsigned short family = seq_file_family(seq);
2350
2351	/* AF_UNSPEC is used as a match all */
2352	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2353		net_eq(sock_net(sk), seq_file_net(seq)));
2354}
2355
2356/* Find a non empty bucket (starting from st->bucket)
2357 * and return the first sk from it.
2358 */
2359static void *listening_get_first(struct seq_file *seq)
2360{
2361	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2362	struct tcp_iter_state *st = seq->private;
2363
2364	st->offset = 0;
2365	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2366		struct inet_listen_hashbucket *ilb2;
2367		struct hlist_nulls_node *node;
2368		struct sock *sk;
2369
2370		ilb2 = &hinfo->lhash2[st->bucket];
2371		if (hlist_nulls_empty(&ilb2->nulls_head))
2372			continue;
2373
2374		spin_lock(&ilb2->lock);
2375		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2376			if (seq_sk_match(seq, sk))
2377				return sk;
2378		}
2379		spin_unlock(&ilb2->lock);
2380	}
2381
2382	return NULL;
2383}
2384
2385/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2386 * If "cur" is the last one in the st->bucket,
2387 * call listening_get_first() to return the first sk of the next
2388 * non empty bucket.
2389 */
2390static void *listening_get_next(struct seq_file *seq, void *cur)
2391{
2392	struct tcp_iter_state *st = seq->private;
2393	struct inet_listen_hashbucket *ilb2;
2394	struct hlist_nulls_node *node;
2395	struct inet_hashinfo *hinfo;
2396	struct sock *sk = cur;
2397
2398	++st->num;
2399	++st->offset;
2400
2401	sk = sk_nulls_next(sk);
2402	sk_nulls_for_each_from(sk, node) {
2403		if (seq_sk_match(seq, sk))
2404			return sk;
2405	}
2406
2407	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2408	ilb2 = &hinfo->lhash2[st->bucket];
2409	spin_unlock(&ilb2->lock);
2410	++st->bucket;
2411	return listening_get_first(seq);
2412}
2413
2414static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2415{
2416	struct tcp_iter_state *st = seq->private;
2417	void *rc;
2418
2419	st->bucket = 0;
2420	st->offset = 0;
2421	rc = listening_get_first(seq);
2422
2423	while (rc && *pos) {
2424		rc = listening_get_next(seq, rc);
2425		--*pos;
2426	}
2427	return rc;
2428}
2429
2430static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2431				const struct tcp_iter_state *st)
2432{
2433	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2434}
2435
2436/*
2437 * Get first established socket starting from bucket given in st->bucket.
2438 * If st->bucket is zero, the very first socket in the hash is returned.
2439 */
2440static void *established_get_first(struct seq_file *seq)
2441{
2442	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2443	struct tcp_iter_state *st = seq->private;
2444
2445	st->offset = 0;
2446	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2447		struct sock *sk;
2448		struct hlist_nulls_node *node;
2449		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2450
2451		/* Lockless fast path for the common case of empty buckets */
2452		if (empty_bucket(hinfo, st))
2453			continue;
2454
2455		spin_lock_bh(lock);
2456		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2457			if (seq_sk_match(seq, sk))
2458				return sk;
2459		}
2460		spin_unlock_bh(lock);
2461	}
2462
2463	return NULL;
2464}
2465
2466static void *established_get_next(struct seq_file *seq, void *cur)
2467{
2468	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2469	struct tcp_iter_state *st = seq->private;
2470	struct hlist_nulls_node *node;
2471	struct sock *sk = cur;
2472
2473	++st->num;
2474	++st->offset;
2475
2476	sk = sk_nulls_next(sk);
2477
2478	sk_nulls_for_each_from(sk, node) {
2479		if (seq_sk_match(seq, sk))
2480			return sk;
2481	}
2482
2483	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2484	++st->bucket;
2485	return established_get_first(seq);
2486}
2487
2488static void *established_get_idx(struct seq_file *seq, loff_t pos)
2489{
2490	struct tcp_iter_state *st = seq->private;
2491	void *rc;
2492
2493	st->bucket = 0;
2494	rc = established_get_first(seq);
2495
2496	while (rc && pos) {
2497		rc = established_get_next(seq, rc);
2498		--pos;
2499	}
2500	return rc;
2501}
2502
2503static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2504{
2505	void *rc;
2506	struct tcp_iter_state *st = seq->private;
2507
2508	st->state = TCP_SEQ_STATE_LISTENING;
2509	rc	  = listening_get_idx(seq, &pos);
2510
2511	if (!rc) {
2512		st->state = TCP_SEQ_STATE_ESTABLISHED;
2513		rc	  = established_get_idx(seq, pos);
2514	}
2515
2516	return rc;
2517}
2518
2519static void *tcp_seek_last_pos(struct seq_file *seq)
2520{
2521	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2522	struct tcp_iter_state *st = seq->private;
2523	int bucket = st->bucket;
2524	int offset = st->offset;
2525	int orig_num = st->num;
2526	void *rc = NULL;
2527
2528	switch (st->state) {
2529	case TCP_SEQ_STATE_LISTENING:
2530		if (st->bucket > hinfo->lhash2_mask)
2531			break;
2532		rc = listening_get_first(seq);
2533		while (offset-- && rc && bucket == st->bucket)
2534			rc = listening_get_next(seq, rc);
2535		if (rc)
2536			break;
2537		st->bucket = 0;
2538		st->state = TCP_SEQ_STATE_ESTABLISHED;
2539		fallthrough;
2540	case TCP_SEQ_STATE_ESTABLISHED:
2541		if (st->bucket > hinfo->ehash_mask)
2542			break;
2543		rc = established_get_first(seq);
2544		while (offset-- && rc && bucket == st->bucket)
2545			rc = established_get_next(seq, rc);
2546	}
2547
2548	st->num = orig_num;
2549
2550	return rc;
2551}
2552
2553void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2554{
2555	struct tcp_iter_state *st = seq->private;
2556	void *rc;
2557
2558	if (*pos && *pos == st->last_pos) {
2559		rc = tcp_seek_last_pos(seq);
2560		if (rc)
2561			goto out;
2562	}
2563
2564	st->state = TCP_SEQ_STATE_LISTENING;
2565	st->num = 0;
2566	st->bucket = 0;
2567	st->offset = 0;
2568	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2569
2570out:
2571	st->last_pos = *pos;
2572	return rc;
2573}
2574EXPORT_SYMBOL(tcp_seq_start);
2575
2576void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2577{
2578	struct tcp_iter_state *st = seq->private;
2579	void *rc = NULL;
2580
2581	if (v == SEQ_START_TOKEN) {
2582		rc = tcp_get_idx(seq, 0);
2583		goto out;
2584	}
2585
2586	switch (st->state) {
2587	case TCP_SEQ_STATE_LISTENING:
2588		rc = listening_get_next(seq, v);
2589		if (!rc) {
2590			st->state = TCP_SEQ_STATE_ESTABLISHED;
2591			st->bucket = 0;
2592			st->offset = 0;
2593			rc	  = established_get_first(seq);
2594		}
2595		break;
2596	case TCP_SEQ_STATE_ESTABLISHED:
2597		rc = established_get_next(seq, v);
2598		break;
2599	}
2600out:
2601	++*pos;
2602	st->last_pos = *pos;
2603	return rc;
2604}
2605EXPORT_SYMBOL(tcp_seq_next);
2606
2607void tcp_seq_stop(struct seq_file *seq, void *v)
2608{
2609	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610	struct tcp_iter_state *st = seq->private;
2611
2612	switch (st->state) {
2613	case TCP_SEQ_STATE_LISTENING:
2614		if (v != SEQ_START_TOKEN)
2615			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2616		break;
2617	case TCP_SEQ_STATE_ESTABLISHED:
2618		if (v)
2619			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2620		break;
2621	}
2622}
2623EXPORT_SYMBOL(tcp_seq_stop);
2624
2625static void get_openreq4(const struct request_sock *req,
2626			 struct seq_file *f, int i)
2627{
2628	const struct inet_request_sock *ireq = inet_rsk(req);
2629	long delta = req->rsk_timer.expires - jiffies;
2630
2631	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2632		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2633		i,
2634		ireq->ir_loc_addr,
2635		ireq->ir_num,
2636		ireq->ir_rmt_addr,
2637		ntohs(ireq->ir_rmt_port),
2638		TCP_SYN_RECV,
2639		0, 0, /* could print option size, but that is af dependent. */
2640		1,    /* timers active (only the expire timer) */
2641		jiffies_delta_to_clock_t(delta),
2642		req->num_timeout,
2643		from_kuid_munged(seq_user_ns(f),
2644				 sock_i_uid(req->rsk_listener)),
2645		0,  /* non standard timer */
2646		0, /* open_requests have no inode */
2647		0,
2648		req);
2649}
2650
2651static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2652{
2653	int timer_active;
2654	unsigned long timer_expires;
2655	const struct tcp_sock *tp = tcp_sk(sk);
2656	const struct inet_connection_sock *icsk = inet_csk(sk);
2657	const struct inet_sock *inet = inet_sk(sk);
2658	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2659	__be32 dest = inet->inet_daddr;
2660	__be32 src = inet->inet_rcv_saddr;
2661	__u16 destp = ntohs(inet->inet_dport);
2662	__u16 srcp = ntohs(inet->inet_sport);
2663	int rx_queue;
2664	int state;
2665
2666	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2667	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2668	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2669		timer_active	= 1;
2670		timer_expires	= icsk->icsk_timeout;
2671	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2672		timer_active	= 4;
2673		timer_expires	= icsk->icsk_timeout;
2674	} else if (timer_pending(&sk->sk_timer)) {
2675		timer_active	= 2;
2676		timer_expires	= sk->sk_timer.expires;
2677	} else {
2678		timer_active	= 0;
2679		timer_expires = jiffies;
2680	}
2681
2682	state = inet_sk_state_load(sk);
2683	if (state == TCP_LISTEN)
2684		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2685	else
2686		/* Because we don't lock the socket,
2687		 * we might find a transient negative value.
2688		 */
2689		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2690				      READ_ONCE(tp->copied_seq), 0);
2691
2692	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2693			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2694		i, src, srcp, dest, destp, state,
2695		READ_ONCE(tp->write_seq) - tp->snd_una,
2696		rx_queue,
2697		timer_active,
2698		jiffies_delta_to_clock_t(timer_expires - jiffies),
2699		icsk->icsk_retransmits,
2700		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2701		icsk->icsk_probes_out,
2702		sock_i_ino(sk),
2703		refcount_read(&sk->sk_refcnt), sk,
2704		jiffies_to_clock_t(icsk->icsk_rto),
2705		jiffies_to_clock_t(icsk->icsk_ack.ato),
2706		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2707		tcp_snd_cwnd(tp),
2708		state == TCP_LISTEN ?
2709		    fastopenq->max_qlen :
2710		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2711}
2712
2713static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2714			       struct seq_file *f, int i)
2715{
2716	long delta = tw->tw_timer.expires - jiffies;
2717	__be32 dest, src;
2718	__u16 destp, srcp;
2719
2720	dest  = tw->tw_daddr;
2721	src   = tw->tw_rcv_saddr;
2722	destp = ntohs(tw->tw_dport);
2723	srcp  = ntohs(tw->tw_sport);
2724
2725	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2726		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2727		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2728		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2729		refcount_read(&tw->tw_refcnt), tw);
2730}
2731
2732#define TMPSZ 150
2733
2734static int tcp4_seq_show(struct seq_file *seq, void *v)
2735{
2736	struct tcp_iter_state *st;
2737	struct sock *sk = v;
2738
2739	seq_setwidth(seq, TMPSZ - 1);
2740	if (v == SEQ_START_TOKEN) {
2741		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2742			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2743			   "inode");
2744		goto out;
2745	}
2746	st = seq->private;
2747
2748	if (sk->sk_state == TCP_TIME_WAIT)
2749		get_timewait4_sock(v, seq, st->num);
2750	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2751		get_openreq4(v, seq, st->num);
2752	else
2753		get_tcp4_sock(v, seq, st->num);
2754out:
2755	seq_pad(seq, '\n');
2756	return 0;
2757}
2758
2759#ifdef CONFIG_BPF_SYSCALL
2760struct bpf_tcp_iter_state {
2761	struct tcp_iter_state state;
2762	unsigned int cur_sk;
2763	unsigned int end_sk;
2764	unsigned int max_sk;
2765	struct sock **batch;
2766	bool st_bucket_done;
2767};
2768
2769struct bpf_iter__tcp {
2770	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2771	__bpf_md_ptr(struct sock_common *, sk_common);
2772	uid_t uid __aligned(8);
2773};
2774
2775static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2776			     struct sock_common *sk_common, uid_t uid)
2777{
2778	struct bpf_iter__tcp ctx;
2779
2780	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2781	ctx.meta = meta;
2782	ctx.sk_common = sk_common;
2783	ctx.uid = uid;
2784	return bpf_iter_run_prog(prog, &ctx);
2785}
2786
2787static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2788{
2789	while (iter->cur_sk < iter->end_sk)
2790		sock_gen_put(iter->batch[iter->cur_sk++]);
2791}
2792
2793static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2794				      unsigned int new_batch_sz)
2795{
2796	struct sock **new_batch;
2797
2798	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2799			     GFP_USER | __GFP_NOWARN);
2800	if (!new_batch)
2801		return -ENOMEM;
2802
2803	bpf_iter_tcp_put_batch(iter);
2804	kvfree(iter->batch);
2805	iter->batch = new_batch;
2806	iter->max_sk = new_batch_sz;
2807
2808	return 0;
2809}
2810
2811static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2812						 struct sock *start_sk)
2813{
2814	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2815	struct bpf_tcp_iter_state *iter = seq->private;
2816	struct tcp_iter_state *st = &iter->state;
2817	struct hlist_nulls_node *node;
2818	unsigned int expected = 1;
2819	struct sock *sk;
2820
2821	sock_hold(start_sk);
2822	iter->batch[iter->end_sk++] = start_sk;
2823
2824	sk = sk_nulls_next(start_sk);
2825	sk_nulls_for_each_from(sk, node) {
2826		if (seq_sk_match(seq, sk)) {
2827			if (iter->end_sk < iter->max_sk) {
2828				sock_hold(sk);
2829				iter->batch[iter->end_sk++] = sk;
2830			}
2831			expected++;
2832		}
2833	}
2834	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2835
2836	return expected;
2837}
2838
2839static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2840						   struct sock *start_sk)
2841{
2842	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2843	struct bpf_tcp_iter_state *iter = seq->private;
2844	struct tcp_iter_state *st = &iter->state;
2845	struct hlist_nulls_node *node;
2846	unsigned int expected = 1;
2847	struct sock *sk;
2848
2849	sock_hold(start_sk);
2850	iter->batch[iter->end_sk++] = start_sk;
2851
2852	sk = sk_nulls_next(start_sk);
2853	sk_nulls_for_each_from(sk, node) {
2854		if (seq_sk_match(seq, sk)) {
2855			if (iter->end_sk < iter->max_sk) {
2856				sock_hold(sk);
2857				iter->batch[iter->end_sk++] = sk;
2858			}
2859			expected++;
2860		}
2861	}
2862	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2863
2864	return expected;
2865}
2866
2867static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2868{
2869	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2870	struct bpf_tcp_iter_state *iter = seq->private;
2871	struct tcp_iter_state *st = &iter->state;
2872	unsigned int expected;
2873	bool resized = false;
2874	struct sock *sk;
2875
2876	/* The st->bucket is done.  Directly advance to the next
2877	 * bucket instead of having the tcp_seek_last_pos() to skip
2878	 * one by one in the current bucket and eventually find out
2879	 * it has to advance to the next bucket.
2880	 */
2881	if (iter->st_bucket_done) {
2882		st->offset = 0;
2883		st->bucket++;
2884		if (st->state == TCP_SEQ_STATE_LISTENING &&
2885		    st->bucket > hinfo->lhash2_mask) {
2886			st->state = TCP_SEQ_STATE_ESTABLISHED;
2887			st->bucket = 0;
2888		}
2889	}
2890
2891again:
2892	/* Get a new batch */
2893	iter->cur_sk = 0;
2894	iter->end_sk = 0;
2895	iter->st_bucket_done = false;
2896
2897	sk = tcp_seek_last_pos(seq);
2898	if (!sk)
2899		return NULL; /* Done */
2900
2901	if (st->state == TCP_SEQ_STATE_LISTENING)
2902		expected = bpf_iter_tcp_listening_batch(seq, sk);
2903	else
2904		expected = bpf_iter_tcp_established_batch(seq, sk);
2905
2906	if (iter->end_sk == expected) {
2907		iter->st_bucket_done = true;
2908		return sk;
2909	}
2910
2911	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2912		resized = true;
2913		goto again;
2914	}
2915
2916	return sk;
2917}
2918
2919static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2920{
2921	/* bpf iter does not support lseek, so it always
2922	 * continue from where it was stop()-ped.
2923	 */
2924	if (*pos)
2925		return bpf_iter_tcp_batch(seq);
2926
2927	return SEQ_START_TOKEN;
2928}
2929
2930static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2931{
2932	struct bpf_tcp_iter_state *iter = seq->private;
2933	struct tcp_iter_state *st = &iter->state;
2934	struct sock *sk;
2935
2936	/* Whenever seq_next() is called, the iter->cur_sk is
2937	 * done with seq_show(), so advance to the next sk in
2938	 * the batch.
2939	 */
2940	if (iter->cur_sk < iter->end_sk) {
2941		/* Keeping st->num consistent in tcp_iter_state.
2942		 * bpf_iter_tcp does not use st->num.
2943		 * meta.seq_num is used instead.
2944		 */
2945		st->num++;
2946		/* Move st->offset to the next sk in the bucket such that
2947		 * the future start() will resume at st->offset in
2948		 * st->bucket.  See tcp_seek_last_pos().
2949		 */
2950		st->offset++;
2951		sock_gen_put(iter->batch[iter->cur_sk++]);
2952	}
2953
2954	if (iter->cur_sk < iter->end_sk)
2955		sk = iter->batch[iter->cur_sk];
2956	else
2957		sk = bpf_iter_tcp_batch(seq);
2958
2959	++*pos;
2960	/* Keeping st->last_pos consistent in tcp_iter_state.
2961	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2962	 */
2963	st->last_pos = *pos;
2964	return sk;
2965}
2966
2967static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2968{
2969	struct bpf_iter_meta meta;
2970	struct bpf_prog *prog;
2971	struct sock *sk = v;
2972	uid_t uid;
2973	int ret;
2974
2975	if (v == SEQ_START_TOKEN)
2976		return 0;
2977
2978	if (sk_fullsock(sk))
2979		lock_sock(sk);
2980
2981	if (unlikely(sk_unhashed(sk))) {
2982		ret = SEQ_SKIP;
2983		goto unlock;
2984	}
2985
2986	if (sk->sk_state == TCP_TIME_WAIT) {
2987		uid = 0;
2988	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2989		const struct request_sock *req = v;
2990
2991		uid = from_kuid_munged(seq_user_ns(seq),
2992				       sock_i_uid(req->rsk_listener));
2993	} else {
2994		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2995	}
2996
2997	meta.seq = seq;
2998	prog = bpf_iter_get_info(&meta, false);
2999	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3000
3001unlock:
3002	if (sk_fullsock(sk))
3003		release_sock(sk);
3004	return ret;
3005
3006}
3007
3008static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3009{
3010	struct bpf_tcp_iter_state *iter = seq->private;
3011	struct bpf_iter_meta meta;
3012	struct bpf_prog *prog;
3013
3014	if (!v) {
3015		meta.seq = seq;
3016		prog = bpf_iter_get_info(&meta, true);
3017		if (prog)
3018			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3019	}
3020
3021	if (iter->cur_sk < iter->end_sk) {
3022		bpf_iter_tcp_put_batch(iter);
3023		iter->st_bucket_done = false;
3024	}
3025}
3026
3027static const struct seq_operations bpf_iter_tcp_seq_ops = {
3028	.show		= bpf_iter_tcp_seq_show,
3029	.start		= bpf_iter_tcp_seq_start,
3030	.next		= bpf_iter_tcp_seq_next,
3031	.stop		= bpf_iter_tcp_seq_stop,
3032};
3033#endif
3034static unsigned short seq_file_family(const struct seq_file *seq)
3035{
3036	const struct tcp_seq_afinfo *afinfo;
3037
3038#ifdef CONFIG_BPF_SYSCALL
3039	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3040	if (seq->op == &bpf_iter_tcp_seq_ops)
3041		return AF_UNSPEC;
3042#endif
3043
3044	/* Iterated from proc fs */
3045	afinfo = pde_data(file_inode(seq->file));
3046	return afinfo->family;
3047}
3048
3049static const struct seq_operations tcp4_seq_ops = {
3050	.show		= tcp4_seq_show,
3051	.start		= tcp_seq_start,
3052	.next		= tcp_seq_next,
3053	.stop		= tcp_seq_stop,
3054};
3055
3056static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3057	.family		= AF_INET,
3058};
3059
3060static int __net_init tcp4_proc_init_net(struct net *net)
3061{
3062	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3063			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3064		return -ENOMEM;
3065	return 0;
3066}
3067
3068static void __net_exit tcp4_proc_exit_net(struct net *net)
3069{
3070	remove_proc_entry("tcp", net->proc_net);
3071}
3072
3073static struct pernet_operations tcp4_net_ops = {
3074	.init = tcp4_proc_init_net,
3075	.exit = tcp4_proc_exit_net,
3076};
3077
3078int __init tcp4_proc_init(void)
3079{
3080	return register_pernet_subsys(&tcp4_net_ops);
3081}
3082
3083void tcp4_proc_exit(void)
3084{
3085	unregister_pernet_subsys(&tcp4_net_ops);
3086}
3087#endif /* CONFIG_PROC_FS */
3088
3089/* @wake is one when sk_stream_write_space() calls us.
3090 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3091 * This mimics the strategy used in sock_def_write_space().
3092 */
3093bool tcp_stream_memory_free(const struct sock *sk, int wake)
3094{
3095	const struct tcp_sock *tp = tcp_sk(sk);
3096	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3097			    READ_ONCE(tp->snd_nxt);
3098
3099	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3100}
3101EXPORT_SYMBOL(tcp_stream_memory_free);
3102
3103struct proto tcp_prot = {
3104	.name			= "TCP",
3105	.owner			= THIS_MODULE,
3106	.close			= tcp_close,
3107	.pre_connect		= tcp_v4_pre_connect,
3108	.connect		= tcp_v4_connect,
3109	.disconnect		= tcp_disconnect,
3110	.accept			= inet_csk_accept,
3111	.ioctl			= tcp_ioctl,
3112	.init			= tcp_v4_init_sock,
3113	.destroy		= tcp_v4_destroy_sock,
3114	.shutdown		= tcp_shutdown,
3115	.setsockopt		= tcp_setsockopt,
3116	.getsockopt		= tcp_getsockopt,
3117	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3118	.keepalive		= tcp_set_keepalive,
3119	.recvmsg		= tcp_recvmsg,
3120	.sendmsg		= tcp_sendmsg,
3121	.splice_eof		= tcp_splice_eof,
3122	.backlog_rcv		= tcp_v4_do_rcv,
3123	.release_cb		= tcp_release_cb,
3124	.hash			= inet_hash,
3125	.unhash			= inet_unhash,
3126	.get_port		= inet_csk_get_port,
3127	.put_port		= inet_put_port,
3128#ifdef CONFIG_BPF_SYSCALL
3129	.psock_update_sk_prot	= tcp_bpf_update_proto,
3130#endif
3131	.enter_memory_pressure	= tcp_enter_memory_pressure,
3132	.leave_memory_pressure	= tcp_leave_memory_pressure,
3133	.stream_memory_free	= tcp_stream_memory_free,
3134	.sockets_allocated	= &tcp_sockets_allocated,
3135	.orphan_count		= &tcp_orphan_count,
3136
3137	.memory_allocated	= &tcp_memory_allocated,
3138	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3139
3140	.memory_pressure	= &tcp_memory_pressure,
3141	.sysctl_mem		= sysctl_tcp_mem,
3142	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3143	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3144	.max_header		= MAX_TCP_HEADER,
3145	.obj_size		= sizeof(struct tcp_sock),
3146	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3147	.twsk_prot		= &tcp_timewait_sock_ops,
3148	.rsk_prot		= &tcp_request_sock_ops,
3149	.h.hashinfo		= NULL,
3150	.no_autobind		= true,
3151	.diag_destroy		= tcp_abort,
3152};
3153EXPORT_SYMBOL(tcp_prot);
3154
3155static void __net_exit tcp_sk_exit(struct net *net)
3156{
3157	if (net->ipv4.tcp_congestion_control)
3158		bpf_module_put(net->ipv4.tcp_congestion_control,
3159			       net->ipv4.tcp_congestion_control->owner);
3160}
3161
3162static void __net_init tcp_set_hashinfo(struct net *net)
3163{
3164	struct inet_hashinfo *hinfo;
3165	unsigned int ehash_entries;
3166	struct net *old_net;
3167
3168	if (net_eq(net, &init_net))
3169		goto fallback;
3170
3171	old_net = current->nsproxy->net_ns;
3172	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3173	if (!ehash_entries)
3174		goto fallback;
3175
3176	ehash_entries = roundup_pow_of_two(ehash_entries);
3177	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3178	if (!hinfo) {
3179		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3180			"for a netns, fallback to the global one\n",
3181			ehash_entries);
3182fallback:
3183		hinfo = &tcp_hashinfo;
3184		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3185	}
3186
3187	net->ipv4.tcp_death_row.hashinfo = hinfo;
3188	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3189	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3190}
3191
3192static int __net_init tcp_sk_init(struct net *net)
3193{
3194	net->ipv4.sysctl_tcp_ecn = 2;
3195	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3196
3197	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3198	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3199	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3200	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3201	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3202
3203	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3204	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3205	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3206
3207	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3208	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3209	net->ipv4.sysctl_tcp_syncookies = 1;
3210	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3211	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3212	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3213	net->ipv4.sysctl_tcp_orphan_retries = 0;
3214	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3215	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3216	net->ipv4.sysctl_tcp_tw_reuse = 2;
3217	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3218
3219	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3220	tcp_set_hashinfo(net);
3221
3222	net->ipv4.sysctl_tcp_sack = 1;
3223	net->ipv4.sysctl_tcp_window_scaling = 1;
3224	net->ipv4.sysctl_tcp_timestamps = 1;
3225	net->ipv4.sysctl_tcp_early_retrans = 3;
3226	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3227	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3228	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3229	net->ipv4.sysctl_tcp_max_reordering = 300;
3230	net->ipv4.sysctl_tcp_dsack = 1;
3231	net->ipv4.sysctl_tcp_app_win = 31;
3232	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3233	net->ipv4.sysctl_tcp_frto = 2;
3234	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3235	/* This limits the percentage of the congestion window which we
3236	 * will allow a single TSO frame to consume.  Building TSO frames
3237	 * which are too large can cause TCP streams to be bursty.
3238	 */
3239	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3240	/* Default TSQ limit of 16 TSO segments */
3241	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3242
3243	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3244	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3245
3246	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3247	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3248	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3249	net->ipv4.sysctl_tcp_autocorking = 1;
3250	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3251	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3252	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3253	if (net != &init_net) {
3254		memcpy(net->ipv4.sysctl_tcp_rmem,
3255		       init_net.ipv4.sysctl_tcp_rmem,
3256		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3257		memcpy(net->ipv4.sysctl_tcp_wmem,
3258		       init_net.ipv4.sysctl_tcp_wmem,
3259		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3260	}
3261	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3262	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3263	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3264	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3265	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3266	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3267
3268	/* Set default values for PLB */
3269	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3270	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3271	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3272	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3273	/* Default congestion threshold for PLB to mark a round is 50% */
3274	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3275
3276	/* Reno is always built in */
3277	if (!net_eq(net, &init_net) &&
3278	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3279			       init_net.ipv4.tcp_congestion_control->owner))
3280		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3281	else
3282		net->ipv4.tcp_congestion_control = &tcp_reno;
3283
3284	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3285	net->ipv4.sysctl_tcp_shrink_window = 0;
3286
3287	return 0;
3288}
3289
3290static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3291{
3292	struct net *net;
3293
3294	tcp_twsk_purge(net_exit_list, AF_INET);
3295
3296	list_for_each_entry(net, net_exit_list, exit_list) {
3297		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3298		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3299		tcp_fastopen_ctx_destroy(net);
3300	}
3301}
3302
3303static struct pernet_operations __net_initdata tcp_sk_ops = {
3304       .init	   = tcp_sk_init,
3305       .exit	   = tcp_sk_exit,
3306       .exit_batch = tcp_sk_exit_batch,
3307};
3308
3309#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3310DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3311		     struct sock_common *sk_common, uid_t uid)
3312
3313#define INIT_BATCH_SZ 16
3314
3315static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3316{
3317	struct bpf_tcp_iter_state *iter = priv_data;
3318	int err;
3319
3320	err = bpf_iter_init_seq_net(priv_data, aux);
3321	if (err)
3322		return err;
3323
3324	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3325	if (err) {
3326		bpf_iter_fini_seq_net(priv_data);
3327		return err;
3328	}
3329
3330	return 0;
3331}
3332
3333static void bpf_iter_fini_tcp(void *priv_data)
3334{
3335	struct bpf_tcp_iter_state *iter = priv_data;
3336
3337	bpf_iter_fini_seq_net(priv_data);
3338	kvfree(iter->batch);
3339}
3340
3341static const struct bpf_iter_seq_info tcp_seq_info = {
3342	.seq_ops		= &bpf_iter_tcp_seq_ops,
3343	.init_seq_private	= bpf_iter_init_tcp,
3344	.fini_seq_private	= bpf_iter_fini_tcp,
3345	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3346};
3347
3348static const struct bpf_func_proto *
3349bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3350			    const struct bpf_prog *prog)
3351{
3352	switch (func_id) {
3353	case BPF_FUNC_setsockopt:
3354		return &bpf_sk_setsockopt_proto;
3355	case BPF_FUNC_getsockopt:
3356		return &bpf_sk_getsockopt_proto;
3357	default:
3358		return NULL;
3359	}
3360}
3361
3362static struct bpf_iter_reg tcp_reg_info = {
3363	.target			= "tcp",
3364	.ctx_arg_info_size	= 1,
3365	.ctx_arg_info		= {
3366		{ offsetof(struct bpf_iter__tcp, sk_common),
3367		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3368	},
3369	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3370	.seq_info		= &tcp_seq_info,
3371};
3372
3373static void __init bpf_iter_register(void)
3374{
3375	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3376	if (bpf_iter_reg_target(&tcp_reg_info))
3377		pr_warn("Warning: could not register bpf iterator tcp\n");
3378}
3379
3380#endif
3381
3382void __init tcp_v4_init(void)
3383{
3384	int cpu, res;
3385
3386	for_each_possible_cpu(cpu) {
3387		struct sock *sk;
3388
3389		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3390					   IPPROTO_TCP, &init_net);
3391		if (res)
3392			panic("Failed to create the TCP control socket.\n");
3393		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3394
3395		/* Please enforce IP_DF and IPID==0 for RST and
3396		 * ACK sent in SYN-RECV and TIME-WAIT state.
3397		 */
3398		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3399
3400		per_cpu(ipv4_tcp_sk, cpu) = sk;
3401	}
3402	if (register_pernet_subsys(&tcp_sk_ops))
3403		panic("Failed to create the TCP control socket.\n");
3404
3405#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3406	bpf_iter_register();
3407#endif
3408}
Configure Feed

Configure Feed