net/ipv4/tcp_ipv4.c at v5.11-rc2

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v5.11-rc2 3032 lines 81 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *		IPv4 specific functions
  10 *
  11 *		code split from:
  12 *		linux/ipv4/tcp.c
  13 *		linux/ipv4/tcp_input.c
  14 *		linux/ipv4/tcp_output.c
  15 *
  16 *		See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *		David S. Miller	:	New socket lookup architecture.
  22 *					This code is dedicated to John Dyson.
  23 *		David S. Miller :	Change semantics of established hash,
  24 *					half is devoted to TIME_WAIT sockets
  25 *					and the rest go in the other half.
  26 *		Andi Kleen :		Add support for syncookies and fixed
  27 *					some bugs: ip options weren't passed to
  28 *					the TCP layer, missed a check for an
  29 *					ACK bit.
  30 *		Andi Kleen :		Implemented fast path mtu discovery.
  31 *	     				Fixed many serious bugs in the
  32 *					request_sock handling and moved
  33 *					most of it into the af independent code.
  34 *					Added tail drop and some other bugfixes.
  35 *					Added new listen semantics.
  36 *		Mike McLagan	:	Routing by source
  37 *	Juan Jose Ciarlante:		ip_dynaddr bits
  38 *		Andi Kleen:		various fixes.
  39 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  40 *					coma.
  41 *	Andi Kleen		:	Fix new listen.
  42 *	Andi Kleen		:	Fix accept error reporting.
  43 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  44 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  45 *					a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79#include <linux/btf_ids.h>
  80
  81#include <crypto/hash.h>
  82#include <linux/scatterlist.h>
  83
  84#include <trace/events/tcp.h>
  85
  86#ifdef CONFIG_TCP_MD5SIG
  87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  88			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
  89#endif
  90
  91struct inet_hashinfo tcp_hashinfo;
  92EXPORT_SYMBOL(tcp_hashinfo);
  93
  94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  95{
  96	return secure_tcp_seq(ip_hdr(skb)->daddr,
  97			      ip_hdr(skb)->saddr,
  98			      tcp_hdr(skb)->dest,
  99			      tcp_hdr(skb)->source);
 100}
 101
 102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 103{
 104	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 105}
 106
 107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108{
 109	const struct inet_timewait_sock *tw = inet_twsk(sktw);
 110	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 111	struct tcp_sock *tp = tcp_sk(sk);
 112	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 113
 114	if (reuse == 2) {
 115		/* Still does not detect *everything* that goes through
 116		 * lo, since we require a loopback src or dst address
 117		 * or direct binding to 'lo' interface.
 118		 */
 119		bool loopback = false;
 120		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 121			loopback = true;
 122#if IS_ENABLED(CONFIG_IPV6)
 123		if (tw->tw_family == AF_INET6) {
 124			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 125			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
 126			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
 128				loopback = true;
 129		} else
 130#endif
 131		{
 132			if (ipv4_is_loopback(tw->tw_daddr) ||
 133			    ipv4_is_loopback(tw->tw_rcv_saddr))
 134				loopback = true;
 135		}
 136		if (!loopback)
 137			reuse = 0;
 138	}
 139
 140	/* With PAWS, it is safe from the viewpoint
 141	   of data integrity. Even without PAWS it is safe provided sequence
 142	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 143
 144	   Actually, the idea is close to VJ's one, only timestamp cache is
 145	   held not per host, but per port pair and TW bucket is used as state
 146	   holder.
 147
 148	   If TW bucket has been already destroyed we fall back to VJ's scheme
 149	   and use initial timestamp retrieved from peer table.
 150	 */
 151	if (tcptw->tw_ts_recent_stamp &&
 152	    (!twp || (reuse && time_after32(ktime_get_seconds(),
 153					    tcptw->tw_ts_recent_stamp)))) {
 154		/* In case of repair and re-using TIME-WAIT sockets we still
 155		 * want to be sure that it is safe as above but honor the
 156		 * sequence numbers and time stamps set as part of the repair
 157		 * process.
 158		 *
 159		 * Without this check re-using a TIME-WAIT socket with TCP
 160		 * repair would accumulate a -1 on the repair assigned
 161		 * sequence number. The first time it is reused the sequence
 162		 * is -1, the second time -2, etc. This fixes that issue
 163		 * without appearing to create any others.
 164		 */
 165		if (likely(!tp->repair)) {
 166			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
 167
 168			if (!seq)
 169				seq = 1;
 170			WRITE_ONCE(tp->write_seq, seq);
 171			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 172			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 173		}
 174		sock_hold(sktw);
 175		return 1;
 176	}
 177
 178	return 0;
 179}
 180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 181
 182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 183			      int addr_len)
 184{
 185	/* This check is replicated from tcp_v4_connect() and intended to
 186	 * prevent BPF program called below from accessing bytes that are out
 187	 * of the bound specified by user in addr_len.
 188	 */
 189	if (addr_len < sizeof(struct sockaddr_in))
 190		return -EINVAL;
 191
 192	sock_owned_by_me(sk);
 193
 194	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 195}
 196
 197/* This will initiate an outgoing connection. */
 198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 199{
 200	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 201	struct inet_sock *inet = inet_sk(sk);
 202	struct tcp_sock *tp = tcp_sk(sk);
 203	__be16 orig_sport, orig_dport;
 204	__be32 daddr, nexthop;
 205	struct flowi4 *fl4;
 206	struct rtable *rt;
 207	int err;
 208	struct ip_options_rcu *inet_opt;
 209	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 210
 211	if (addr_len < sizeof(struct sockaddr_in))
 212		return -EINVAL;
 213
 214	if (usin->sin_family != AF_INET)
 215		return -EAFNOSUPPORT;
 216
 217	nexthop = daddr = usin->sin_addr.s_addr;
 218	inet_opt = rcu_dereference_protected(inet->inet_opt,
 219					     lockdep_sock_is_held(sk));
 220	if (inet_opt && inet_opt->opt.srr) {
 221		if (!daddr)
 222			return -EINVAL;
 223		nexthop = inet_opt->opt.faddr;
 224	}
 225
 226	orig_sport = inet->inet_sport;
 227	orig_dport = usin->sin_port;
 228	fl4 = &inet->cork.fl.u.ip4;
 229	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 230			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 231			      IPPROTO_TCP,
 232			      orig_sport, orig_dport, sk);
 233	if (IS_ERR(rt)) {
 234		err = PTR_ERR(rt);
 235		if (err == -ENETUNREACH)
 236			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 237		return err;
 238	}
 239
 240	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 241		ip_rt_put(rt);
 242		return -ENETUNREACH;
 243	}
 244
 245	if (!inet_opt || !inet_opt->opt.srr)
 246		daddr = fl4->daddr;
 247
 248	if (!inet->inet_saddr)
 249		inet->inet_saddr = fl4->saddr;
 250	sk_rcv_saddr_set(sk, inet->inet_saddr);
 251
 252	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 253		/* Reset inherited state */
 254		tp->rx_opt.ts_recent	   = 0;
 255		tp->rx_opt.ts_recent_stamp = 0;
 256		if (likely(!tp->repair))
 257			WRITE_ONCE(tp->write_seq, 0);
 258	}
 259
 260	inet->inet_dport = usin->sin_port;
 261	sk_daddr_set(sk, daddr);
 262
 263	inet_csk(sk)->icsk_ext_hdr_len = 0;
 264	if (inet_opt)
 265		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 266
 267	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 268
 269	/* Socket identity is still unknown (sport may be zero).
 270	 * However we set state to SYN-SENT and not releasing socket
 271	 * lock select source port, enter ourselves into the hash tables and
 272	 * complete initialization after this.
 273	 */
 274	tcp_set_state(sk, TCP_SYN_SENT);
 275	err = inet_hash_connect(tcp_death_row, sk);
 276	if (err)
 277		goto failure;
 278
 279	sk_set_txhash(sk);
 280
 281	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 282			       inet->inet_sport, inet->inet_dport, sk);
 283	if (IS_ERR(rt)) {
 284		err = PTR_ERR(rt);
 285		rt = NULL;
 286		goto failure;
 287	}
 288	/* OK, now commit destination to socket.  */
 289	sk->sk_gso_type = SKB_GSO_TCPV4;
 290	sk_setup_caps(sk, &rt->dst);
 291	rt = NULL;
 292
 293	if (likely(!tp->repair)) {
 294		if (!tp->write_seq)
 295			WRITE_ONCE(tp->write_seq,
 296				   secure_tcp_seq(inet->inet_saddr,
 297						  inet->inet_daddr,
 298						  inet->inet_sport,
 299						  usin->sin_port));
 300		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 301						 inet->inet_saddr,
 302						 inet->inet_daddr);
 303	}
 304
 305	inet->inet_id = prandom_u32();
 306
 307	if (tcp_fastopen_defer_connect(sk, &err))
 308		return err;
 309	if (err)
 310		goto failure;
 311
 312	err = tcp_connect(sk);
 313
 314	if (err)
 315		goto failure;
 316
 317	return 0;
 318
 319failure:
 320	/*
 321	 * This unhashes the socket and releases the local port,
 322	 * if necessary.
 323	 */
 324	tcp_set_state(sk, TCP_CLOSE);
 325	ip_rt_put(rt);
 326	sk->sk_route_caps = 0;
 327	inet->inet_dport = 0;
 328	return err;
 329}
 330EXPORT_SYMBOL(tcp_v4_connect);
 331
 332/*
 333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 334 * It can be called through tcp_release_cb() if socket was owned by user
 335 * at the time tcp_v4_err() was called to handle ICMP message.
 336 */
 337void tcp_v4_mtu_reduced(struct sock *sk)
 338{
 339	struct inet_sock *inet = inet_sk(sk);
 340	struct dst_entry *dst;
 341	u32 mtu;
 342
 343	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 344		return;
 345	mtu = tcp_sk(sk)->mtu_info;
 346	dst = inet_csk_update_pmtu(sk, mtu);
 347	if (!dst)
 348		return;
 349
 350	/* Something is about to be wrong... Remember soft error
 351	 * for the case, if this connection will not able to recover.
 352	 */
 353	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 354		sk->sk_err_soft = EMSGSIZE;
 355
 356	mtu = dst_mtu(dst);
 357
 358	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 359	    ip_sk_accept_pmtu(sk) &&
 360	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 361		tcp_sync_mss(sk, mtu);
 362
 363		/* Resend the TCP packet because it's
 364		 * clear that the old packet has been
 365		 * dropped. This is the new "fast" path mtu
 366		 * discovery.
 367		 */
 368		tcp_simple_retransmit(sk);
 369	} /* else let the usual retransmit timer handle it */
 370}
 371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 372
 373static void do_redirect(struct sk_buff *skb, struct sock *sk)
 374{
 375	struct dst_entry *dst = __sk_dst_check(sk, 0);
 376
 377	if (dst)
 378		dst->ops->redirect(dst, sk, skb);
 379}
 380
 381
 382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 384{
 385	struct request_sock *req = inet_reqsk(sk);
 386	struct net *net = sock_net(sk);
 387
 388	/* ICMPs are not backlogged, hence we cannot get
 389	 * an established socket here.
 390	 */
 391	if (seq != tcp_rsk(req)->snt_isn) {
 392		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 393	} else if (abort) {
 394		/*
 395		 * Still in SYN_RECV, just remove it silently.
 396		 * There is no good way to pass the error to the newly
 397		 * created socket, and POSIX does not want network
 398		 * errors returned from accept().
 399		 */
 400		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 401		tcp_listendrop(req->rsk_listener);
 402	}
 403	reqsk_put(req);
 404}
 405EXPORT_SYMBOL(tcp_req_err);
 406
 407/* TCP-LD (RFC 6069) logic */
 408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
 409{
 410	struct inet_connection_sock *icsk = inet_csk(sk);
 411	struct tcp_sock *tp = tcp_sk(sk);
 412	struct sk_buff *skb;
 413	s32 remaining;
 414	u32 delta_us;
 415
 416	if (sock_owned_by_user(sk))
 417		return;
 418
 419	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 420	    !icsk->icsk_backoff)
 421		return;
 422
 423	skb = tcp_rtx_queue_head(sk);
 424	if (WARN_ON_ONCE(!skb))
 425		return;
 426
 427	icsk->icsk_backoff--;
 428	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
 429	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 430
 431	tcp_mstamp_refresh(tp);
 432	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 433	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
 434
 435	if (remaining > 0) {
 436		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 437					  remaining, TCP_RTO_MAX);
 438	} else {
 439		/* RTO revert clocked out retransmission.
 440		 * Will retransmit now.
 441		 */
 442		tcp_retransmit_timer(sk);
 443	}
 444}
 445EXPORT_SYMBOL(tcp_ld_RTO_revert);
 446
 447/*
 448 * This routine is called by the ICMP module when it gets some
 449 * sort of error condition.  If err < 0 then the socket should
 450 * be closed and the error returned to the user.  If err > 0
 451 * it's just the icmp type << 8 | icmp code.  After adjustment
 452 * header points to the first 8 bytes of the tcp header.  We need
 453 * to find the appropriate port.
 454 *
 455 * The locking strategy used here is very "optimistic". When
 456 * someone else accesses the socket the ICMP is just dropped
 457 * and for some paths there is no check at all.
 458 * A more general error queue to queue errors for later handling
 459 * is probably better.
 460 *
 461 */
 462
 463int tcp_v4_err(struct sk_buff *skb, u32 info)
 464{
 465	const struct iphdr *iph = (const struct iphdr *)skb->data;
 466	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 467	struct tcp_sock *tp;
 468	struct inet_sock *inet;
 469	const int type = icmp_hdr(skb)->type;
 470	const int code = icmp_hdr(skb)->code;
 471	struct sock *sk;
 472	struct request_sock *fastopen;
 473	u32 seq, snd_una;
 474	int err;
 475	struct net *net = dev_net(skb->dev);
 476
 477	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 478				       th->dest, iph->saddr, ntohs(th->source),
 479				       inet_iif(skb), 0);
 480	if (!sk) {
 481		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 482		return -ENOENT;
 483	}
 484	if (sk->sk_state == TCP_TIME_WAIT) {
 485		inet_twsk_put(inet_twsk(sk));
 486		return 0;
 487	}
 488	seq = ntohl(th->seq);
 489	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 490		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 491				     type == ICMP_TIME_EXCEEDED ||
 492				     (type == ICMP_DEST_UNREACH &&
 493				      (code == ICMP_NET_UNREACH ||
 494				       code == ICMP_HOST_UNREACH)));
 495		return 0;
 496	}
 497
 498	bh_lock_sock(sk);
 499	/* If too many ICMPs get dropped on busy
 500	 * servers this needs to be solved differently.
 501	 * We do take care of PMTU discovery (RFC1191) special case :
 502	 * we can receive locally generated ICMP messages while socket is held.
 503	 */
 504	if (sock_owned_by_user(sk)) {
 505		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 506			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 507	}
 508	if (sk->sk_state == TCP_CLOSE)
 509		goto out;
 510
 511	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 512		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 513		goto out;
 514	}
 515
 516	tp = tcp_sk(sk);
 517	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 518	fastopen = rcu_dereference(tp->fastopen_rsk);
 519	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 520	if (sk->sk_state != TCP_LISTEN &&
 521	    !between(seq, snd_una, tp->snd_nxt)) {
 522		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 523		goto out;
 524	}
 525
 526	switch (type) {
 527	case ICMP_REDIRECT:
 528		if (!sock_owned_by_user(sk))
 529			do_redirect(skb, sk);
 530		goto out;
 531	case ICMP_SOURCE_QUENCH:
 532		/* Just silently ignore these. */
 533		goto out;
 534	case ICMP_PARAMETERPROB:
 535		err = EPROTO;
 536		break;
 537	case ICMP_DEST_UNREACH:
 538		if (code > NR_ICMP_UNREACH)
 539			goto out;
 540
 541		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 542			/* We are not interested in TCP_LISTEN and open_requests
 543			 * (SYN-ACKs send out by Linux are always <576bytes so
 544			 * they should go through unfragmented).
 545			 */
 546			if (sk->sk_state == TCP_LISTEN)
 547				goto out;
 548
 549			tp->mtu_info = info;
 550			if (!sock_owned_by_user(sk)) {
 551				tcp_v4_mtu_reduced(sk);
 552			} else {
 553				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 554					sock_hold(sk);
 555			}
 556			goto out;
 557		}
 558
 559		err = icmp_err_convert[code].errno;
 560		/* check if this ICMP message allows revert of backoff.
 561		 * (see RFC 6069)
 562		 */
 563		if (!fastopen &&
 564		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
 565			tcp_ld_RTO_revert(sk, seq);
 566		break;
 567	case ICMP_TIME_EXCEEDED:
 568		err = EHOSTUNREACH;
 569		break;
 570	default:
 571		goto out;
 572	}
 573
 574	switch (sk->sk_state) {
 575	case TCP_SYN_SENT:
 576	case TCP_SYN_RECV:
 577		/* Only in fast or simultaneous open. If a fast open socket is
 578		 * already accepted it is treated as a connected one below.
 579		 */
 580		if (fastopen && !fastopen->sk)
 581			break;
 582
 583		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
 584
 585		if (!sock_owned_by_user(sk)) {
 586			sk->sk_err = err;
 587
 588			sk->sk_error_report(sk);
 589
 590			tcp_done(sk);
 591		} else {
 592			sk->sk_err_soft = err;
 593		}
 594		goto out;
 595	}
 596
 597	/* If we've already connected we will keep trying
 598	 * until we time out, or the user gives up.
 599	 *
 600	 * rfc1122 4.2.3.9 allows to consider as hard errors
 601	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 602	 * but it is obsoleted by pmtu discovery).
 603	 *
 604	 * Note, that in modern internet, where routing is unreliable
 605	 * and in each dark corner broken firewalls sit, sending random
 606	 * errors ordered by their masters even this two messages finally lose
 607	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 608	 *
 609	 * Now we are in compliance with RFCs.
 610	 *							--ANK (980905)
 611	 */
 612
 613	inet = inet_sk(sk);
 614	if (!sock_owned_by_user(sk) && inet->recverr) {
 615		sk->sk_err = err;
 616		sk->sk_error_report(sk);
 617	} else	{ /* Only an error on timeout */
 618		sk->sk_err_soft = err;
 619	}
 620
 621out:
 622	bh_unlock_sock(sk);
 623	sock_put(sk);
 624	return 0;
 625}
 626
 627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 628{
 629	struct tcphdr *th = tcp_hdr(skb);
 630
 631	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 632	skb->csum_start = skb_transport_header(skb) - skb->head;
 633	skb->csum_offset = offsetof(struct tcphdr, check);
 634}
 635
 636/* This routine computes an IPv4 TCP checksum. */
 637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 638{
 639	const struct inet_sock *inet = inet_sk(sk);
 640
 641	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 642}
 643EXPORT_SYMBOL(tcp_v4_send_check);
 644
 645/*
 646 *	This routine will send an RST to the other tcp.
 647 *
 648 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 649 *		      for reset.
 650 *	Answer: if a packet caused RST, it is not for a socket
 651 *		existing in our system, if it is matched to a socket,
 652 *		it is just duplicate segment or bug in other side's TCP.
 653 *		So that we build reply only basing on parameters
 654 *		arrived with segment.
 655 *	Exception: precedence violation. We do not implement it in any case.
 656 */
 657
 658static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 659{
 660	const struct tcphdr *th = tcp_hdr(skb);
 661	struct {
 662		struct tcphdr th;
 663#ifdef CONFIG_TCP_MD5SIG
 664		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 665#endif
 666	} rep;
 667	struct ip_reply_arg arg;
 668#ifdef CONFIG_TCP_MD5SIG
 669	struct tcp_md5sig_key *key = NULL;
 670	const __u8 *hash_location = NULL;
 671	unsigned char newhash[16];
 672	int genhash;
 673	struct sock *sk1 = NULL;
 674#endif
 675	u64 transmit_time = 0;
 676	struct sock *ctl_sk;
 677	struct net *net;
 678
 679	/* Never send a reset in response to a reset. */
 680	if (th->rst)
 681		return;
 682
 683	/* If sk not NULL, it means we did a successful lookup and incoming
 684	 * route had to be correct. prequeue might have dropped our dst.
 685	 */
 686	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 687		return;
 688
 689	/* Swap the send and the receive. */
 690	memset(&rep, 0, sizeof(rep));
 691	rep.th.dest   = th->source;
 692	rep.th.source = th->dest;
 693	rep.th.doff   = sizeof(struct tcphdr) / 4;
 694	rep.th.rst    = 1;
 695
 696	if (th->ack) {
 697		rep.th.seq = th->ack_seq;
 698	} else {
 699		rep.th.ack = 1;
 700		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 701				       skb->len - (th->doff << 2));
 702	}
 703
 704	memset(&arg, 0, sizeof(arg));
 705	arg.iov[0].iov_base = (unsigned char *)&rep;
 706	arg.iov[0].iov_len  = sizeof(rep.th);
 707
 708	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 709#ifdef CONFIG_TCP_MD5SIG
 710	rcu_read_lock();
 711	hash_location = tcp_parse_md5sig_option(th);
 712	if (sk && sk_fullsock(sk)) {
 713		const union tcp_md5_addr *addr;
 714		int l3index;
 715
 716		/* sdif set, means packet ingressed via a device
 717		 * in an L3 domain and inet_iif is set to it.
 718		 */
 719		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 720		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 721		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
 722	} else if (hash_location) {
 723		const union tcp_md5_addr *addr;
 724		int sdif = tcp_v4_sdif(skb);
 725		int dif = inet_iif(skb);
 726		int l3index;
 727
 728		/*
 729		 * active side is lost. Try to find listening socket through
 730		 * source port, and then find md5 key through listening socket.
 731		 * we are not loose security here:
 732		 * Incoming packet is checked with md5 hash with finding key,
 733		 * no RST generated if md5 hash doesn't match.
 734		 */
 735		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 736					     ip_hdr(skb)->saddr,
 737					     th->source, ip_hdr(skb)->daddr,
 738					     ntohs(th->source), dif, sdif);
 739		/* don't send rst if it can't find key */
 740		if (!sk1)
 741			goto out;
 742
 743		/* sdif set, means packet ingressed via a device
 744		 * in an L3 domain and dif is set to it.
 745		 */
 746		l3index = sdif ? dif : 0;
 747		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 748		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
 749		if (!key)
 750			goto out;
 751
 752
 753		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 754		if (genhash || memcmp(hash_location, newhash, 16) != 0)
 755			goto out;
 756
 757	}
 758
 759	if (key) {
 760		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 761				   (TCPOPT_NOP << 16) |
 762				   (TCPOPT_MD5SIG << 8) |
 763				   TCPOLEN_MD5SIG);
 764		/* Update length and the length the header thinks exists */
 765		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 766		rep.th.doff = arg.iov[0].iov_len / 4;
 767
 768		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 769				     key, ip_hdr(skb)->saddr,
 770				     ip_hdr(skb)->daddr, &rep.th);
 771	}
 772#endif
 773	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 774				      ip_hdr(skb)->saddr, /* XXX */
 775				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 776	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 777	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 778
 779	/* When socket is gone, all binding information is lost.
 780	 * routing might fail in this case. No choice here, if we choose to force
 781	 * input interface, we will misroute in case of asymmetric route.
 782	 */
 783	if (sk) {
 784		arg.bound_dev_if = sk->sk_bound_dev_if;
 785		if (sk_fullsock(sk))
 786			trace_tcp_send_reset(sk, skb);
 787	}
 788
 789	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 790		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 791
 792	arg.tos = ip_hdr(skb)->tos;
 793	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 794	local_bh_disable();
 795	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 796	if (sk) {
 797		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 798				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 799		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 800				   inet_twsk(sk)->tw_priority : sk->sk_priority;
 801		transmit_time = tcp_transmit_time(sk);
 802	}
 803	ip_send_unicast_reply(ctl_sk,
 804			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 805			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 806			      &arg, arg.iov[0].iov_len,
 807			      transmit_time);
 808
 809	ctl_sk->sk_mark = 0;
 810	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 811	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 812	local_bh_enable();
 813
 814#ifdef CONFIG_TCP_MD5SIG
 815out:
 816	rcu_read_unlock();
 817#endif
 818}
 819
 820/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 821   outside socket context is ugly, certainly. What can I do?
 822 */
 823
 824static void tcp_v4_send_ack(const struct sock *sk,
 825			    struct sk_buff *skb, u32 seq, u32 ack,
 826			    u32 win, u32 tsval, u32 tsecr, int oif,
 827			    struct tcp_md5sig_key *key,
 828			    int reply_flags, u8 tos)
 829{
 830	const struct tcphdr *th = tcp_hdr(skb);
 831	struct {
 832		struct tcphdr th;
 833		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 834#ifdef CONFIG_TCP_MD5SIG
 835			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 836#endif
 837			];
 838	} rep;
 839	struct net *net = sock_net(sk);
 840	struct ip_reply_arg arg;
 841	struct sock *ctl_sk;
 842	u64 transmit_time;
 843
 844	memset(&rep.th, 0, sizeof(struct tcphdr));
 845	memset(&arg, 0, sizeof(arg));
 846
 847	arg.iov[0].iov_base = (unsigned char *)&rep;
 848	arg.iov[0].iov_len  = sizeof(rep.th);
 849	if (tsecr) {
 850		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 851				   (TCPOPT_TIMESTAMP << 8) |
 852				   TCPOLEN_TIMESTAMP);
 853		rep.opt[1] = htonl(tsval);
 854		rep.opt[2] = htonl(tsecr);
 855		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 856	}
 857
 858	/* Swap the send and the receive. */
 859	rep.th.dest    = th->source;
 860	rep.th.source  = th->dest;
 861	rep.th.doff    = arg.iov[0].iov_len / 4;
 862	rep.th.seq     = htonl(seq);
 863	rep.th.ack_seq = htonl(ack);
 864	rep.th.ack     = 1;
 865	rep.th.window  = htons(win);
 866
 867#ifdef CONFIG_TCP_MD5SIG
 868	if (key) {
 869		int offset = (tsecr) ? 3 : 0;
 870
 871		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 872					  (TCPOPT_NOP << 16) |
 873					  (TCPOPT_MD5SIG << 8) |
 874					  TCPOLEN_MD5SIG);
 875		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 876		rep.th.doff = arg.iov[0].iov_len/4;
 877
 878		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 879				    key, ip_hdr(skb)->saddr,
 880				    ip_hdr(skb)->daddr, &rep.th);
 881	}
 882#endif
 883	arg.flags = reply_flags;
 884	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 885				      ip_hdr(skb)->saddr, /* XXX */
 886				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 887	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 888	if (oif)
 889		arg.bound_dev_if = oif;
 890	arg.tos = tos;
 891	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 892	local_bh_disable();
 893	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 894	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 895			   inet_twsk(sk)->tw_mark : sk->sk_mark;
 896	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 897			   inet_twsk(sk)->tw_priority : sk->sk_priority;
 898	transmit_time = tcp_transmit_time(sk);
 899	ip_send_unicast_reply(ctl_sk,
 900			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 901			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 902			      &arg, arg.iov[0].iov_len,
 903			      transmit_time);
 904
 905	ctl_sk->sk_mark = 0;
 906	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 907	local_bh_enable();
 908}
 909
 910static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 911{
 912	struct inet_timewait_sock *tw = inet_twsk(sk);
 913	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 914
 915	tcp_v4_send_ack(sk, skb,
 916			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 917			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 918			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 919			tcptw->tw_ts_recent,
 920			tw->tw_bound_dev_if,
 921			tcp_twsk_md5_key(tcptw),
 922			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 923			tw->tw_tos
 924			);
 925
 926	inet_twsk_put(tw);
 927}
 928
 929static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 930				  struct request_sock *req)
 931{
 932	const union tcp_md5_addr *addr;
 933	int l3index;
 934
 935	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 936	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 937	 */
 938	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 939					     tcp_sk(sk)->snd_nxt;
 940
 941	/* RFC 7323 2.3
 942	 * The window field (SEG.WND) of every outgoing segment, with the
 943	 * exception of <SYN> segments, MUST be right-shifted by
 944	 * Rcv.Wind.Shift bits:
 945	 */
 946	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
 947	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
 948	tcp_v4_send_ack(sk, skb, seq,
 949			tcp_rsk(req)->rcv_nxt,
 950			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 951			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 952			req->ts_recent,
 953			0,
 954			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
 955			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 956			ip_hdr(skb)->tos);
 957}
 958
 959/*
 960 *	Send a SYN-ACK after having received a SYN.
 961 *	This still operates on a request_sock only, not on a big
 962 *	socket.
 963 */
 964static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 965			      struct flowi *fl,
 966			      struct request_sock *req,
 967			      struct tcp_fastopen_cookie *foc,
 968			      enum tcp_synack_type synack_type,
 969			      struct sk_buff *syn_skb)
 970{
 971	const struct inet_request_sock *ireq = inet_rsk(req);
 972	struct flowi4 fl4;
 973	int err = -1;
 974	struct sk_buff *skb;
 975	u8 tos;
 976
 977	/* First, grab a route. */
 978	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 979		return -1;
 980
 981	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 982
 983	if (skb) {
 984		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 985
 986		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
 987				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
 988				(inet_sk(sk)->tos & INET_ECN_MASK) :
 989				inet_sk(sk)->tos;
 990
 991		if (!INET_ECN_is_capable(tos) &&
 992		    tcp_bpf_ca_needs_ecn((struct sock *)req))
 993			tos |= INET_ECN_ECT_0;
 994
 995		rcu_read_lock();
 996		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 997					    ireq->ir_rmt_addr,
 998					    rcu_dereference(ireq->ireq_opt),
 999					    tos);
1000		rcu_read_unlock();
1001		err = net_xmit_eval(err);
1002	}
1003
1004	return err;
1005}
1006
1007/*
1008 *	IPv4 request_sock destructor.
1009 */
1010static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011{
1012	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013}
1014
1015#ifdef CONFIG_TCP_MD5SIG
1016/*
1017 * RFC2385 MD5 checksumming requires a mapping of
1018 * IP address->MD5 Key.
1019 * We need to maintain these in the sk structure.
1020 */
1021
1022DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023EXPORT_SYMBOL(tcp_md5_needed);
1024
1025/* Find the Key structure for an address.  */
1026struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1027					   const union tcp_md5_addr *addr,
1028					   int family)
1029{
1030	const struct tcp_sock *tp = tcp_sk(sk);
1031	struct tcp_md5sig_key *key;
1032	const struct tcp_md5sig_info *md5sig;
1033	__be32 mask;
1034	struct tcp_md5sig_key *best_match = NULL;
1035	bool match;
1036
1037	/* caller either holds rcu_read_lock() or socket lock */
1038	md5sig = rcu_dereference_check(tp->md5sig_info,
1039				       lockdep_sock_is_held(sk));
1040	if (!md5sig)
1041		return NULL;
1042
1043	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1044				 lockdep_sock_is_held(sk)) {
1045		if (key->family != family)
1046			continue;
1047		if (key->l3index && key->l3index != l3index)
1048			continue;
1049		if (family == AF_INET) {
1050			mask = inet_make_mask(key->prefixlen);
1051			match = (key->addr.a4.s_addr & mask) ==
1052				(addr->a4.s_addr & mask);
1053#if IS_ENABLED(CONFIG_IPV6)
1054		} else if (family == AF_INET6) {
1055			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1056						  key->prefixlen);
1057#endif
1058		} else {
1059			match = false;
1060		}
1061
1062		if (match && (!best_match ||
1063			      key->prefixlen > best_match->prefixlen))
1064			best_match = key;
1065	}
1066	return best_match;
1067}
1068EXPORT_SYMBOL(__tcp_md5_do_lookup);
1069
1070static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1071						      const union tcp_md5_addr *addr,
1072						      int family, u8 prefixlen,
1073						      int l3index)
1074{
1075	const struct tcp_sock *tp = tcp_sk(sk);
1076	struct tcp_md5sig_key *key;
1077	unsigned int size = sizeof(struct in_addr);
1078	const struct tcp_md5sig_info *md5sig;
1079
1080	/* caller either holds rcu_read_lock() or socket lock */
1081	md5sig = rcu_dereference_check(tp->md5sig_info,
1082				       lockdep_sock_is_held(sk));
1083	if (!md5sig)
1084		return NULL;
1085#if IS_ENABLED(CONFIG_IPV6)
1086	if (family == AF_INET6)
1087		size = sizeof(struct in6_addr);
1088#endif
1089	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1090				 lockdep_sock_is_held(sk)) {
1091		if (key->family != family)
1092			continue;
1093		if (key->l3index && key->l3index != l3index)
1094			continue;
1095		if (!memcmp(&key->addr, addr, size) &&
1096		    key->prefixlen == prefixlen)
1097			return key;
1098	}
1099	return NULL;
1100}
1101
1102struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1103					 const struct sock *addr_sk)
1104{
1105	const union tcp_md5_addr *addr;
1106	int l3index;
1107
1108	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1109						 addr_sk->sk_bound_dev_if);
1110	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1111	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1112}
1113EXPORT_SYMBOL(tcp_v4_md5_lookup);
1114
1115/* This can be called on a newly created socket, from other files */
1116int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1117		   int family, u8 prefixlen, int l3index,
1118		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1119{
1120	/* Add Key to the list */
1121	struct tcp_md5sig_key *key;
1122	struct tcp_sock *tp = tcp_sk(sk);
1123	struct tcp_md5sig_info *md5sig;
1124
1125	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1126	if (key) {
1127		/* Pre-existing entry - just update that one.
1128		 * Note that the key might be used concurrently.
1129		 * data_race() is telling kcsan that we do not care of
1130		 * key mismatches, since changing MD5 key on live flows
1131		 * can lead to packet drops.
1132		 */
1133		data_race(memcpy(key->key, newkey, newkeylen));
1134
1135		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1136		 * Also note that a reader could catch new key->keylen value
1137		 * but old key->key[], this is the reason we use __GFP_ZERO
1138		 * at sock_kmalloc() time below these lines.
1139		 */
1140		WRITE_ONCE(key->keylen, newkeylen);
1141
1142		return 0;
1143	}
1144
1145	md5sig = rcu_dereference_protected(tp->md5sig_info,
1146					   lockdep_sock_is_held(sk));
1147	if (!md5sig) {
1148		md5sig = kmalloc(sizeof(*md5sig), gfp);
1149		if (!md5sig)
1150			return -ENOMEM;
1151
1152		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1153		INIT_HLIST_HEAD(&md5sig->head);
1154		rcu_assign_pointer(tp->md5sig_info, md5sig);
1155	}
1156
1157	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1158	if (!key)
1159		return -ENOMEM;
1160	if (!tcp_alloc_md5sig_pool()) {
1161		sock_kfree_s(sk, key, sizeof(*key));
1162		return -ENOMEM;
1163	}
1164
1165	memcpy(key->key, newkey, newkeylen);
1166	key->keylen = newkeylen;
1167	key->family = family;
1168	key->prefixlen = prefixlen;
1169	key->l3index = l3index;
1170	memcpy(&key->addr, addr,
1171	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1172				      sizeof(struct in_addr));
1173	hlist_add_head_rcu(&key->node, &md5sig->head);
1174	return 0;
1175}
1176EXPORT_SYMBOL(tcp_md5_do_add);
1177
1178int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1179		   u8 prefixlen, int l3index)
1180{
1181	struct tcp_md5sig_key *key;
1182
1183	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1184	if (!key)
1185		return -ENOENT;
1186	hlist_del_rcu(&key->node);
1187	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1188	kfree_rcu(key, rcu);
1189	return 0;
1190}
1191EXPORT_SYMBOL(tcp_md5_do_del);
1192
1193static void tcp_clear_md5_list(struct sock *sk)
1194{
1195	struct tcp_sock *tp = tcp_sk(sk);
1196	struct tcp_md5sig_key *key;
1197	struct hlist_node *n;
1198	struct tcp_md5sig_info *md5sig;
1199
1200	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1201
1202	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1203		hlist_del_rcu(&key->node);
1204		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1205		kfree_rcu(key, rcu);
1206	}
1207}
1208
1209static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1210				 sockptr_t optval, int optlen)
1211{
1212	struct tcp_md5sig cmd;
1213	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1214	const union tcp_md5_addr *addr;
1215	u8 prefixlen = 32;
1216	int l3index = 0;
1217
1218	if (optlen < sizeof(cmd))
1219		return -EINVAL;
1220
1221	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1222		return -EFAULT;
1223
1224	if (sin->sin_family != AF_INET)
1225		return -EINVAL;
1226
1227	if (optname == TCP_MD5SIG_EXT &&
1228	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1229		prefixlen = cmd.tcpm_prefixlen;
1230		if (prefixlen > 32)
1231			return -EINVAL;
1232	}
1233
1234	if (optname == TCP_MD5SIG_EXT &&
1235	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1236		struct net_device *dev;
1237
1238		rcu_read_lock();
1239		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1240		if (dev && netif_is_l3_master(dev))
1241			l3index = dev->ifindex;
1242
1243		rcu_read_unlock();
1244
1245		/* ok to reference set/not set outside of rcu;
1246		 * right now device MUST be an L3 master
1247		 */
1248		if (!dev || !l3index)
1249			return -EINVAL;
1250	}
1251
1252	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1253
1254	if (!cmd.tcpm_keylen)
1255		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1256
1257	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1258		return -EINVAL;
1259
1260	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1261			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1262}
1263
1264static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1265				   __be32 daddr, __be32 saddr,
1266				   const struct tcphdr *th, int nbytes)
1267{
1268	struct tcp4_pseudohdr *bp;
1269	struct scatterlist sg;
1270	struct tcphdr *_th;
1271
1272	bp = hp->scratch;
1273	bp->saddr = saddr;
1274	bp->daddr = daddr;
1275	bp->pad = 0;
1276	bp->protocol = IPPROTO_TCP;
1277	bp->len = cpu_to_be16(nbytes);
1278
1279	_th = (struct tcphdr *)(bp + 1);
1280	memcpy(_th, th, sizeof(*th));
1281	_th->check = 0;
1282
1283	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1284	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1285				sizeof(*bp) + sizeof(*th));
1286	return crypto_ahash_update(hp->md5_req);
1287}
1288
1289static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1290			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1291{
1292	struct tcp_md5sig_pool *hp;
1293	struct ahash_request *req;
1294
1295	hp = tcp_get_md5sig_pool();
1296	if (!hp)
1297		goto clear_hash_noput;
1298	req = hp->md5_req;
1299
1300	if (crypto_ahash_init(req))
1301		goto clear_hash;
1302	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1303		goto clear_hash;
1304	if (tcp_md5_hash_key(hp, key))
1305		goto clear_hash;
1306	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1307	if (crypto_ahash_final(req))
1308		goto clear_hash;
1309
1310	tcp_put_md5sig_pool();
1311	return 0;
1312
1313clear_hash:
1314	tcp_put_md5sig_pool();
1315clear_hash_noput:
1316	memset(md5_hash, 0, 16);
1317	return 1;
1318}
1319
1320int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1321			const struct sock *sk,
1322			const struct sk_buff *skb)
1323{
1324	struct tcp_md5sig_pool *hp;
1325	struct ahash_request *req;
1326	const struct tcphdr *th = tcp_hdr(skb);
1327	__be32 saddr, daddr;
1328
1329	if (sk) { /* valid for establish/request sockets */
1330		saddr = sk->sk_rcv_saddr;
1331		daddr = sk->sk_daddr;
1332	} else {
1333		const struct iphdr *iph = ip_hdr(skb);
1334		saddr = iph->saddr;
1335		daddr = iph->daddr;
1336	}
1337
1338	hp = tcp_get_md5sig_pool();
1339	if (!hp)
1340		goto clear_hash_noput;
1341	req = hp->md5_req;
1342
1343	if (crypto_ahash_init(req))
1344		goto clear_hash;
1345
1346	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1347		goto clear_hash;
1348	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1349		goto clear_hash;
1350	if (tcp_md5_hash_key(hp, key))
1351		goto clear_hash;
1352	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1353	if (crypto_ahash_final(req))
1354		goto clear_hash;
1355
1356	tcp_put_md5sig_pool();
1357	return 0;
1358
1359clear_hash:
1360	tcp_put_md5sig_pool();
1361clear_hash_noput:
1362	memset(md5_hash, 0, 16);
1363	return 1;
1364}
1365EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1366
1367#endif
1368
1369/* Called with rcu_read_lock() */
1370static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1371				    const struct sk_buff *skb,
1372				    int dif, int sdif)
1373{
1374#ifdef CONFIG_TCP_MD5SIG
1375	/*
1376	 * This gets called for each TCP segment that arrives
1377	 * so we want to be efficient.
1378	 * We have 3 drop cases:
1379	 * o No MD5 hash and one expected.
1380	 * o MD5 hash and we're not expecting one.
1381	 * o MD5 hash and its wrong.
1382	 */
1383	const __u8 *hash_location = NULL;
1384	struct tcp_md5sig_key *hash_expected;
1385	const struct iphdr *iph = ip_hdr(skb);
1386	const struct tcphdr *th = tcp_hdr(skb);
1387	const union tcp_md5_addr *addr;
1388	unsigned char newhash[16];
1389	int genhash, l3index;
1390
1391	/* sdif set, means packet ingressed via a device
1392	 * in an L3 domain and dif is set to the l3mdev
1393	 */
1394	l3index = sdif ? dif : 0;
1395
1396	addr = (union tcp_md5_addr *)&iph->saddr;
1397	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1398	hash_location = tcp_parse_md5sig_option(th);
1399
1400	/* We've parsed the options - do we have a hash? */
1401	if (!hash_expected && !hash_location)
1402		return false;
1403
1404	if (hash_expected && !hash_location) {
1405		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1406		return true;
1407	}
1408
1409	if (!hash_expected && hash_location) {
1410		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1411		return true;
1412	}
1413
1414	/* Okay, so this is hash_expected and hash_location -
1415	 * so we need to calculate the checksum.
1416	 */
1417	genhash = tcp_v4_md5_hash_skb(newhash,
1418				      hash_expected,
1419				      NULL, skb);
1420
1421	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1422		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1423		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1424				     &iph->saddr, ntohs(th->source),
1425				     &iph->daddr, ntohs(th->dest),
1426				     genhash ? " tcp_v4_calc_md5_hash failed"
1427				     : "", l3index);
1428		return true;
1429	}
1430	return false;
1431#endif
1432	return false;
1433}
1434
1435static void tcp_v4_init_req(struct request_sock *req,
1436			    const struct sock *sk_listener,
1437			    struct sk_buff *skb)
1438{
1439	struct inet_request_sock *ireq = inet_rsk(req);
1440	struct net *net = sock_net(sk_listener);
1441
1442	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445}
1446
1447static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448					  struct sk_buff *skb,
1449					  struct flowi *fl,
1450					  struct request_sock *req)
1451{
1452	tcp_v4_init_req(req, sk, skb);
1453
1454	if (security_inet_conn_request(sk, skb, req))
1455		return NULL;
1456
1457	return inet_csk_route_req(sk, &fl->u.ip4, req);
1458}
1459
1460struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1461	.family		=	PF_INET,
1462	.obj_size	=	sizeof(struct tcp_request_sock),
1463	.rtx_syn_ack	=	tcp_rtx_synack,
1464	.send_ack	=	tcp_v4_reqsk_send_ack,
1465	.destructor	=	tcp_v4_reqsk_destructor,
1466	.send_reset	=	tcp_v4_send_reset,
1467	.syn_ack_timeout =	tcp_syn_ack_timeout,
1468};
1469
1470const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1471	.mss_clamp	=	TCP_MSS_DEFAULT,
1472#ifdef CONFIG_TCP_MD5SIG
1473	.req_md5_lookup	=	tcp_v4_md5_lookup,
1474	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1475#endif
1476#ifdef CONFIG_SYN_COOKIES
1477	.cookie_init_seq =	cookie_v4_init_sequence,
1478#endif
1479	.route_req	=	tcp_v4_route_req,
1480	.init_seq	=	tcp_v4_init_seq,
1481	.init_ts_off	=	tcp_v4_init_ts_off,
1482	.send_synack	=	tcp_v4_send_synack,
1483};
1484
1485int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1486{
1487	/* Never answer to SYNs send to broadcast or multicast */
1488	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489		goto drop;
1490
1491	return tcp_conn_request(&tcp_request_sock_ops,
1492				&tcp_request_sock_ipv4_ops, sk, skb);
1493
1494drop:
1495	tcp_listendrop(sk);
1496	return 0;
1497}
1498EXPORT_SYMBOL(tcp_v4_conn_request);
1499
1500
1501/*
1502 * The three way handshake has completed - we got a valid synack -
1503 * now create the new socket.
1504 */
1505struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1506				  struct request_sock *req,
1507				  struct dst_entry *dst,
1508				  struct request_sock *req_unhash,
1509				  bool *own_req)
1510{
1511	struct inet_request_sock *ireq;
1512	bool found_dup_sk = false;
1513	struct inet_sock *newinet;
1514	struct tcp_sock *newtp;
1515	struct sock *newsk;
1516#ifdef CONFIG_TCP_MD5SIG
1517	const union tcp_md5_addr *addr;
1518	struct tcp_md5sig_key *key;
1519	int l3index;
1520#endif
1521	struct ip_options_rcu *inet_opt;
1522
1523	if (sk_acceptq_is_full(sk))
1524		goto exit_overflow;
1525
1526	newsk = tcp_create_openreq_child(sk, req, skb);
1527	if (!newsk)
1528		goto exit_nonewsk;
1529
1530	newsk->sk_gso_type = SKB_GSO_TCPV4;
1531	inet_sk_rx_dst_set(newsk, skb);
1532
1533	newtp		      = tcp_sk(newsk);
1534	newinet		      = inet_sk(newsk);
1535	ireq		      = inet_rsk(req);
1536	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1537	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1538	newsk->sk_bound_dev_if = ireq->ir_iif;
1539	newinet->inet_saddr   = ireq->ir_loc_addr;
1540	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1541	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1542	newinet->mc_index     = inet_iif(skb);
1543	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1544	newinet->rcv_tos      = ip_hdr(skb)->tos;
1545	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1546	if (inet_opt)
1547		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1548	newinet->inet_id = prandom_u32();
1549
1550	/* Set ToS of the new socket based upon the value of incoming SYN.
1551	 * ECT bits are set later in tcp_init_transfer().
1552	 */
1553	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1554		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1555
1556	if (!dst) {
1557		dst = inet_csk_route_child_sock(sk, newsk, req);
1558		if (!dst)
1559			goto put_and_exit;
1560	} else {
1561		/* syncookie case : see end of cookie_v4_check() */
1562	}
1563	sk_setup_caps(newsk, dst);
1564
1565	tcp_ca_openreq_child(newsk, dst);
1566
1567	tcp_sync_mss(newsk, dst_mtu(dst));
1568	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1569
1570	tcp_initialize_rcv_mss(newsk);
1571
1572#ifdef CONFIG_TCP_MD5SIG
1573	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1574	/* Copy over the MD5 key from the original socket */
1575	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1576	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1577	if (key) {
1578		/*
1579		 * We're using one, so create a matching key
1580		 * on the newsk structure. If we fail to get
1581		 * memory, then we end up not copying the key
1582		 * across. Shucks.
1583		 */
1584		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1585			       key->key, key->keylen, GFP_ATOMIC);
1586		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1587	}
1588#endif
1589
1590	if (__inet_inherit_port(sk, newsk) < 0)
1591		goto put_and_exit;
1592	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1593				       &found_dup_sk);
1594	if (likely(*own_req)) {
1595		tcp_move_syn(newtp, req);
1596		ireq->ireq_opt = NULL;
1597	} else {
1598		if (!req_unhash && found_dup_sk) {
1599			/* This code path should only be executed in the
1600			 * syncookie case only
1601			 */
1602			bh_unlock_sock(newsk);
1603			sock_put(newsk);
1604			newsk = NULL;
1605		} else {
1606			newinet->inet_opt = NULL;
1607		}
1608	}
1609	return newsk;
1610
1611exit_overflow:
1612	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1613exit_nonewsk:
1614	dst_release(dst);
1615exit:
1616	tcp_listendrop(sk);
1617	return NULL;
1618put_and_exit:
1619	newinet->inet_opt = NULL;
1620	inet_csk_prepare_forced_close(newsk);
1621	tcp_done(newsk);
1622	goto exit;
1623}
1624EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1625
1626static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1627{
1628#ifdef CONFIG_SYN_COOKIES
1629	const struct tcphdr *th = tcp_hdr(skb);
1630
1631	if (!th->syn)
1632		sk = cookie_v4_check(sk, skb);
1633#endif
1634	return sk;
1635}
1636
1637u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1638			 struct tcphdr *th, u32 *cookie)
1639{
1640	u16 mss = 0;
1641#ifdef CONFIG_SYN_COOKIES
1642	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1643				    &tcp_request_sock_ipv4_ops, sk, th);
1644	if (mss) {
1645		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1646		tcp_synq_overflow(sk);
1647	}
1648#endif
1649	return mss;
1650}
1651
1652/* The socket must have it's spinlock held when we get
1653 * here, unless it is a TCP_LISTEN socket.
1654 *
1655 * We have a potential double-lock case here, so even when
1656 * doing backlog processing we use the BH locking scheme.
1657 * This is because we cannot sleep with the original spinlock
1658 * held.
1659 */
1660int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661{
1662	struct sock *rsk;
1663
1664	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1665		struct dst_entry *dst = sk->sk_rx_dst;
1666
1667		sock_rps_save_rxhash(sk, skb);
1668		sk_mark_napi_id(sk, skb);
1669		if (dst) {
1670			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1671			    !dst->ops->check(dst, 0)) {
1672				dst_release(dst);
1673				sk->sk_rx_dst = NULL;
1674			}
1675		}
1676		tcp_rcv_established(sk, skb);
1677		return 0;
1678	}
1679
1680	if (tcp_checksum_complete(skb))
1681		goto csum_err;
1682
1683	if (sk->sk_state == TCP_LISTEN) {
1684		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1685
1686		if (!nsk)
1687			goto discard;
1688		if (nsk != sk) {
1689			if (tcp_child_process(sk, nsk, skb)) {
1690				rsk = nsk;
1691				goto reset;
1692			}
1693			return 0;
1694		}
1695	} else
1696		sock_rps_save_rxhash(sk, skb);
1697
1698	if (tcp_rcv_state_process(sk, skb)) {
1699		rsk = sk;
1700		goto reset;
1701	}
1702	return 0;
1703
1704reset:
1705	tcp_v4_send_reset(rsk, skb);
1706discard:
1707	kfree_skb(skb);
1708	/* Be careful here. If this function gets more complicated and
1709	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1710	 * might be destroyed here. This current version compiles correctly,
1711	 * but you have been warned.
1712	 */
1713	return 0;
1714
1715csum_err:
1716	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1717	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1718	goto discard;
1719}
1720EXPORT_SYMBOL(tcp_v4_do_rcv);
1721
1722int tcp_v4_early_demux(struct sk_buff *skb)
1723{
1724	const struct iphdr *iph;
1725	const struct tcphdr *th;
1726	struct sock *sk;
1727
1728	if (skb->pkt_type != PACKET_HOST)
1729		return 0;
1730
1731	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1732		return 0;
1733
1734	iph = ip_hdr(skb);
1735	th = tcp_hdr(skb);
1736
1737	if (th->doff < sizeof(struct tcphdr) / 4)
1738		return 0;
1739
1740	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1741				       iph->saddr, th->source,
1742				       iph->daddr, ntohs(th->dest),
1743				       skb->skb_iif, inet_sdif(skb));
1744	if (sk) {
1745		skb->sk = sk;
1746		skb->destructor = sock_edemux;
1747		if (sk_fullsock(sk)) {
1748			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1749
1750			if (dst)
1751				dst = dst_check(dst, 0);
1752			if (dst &&
1753			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1754				skb_dst_set_noref(skb, dst);
1755		}
1756	}
1757	return 0;
1758}
1759
1760bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1761{
1762	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1763	struct skb_shared_info *shinfo;
1764	const struct tcphdr *th;
1765	struct tcphdr *thtail;
1766	struct sk_buff *tail;
1767	unsigned int hdrlen;
1768	bool fragstolen;
1769	u32 gso_segs;
1770	int delta;
1771
1772	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1773	 * we can fix skb->truesize to its real value to avoid future drops.
1774	 * This is valid because skb is not yet charged to the socket.
1775	 * It has been noticed pure SACK packets were sometimes dropped
1776	 * (if cooked by drivers without copybreak feature).
1777	 */
1778	skb_condense(skb);
1779
1780	skb_dst_drop(skb);
1781
1782	if (unlikely(tcp_checksum_complete(skb))) {
1783		bh_unlock_sock(sk);
1784		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1785		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1786		return true;
1787	}
1788
1789	/* Attempt coalescing to last skb in backlog, even if we are
1790	 * above the limits.
1791	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1792	 */
1793	th = (const struct tcphdr *)skb->data;
1794	hdrlen = th->doff * 4;
1795	shinfo = skb_shinfo(skb);
1796
1797	if (!shinfo->gso_size)
1798		shinfo->gso_size = skb->len - hdrlen;
1799
1800	if (!shinfo->gso_segs)
1801		shinfo->gso_segs = 1;
1802
1803	tail = sk->sk_backlog.tail;
1804	if (!tail)
1805		goto no_coalesce;
1806	thtail = (struct tcphdr *)tail->data;
1807
1808	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1809	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1810	    ((TCP_SKB_CB(tail)->tcp_flags |
1811	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1812	    !((TCP_SKB_CB(tail)->tcp_flags &
1813	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1814	    ((TCP_SKB_CB(tail)->tcp_flags ^
1815	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1816#ifdef CONFIG_TLS_DEVICE
1817	    tail->decrypted != skb->decrypted ||
1818#endif
1819	    thtail->doff != th->doff ||
1820	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1821		goto no_coalesce;
1822
1823	__skb_pull(skb, hdrlen);
1824	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1825		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1826
1827		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1828			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1829			thtail->window = th->window;
1830		}
1831
1832		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1833		 * thtail->fin, so that the fast path in tcp_rcv_established()
1834		 * is not entered if we append a packet with a FIN.
1835		 * SYN, RST, URG are not present.
1836		 * ACK is set on both packets.
1837		 * PSH : we do not really care in TCP stack,
1838		 *       at least for 'GRO' packets.
1839		 */
1840		thtail->fin |= th->fin;
1841		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1842
1843		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1844			TCP_SKB_CB(tail)->has_rxtstamp = true;
1845			tail->tstamp = skb->tstamp;
1846			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1847		}
1848
1849		/* Not as strict as GRO. We only need to carry mss max value */
1850		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1851						 skb_shinfo(tail)->gso_size);
1852
1853		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1854		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1855
1856		sk->sk_backlog.len += delta;
1857		__NET_INC_STATS(sock_net(sk),
1858				LINUX_MIB_TCPBACKLOGCOALESCE);
1859		kfree_skb_partial(skb, fragstolen);
1860		return false;
1861	}
1862	__skb_push(skb, hdrlen);
1863
1864no_coalesce:
1865	/* Only socket owner can try to collapse/prune rx queues
1866	 * to reduce memory overhead, so add a little headroom here.
1867	 * Few sockets backlog are possibly concurrently non empty.
1868	 */
1869	limit += 64*1024;
1870
1871	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1872		bh_unlock_sock(sk);
1873		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1874		return true;
1875	}
1876	return false;
1877}
1878EXPORT_SYMBOL(tcp_add_backlog);
1879
1880int tcp_filter(struct sock *sk, struct sk_buff *skb)
1881{
1882	struct tcphdr *th = (struct tcphdr *)skb->data;
1883
1884	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1885}
1886EXPORT_SYMBOL(tcp_filter);
1887
1888static void tcp_v4_restore_cb(struct sk_buff *skb)
1889{
1890	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1891		sizeof(struct inet_skb_parm));
1892}
1893
1894static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1895			   const struct tcphdr *th)
1896{
1897	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1898	 * barrier() makes sure compiler wont play fool^Waliasing games.
1899	 */
1900	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1901		sizeof(struct inet_skb_parm));
1902	barrier();
1903
1904	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1905	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1906				    skb->len - th->doff * 4);
1907	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1908	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1909	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1910	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1911	TCP_SKB_CB(skb)->sacked	 = 0;
1912	TCP_SKB_CB(skb)->has_rxtstamp =
1913			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1914}
1915
1916/*
1917 *	From tcp_input.c
1918 */
1919
1920int tcp_v4_rcv(struct sk_buff *skb)
1921{
1922	struct net *net = dev_net(skb->dev);
1923	struct sk_buff *skb_to_free;
1924	int sdif = inet_sdif(skb);
1925	int dif = inet_iif(skb);
1926	const struct iphdr *iph;
1927	const struct tcphdr *th;
1928	bool refcounted;
1929	struct sock *sk;
1930	int ret;
1931
1932	if (skb->pkt_type != PACKET_HOST)
1933		goto discard_it;
1934
1935	/* Count it even if it's bad */
1936	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1937
1938	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1939		goto discard_it;
1940
1941	th = (const struct tcphdr *)skb->data;
1942
1943	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1944		goto bad_packet;
1945	if (!pskb_may_pull(skb, th->doff * 4))
1946		goto discard_it;
1947
1948	/* An explanation is required here, I think.
1949	 * Packet length and doff are validated by header prediction,
1950	 * provided case of th->doff==0 is eliminated.
1951	 * So, we defer the checks. */
1952
1953	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1954		goto csum_error;
1955
1956	th = (const struct tcphdr *)skb->data;
1957	iph = ip_hdr(skb);
1958lookup:
1959	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1960			       th->dest, sdif, &refcounted);
1961	if (!sk)
1962		goto no_tcp_socket;
1963
1964process:
1965	if (sk->sk_state == TCP_TIME_WAIT)
1966		goto do_time_wait;
1967
1968	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1969		struct request_sock *req = inet_reqsk(sk);
1970		bool req_stolen = false;
1971		struct sock *nsk;
1972
1973		sk = req->rsk_listener;
1974		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1975			sk_drops_add(sk, skb);
1976			reqsk_put(req);
1977			goto discard_it;
1978		}
1979		if (tcp_checksum_complete(skb)) {
1980			reqsk_put(req);
1981			goto csum_error;
1982		}
1983		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1984			inet_csk_reqsk_queue_drop_and_put(sk, req);
1985			goto lookup;
1986		}
1987		/* We own a reference on the listener, increase it again
1988		 * as we might lose it too soon.
1989		 */
1990		sock_hold(sk);
1991		refcounted = true;
1992		nsk = NULL;
1993		if (!tcp_filter(sk, skb)) {
1994			th = (const struct tcphdr *)skb->data;
1995			iph = ip_hdr(skb);
1996			tcp_v4_fill_cb(skb, iph, th);
1997			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1998		}
1999		if (!nsk) {
2000			reqsk_put(req);
2001			if (req_stolen) {
2002				/* Another cpu got exclusive access to req
2003				 * and created a full blown socket.
2004				 * Try to feed this packet to this socket
2005				 * instead of discarding it.
2006				 */
2007				tcp_v4_restore_cb(skb);
2008				sock_put(sk);
2009				goto lookup;
2010			}
2011			goto discard_and_relse;
2012		}
2013		if (nsk == sk) {
2014			reqsk_put(req);
2015			tcp_v4_restore_cb(skb);
2016		} else if (tcp_child_process(sk, nsk, skb)) {
2017			tcp_v4_send_reset(nsk, skb);
2018			goto discard_and_relse;
2019		} else {
2020			sock_put(sk);
2021			return 0;
2022		}
2023	}
2024	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2025		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2026		goto discard_and_relse;
2027	}
2028
2029	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2030		goto discard_and_relse;
2031
2032	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2033		goto discard_and_relse;
2034
2035	nf_reset_ct(skb);
2036
2037	if (tcp_filter(sk, skb))
2038		goto discard_and_relse;
2039	th = (const struct tcphdr *)skb->data;
2040	iph = ip_hdr(skb);
2041	tcp_v4_fill_cb(skb, iph, th);
2042
2043	skb->dev = NULL;
2044
2045	if (sk->sk_state == TCP_LISTEN) {
2046		ret = tcp_v4_do_rcv(sk, skb);
2047		goto put_and_return;
2048	}
2049
2050	sk_incoming_cpu_update(sk);
2051
2052	bh_lock_sock_nested(sk);
2053	tcp_segs_in(tcp_sk(sk), skb);
2054	ret = 0;
2055	if (!sock_owned_by_user(sk)) {
2056		skb_to_free = sk->sk_rx_skb_cache;
2057		sk->sk_rx_skb_cache = NULL;
2058		ret = tcp_v4_do_rcv(sk, skb);
2059	} else {
2060		if (tcp_add_backlog(sk, skb))
2061			goto discard_and_relse;
2062		skb_to_free = NULL;
2063	}
2064	bh_unlock_sock(sk);
2065	if (skb_to_free)
2066		__kfree_skb(skb_to_free);
2067
2068put_and_return:
2069	if (refcounted)
2070		sock_put(sk);
2071
2072	return ret;
2073
2074no_tcp_socket:
2075	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2076		goto discard_it;
2077
2078	tcp_v4_fill_cb(skb, iph, th);
2079
2080	if (tcp_checksum_complete(skb)) {
2081csum_error:
2082		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2083bad_packet:
2084		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2085	} else {
2086		tcp_v4_send_reset(NULL, skb);
2087	}
2088
2089discard_it:
2090	/* Discard frame. */
2091	kfree_skb(skb);
2092	return 0;
2093
2094discard_and_relse:
2095	sk_drops_add(sk, skb);
2096	if (refcounted)
2097		sock_put(sk);
2098	goto discard_it;
2099
2100do_time_wait:
2101	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2102		inet_twsk_put(inet_twsk(sk));
2103		goto discard_it;
2104	}
2105
2106	tcp_v4_fill_cb(skb, iph, th);
2107
2108	if (tcp_checksum_complete(skb)) {
2109		inet_twsk_put(inet_twsk(sk));
2110		goto csum_error;
2111	}
2112	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2113	case TCP_TW_SYN: {
2114		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2115							&tcp_hashinfo, skb,
2116							__tcp_hdrlen(th),
2117							iph->saddr, th->source,
2118							iph->daddr, th->dest,
2119							inet_iif(skb),
2120							sdif);
2121		if (sk2) {
2122			inet_twsk_deschedule_put(inet_twsk(sk));
2123			sk = sk2;
2124			tcp_v4_restore_cb(skb);
2125			refcounted = false;
2126			goto process;
2127		}
2128	}
2129		/* to ACK */
2130		fallthrough;
2131	case TCP_TW_ACK:
2132		tcp_v4_timewait_ack(sk, skb);
2133		break;
2134	case TCP_TW_RST:
2135		tcp_v4_send_reset(sk, skb);
2136		inet_twsk_deschedule_put(inet_twsk(sk));
2137		goto discard_it;
2138	case TCP_TW_SUCCESS:;
2139	}
2140	goto discard_it;
2141}
2142
2143static struct timewait_sock_ops tcp_timewait_sock_ops = {
2144	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2145	.twsk_unique	= tcp_twsk_unique,
2146	.twsk_destructor= tcp_twsk_destructor,
2147};
2148
2149void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2150{
2151	struct dst_entry *dst = skb_dst(skb);
2152
2153	if (dst && dst_hold_safe(dst)) {
2154		sk->sk_rx_dst = dst;
2155		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2156	}
2157}
2158EXPORT_SYMBOL(inet_sk_rx_dst_set);
2159
2160const struct inet_connection_sock_af_ops ipv4_specific = {
2161	.queue_xmit	   = ip_queue_xmit,
2162	.send_check	   = tcp_v4_send_check,
2163	.rebuild_header	   = inet_sk_rebuild_header,
2164	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2165	.conn_request	   = tcp_v4_conn_request,
2166	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2167	.net_header_len	   = sizeof(struct iphdr),
2168	.setsockopt	   = ip_setsockopt,
2169	.getsockopt	   = ip_getsockopt,
2170	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2171	.sockaddr_len	   = sizeof(struct sockaddr_in),
2172	.mtu_reduced	   = tcp_v4_mtu_reduced,
2173};
2174EXPORT_SYMBOL(ipv4_specific);
2175
2176#ifdef CONFIG_TCP_MD5SIG
2177static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2178	.md5_lookup		= tcp_v4_md5_lookup,
2179	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2180	.md5_parse		= tcp_v4_parse_md5_keys,
2181};
2182#endif
2183
2184/* NOTE: A lot of things set to zero explicitly by call to
2185 *       sk_alloc() so need not be done here.
2186 */
2187static int tcp_v4_init_sock(struct sock *sk)
2188{
2189	struct inet_connection_sock *icsk = inet_csk(sk);
2190
2191	tcp_init_sock(sk);
2192
2193	icsk->icsk_af_ops = &ipv4_specific;
2194
2195#ifdef CONFIG_TCP_MD5SIG
2196	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2197#endif
2198
2199	return 0;
2200}
2201
2202void tcp_v4_destroy_sock(struct sock *sk)
2203{
2204	struct tcp_sock *tp = tcp_sk(sk);
2205
2206	trace_tcp_destroy_sock(sk);
2207
2208	tcp_clear_xmit_timers(sk);
2209
2210	tcp_cleanup_congestion_control(sk);
2211
2212	tcp_cleanup_ulp(sk);
2213
2214	/* Cleanup up the write buffer. */
2215	tcp_write_queue_purge(sk);
2216
2217	/* Check if we want to disable active TFO */
2218	tcp_fastopen_active_disable_ofo_check(sk);
2219
2220	/* Cleans up our, hopefully empty, out_of_order_queue. */
2221	skb_rbtree_purge(&tp->out_of_order_queue);
2222
2223#ifdef CONFIG_TCP_MD5SIG
2224	/* Clean up the MD5 key list, if any */
2225	if (tp->md5sig_info) {
2226		tcp_clear_md5_list(sk);
2227		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2228		tp->md5sig_info = NULL;
2229	}
2230#endif
2231
2232	/* Clean up a referenced TCP bind bucket. */
2233	if (inet_csk(sk)->icsk_bind_hash)
2234		inet_put_port(sk);
2235
2236	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2237
2238	/* If socket is aborted during connect operation */
2239	tcp_free_fastopen_req(tp);
2240	tcp_fastopen_destroy_cipher(sk);
2241	tcp_saved_syn_free(tp);
2242
2243	sk_sockets_allocated_dec(sk);
2244}
2245EXPORT_SYMBOL(tcp_v4_destroy_sock);
2246
2247#ifdef CONFIG_PROC_FS
2248/* Proc filesystem TCP sock list dumping. */
2249
2250/*
2251 * Get next listener socket follow cur.  If cur is NULL, get first socket
2252 * starting from bucket given in st->bucket; when st->bucket is zero the
2253 * very first socket in the hash table is returned.
2254 */
2255static void *listening_get_next(struct seq_file *seq, void *cur)
2256{
2257	struct tcp_seq_afinfo *afinfo;
2258	struct tcp_iter_state *st = seq->private;
2259	struct net *net = seq_file_net(seq);
2260	struct inet_listen_hashbucket *ilb;
2261	struct hlist_nulls_node *node;
2262	struct sock *sk = cur;
2263
2264	if (st->bpf_seq_afinfo)
2265		afinfo = st->bpf_seq_afinfo;
2266	else
2267		afinfo = PDE_DATA(file_inode(seq->file));
2268
2269	if (!sk) {
2270get_head:
2271		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2272		spin_lock(&ilb->lock);
2273		sk = sk_nulls_head(&ilb->nulls_head);
2274		st->offset = 0;
2275		goto get_sk;
2276	}
2277	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2278	++st->num;
2279	++st->offset;
2280
2281	sk = sk_nulls_next(sk);
2282get_sk:
2283	sk_nulls_for_each_from(sk, node) {
2284		if (!net_eq(sock_net(sk), net))
2285			continue;
2286		if (afinfo->family == AF_UNSPEC ||
2287		    sk->sk_family == afinfo->family)
2288			return sk;
2289	}
2290	spin_unlock(&ilb->lock);
2291	st->offset = 0;
2292	if (++st->bucket < INET_LHTABLE_SIZE)
2293		goto get_head;
2294	return NULL;
2295}
2296
2297static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2298{
2299	struct tcp_iter_state *st = seq->private;
2300	void *rc;
2301
2302	st->bucket = 0;
2303	st->offset = 0;
2304	rc = listening_get_next(seq, NULL);
2305
2306	while (rc && *pos) {
2307		rc = listening_get_next(seq, rc);
2308		--*pos;
2309	}
2310	return rc;
2311}
2312
2313static inline bool empty_bucket(const struct tcp_iter_state *st)
2314{
2315	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2316}
2317
2318/*
2319 * Get first established socket starting from bucket given in st->bucket.
2320 * If st->bucket is zero, the very first socket in the hash is returned.
2321 */
2322static void *established_get_first(struct seq_file *seq)
2323{
2324	struct tcp_seq_afinfo *afinfo;
2325	struct tcp_iter_state *st = seq->private;
2326	struct net *net = seq_file_net(seq);
2327	void *rc = NULL;
2328
2329	if (st->bpf_seq_afinfo)
2330		afinfo = st->bpf_seq_afinfo;
2331	else
2332		afinfo = PDE_DATA(file_inode(seq->file));
2333
2334	st->offset = 0;
2335	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2336		struct sock *sk;
2337		struct hlist_nulls_node *node;
2338		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2339
2340		/* Lockless fast path for the common case of empty buckets */
2341		if (empty_bucket(st))
2342			continue;
2343
2344		spin_lock_bh(lock);
2345		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2346			if ((afinfo->family != AF_UNSPEC &&
2347			     sk->sk_family != afinfo->family) ||
2348			    !net_eq(sock_net(sk), net)) {
2349				continue;
2350			}
2351			rc = sk;
2352			goto out;
2353		}
2354		spin_unlock_bh(lock);
2355	}
2356out:
2357	return rc;
2358}
2359
2360static void *established_get_next(struct seq_file *seq, void *cur)
2361{
2362	struct tcp_seq_afinfo *afinfo;
2363	struct sock *sk = cur;
2364	struct hlist_nulls_node *node;
2365	struct tcp_iter_state *st = seq->private;
2366	struct net *net = seq_file_net(seq);
2367
2368	if (st->bpf_seq_afinfo)
2369		afinfo = st->bpf_seq_afinfo;
2370	else
2371		afinfo = PDE_DATA(file_inode(seq->file));
2372
2373	++st->num;
2374	++st->offset;
2375
2376	sk = sk_nulls_next(sk);
2377
2378	sk_nulls_for_each_from(sk, node) {
2379		if ((afinfo->family == AF_UNSPEC ||
2380		     sk->sk_family == afinfo->family) &&
2381		    net_eq(sock_net(sk), net))
2382			return sk;
2383	}
2384
2385	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2386	++st->bucket;
2387	return established_get_first(seq);
2388}
2389
2390static void *established_get_idx(struct seq_file *seq, loff_t pos)
2391{
2392	struct tcp_iter_state *st = seq->private;
2393	void *rc;
2394
2395	st->bucket = 0;
2396	rc = established_get_first(seq);
2397
2398	while (rc && pos) {
2399		rc = established_get_next(seq, rc);
2400		--pos;
2401	}
2402	return rc;
2403}
2404
2405static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2406{
2407	void *rc;
2408	struct tcp_iter_state *st = seq->private;
2409
2410	st->state = TCP_SEQ_STATE_LISTENING;
2411	rc	  = listening_get_idx(seq, &pos);
2412
2413	if (!rc) {
2414		st->state = TCP_SEQ_STATE_ESTABLISHED;
2415		rc	  = established_get_idx(seq, pos);
2416	}
2417
2418	return rc;
2419}
2420
2421static void *tcp_seek_last_pos(struct seq_file *seq)
2422{
2423	struct tcp_iter_state *st = seq->private;
2424	int offset = st->offset;
2425	int orig_num = st->num;
2426	void *rc = NULL;
2427
2428	switch (st->state) {
2429	case TCP_SEQ_STATE_LISTENING:
2430		if (st->bucket >= INET_LHTABLE_SIZE)
2431			break;
2432		st->state = TCP_SEQ_STATE_LISTENING;
2433		rc = listening_get_next(seq, NULL);
2434		while (offset-- && rc)
2435			rc = listening_get_next(seq, rc);
2436		if (rc)
2437			break;
2438		st->bucket = 0;
2439		st->state = TCP_SEQ_STATE_ESTABLISHED;
2440		fallthrough;
2441	case TCP_SEQ_STATE_ESTABLISHED:
2442		if (st->bucket > tcp_hashinfo.ehash_mask)
2443			break;
2444		rc = established_get_first(seq);
2445		while (offset-- && rc)
2446			rc = established_get_next(seq, rc);
2447	}
2448
2449	st->num = orig_num;
2450
2451	return rc;
2452}
2453
2454void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2455{
2456	struct tcp_iter_state *st = seq->private;
2457	void *rc;
2458
2459	if (*pos && *pos == st->last_pos) {
2460		rc = tcp_seek_last_pos(seq);
2461		if (rc)
2462			goto out;
2463	}
2464
2465	st->state = TCP_SEQ_STATE_LISTENING;
2466	st->num = 0;
2467	st->bucket = 0;
2468	st->offset = 0;
2469	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2470
2471out:
2472	st->last_pos = *pos;
2473	return rc;
2474}
2475EXPORT_SYMBOL(tcp_seq_start);
2476
2477void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2478{
2479	struct tcp_iter_state *st = seq->private;
2480	void *rc = NULL;
2481
2482	if (v == SEQ_START_TOKEN) {
2483		rc = tcp_get_idx(seq, 0);
2484		goto out;
2485	}
2486
2487	switch (st->state) {
2488	case TCP_SEQ_STATE_LISTENING:
2489		rc = listening_get_next(seq, v);
2490		if (!rc) {
2491			st->state = TCP_SEQ_STATE_ESTABLISHED;
2492			st->bucket = 0;
2493			st->offset = 0;
2494			rc	  = established_get_first(seq);
2495		}
2496		break;
2497	case TCP_SEQ_STATE_ESTABLISHED:
2498		rc = established_get_next(seq, v);
2499		break;
2500	}
2501out:
2502	++*pos;
2503	st->last_pos = *pos;
2504	return rc;
2505}
2506EXPORT_SYMBOL(tcp_seq_next);
2507
2508void tcp_seq_stop(struct seq_file *seq, void *v)
2509{
2510	struct tcp_iter_state *st = seq->private;
2511
2512	switch (st->state) {
2513	case TCP_SEQ_STATE_LISTENING:
2514		if (v != SEQ_START_TOKEN)
2515			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2516		break;
2517	case TCP_SEQ_STATE_ESTABLISHED:
2518		if (v)
2519			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2520		break;
2521	}
2522}
2523EXPORT_SYMBOL(tcp_seq_stop);
2524
2525static void get_openreq4(const struct request_sock *req,
2526			 struct seq_file *f, int i)
2527{
2528	const struct inet_request_sock *ireq = inet_rsk(req);
2529	long delta = req->rsk_timer.expires - jiffies;
2530
2531	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2532		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2533		i,
2534		ireq->ir_loc_addr,
2535		ireq->ir_num,
2536		ireq->ir_rmt_addr,
2537		ntohs(ireq->ir_rmt_port),
2538		TCP_SYN_RECV,
2539		0, 0, /* could print option size, but that is af dependent. */
2540		1,    /* timers active (only the expire timer) */
2541		jiffies_delta_to_clock_t(delta),
2542		req->num_timeout,
2543		from_kuid_munged(seq_user_ns(f),
2544				 sock_i_uid(req->rsk_listener)),
2545		0,  /* non standard timer */
2546		0, /* open_requests have no inode */
2547		0,
2548		req);
2549}
2550
2551static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2552{
2553	int timer_active;
2554	unsigned long timer_expires;
2555	const struct tcp_sock *tp = tcp_sk(sk);
2556	const struct inet_connection_sock *icsk = inet_csk(sk);
2557	const struct inet_sock *inet = inet_sk(sk);
2558	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2559	__be32 dest = inet->inet_daddr;
2560	__be32 src = inet->inet_rcv_saddr;
2561	__u16 destp = ntohs(inet->inet_dport);
2562	__u16 srcp = ntohs(inet->inet_sport);
2563	int rx_queue;
2564	int state;
2565
2566	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2567	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2568	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2569		timer_active	= 1;
2570		timer_expires	= icsk->icsk_timeout;
2571	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2572		timer_active	= 4;
2573		timer_expires	= icsk->icsk_timeout;
2574	} else if (timer_pending(&sk->sk_timer)) {
2575		timer_active	= 2;
2576		timer_expires	= sk->sk_timer.expires;
2577	} else {
2578		timer_active	= 0;
2579		timer_expires = jiffies;
2580	}
2581
2582	state = inet_sk_state_load(sk);
2583	if (state == TCP_LISTEN)
2584		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2585	else
2586		/* Because we don't lock the socket,
2587		 * we might find a transient negative value.
2588		 */
2589		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2590				      READ_ONCE(tp->copied_seq), 0);
2591
2592	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2593			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2594		i, src, srcp, dest, destp, state,
2595		READ_ONCE(tp->write_seq) - tp->snd_una,
2596		rx_queue,
2597		timer_active,
2598		jiffies_delta_to_clock_t(timer_expires - jiffies),
2599		icsk->icsk_retransmits,
2600		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2601		icsk->icsk_probes_out,
2602		sock_i_ino(sk),
2603		refcount_read(&sk->sk_refcnt), sk,
2604		jiffies_to_clock_t(icsk->icsk_rto),
2605		jiffies_to_clock_t(icsk->icsk_ack.ato),
2606		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2607		tp->snd_cwnd,
2608		state == TCP_LISTEN ?
2609		    fastopenq->max_qlen :
2610		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2611}
2612
2613static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2614			       struct seq_file *f, int i)
2615{
2616	long delta = tw->tw_timer.expires - jiffies;
2617	__be32 dest, src;
2618	__u16 destp, srcp;
2619
2620	dest  = tw->tw_daddr;
2621	src   = tw->tw_rcv_saddr;
2622	destp = ntohs(tw->tw_dport);
2623	srcp  = ntohs(tw->tw_sport);
2624
2625	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2626		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2627		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2628		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2629		refcount_read(&tw->tw_refcnt), tw);
2630}
2631
2632#define TMPSZ 150
2633
2634static int tcp4_seq_show(struct seq_file *seq, void *v)
2635{
2636	struct tcp_iter_state *st;
2637	struct sock *sk = v;
2638
2639	seq_setwidth(seq, TMPSZ - 1);
2640	if (v == SEQ_START_TOKEN) {
2641		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2642			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2643			   "inode");
2644		goto out;
2645	}
2646	st = seq->private;
2647
2648	if (sk->sk_state == TCP_TIME_WAIT)
2649		get_timewait4_sock(v, seq, st->num);
2650	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2651		get_openreq4(v, seq, st->num);
2652	else
2653		get_tcp4_sock(v, seq, st->num);
2654out:
2655	seq_pad(seq, '\n');
2656	return 0;
2657}
2658
2659#ifdef CONFIG_BPF_SYSCALL
2660struct bpf_iter__tcp {
2661	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2662	__bpf_md_ptr(struct sock_common *, sk_common);
2663	uid_t uid __aligned(8);
2664};
2665
2666static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2667			     struct sock_common *sk_common, uid_t uid)
2668{
2669	struct bpf_iter__tcp ctx;
2670
2671	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2672	ctx.meta = meta;
2673	ctx.sk_common = sk_common;
2674	ctx.uid = uid;
2675	return bpf_iter_run_prog(prog, &ctx);
2676}
2677
2678static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2679{
2680	struct bpf_iter_meta meta;
2681	struct bpf_prog *prog;
2682	struct sock *sk = v;
2683	uid_t uid;
2684
2685	if (v == SEQ_START_TOKEN)
2686		return 0;
2687
2688	if (sk->sk_state == TCP_TIME_WAIT) {
2689		uid = 0;
2690	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2691		const struct request_sock *req = v;
2692
2693		uid = from_kuid_munged(seq_user_ns(seq),
2694				       sock_i_uid(req->rsk_listener));
2695	} else {
2696		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2697	}
2698
2699	meta.seq = seq;
2700	prog = bpf_iter_get_info(&meta, false);
2701	return tcp_prog_seq_show(prog, &meta, v, uid);
2702}
2703
2704static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2705{
2706	struct bpf_iter_meta meta;
2707	struct bpf_prog *prog;
2708
2709	if (!v) {
2710		meta.seq = seq;
2711		prog = bpf_iter_get_info(&meta, true);
2712		if (prog)
2713			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2714	}
2715
2716	tcp_seq_stop(seq, v);
2717}
2718
2719static const struct seq_operations bpf_iter_tcp_seq_ops = {
2720	.show		= bpf_iter_tcp_seq_show,
2721	.start		= tcp_seq_start,
2722	.next		= tcp_seq_next,
2723	.stop		= bpf_iter_tcp_seq_stop,
2724};
2725#endif
2726
2727static const struct seq_operations tcp4_seq_ops = {
2728	.show		= tcp4_seq_show,
2729	.start		= tcp_seq_start,
2730	.next		= tcp_seq_next,
2731	.stop		= tcp_seq_stop,
2732};
2733
2734static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2735	.family		= AF_INET,
2736};
2737
2738static int __net_init tcp4_proc_init_net(struct net *net)
2739{
2740	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2741			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2742		return -ENOMEM;
2743	return 0;
2744}
2745
2746static void __net_exit tcp4_proc_exit_net(struct net *net)
2747{
2748	remove_proc_entry("tcp", net->proc_net);
2749}
2750
2751static struct pernet_operations tcp4_net_ops = {
2752	.init = tcp4_proc_init_net,
2753	.exit = tcp4_proc_exit_net,
2754};
2755
2756int __init tcp4_proc_init(void)
2757{
2758	return register_pernet_subsys(&tcp4_net_ops);
2759}
2760
2761void tcp4_proc_exit(void)
2762{
2763	unregister_pernet_subsys(&tcp4_net_ops);
2764}
2765#endif /* CONFIG_PROC_FS */
2766
2767/* @wake is one when sk_stream_write_space() calls us.
2768 * This sends EPOLLOUT only if notsent_bytes is half the limit.
2769 * This mimics the strategy used in sock_def_write_space().
2770 */
2771bool tcp_stream_memory_free(const struct sock *sk, int wake)
2772{
2773	const struct tcp_sock *tp = tcp_sk(sk);
2774	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2775			    READ_ONCE(tp->snd_nxt);
2776
2777	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2778}
2779EXPORT_SYMBOL(tcp_stream_memory_free);
2780
2781struct proto tcp_prot = {
2782	.name			= "TCP",
2783	.owner			= THIS_MODULE,
2784	.close			= tcp_close,
2785	.pre_connect		= tcp_v4_pre_connect,
2786	.connect		= tcp_v4_connect,
2787	.disconnect		= tcp_disconnect,
2788	.accept			= inet_csk_accept,
2789	.ioctl			= tcp_ioctl,
2790	.init			= tcp_v4_init_sock,
2791	.destroy		= tcp_v4_destroy_sock,
2792	.shutdown		= tcp_shutdown,
2793	.setsockopt		= tcp_setsockopt,
2794	.getsockopt		= tcp_getsockopt,
2795	.keepalive		= tcp_set_keepalive,
2796	.recvmsg		= tcp_recvmsg,
2797	.sendmsg		= tcp_sendmsg,
2798	.sendpage		= tcp_sendpage,
2799	.backlog_rcv		= tcp_v4_do_rcv,
2800	.release_cb		= tcp_release_cb,
2801	.hash			= inet_hash,
2802	.unhash			= inet_unhash,
2803	.get_port		= inet_csk_get_port,
2804	.enter_memory_pressure	= tcp_enter_memory_pressure,
2805	.leave_memory_pressure	= tcp_leave_memory_pressure,
2806	.stream_memory_free	= tcp_stream_memory_free,
2807	.sockets_allocated	= &tcp_sockets_allocated,
2808	.orphan_count		= &tcp_orphan_count,
2809	.memory_allocated	= &tcp_memory_allocated,
2810	.memory_pressure	= &tcp_memory_pressure,
2811	.sysctl_mem		= sysctl_tcp_mem,
2812	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2813	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2814	.max_header		= MAX_TCP_HEADER,
2815	.obj_size		= sizeof(struct tcp_sock),
2816	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2817	.twsk_prot		= &tcp_timewait_sock_ops,
2818	.rsk_prot		= &tcp_request_sock_ops,
2819	.h.hashinfo		= &tcp_hashinfo,
2820	.no_autobind		= true,
2821	.diag_destroy		= tcp_abort,
2822};
2823EXPORT_SYMBOL(tcp_prot);
2824
2825static void __net_exit tcp_sk_exit(struct net *net)
2826{
2827	int cpu;
2828
2829	if (net->ipv4.tcp_congestion_control)
2830		bpf_module_put(net->ipv4.tcp_congestion_control,
2831			       net->ipv4.tcp_congestion_control->owner);
2832
2833	for_each_possible_cpu(cpu)
2834		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2835	free_percpu(net->ipv4.tcp_sk);
2836}
2837
2838static int __net_init tcp_sk_init(struct net *net)
2839{
2840	int res, cpu, cnt;
2841
2842	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2843	if (!net->ipv4.tcp_sk)
2844		return -ENOMEM;
2845
2846	for_each_possible_cpu(cpu) {
2847		struct sock *sk;
2848
2849		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2850					   IPPROTO_TCP, net);
2851		if (res)
2852			goto fail;
2853		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2854
2855		/* Please enforce IP_DF and IPID==0 for RST and
2856		 * ACK sent in SYN-RECV and TIME-WAIT state.
2857		 */
2858		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2859
2860		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2861	}
2862
2863	net->ipv4.sysctl_tcp_ecn = 2;
2864	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2865
2866	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2867	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2868	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2869	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2870	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2871
2872	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2873	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2874	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2875
2876	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2877	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2878	net->ipv4.sysctl_tcp_syncookies = 1;
2879	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2880	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2881	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2882	net->ipv4.sysctl_tcp_orphan_retries = 0;
2883	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2884	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2885	net->ipv4.sysctl_tcp_tw_reuse = 2;
2886	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2887
2888	cnt = tcp_hashinfo.ehash_mask + 1;
2889	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2890	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2891
2892	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2893	net->ipv4.sysctl_tcp_sack = 1;
2894	net->ipv4.sysctl_tcp_window_scaling = 1;
2895	net->ipv4.sysctl_tcp_timestamps = 1;
2896	net->ipv4.sysctl_tcp_early_retrans = 3;
2897	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2898	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2899	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2900	net->ipv4.sysctl_tcp_max_reordering = 300;
2901	net->ipv4.sysctl_tcp_dsack = 1;
2902	net->ipv4.sysctl_tcp_app_win = 31;
2903	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2904	net->ipv4.sysctl_tcp_frto = 2;
2905	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2906	/* This limits the percentage of the congestion window which we
2907	 * will allow a single TSO frame to consume.  Building TSO frames
2908	 * which are too large can cause TCP streams to be bursty.
2909	 */
2910	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2911	/* Default TSQ limit of 16 TSO segments */
2912	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2913	/* rfc5961 challenge ack rate limiting */
2914	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2915	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2916	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2917	net->ipv4.sysctl_tcp_autocorking = 1;
2918	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2919	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2920	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2921	if (net != &init_net) {
2922		memcpy(net->ipv4.sysctl_tcp_rmem,
2923		       init_net.ipv4.sysctl_tcp_rmem,
2924		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2925		memcpy(net->ipv4.sysctl_tcp_wmem,
2926		       init_net.ipv4.sysctl_tcp_wmem,
2927		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2928	}
2929	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2930	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2931	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2932	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2933	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2934	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2935	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2936
2937	/* Reno is always built in */
2938	if (!net_eq(net, &init_net) &&
2939	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2940			       init_net.ipv4.tcp_congestion_control->owner))
2941		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2942	else
2943		net->ipv4.tcp_congestion_control = &tcp_reno;
2944
2945	return 0;
2946fail:
2947	tcp_sk_exit(net);
2948
2949	return res;
2950}
2951
2952static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2953{
2954	struct net *net;
2955
2956	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2957
2958	list_for_each_entry(net, net_exit_list, exit_list)
2959		tcp_fastopen_ctx_destroy(net);
2960}
2961
2962static struct pernet_operations __net_initdata tcp_sk_ops = {
2963       .init	   = tcp_sk_init,
2964       .exit	   = tcp_sk_exit,
2965       .exit_batch = tcp_sk_exit_batch,
2966};
2967
2968#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2969DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2970		     struct sock_common *sk_common, uid_t uid)
2971
2972static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2973{
2974	struct tcp_iter_state *st = priv_data;
2975	struct tcp_seq_afinfo *afinfo;
2976	int ret;
2977
2978	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2979	if (!afinfo)
2980		return -ENOMEM;
2981
2982	afinfo->family = AF_UNSPEC;
2983	st->bpf_seq_afinfo = afinfo;
2984	ret = bpf_iter_init_seq_net(priv_data, aux);
2985	if (ret)
2986		kfree(afinfo);
2987	return ret;
2988}
2989
2990static void bpf_iter_fini_tcp(void *priv_data)
2991{
2992	struct tcp_iter_state *st = priv_data;
2993
2994	kfree(st->bpf_seq_afinfo);
2995	bpf_iter_fini_seq_net(priv_data);
2996}
2997
2998static const struct bpf_iter_seq_info tcp_seq_info = {
2999	.seq_ops		= &bpf_iter_tcp_seq_ops,
3000	.init_seq_private	= bpf_iter_init_tcp,
3001	.fini_seq_private	= bpf_iter_fini_tcp,
3002	.seq_priv_size		= sizeof(struct tcp_iter_state),
3003};
3004
3005static struct bpf_iter_reg tcp_reg_info = {
3006	.target			= "tcp",
3007	.ctx_arg_info_size	= 1,
3008	.ctx_arg_info		= {
3009		{ offsetof(struct bpf_iter__tcp, sk_common),
3010		  PTR_TO_BTF_ID_OR_NULL },
3011	},
3012	.seq_info		= &tcp_seq_info,
3013};
3014
3015static void __init bpf_iter_register(void)
3016{
3017	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3018	if (bpf_iter_reg_target(&tcp_reg_info))
3019		pr_warn("Warning: could not register bpf iterator tcp\n");
3020}
3021
3022#endif
3023
3024void __init tcp_v4_init(void)
3025{
3026	if (register_pernet_subsys(&tcp_sk_ops))
3027		panic("Failed to create the TCP control socket.\n");
3028
3029#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3030	bpf_iter_register();
3031#endif
3032}
Configure Feed

Configure Feed