net/ipv4/tcp_ipv4.c at v5.4-rc3

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v5.4-rc3 2758 lines 74 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 *		IPv4 specific functions
  10 *
  11 *		code split from:
  12 *		linux/ipv4/tcp.c
  13 *		linux/ipv4/tcp_input.c
  14 *		linux/ipv4/tcp_output.c
  15 *
  16 *		See tcp.c for author information
  17 */
  18
  19/*
  20 * Changes:
  21 *		David S. Miller	:	New socket lookup architecture.
  22 *					This code is dedicated to John Dyson.
  23 *		David S. Miller :	Change semantics of established hash,
  24 *					half is devoted to TIME_WAIT sockets
  25 *					and the rest go in the other half.
  26 *		Andi Kleen :		Add support for syncookies and fixed
  27 *					some bugs: ip options weren't passed to
  28 *					the TCP layer, missed a check for an
  29 *					ACK bit.
  30 *		Andi Kleen :		Implemented fast path mtu discovery.
  31 *	     				Fixed many serious bugs in the
  32 *					request_sock handling and moved
  33 *					most of it into the af independent code.
  34 *					Added tail drop and some other bugfixes.
  35 *					Added new listen semantics.
  36 *		Mike McLagan	:	Routing by source
  37 *	Juan Jose Ciarlante:		ip_dynaddr bits
  38 *		Andi Kleen:		various fixes.
  39 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  40 *					coma.
  41 *	Andi Kleen		:	Fix new listen.
  42 *	Andi Kleen		:	Fix accept error reporting.
  43 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  44 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  45 *					a single port at the same time.
  46 */
  47
  48#define pr_fmt(fmt) "TCP: " fmt
  49
  50#include <linux/bottom_half.h>
  51#include <linux/types.h>
  52#include <linux/fcntl.h>
  53#include <linux/module.h>
  54#include <linux/random.h>
  55#include <linux/cache.h>
  56#include <linux/jhash.h>
  57#include <linux/init.h>
  58#include <linux/times.h>
  59#include <linux/slab.h>
  60
  61#include <net/net_namespace.h>
  62#include <net/icmp.h>
  63#include <net/inet_hashtables.h>
  64#include <net/tcp.h>
  65#include <net/transp_v6.h>
  66#include <net/ipv6.h>
  67#include <net/inet_common.h>
  68#include <net/timewait_sock.h>
  69#include <net/xfrm.h>
  70#include <net/secure_seq.h>
  71#include <net/busy_poll.h>
  72
  73#include <linux/inet.h>
  74#include <linux/ipv6.h>
  75#include <linux/stddef.h>
  76#include <linux/proc_fs.h>
  77#include <linux/seq_file.h>
  78#include <linux/inetdevice.h>
  79
  80#include <crypto/hash.h>
  81#include <linux/scatterlist.h>
  82
  83#include <trace/events/tcp.h>
  84
  85#ifdef CONFIG_TCP_MD5SIG
  86static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  87			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
  88#endif
  89
  90struct inet_hashinfo tcp_hashinfo;
  91EXPORT_SYMBOL(tcp_hashinfo);
  92
  93static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  94{
  95	return secure_tcp_seq(ip_hdr(skb)->daddr,
  96			      ip_hdr(skb)->saddr,
  97			      tcp_hdr(skb)->dest,
  98			      tcp_hdr(skb)->source);
  99}
 100
 101static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 102{
 103	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 104}
 105
 106int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 107{
 108	const struct inet_timewait_sock *tw = inet_twsk(sktw);
 109	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110	struct tcp_sock *tp = tcp_sk(sk);
 111	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 112
 113	if (reuse == 2) {
 114		/* Still does not detect *everything* that goes through
 115		 * lo, since we require a loopback src or dst address
 116		 * or direct binding to 'lo' interface.
 117		 */
 118		bool loopback = false;
 119		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 120			loopback = true;
 121#if IS_ENABLED(CONFIG_IPV6)
 122		if (tw->tw_family == AF_INET6) {
 123			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 124			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 125			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 126			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 127			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 128			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 129				loopback = true;
 130		} else
 131#endif
 132		{
 133			if (ipv4_is_loopback(tw->tw_daddr) ||
 134			    ipv4_is_loopback(tw->tw_rcv_saddr))
 135				loopback = true;
 136		}
 137		if (!loopback)
 138			reuse = 0;
 139	}
 140
 141	/* With PAWS, it is safe from the viewpoint
 142	   of data integrity. Even without PAWS it is safe provided sequence
 143	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 144
 145	   Actually, the idea is close to VJ's one, only timestamp cache is
 146	   held not per host, but per port pair and TW bucket is used as state
 147	   holder.
 148
 149	   If TW bucket has been already destroyed we fall back to VJ's scheme
 150	   and use initial timestamp retrieved from peer table.
 151	 */
 152	if (tcptw->tw_ts_recent_stamp &&
 153	    (!twp || (reuse && time_after32(ktime_get_seconds(),
 154					    tcptw->tw_ts_recent_stamp)))) {
 155		/* In case of repair and re-using TIME-WAIT sockets we still
 156		 * want to be sure that it is safe as above but honor the
 157		 * sequence numbers and time stamps set as part of the repair
 158		 * process.
 159		 *
 160		 * Without this check re-using a TIME-WAIT socket with TCP
 161		 * repair would accumulate a -1 on the repair assigned
 162		 * sequence number. The first time it is reused the sequence
 163		 * is -1, the second time -2, etc. This fixes that issue
 164		 * without appearing to create any others.
 165		 */
 166		if (likely(!tp->repair)) {
 167			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 168			if (tp->write_seq == 0)
 169				tp->write_seq = 1;
 170			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 171			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 172		}
 173		sock_hold(sktw);
 174		return 1;
 175	}
 176
 177	return 0;
 178}
 179EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 180
 181static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 182			      int addr_len)
 183{
 184	/* This check is replicated from tcp_v4_connect() and intended to
 185	 * prevent BPF program called below from accessing bytes that are out
 186	 * of the bound specified by user in addr_len.
 187	 */
 188	if (addr_len < sizeof(struct sockaddr_in))
 189		return -EINVAL;
 190
 191	sock_owned_by_me(sk);
 192
 193	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 194}
 195
 196/* This will initiate an outgoing connection. */
 197int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 198{
 199	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 200	struct inet_sock *inet = inet_sk(sk);
 201	struct tcp_sock *tp = tcp_sk(sk);
 202	__be16 orig_sport, orig_dport;
 203	__be32 daddr, nexthop;
 204	struct flowi4 *fl4;
 205	struct rtable *rt;
 206	int err;
 207	struct ip_options_rcu *inet_opt;
 208	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 209
 210	if (addr_len < sizeof(struct sockaddr_in))
 211		return -EINVAL;
 212
 213	if (usin->sin_family != AF_INET)
 214		return -EAFNOSUPPORT;
 215
 216	nexthop = daddr = usin->sin_addr.s_addr;
 217	inet_opt = rcu_dereference_protected(inet->inet_opt,
 218					     lockdep_sock_is_held(sk));
 219	if (inet_opt && inet_opt->opt.srr) {
 220		if (!daddr)
 221			return -EINVAL;
 222		nexthop = inet_opt->opt.faddr;
 223	}
 224
 225	orig_sport = inet->inet_sport;
 226	orig_dport = usin->sin_port;
 227	fl4 = &inet->cork.fl.u.ip4;
 228	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 229			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 230			      IPPROTO_TCP,
 231			      orig_sport, orig_dport, sk);
 232	if (IS_ERR(rt)) {
 233		err = PTR_ERR(rt);
 234		if (err == -ENETUNREACH)
 235			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 236		return err;
 237	}
 238
 239	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 240		ip_rt_put(rt);
 241		return -ENETUNREACH;
 242	}
 243
 244	if (!inet_opt || !inet_opt->opt.srr)
 245		daddr = fl4->daddr;
 246
 247	if (!inet->inet_saddr)
 248		inet->inet_saddr = fl4->saddr;
 249	sk_rcv_saddr_set(sk, inet->inet_saddr);
 250
 251	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 252		/* Reset inherited state */
 253		tp->rx_opt.ts_recent	   = 0;
 254		tp->rx_opt.ts_recent_stamp = 0;
 255		if (likely(!tp->repair))
 256			tp->write_seq	   = 0;
 257	}
 258
 259	inet->inet_dport = usin->sin_port;
 260	sk_daddr_set(sk, daddr);
 261
 262	inet_csk(sk)->icsk_ext_hdr_len = 0;
 263	if (inet_opt)
 264		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 265
 266	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 267
 268	/* Socket identity is still unknown (sport may be zero).
 269	 * However we set state to SYN-SENT and not releasing socket
 270	 * lock select source port, enter ourselves into the hash tables and
 271	 * complete initialization after this.
 272	 */
 273	tcp_set_state(sk, TCP_SYN_SENT);
 274	err = inet_hash_connect(tcp_death_row, sk);
 275	if (err)
 276		goto failure;
 277
 278	sk_set_txhash(sk);
 279
 280	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 281			       inet->inet_sport, inet->inet_dport, sk);
 282	if (IS_ERR(rt)) {
 283		err = PTR_ERR(rt);
 284		rt = NULL;
 285		goto failure;
 286	}
 287	/* OK, now commit destination to socket.  */
 288	sk->sk_gso_type = SKB_GSO_TCPV4;
 289	sk_setup_caps(sk, &rt->dst);
 290	rt = NULL;
 291
 292	if (likely(!tp->repair)) {
 293		if (!tp->write_seq)
 294			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 295						       inet->inet_daddr,
 296						       inet->inet_sport,
 297						       usin->sin_port);
 298		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 299						 inet->inet_saddr,
 300						 inet->inet_daddr);
 301	}
 302
 303	inet->inet_id = tp->write_seq ^ jiffies;
 304
 305	if (tcp_fastopen_defer_connect(sk, &err))
 306		return err;
 307	if (err)
 308		goto failure;
 309
 310	err = tcp_connect(sk);
 311
 312	if (err)
 313		goto failure;
 314
 315	return 0;
 316
 317failure:
 318	/*
 319	 * This unhashes the socket and releases the local port,
 320	 * if necessary.
 321	 */
 322	tcp_set_state(sk, TCP_CLOSE);
 323	ip_rt_put(rt);
 324	sk->sk_route_caps = 0;
 325	inet->inet_dport = 0;
 326	return err;
 327}
 328EXPORT_SYMBOL(tcp_v4_connect);
 329
 330/*
 331 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 332 * It can be called through tcp_release_cb() if socket was owned by user
 333 * at the time tcp_v4_err() was called to handle ICMP message.
 334 */
 335void tcp_v4_mtu_reduced(struct sock *sk)
 336{
 337	struct inet_sock *inet = inet_sk(sk);
 338	struct dst_entry *dst;
 339	u32 mtu;
 340
 341	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 342		return;
 343	mtu = tcp_sk(sk)->mtu_info;
 344	dst = inet_csk_update_pmtu(sk, mtu);
 345	if (!dst)
 346		return;
 347
 348	/* Something is about to be wrong... Remember soft error
 349	 * for the case, if this connection will not able to recover.
 350	 */
 351	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 352		sk->sk_err_soft = EMSGSIZE;
 353
 354	mtu = dst_mtu(dst);
 355
 356	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 357	    ip_sk_accept_pmtu(sk) &&
 358	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 359		tcp_sync_mss(sk, mtu);
 360
 361		/* Resend the TCP packet because it's
 362		 * clear that the old packet has been
 363		 * dropped. This is the new "fast" path mtu
 364		 * discovery.
 365		 */
 366		tcp_simple_retransmit(sk);
 367	} /* else let the usual retransmit timer handle it */
 368}
 369EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 370
 371static void do_redirect(struct sk_buff *skb, struct sock *sk)
 372{
 373	struct dst_entry *dst = __sk_dst_check(sk, 0);
 374
 375	if (dst)
 376		dst->ops->redirect(dst, sk, skb);
 377}
 378
 379
 380/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 381void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 382{
 383	struct request_sock *req = inet_reqsk(sk);
 384	struct net *net = sock_net(sk);
 385
 386	/* ICMPs are not backlogged, hence we cannot get
 387	 * an established socket here.
 388	 */
 389	if (seq != tcp_rsk(req)->snt_isn) {
 390		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 391	} else if (abort) {
 392		/*
 393		 * Still in SYN_RECV, just remove it silently.
 394		 * There is no good way to pass the error to the newly
 395		 * created socket, and POSIX does not want network
 396		 * errors returned from accept().
 397		 */
 398		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 399		tcp_listendrop(req->rsk_listener);
 400	}
 401	reqsk_put(req);
 402}
 403EXPORT_SYMBOL(tcp_req_err);
 404
 405/*
 406 * This routine is called by the ICMP module when it gets some
 407 * sort of error condition.  If err < 0 then the socket should
 408 * be closed and the error returned to the user.  If err > 0
 409 * it's just the icmp type << 8 | icmp code.  After adjustment
 410 * header points to the first 8 bytes of the tcp header.  We need
 411 * to find the appropriate port.
 412 *
 413 * The locking strategy used here is very "optimistic". When
 414 * someone else accesses the socket the ICMP is just dropped
 415 * and for some paths there is no check at all.
 416 * A more general error queue to queue errors for later handling
 417 * is probably better.
 418 *
 419 */
 420
 421int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 422{
 423	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 424	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 425	struct inet_connection_sock *icsk;
 426	struct tcp_sock *tp;
 427	struct inet_sock *inet;
 428	const int type = icmp_hdr(icmp_skb)->type;
 429	const int code = icmp_hdr(icmp_skb)->code;
 430	struct sock *sk;
 431	struct sk_buff *skb;
 432	struct request_sock *fastopen;
 433	u32 seq, snd_una;
 434	s32 remaining;
 435	u32 delta_us;
 436	int err;
 437	struct net *net = dev_net(icmp_skb->dev);
 438
 439	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 440				       th->dest, iph->saddr, ntohs(th->source),
 441				       inet_iif(icmp_skb), 0);
 442	if (!sk) {
 443		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 444		return -ENOENT;
 445	}
 446	if (sk->sk_state == TCP_TIME_WAIT) {
 447		inet_twsk_put(inet_twsk(sk));
 448		return 0;
 449	}
 450	seq = ntohl(th->seq);
 451	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 452		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 453				     type == ICMP_TIME_EXCEEDED ||
 454				     (type == ICMP_DEST_UNREACH &&
 455				      (code == ICMP_NET_UNREACH ||
 456				       code == ICMP_HOST_UNREACH)));
 457		return 0;
 458	}
 459
 460	bh_lock_sock(sk);
 461	/* If too many ICMPs get dropped on busy
 462	 * servers this needs to be solved differently.
 463	 * We do take care of PMTU discovery (RFC1191) special case :
 464	 * we can receive locally generated ICMP messages while socket is held.
 465	 */
 466	if (sock_owned_by_user(sk)) {
 467		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 468			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 469	}
 470	if (sk->sk_state == TCP_CLOSE)
 471		goto out;
 472
 473	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 474		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 475		goto out;
 476	}
 477
 478	icsk = inet_csk(sk);
 479	tp = tcp_sk(sk);
 480	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 481	fastopen = tp->fastopen_rsk;
 482	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 483	if (sk->sk_state != TCP_LISTEN &&
 484	    !between(seq, snd_una, tp->snd_nxt)) {
 485		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 486		goto out;
 487	}
 488
 489	switch (type) {
 490	case ICMP_REDIRECT:
 491		if (!sock_owned_by_user(sk))
 492			do_redirect(icmp_skb, sk);
 493		goto out;
 494	case ICMP_SOURCE_QUENCH:
 495		/* Just silently ignore these. */
 496		goto out;
 497	case ICMP_PARAMETERPROB:
 498		err = EPROTO;
 499		break;
 500	case ICMP_DEST_UNREACH:
 501		if (code > NR_ICMP_UNREACH)
 502			goto out;
 503
 504		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 505			/* We are not interested in TCP_LISTEN and open_requests
 506			 * (SYN-ACKs send out by Linux are always <576bytes so
 507			 * they should go through unfragmented).
 508			 */
 509			if (sk->sk_state == TCP_LISTEN)
 510				goto out;
 511
 512			tp->mtu_info = info;
 513			if (!sock_owned_by_user(sk)) {
 514				tcp_v4_mtu_reduced(sk);
 515			} else {
 516				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 517					sock_hold(sk);
 518			}
 519			goto out;
 520		}
 521
 522		err = icmp_err_convert[code].errno;
 523		/* check if icmp_skb allows revert of backoff
 524		 * (see draft-zimmermann-tcp-lcd) */
 525		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 526			break;
 527		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 528		    !icsk->icsk_backoff || fastopen)
 529			break;
 530
 531		if (sock_owned_by_user(sk))
 532			break;
 533
 534		skb = tcp_rtx_queue_head(sk);
 535		if (WARN_ON_ONCE(!skb))
 536			break;
 537
 538		icsk->icsk_backoff--;
 539		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 540					       TCP_TIMEOUT_INIT;
 541		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 542
 543
 544		tcp_mstamp_refresh(tp);
 545		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 546		remaining = icsk->icsk_rto -
 547			    usecs_to_jiffies(delta_us);
 548
 549		if (remaining > 0) {
 550			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 551						  remaining, TCP_RTO_MAX);
 552		} else {
 553			/* RTO revert clocked out retransmission.
 554			 * Will retransmit now */
 555			tcp_retransmit_timer(sk);
 556		}
 557
 558		break;
 559	case ICMP_TIME_EXCEEDED:
 560		err = EHOSTUNREACH;
 561		break;
 562	default:
 563		goto out;
 564	}
 565
 566	switch (sk->sk_state) {
 567	case TCP_SYN_SENT:
 568	case TCP_SYN_RECV:
 569		/* Only in fast or simultaneous open. If a fast open socket is
 570		 * is already accepted it is treated as a connected one below.
 571		 */
 572		if (fastopen && !fastopen->sk)
 573			break;
 574
 575		if (!sock_owned_by_user(sk)) {
 576			sk->sk_err = err;
 577
 578			sk->sk_error_report(sk);
 579
 580			tcp_done(sk);
 581		} else {
 582			sk->sk_err_soft = err;
 583		}
 584		goto out;
 585	}
 586
 587	/* If we've already connected we will keep trying
 588	 * until we time out, or the user gives up.
 589	 *
 590	 * rfc1122 4.2.3.9 allows to consider as hard errors
 591	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 592	 * but it is obsoleted by pmtu discovery).
 593	 *
 594	 * Note, that in modern internet, where routing is unreliable
 595	 * and in each dark corner broken firewalls sit, sending random
 596	 * errors ordered by their masters even this two messages finally lose
 597	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 598	 *
 599	 * Now we are in compliance with RFCs.
 600	 *							--ANK (980905)
 601	 */
 602
 603	inet = inet_sk(sk);
 604	if (!sock_owned_by_user(sk) && inet->recverr) {
 605		sk->sk_err = err;
 606		sk->sk_error_report(sk);
 607	} else	{ /* Only an error on timeout */
 608		sk->sk_err_soft = err;
 609	}
 610
 611out:
 612	bh_unlock_sock(sk);
 613	sock_put(sk);
 614	return 0;
 615}
 616
 617void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 618{
 619	struct tcphdr *th = tcp_hdr(skb);
 620
 621	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 622	skb->csum_start = skb_transport_header(skb) - skb->head;
 623	skb->csum_offset = offsetof(struct tcphdr, check);
 624}
 625
 626/* This routine computes an IPv4 TCP checksum. */
 627void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 628{
 629	const struct inet_sock *inet = inet_sk(sk);
 630
 631	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 632}
 633EXPORT_SYMBOL(tcp_v4_send_check);
 634
 635/*
 636 *	This routine will send an RST to the other tcp.
 637 *
 638 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 639 *		      for reset.
 640 *	Answer: if a packet caused RST, it is not for a socket
 641 *		existing in our system, if it is matched to a socket,
 642 *		it is just duplicate segment or bug in other side's TCP.
 643 *		So that we build reply only basing on parameters
 644 *		arrived with segment.
 645 *	Exception: precedence violation. We do not implement it in any case.
 646 */
 647
 648static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 649{
 650	const struct tcphdr *th = tcp_hdr(skb);
 651	struct {
 652		struct tcphdr th;
 653#ifdef CONFIG_TCP_MD5SIG
 654		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 655#endif
 656	} rep;
 657	struct ip_reply_arg arg;
 658#ifdef CONFIG_TCP_MD5SIG
 659	struct tcp_md5sig_key *key = NULL;
 660	const __u8 *hash_location = NULL;
 661	unsigned char newhash[16];
 662	int genhash;
 663	struct sock *sk1 = NULL;
 664#endif
 665	u64 transmit_time = 0;
 666	struct sock *ctl_sk;
 667	struct net *net;
 668
 669	/* Never send a reset in response to a reset. */
 670	if (th->rst)
 671		return;
 672
 673	/* If sk not NULL, it means we did a successful lookup and incoming
 674	 * route had to be correct. prequeue might have dropped our dst.
 675	 */
 676	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 677		return;
 678
 679	/* Swap the send and the receive. */
 680	memset(&rep, 0, sizeof(rep));
 681	rep.th.dest   = th->source;
 682	rep.th.source = th->dest;
 683	rep.th.doff   = sizeof(struct tcphdr) / 4;
 684	rep.th.rst    = 1;
 685
 686	if (th->ack) {
 687		rep.th.seq = th->ack_seq;
 688	} else {
 689		rep.th.ack = 1;
 690		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 691				       skb->len - (th->doff << 2));
 692	}
 693
 694	memset(&arg, 0, sizeof(arg));
 695	arg.iov[0].iov_base = (unsigned char *)&rep;
 696	arg.iov[0].iov_len  = sizeof(rep.th);
 697
 698	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 699#ifdef CONFIG_TCP_MD5SIG
 700	rcu_read_lock();
 701	hash_location = tcp_parse_md5sig_option(th);
 702	if (sk && sk_fullsock(sk)) {
 703		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 704					&ip_hdr(skb)->saddr, AF_INET);
 705	} else if (hash_location) {
 706		/*
 707		 * active side is lost. Try to find listening socket through
 708		 * source port, and then find md5 key through listening socket.
 709		 * we are not loose security here:
 710		 * Incoming packet is checked with md5 hash with finding key,
 711		 * no RST generated if md5 hash doesn't match.
 712		 */
 713		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 714					     ip_hdr(skb)->saddr,
 715					     th->source, ip_hdr(skb)->daddr,
 716					     ntohs(th->source), inet_iif(skb),
 717					     tcp_v4_sdif(skb));
 718		/* don't send rst if it can't find key */
 719		if (!sk1)
 720			goto out;
 721
 722		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 723					&ip_hdr(skb)->saddr, AF_INET);
 724		if (!key)
 725			goto out;
 726
 727
 728		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 729		if (genhash || memcmp(hash_location, newhash, 16) != 0)
 730			goto out;
 731
 732	}
 733
 734	if (key) {
 735		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 736				   (TCPOPT_NOP << 16) |
 737				   (TCPOPT_MD5SIG << 8) |
 738				   TCPOLEN_MD5SIG);
 739		/* Update length and the length the header thinks exists */
 740		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 741		rep.th.doff = arg.iov[0].iov_len / 4;
 742
 743		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 744				     key, ip_hdr(skb)->saddr,
 745				     ip_hdr(skb)->daddr, &rep.th);
 746	}
 747#endif
 748	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 749				      ip_hdr(skb)->saddr, /* XXX */
 750				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 751	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 752	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 753
 754	/* When socket is gone, all binding information is lost.
 755	 * routing might fail in this case. No choice here, if we choose to force
 756	 * input interface, we will misroute in case of asymmetric route.
 757	 */
 758	if (sk) {
 759		arg.bound_dev_if = sk->sk_bound_dev_if;
 760		if (sk_fullsock(sk))
 761			trace_tcp_send_reset(sk, skb);
 762	}
 763
 764	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 765		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 766
 767	arg.tos = ip_hdr(skb)->tos;
 768	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 769	local_bh_disable();
 770	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 771	if (sk) {
 772		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 773				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 774		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 775				   inet_twsk(sk)->tw_priority : sk->sk_priority;
 776		transmit_time = tcp_transmit_time(sk);
 777	}
 778	ip_send_unicast_reply(ctl_sk,
 779			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 780			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 781			      &arg, arg.iov[0].iov_len,
 782			      transmit_time);
 783
 784	ctl_sk->sk_mark = 0;
 785	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 786	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 787	local_bh_enable();
 788
 789#ifdef CONFIG_TCP_MD5SIG
 790out:
 791	rcu_read_unlock();
 792#endif
 793}
 794
 795/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 796   outside socket context is ugly, certainly. What can I do?
 797 */
 798
 799static void tcp_v4_send_ack(const struct sock *sk,
 800			    struct sk_buff *skb, u32 seq, u32 ack,
 801			    u32 win, u32 tsval, u32 tsecr, int oif,
 802			    struct tcp_md5sig_key *key,
 803			    int reply_flags, u8 tos)
 804{
 805	const struct tcphdr *th = tcp_hdr(skb);
 806	struct {
 807		struct tcphdr th;
 808		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 809#ifdef CONFIG_TCP_MD5SIG
 810			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 811#endif
 812			];
 813	} rep;
 814	struct net *net = sock_net(sk);
 815	struct ip_reply_arg arg;
 816	struct sock *ctl_sk;
 817	u64 transmit_time;
 818
 819	memset(&rep.th, 0, sizeof(struct tcphdr));
 820	memset(&arg, 0, sizeof(arg));
 821
 822	arg.iov[0].iov_base = (unsigned char *)&rep;
 823	arg.iov[0].iov_len  = sizeof(rep.th);
 824	if (tsecr) {
 825		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 826				   (TCPOPT_TIMESTAMP << 8) |
 827				   TCPOLEN_TIMESTAMP);
 828		rep.opt[1] = htonl(tsval);
 829		rep.opt[2] = htonl(tsecr);
 830		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 831	}
 832
 833	/* Swap the send and the receive. */
 834	rep.th.dest    = th->source;
 835	rep.th.source  = th->dest;
 836	rep.th.doff    = arg.iov[0].iov_len / 4;
 837	rep.th.seq     = htonl(seq);
 838	rep.th.ack_seq = htonl(ack);
 839	rep.th.ack     = 1;
 840	rep.th.window  = htons(win);
 841
 842#ifdef CONFIG_TCP_MD5SIG
 843	if (key) {
 844		int offset = (tsecr) ? 3 : 0;
 845
 846		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 847					  (TCPOPT_NOP << 16) |
 848					  (TCPOPT_MD5SIG << 8) |
 849					  TCPOLEN_MD5SIG);
 850		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 851		rep.th.doff = arg.iov[0].iov_len/4;
 852
 853		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 854				    key, ip_hdr(skb)->saddr,
 855				    ip_hdr(skb)->daddr, &rep.th);
 856	}
 857#endif
 858	arg.flags = reply_flags;
 859	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 860				      ip_hdr(skb)->saddr, /* XXX */
 861				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 862	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 863	if (oif)
 864		arg.bound_dev_if = oif;
 865	arg.tos = tos;
 866	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 867	local_bh_disable();
 868	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 869	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 870			   inet_twsk(sk)->tw_mark : sk->sk_mark;
 871	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
 872			   inet_twsk(sk)->tw_priority : sk->sk_priority;
 873	transmit_time = tcp_transmit_time(sk);
 874	ip_send_unicast_reply(ctl_sk,
 875			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 876			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 877			      &arg, arg.iov[0].iov_len,
 878			      transmit_time);
 879
 880	ctl_sk->sk_mark = 0;
 881	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 882	local_bh_enable();
 883}
 884
 885static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 886{
 887	struct inet_timewait_sock *tw = inet_twsk(sk);
 888	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 889
 890	tcp_v4_send_ack(sk, skb,
 891			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 892			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 893			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 894			tcptw->tw_ts_recent,
 895			tw->tw_bound_dev_if,
 896			tcp_twsk_md5_key(tcptw),
 897			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 898			tw->tw_tos
 899			);
 900
 901	inet_twsk_put(tw);
 902}
 903
 904static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 905				  struct request_sock *req)
 906{
 907	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 908	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 909	 */
 910	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 911					     tcp_sk(sk)->snd_nxt;
 912
 913	/* RFC 7323 2.3
 914	 * The window field (SEG.WND) of every outgoing segment, with the
 915	 * exception of <SYN> segments, MUST be right-shifted by
 916	 * Rcv.Wind.Shift bits:
 917	 */
 918	tcp_v4_send_ack(sk, skb, seq,
 919			tcp_rsk(req)->rcv_nxt,
 920			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 921			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 922			req->ts_recent,
 923			0,
 924			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 925					  AF_INET),
 926			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 927			ip_hdr(skb)->tos);
 928}
 929
 930/*
 931 *	Send a SYN-ACK after having received a SYN.
 932 *	This still operates on a request_sock only, not on a big
 933 *	socket.
 934 */
 935static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 936			      struct flowi *fl,
 937			      struct request_sock *req,
 938			      struct tcp_fastopen_cookie *foc,
 939			      enum tcp_synack_type synack_type)
 940{
 941	const struct inet_request_sock *ireq = inet_rsk(req);
 942	struct flowi4 fl4;
 943	int err = -1;
 944	struct sk_buff *skb;
 945
 946	/* First, grab a route. */
 947	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 948		return -1;
 949
 950	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 951
 952	if (skb) {
 953		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 954
 955		rcu_read_lock();
 956		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 957					    ireq->ir_rmt_addr,
 958					    rcu_dereference(ireq->ireq_opt));
 959		rcu_read_unlock();
 960		err = net_xmit_eval(err);
 961	}
 962
 963	return err;
 964}
 965
 966/*
 967 *	IPv4 request_sock destructor.
 968 */
 969static void tcp_v4_reqsk_destructor(struct request_sock *req)
 970{
 971	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 972}
 973
 974#ifdef CONFIG_TCP_MD5SIG
 975/*
 976 * RFC2385 MD5 checksumming requires a mapping of
 977 * IP address->MD5 Key.
 978 * We need to maintain these in the sk structure.
 979 */
 980
 981DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
 982EXPORT_SYMBOL(tcp_md5_needed);
 983
 984/* Find the Key structure for an address.  */
 985struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 986					   const union tcp_md5_addr *addr,
 987					   int family)
 988{
 989	const struct tcp_sock *tp = tcp_sk(sk);
 990	struct tcp_md5sig_key *key;
 991	const struct tcp_md5sig_info *md5sig;
 992	__be32 mask;
 993	struct tcp_md5sig_key *best_match = NULL;
 994	bool match;
 995
 996	/* caller either holds rcu_read_lock() or socket lock */
 997	md5sig = rcu_dereference_check(tp->md5sig_info,
 998				       lockdep_sock_is_held(sk));
 999	if (!md5sig)
1000		return NULL;
1001
1002	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1003		if (key->family != family)
1004			continue;
1005
1006		if (family == AF_INET) {
1007			mask = inet_make_mask(key->prefixlen);
1008			match = (key->addr.a4.s_addr & mask) ==
1009				(addr->a4.s_addr & mask);
1010#if IS_ENABLED(CONFIG_IPV6)
1011		} else if (family == AF_INET6) {
1012			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1013						  key->prefixlen);
1014#endif
1015		} else {
1016			match = false;
1017		}
1018
1019		if (match && (!best_match ||
1020			      key->prefixlen > best_match->prefixlen))
1021			best_match = key;
1022	}
1023	return best_match;
1024}
1025EXPORT_SYMBOL(__tcp_md5_do_lookup);
1026
1027static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1028						      const union tcp_md5_addr *addr,
1029						      int family, u8 prefixlen)
1030{
1031	const struct tcp_sock *tp = tcp_sk(sk);
1032	struct tcp_md5sig_key *key;
1033	unsigned int size = sizeof(struct in_addr);
1034	const struct tcp_md5sig_info *md5sig;
1035
1036	/* caller either holds rcu_read_lock() or socket lock */
1037	md5sig = rcu_dereference_check(tp->md5sig_info,
1038				       lockdep_sock_is_held(sk));
1039	if (!md5sig)
1040		return NULL;
1041#if IS_ENABLED(CONFIG_IPV6)
1042	if (family == AF_INET6)
1043		size = sizeof(struct in6_addr);
1044#endif
1045	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1046		if (key->family != family)
1047			continue;
1048		if (!memcmp(&key->addr, addr, size) &&
1049		    key->prefixlen == prefixlen)
1050			return key;
1051	}
1052	return NULL;
1053}
1054
1055struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1056					 const struct sock *addr_sk)
1057{
1058	const union tcp_md5_addr *addr;
1059
1060	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1061	return tcp_md5_do_lookup(sk, addr, AF_INET);
1062}
1063EXPORT_SYMBOL(tcp_v4_md5_lookup);
1064
1065/* This can be called on a newly created socket, from other files */
1066int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1067		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1068		   gfp_t gfp)
1069{
1070	/* Add Key to the list */
1071	struct tcp_md5sig_key *key;
1072	struct tcp_sock *tp = tcp_sk(sk);
1073	struct tcp_md5sig_info *md5sig;
1074
1075	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1076	if (key) {
1077		/* Pre-existing entry - just update that one. */
1078		memcpy(key->key, newkey, newkeylen);
1079		key->keylen = newkeylen;
1080		return 0;
1081	}
1082
1083	md5sig = rcu_dereference_protected(tp->md5sig_info,
1084					   lockdep_sock_is_held(sk));
1085	if (!md5sig) {
1086		md5sig = kmalloc(sizeof(*md5sig), gfp);
1087		if (!md5sig)
1088			return -ENOMEM;
1089
1090		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1091		INIT_HLIST_HEAD(&md5sig->head);
1092		rcu_assign_pointer(tp->md5sig_info, md5sig);
1093	}
1094
1095	key = sock_kmalloc(sk, sizeof(*key), gfp);
1096	if (!key)
1097		return -ENOMEM;
1098	if (!tcp_alloc_md5sig_pool()) {
1099		sock_kfree_s(sk, key, sizeof(*key));
1100		return -ENOMEM;
1101	}
1102
1103	memcpy(key->key, newkey, newkeylen);
1104	key->keylen = newkeylen;
1105	key->family = family;
1106	key->prefixlen = prefixlen;
1107	memcpy(&key->addr, addr,
1108	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1109				      sizeof(struct in_addr));
1110	hlist_add_head_rcu(&key->node, &md5sig->head);
1111	return 0;
1112}
1113EXPORT_SYMBOL(tcp_md5_do_add);
1114
1115int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1116		   u8 prefixlen)
1117{
1118	struct tcp_md5sig_key *key;
1119
1120	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1121	if (!key)
1122		return -ENOENT;
1123	hlist_del_rcu(&key->node);
1124	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1125	kfree_rcu(key, rcu);
1126	return 0;
1127}
1128EXPORT_SYMBOL(tcp_md5_do_del);
1129
1130static void tcp_clear_md5_list(struct sock *sk)
1131{
1132	struct tcp_sock *tp = tcp_sk(sk);
1133	struct tcp_md5sig_key *key;
1134	struct hlist_node *n;
1135	struct tcp_md5sig_info *md5sig;
1136
1137	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1138
1139	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1140		hlist_del_rcu(&key->node);
1141		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1142		kfree_rcu(key, rcu);
1143	}
1144}
1145
1146static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1147				 char __user *optval, int optlen)
1148{
1149	struct tcp_md5sig cmd;
1150	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1151	u8 prefixlen = 32;
1152
1153	if (optlen < sizeof(cmd))
1154		return -EINVAL;
1155
1156	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1157		return -EFAULT;
1158
1159	if (sin->sin_family != AF_INET)
1160		return -EINVAL;
1161
1162	if (optname == TCP_MD5SIG_EXT &&
1163	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1164		prefixlen = cmd.tcpm_prefixlen;
1165		if (prefixlen > 32)
1166			return -EINVAL;
1167	}
1168
1169	if (!cmd.tcpm_keylen)
1170		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1171				      AF_INET, prefixlen);
1172
1173	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1174		return -EINVAL;
1175
1176	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1177			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1178			      GFP_KERNEL);
1179}
1180
1181static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1182				   __be32 daddr, __be32 saddr,
1183				   const struct tcphdr *th, int nbytes)
1184{
1185	struct tcp4_pseudohdr *bp;
1186	struct scatterlist sg;
1187	struct tcphdr *_th;
1188
1189	bp = hp->scratch;
1190	bp->saddr = saddr;
1191	bp->daddr = daddr;
1192	bp->pad = 0;
1193	bp->protocol = IPPROTO_TCP;
1194	bp->len = cpu_to_be16(nbytes);
1195
1196	_th = (struct tcphdr *)(bp + 1);
1197	memcpy(_th, th, sizeof(*th));
1198	_th->check = 0;
1199
1200	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1201	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1202				sizeof(*bp) + sizeof(*th));
1203	return crypto_ahash_update(hp->md5_req);
1204}
1205
1206static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1207			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1208{
1209	struct tcp_md5sig_pool *hp;
1210	struct ahash_request *req;
1211
1212	hp = tcp_get_md5sig_pool();
1213	if (!hp)
1214		goto clear_hash_noput;
1215	req = hp->md5_req;
1216
1217	if (crypto_ahash_init(req))
1218		goto clear_hash;
1219	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1220		goto clear_hash;
1221	if (tcp_md5_hash_key(hp, key))
1222		goto clear_hash;
1223	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1224	if (crypto_ahash_final(req))
1225		goto clear_hash;
1226
1227	tcp_put_md5sig_pool();
1228	return 0;
1229
1230clear_hash:
1231	tcp_put_md5sig_pool();
1232clear_hash_noput:
1233	memset(md5_hash, 0, 16);
1234	return 1;
1235}
1236
1237int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1238			const struct sock *sk,
1239			const struct sk_buff *skb)
1240{
1241	struct tcp_md5sig_pool *hp;
1242	struct ahash_request *req;
1243	const struct tcphdr *th = tcp_hdr(skb);
1244	__be32 saddr, daddr;
1245
1246	if (sk) { /* valid for establish/request sockets */
1247		saddr = sk->sk_rcv_saddr;
1248		daddr = sk->sk_daddr;
1249	} else {
1250		const struct iphdr *iph = ip_hdr(skb);
1251		saddr = iph->saddr;
1252		daddr = iph->daddr;
1253	}
1254
1255	hp = tcp_get_md5sig_pool();
1256	if (!hp)
1257		goto clear_hash_noput;
1258	req = hp->md5_req;
1259
1260	if (crypto_ahash_init(req))
1261		goto clear_hash;
1262
1263	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1264		goto clear_hash;
1265	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1266		goto clear_hash;
1267	if (tcp_md5_hash_key(hp, key))
1268		goto clear_hash;
1269	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1270	if (crypto_ahash_final(req))
1271		goto clear_hash;
1272
1273	tcp_put_md5sig_pool();
1274	return 0;
1275
1276clear_hash:
1277	tcp_put_md5sig_pool();
1278clear_hash_noput:
1279	memset(md5_hash, 0, 16);
1280	return 1;
1281}
1282EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1283
1284#endif
1285
1286/* Called with rcu_read_lock() */
1287static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1288				    const struct sk_buff *skb)
1289{
1290#ifdef CONFIG_TCP_MD5SIG
1291	/*
1292	 * This gets called for each TCP segment that arrives
1293	 * so we want to be efficient.
1294	 * We have 3 drop cases:
1295	 * o No MD5 hash and one expected.
1296	 * o MD5 hash and we're not expecting one.
1297	 * o MD5 hash and its wrong.
1298	 */
1299	const __u8 *hash_location = NULL;
1300	struct tcp_md5sig_key *hash_expected;
1301	const struct iphdr *iph = ip_hdr(skb);
1302	const struct tcphdr *th = tcp_hdr(skb);
1303	int genhash;
1304	unsigned char newhash[16];
1305
1306	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1307					  AF_INET);
1308	hash_location = tcp_parse_md5sig_option(th);
1309
1310	/* We've parsed the options - do we have a hash? */
1311	if (!hash_expected && !hash_location)
1312		return false;
1313
1314	if (hash_expected && !hash_location) {
1315		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1316		return true;
1317	}
1318
1319	if (!hash_expected && hash_location) {
1320		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1321		return true;
1322	}
1323
1324	/* Okay, so this is hash_expected and hash_location -
1325	 * so we need to calculate the checksum.
1326	 */
1327	genhash = tcp_v4_md5_hash_skb(newhash,
1328				      hash_expected,
1329				      NULL, skb);
1330
1331	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1332		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1333		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1334				     &iph->saddr, ntohs(th->source),
1335				     &iph->daddr, ntohs(th->dest),
1336				     genhash ? " tcp_v4_calc_md5_hash failed"
1337				     : "");
1338		return true;
1339	}
1340	return false;
1341#endif
1342	return false;
1343}
1344
1345static void tcp_v4_init_req(struct request_sock *req,
1346			    const struct sock *sk_listener,
1347			    struct sk_buff *skb)
1348{
1349	struct inet_request_sock *ireq = inet_rsk(req);
1350	struct net *net = sock_net(sk_listener);
1351
1352	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1353	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1354	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1355}
1356
1357static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1358					  struct flowi *fl,
1359					  const struct request_sock *req)
1360{
1361	return inet_csk_route_req(sk, &fl->u.ip4, req);
1362}
1363
1364struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1365	.family		=	PF_INET,
1366	.obj_size	=	sizeof(struct tcp_request_sock),
1367	.rtx_syn_ack	=	tcp_rtx_synack,
1368	.send_ack	=	tcp_v4_reqsk_send_ack,
1369	.destructor	=	tcp_v4_reqsk_destructor,
1370	.send_reset	=	tcp_v4_send_reset,
1371	.syn_ack_timeout =	tcp_syn_ack_timeout,
1372};
1373
1374static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1375	.mss_clamp	=	TCP_MSS_DEFAULT,
1376#ifdef CONFIG_TCP_MD5SIG
1377	.req_md5_lookup	=	tcp_v4_md5_lookup,
1378	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1379#endif
1380	.init_req	=	tcp_v4_init_req,
1381#ifdef CONFIG_SYN_COOKIES
1382	.cookie_init_seq =	cookie_v4_init_sequence,
1383#endif
1384	.route_req	=	tcp_v4_route_req,
1385	.init_seq	=	tcp_v4_init_seq,
1386	.init_ts_off	=	tcp_v4_init_ts_off,
1387	.send_synack	=	tcp_v4_send_synack,
1388};
1389
1390int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1391{
1392	/* Never answer to SYNs send to broadcast or multicast */
1393	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1394		goto drop;
1395
1396	return tcp_conn_request(&tcp_request_sock_ops,
1397				&tcp_request_sock_ipv4_ops, sk, skb);
1398
1399drop:
1400	tcp_listendrop(sk);
1401	return 0;
1402}
1403EXPORT_SYMBOL(tcp_v4_conn_request);
1404
1405
1406/*
1407 * The three way handshake has completed - we got a valid synack -
1408 * now create the new socket.
1409 */
1410struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1411				  struct request_sock *req,
1412				  struct dst_entry *dst,
1413				  struct request_sock *req_unhash,
1414				  bool *own_req)
1415{
1416	struct inet_request_sock *ireq;
1417	struct inet_sock *newinet;
1418	struct tcp_sock *newtp;
1419	struct sock *newsk;
1420#ifdef CONFIG_TCP_MD5SIG
1421	struct tcp_md5sig_key *key;
1422#endif
1423	struct ip_options_rcu *inet_opt;
1424
1425	if (sk_acceptq_is_full(sk))
1426		goto exit_overflow;
1427
1428	newsk = tcp_create_openreq_child(sk, req, skb);
1429	if (!newsk)
1430		goto exit_nonewsk;
1431
1432	newsk->sk_gso_type = SKB_GSO_TCPV4;
1433	inet_sk_rx_dst_set(newsk, skb);
1434
1435	newtp		      = tcp_sk(newsk);
1436	newinet		      = inet_sk(newsk);
1437	ireq		      = inet_rsk(req);
1438	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1439	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1440	newsk->sk_bound_dev_if = ireq->ir_iif;
1441	newinet->inet_saddr   = ireq->ir_loc_addr;
1442	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1443	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1444	newinet->mc_index     = inet_iif(skb);
1445	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1446	newinet->rcv_tos      = ip_hdr(skb)->tos;
1447	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1448	if (inet_opt)
1449		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1450	newinet->inet_id = newtp->write_seq ^ jiffies;
1451
1452	if (!dst) {
1453		dst = inet_csk_route_child_sock(sk, newsk, req);
1454		if (!dst)
1455			goto put_and_exit;
1456	} else {
1457		/* syncookie case : see end of cookie_v4_check() */
1458	}
1459	sk_setup_caps(newsk, dst);
1460
1461	tcp_ca_openreq_child(newsk, dst);
1462
1463	tcp_sync_mss(newsk, dst_mtu(dst));
1464	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1465
1466	tcp_initialize_rcv_mss(newsk);
1467
1468#ifdef CONFIG_TCP_MD5SIG
1469	/* Copy over the MD5 key from the original socket */
1470	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1471				AF_INET);
1472	if (key) {
1473		/*
1474		 * We're using one, so create a matching key
1475		 * on the newsk structure. If we fail to get
1476		 * memory, then we end up not copying the key
1477		 * across. Shucks.
1478		 */
1479		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1480			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1481		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1482	}
1483#endif
1484
1485	if (__inet_inherit_port(sk, newsk) < 0)
1486		goto put_and_exit;
1487	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1488	if (likely(*own_req)) {
1489		tcp_move_syn(newtp, req);
1490		ireq->ireq_opt = NULL;
1491	} else {
1492		newinet->inet_opt = NULL;
1493	}
1494	return newsk;
1495
1496exit_overflow:
1497	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1498exit_nonewsk:
1499	dst_release(dst);
1500exit:
1501	tcp_listendrop(sk);
1502	return NULL;
1503put_and_exit:
1504	newinet->inet_opt = NULL;
1505	inet_csk_prepare_forced_close(newsk);
1506	tcp_done(newsk);
1507	goto exit;
1508}
1509EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1510
1511static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1512{
1513#ifdef CONFIG_SYN_COOKIES
1514	const struct tcphdr *th = tcp_hdr(skb);
1515
1516	if (!th->syn)
1517		sk = cookie_v4_check(sk, skb);
1518#endif
1519	return sk;
1520}
1521
1522u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1523			 struct tcphdr *th, u32 *cookie)
1524{
1525	u16 mss = 0;
1526#ifdef CONFIG_SYN_COOKIES
1527	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1528				    &tcp_request_sock_ipv4_ops, sk, th);
1529	if (mss) {
1530		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1531		tcp_synq_overflow(sk);
1532	}
1533#endif
1534	return mss;
1535}
1536
1537/* The socket must have it's spinlock held when we get
1538 * here, unless it is a TCP_LISTEN socket.
1539 *
1540 * We have a potential double-lock case here, so even when
1541 * doing backlog processing we use the BH locking scheme.
1542 * This is because we cannot sleep with the original spinlock
1543 * held.
1544 */
1545int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1546{
1547	struct sock *rsk;
1548
1549	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1550		struct dst_entry *dst = sk->sk_rx_dst;
1551
1552		sock_rps_save_rxhash(sk, skb);
1553		sk_mark_napi_id(sk, skb);
1554		if (dst) {
1555			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1556			    !dst->ops->check(dst, 0)) {
1557				dst_release(dst);
1558				sk->sk_rx_dst = NULL;
1559			}
1560		}
1561		tcp_rcv_established(sk, skb);
1562		return 0;
1563	}
1564
1565	if (tcp_checksum_complete(skb))
1566		goto csum_err;
1567
1568	if (sk->sk_state == TCP_LISTEN) {
1569		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1570
1571		if (!nsk)
1572			goto discard;
1573		if (nsk != sk) {
1574			if (tcp_child_process(sk, nsk, skb)) {
1575				rsk = nsk;
1576				goto reset;
1577			}
1578			return 0;
1579		}
1580	} else
1581		sock_rps_save_rxhash(sk, skb);
1582
1583	if (tcp_rcv_state_process(sk, skb)) {
1584		rsk = sk;
1585		goto reset;
1586	}
1587	return 0;
1588
1589reset:
1590	tcp_v4_send_reset(rsk, skb);
1591discard:
1592	kfree_skb(skb);
1593	/* Be careful here. If this function gets more complicated and
1594	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1595	 * might be destroyed here. This current version compiles correctly,
1596	 * but you have been warned.
1597	 */
1598	return 0;
1599
1600csum_err:
1601	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1602	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1603	goto discard;
1604}
1605EXPORT_SYMBOL(tcp_v4_do_rcv);
1606
1607int tcp_v4_early_demux(struct sk_buff *skb)
1608{
1609	const struct iphdr *iph;
1610	const struct tcphdr *th;
1611	struct sock *sk;
1612
1613	if (skb->pkt_type != PACKET_HOST)
1614		return 0;
1615
1616	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1617		return 0;
1618
1619	iph = ip_hdr(skb);
1620	th = tcp_hdr(skb);
1621
1622	if (th->doff < sizeof(struct tcphdr) / 4)
1623		return 0;
1624
1625	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1626				       iph->saddr, th->source,
1627				       iph->daddr, ntohs(th->dest),
1628				       skb->skb_iif, inet_sdif(skb));
1629	if (sk) {
1630		skb->sk = sk;
1631		skb->destructor = sock_edemux;
1632		if (sk_fullsock(sk)) {
1633			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1634
1635			if (dst)
1636				dst = dst_check(dst, 0);
1637			if (dst &&
1638			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1639				skb_dst_set_noref(skb, dst);
1640		}
1641	}
1642	return 0;
1643}
1644
1645bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1646{
1647	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1648	struct skb_shared_info *shinfo;
1649	const struct tcphdr *th;
1650	struct tcphdr *thtail;
1651	struct sk_buff *tail;
1652	unsigned int hdrlen;
1653	bool fragstolen;
1654	u32 gso_segs;
1655	int delta;
1656
1657	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1658	 * we can fix skb->truesize to its real value to avoid future drops.
1659	 * This is valid because skb is not yet charged to the socket.
1660	 * It has been noticed pure SACK packets were sometimes dropped
1661	 * (if cooked by drivers without copybreak feature).
1662	 */
1663	skb_condense(skb);
1664
1665	skb_dst_drop(skb);
1666
1667	if (unlikely(tcp_checksum_complete(skb))) {
1668		bh_unlock_sock(sk);
1669		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1670		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1671		return true;
1672	}
1673
1674	/* Attempt coalescing to last skb in backlog, even if we are
1675	 * above the limits.
1676	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1677	 */
1678	th = (const struct tcphdr *)skb->data;
1679	hdrlen = th->doff * 4;
1680	shinfo = skb_shinfo(skb);
1681
1682	if (!shinfo->gso_size)
1683		shinfo->gso_size = skb->len - hdrlen;
1684
1685	if (!shinfo->gso_segs)
1686		shinfo->gso_segs = 1;
1687
1688	tail = sk->sk_backlog.tail;
1689	if (!tail)
1690		goto no_coalesce;
1691	thtail = (struct tcphdr *)tail->data;
1692
1693	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1694	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1695	    ((TCP_SKB_CB(tail)->tcp_flags |
1696	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1697	    !((TCP_SKB_CB(tail)->tcp_flags &
1698	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1699	    ((TCP_SKB_CB(tail)->tcp_flags ^
1700	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1701#ifdef CONFIG_TLS_DEVICE
1702	    tail->decrypted != skb->decrypted ||
1703#endif
1704	    thtail->doff != th->doff ||
1705	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1706		goto no_coalesce;
1707
1708	__skb_pull(skb, hdrlen);
1709	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1710		thtail->window = th->window;
1711
1712		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1713
1714		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1715			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1716
1717		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1718		 * thtail->fin, so that the fast path in tcp_rcv_established()
1719		 * is not entered if we append a packet with a FIN.
1720		 * SYN, RST, URG are not present.
1721		 * ACK is set on both packets.
1722		 * PSH : we do not really care in TCP stack,
1723		 *       at least for 'GRO' packets.
1724		 */
1725		thtail->fin |= th->fin;
1726		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1727
1728		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1729			TCP_SKB_CB(tail)->has_rxtstamp = true;
1730			tail->tstamp = skb->tstamp;
1731			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1732		}
1733
1734		/* Not as strict as GRO. We only need to carry mss max value */
1735		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1736						 skb_shinfo(tail)->gso_size);
1737
1738		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1739		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1740
1741		sk->sk_backlog.len += delta;
1742		__NET_INC_STATS(sock_net(sk),
1743				LINUX_MIB_TCPBACKLOGCOALESCE);
1744		kfree_skb_partial(skb, fragstolen);
1745		return false;
1746	}
1747	__skb_push(skb, hdrlen);
1748
1749no_coalesce:
1750	/* Only socket owner can try to collapse/prune rx queues
1751	 * to reduce memory overhead, so add a little headroom here.
1752	 * Few sockets backlog are possibly concurrently non empty.
1753	 */
1754	limit += 64*1024;
1755
1756	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1757		bh_unlock_sock(sk);
1758		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1759		return true;
1760	}
1761	return false;
1762}
1763EXPORT_SYMBOL(tcp_add_backlog);
1764
1765int tcp_filter(struct sock *sk, struct sk_buff *skb)
1766{
1767	struct tcphdr *th = (struct tcphdr *)skb->data;
1768
1769	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1770}
1771EXPORT_SYMBOL(tcp_filter);
1772
1773static void tcp_v4_restore_cb(struct sk_buff *skb)
1774{
1775	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1776		sizeof(struct inet_skb_parm));
1777}
1778
1779static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1780			   const struct tcphdr *th)
1781{
1782	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1783	 * barrier() makes sure compiler wont play fool^Waliasing games.
1784	 */
1785	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1786		sizeof(struct inet_skb_parm));
1787	barrier();
1788
1789	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1790	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1791				    skb->len - th->doff * 4);
1792	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1793	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1794	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1795	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1796	TCP_SKB_CB(skb)->sacked	 = 0;
1797	TCP_SKB_CB(skb)->has_rxtstamp =
1798			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1799}
1800
1801/*
1802 *	From tcp_input.c
1803 */
1804
1805int tcp_v4_rcv(struct sk_buff *skb)
1806{
1807	struct net *net = dev_net(skb->dev);
1808	struct sk_buff *skb_to_free;
1809	int sdif = inet_sdif(skb);
1810	const struct iphdr *iph;
1811	const struct tcphdr *th;
1812	bool refcounted;
1813	struct sock *sk;
1814	int ret;
1815
1816	if (skb->pkt_type != PACKET_HOST)
1817		goto discard_it;
1818
1819	/* Count it even if it's bad */
1820	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1821
1822	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1823		goto discard_it;
1824
1825	th = (const struct tcphdr *)skb->data;
1826
1827	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1828		goto bad_packet;
1829	if (!pskb_may_pull(skb, th->doff * 4))
1830		goto discard_it;
1831
1832	/* An explanation is required here, I think.
1833	 * Packet length and doff are validated by header prediction,
1834	 * provided case of th->doff==0 is eliminated.
1835	 * So, we defer the checks. */
1836
1837	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1838		goto csum_error;
1839
1840	th = (const struct tcphdr *)skb->data;
1841	iph = ip_hdr(skb);
1842lookup:
1843	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1844			       th->dest, sdif, &refcounted);
1845	if (!sk)
1846		goto no_tcp_socket;
1847
1848process:
1849	if (sk->sk_state == TCP_TIME_WAIT)
1850		goto do_time_wait;
1851
1852	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1853		struct request_sock *req = inet_reqsk(sk);
1854		bool req_stolen = false;
1855		struct sock *nsk;
1856
1857		sk = req->rsk_listener;
1858		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1859			sk_drops_add(sk, skb);
1860			reqsk_put(req);
1861			goto discard_it;
1862		}
1863		if (tcp_checksum_complete(skb)) {
1864			reqsk_put(req);
1865			goto csum_error;
1866		}
1867		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1868			inet_csk_reqsk_queue_drop_and_put(sk, req);
1869			goto lookup;
1870		}
1871		/* We own a reference on the listener, increase it again
1872		 * as we might lose it too soon.
1873		 */
1874		sock_hold(sk);
1875		refcounted = true;
1876		nsk = NULL;
1877		if (!tcp_filter(sk, skb)) {
1878			th = (const struct tcphdr *)skb->data;
1879			iph = ip_hdr(skb);
1880			tcp_v4_fill_cb(skb, iph, th);
1881			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1882		}
1883		if (!nsk) {
1884			reqsk_put(req);
1885			if (req_stolen) {
1886				/* Another cpu got exclusive access to req
1887				 * and created a full blown socket.
1888				 * Try to feed this packet to this socket
1889				 * instead of discarding it.
1890				 */
1891				tcp_v4_restore_cb(skb);
1892				sock_put(sk);
1893				goto lookup;
1894			}
1895			goto discard_and_relse;
1896		}
1897		if (nsk == sk) {
1898			reqsk_put(req);
1899			tcp_v4_restore_cb(skb);
1900		} else if (tcp_child_process(sk, nsk, skb)) {
1901			tcp_v4_send_reset(nsk, skb);
1902			goto discard_and_relse;
1903		} else {
1904			sock_put(sk);
1905			return 0;
1906		}
1907	}
1908	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1909		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1910		goto discard_and_relse;
1911	}
1912
1913	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1914		goto discard_and_relse;
1915
1916	if (tcp_v4_inbound_md5_hash(sk, skb))
1917		goto discard_and_relse;
1918
1919	nf_reset_ct(skb);
1920
1921	if (tcp_filter(sk, skb))
1922		goto discard_and_relse;
1923	th = (const struct tcphdr *)skb->data;
1924	iph = ip_hdr(skb);
1925	tcp_v4_fill_cb(skb, iph, th);
1926
1927	skb->dev = NULL;
1928
1929	if (sk->sk_state == TCP_LISTEN) {
1930		ret = tcp_v4_do_rcv(sk, skb);
1931		goto put_and_return;
1932	}
1933
1934	sk_incoming_cpu_update(sk);
1935
1936	bh_lock_sock_nested(sk);
1937	tcp_segs_in(tcp_sk(sk), skb);
1938	ret = 0;
1939	if (!sock_owned_by_user(sk)) {
1940		skb_to_free = sk->sk_rx_skb_cache;
1941		sk->sk_rx_skb_cache = NULL;
1942		ret = tcp_v4_do_rcv(sk, skb);
1943	} else {
1944		if (tcp_add_backlog(sk, skb))
1945			goto discard_and_relse;
1946		skb_to_free = NULL;
1947	}
1948	bh_unlock_sock(sk);
1949	if (skb_to_free)
1950		__kfree_skb(skb_to_free);
1951
1952put_and_return:
1953	if (refcounted)
1954		sock_put(sk);
1955
1956	return ret;
1957
1958no_tcp_socket:
1959	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1960		goto discard_it;
1961
1962	tcp_v4_fill_cb(skb, iph, th);
1963
1964	if (tcp_checksum_complete(skb)) {
1965csum_error:
1966		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1967bad_packet:
1968		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1969	} else {
1970		tcp_v4_send_reset(NULL, skb);
1971	}
1972
1973discard_it:
1974	/* Discard frame. */
1975	kfree_skb(skb);
1976	return 0;
1977
1978discard_and_relse:
1979	sk_drops_add(sk, skb);
1980	if (refcounted)
1981		sock_put(sk);
1982	goto discard_it;
1983
1984do_time_wait:
1985	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1986		inet_twsk_put(inet_twsk(sk));
1987		goto discard_it;
1988	}
1989
1990	tcp_v4_fill_cb(skb, iph, th);
1991
1992	if (tcp_checksum_complete(skb)) {
1993		inet_twsk_put(inet_twsk(sk));
1994		goto csum_error;
1995	}
1996	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1997	case TCP_TW_SYN: {
1998		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1999							&tcp_hashinfo, skb,
2000							__tcp_hdrlen(th),
2001							iph->saddr, th->source,
2002							iph->daddr, th->dest,
2003							inet_iif(skb),
2004							sdif);
2005		if (sk2) {
2006			inet_twsk_deschedule_put(inet_twsk(sk));
2007			sk = sk2;
2008			tcp_v4_restore_cb(skb);
2009			refcounted = false;
2010			goto process;
2011		}
2012	}
2013		/* to ACK */
2014		/* fall through */
2015	case TCP_TW_ACK:
2016		tcp_v4_timewait_ack(sk, skb);
2017		break;
2018	case TCP_TW_RST:
2019		tcp_v4_send_reset(sk, skb);
2020		inet_twsk_deschedule_put(inet_twsk(sk));
2021		goto discard_it;
2022	case TCP_TW_SUCCESS:;
2023	}
2024	goto discard_it;
2025}
2026
2027static struct timewait_sock_ops tcp_timewait_sock_ops = {
2028	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2029	.twsk_unique	= tcp_twsk_unique,
2030	.twsk_destructor= tcp_twsk_destructor,
2031};
2032
2033void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2034{
2035	struct dst_entry *dst = skb_dst(skb);
2036
2037	if (dst && dst_hold_safe(dst)) {
2038		sk->sk_rx_dst = dst;
2039		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2040	}
2041}
2042EXPORT_SYMBOL(inet_sk_rx_dst_set);
2043
2044const struct inet_connection_sock_af_ops ipv4_specific = {
2045	.queue_xmit	   = ip_queue_xmit,
2046	.send_check	   = tcp_v4_send_check,
2047	.rebuild_header	   = inet_sk_rebuild_header,
2048	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2049	.conn_request	   = tcp_v4_conn_request,
2050	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2051	.net_header_len	   = sizeof(struct iphdr),
2052	.setsockopt	   = ip_setsockopt,
2053	.getsockopt	   = ip_getsockopt,
2054	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2055	.sockaddr_len	   = sizeof(struct sockaddr_in),
2056#ifdef CONFIG_COMPAT
2057	.compat_setsockopt = compat_ip_setsockopt,
2058	.compat_getsockopt = compat_ip_getsockopt,
2059#endif
2060	.mtu_reduced	   = tcp_v4_mtu_reduced,
2061};
2062EXPORT_SYMBOL(ipv4_specific);
2063
2064#ifdef CONFIG_TCP_MD5SIG
2065static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2066	.md5_lookup		= tcp_v4_md5_lookup,
2067	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2068	.md5_parse		= tcp_v4_parse_md5_keys,
2069};
2070#endif
2071
2072/* NOTE: A lot of things set to zero explicitly by call to
2073 *       sk_alloc() so need not be done here.
2074 */
2075static int tcp_v4_init_sock(struct sock *sk)
2076{
2077	struct inet_connection_sock *icsk = inet_csk(sk);
2078
2079	tcp_init_sock(sk);
2080
2081	icsk->icsk_af_ops = &ipv4_specific;
2082
2083#ifdef CONFIG_TCP_MD5SIG
2084	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2085#endif
2086
2087	return 0;
2088}
2089
2090void tcp_v4_destroy_sock(struct sock *sk)
2091{
2092	struct tcp_sock *tp = tcp_sk(sk);
2093
2094	trace_tcp_destroy_sock(sk);
2095
2096	tcp_clear_xmit_timers(sk);
2097
2098	tcp_cleanup_congestion_control(sk);
2099
2100	tcp_cleanup_ulp(sk);
2101
2102	/* Cleanup up the write buffer. */
2103	tcp_write_queue_purge(sk);
2104
2105	/* Check if we want to disable active TFO */
2106	tcp_fastopen_active_disable_ofo_check(sk);
2107
2108	/* Cleans up our, hopefully empty, out_of_order_queue. */
2109	skb_rbtree_purge(&tp->out_of_order_queue);
2110
2111#ifdef CONFIG_TCP_MD5SIG
2112	/* Clean up the MD5 key list, if any */
2113	if (tp->md5sig_info) {
2114		tcp_clear_md5_list(sk);
2115		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2116		tp->md5sig_info = NULL;
2117	}
2118#endif
2119
2120	/* Clean up a referenced TCP bind bucket. */
2121	if (inet_csk(sk)->icsk_bind_hash)
2122		inet_put_port(sk);
2123
2124	BUG_ON(tp->fastopen_rsk);
2125
2126	/* If socket is aborted during connect operation */
2127	tcp_free_fastopen_req(tp);
2128	tcp_fastopen_destroy_cipher(sk);
2129	tcp_saved_syn_free(tp);
2130
2131	sk_sockets_allocated_dec(sk);
2132}
2133EXPORT_SYMBOL(tcp_v4_destroy_sock);
2134
2135#ifdef CONFIG_PROC_FS
2136/* Proc filesystem TCP sock list dumping. */
2137
2138/*
2139 * Get next listener socket follow cur.  If cur is NULL, get first socket
2140 * starting from bucket given in st->bucket; when st->bucket is zero the
2141 * very first socket in the hash table is returned.
2142 */
2143static void *listening_get_next(struct seq_file *seq, void *cur)
2144{
2145	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2146	struct tcp_iter_state *st = seq->private;
2147	struct net *net = seq_file_net(seq);
2148	struct inet_listen_hashbucket *ilb;
2149	struct sock *sk = cur;
2150
2151	if (!sk) {
2152get_head:
2153		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2154		spin_lock(&ilb->lock);
2155		sk = sk_head(&ilb->head);
2156		st->offset = 0;
2157		goto get_sk;
2158	}
2159	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2160	++st->num;
2161	++st->offset;
2162
2163	sk = sk_next(sk);
2164get_sk:
2165	sk_for_each_from(sk) {
2166		if (!net_eq(sock_net(sk), net))
2167			continue;
2168		if (sk->sk_family == afinfo->family)
2169			return sk;
2170	}
2171	spin_unlock(&ilb->lock);
2172	st->offset = 0;
2173	if (++st->bucket < INET_LHTABLE_SIZE)
2174		goto get_head;
2175	return NULL;
2176}
2177
2178static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2179{
2180	struct tcp_iter_state *st = seq->private;
2181	void *rc;
2182
2183	st->bucket = 0;
2184	st->offset = 0;
2185	rc = listening_get_next(seq, NULL);
2186
2187	while (rc && *pos) {
2188		rc = listening_get_next(seq, rc);
2189		--*pos;
2190	}
2191	return rc;
2192}
2193
2194static inline bool empty_bucket(const struct tcp_iter_state *st)
2195{
2196	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2197}
2198
2199/*
2200 * Get first established socket starting from bucket given in st->bucket.
2201 * If st->bucket is zero, the very first socket in the hash is returned.
2202 */
2203static void *established_get_first(struct seq_file *seq)
2204{
2205	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2206	struct tcp_iter_state *st = seq->private;
2207	struct net *net = seq_file_net(seq);
2208	void *rc = NULL;
2209
2210	st->offset = 0;
2211	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2212		struct sock *sk;
2213		struct hlist_nulls_node *node;
2214		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2215
2216		/* Lockless fast path for the common case of empty buckets */
2217		if (empty_bucket(st))
2218			continue;
2219
2220		spin_lock_bh(lock);
2221		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2222			if (sk->sk_family != afinfo->family ||
2223			    !net_eq(sock_net(sk), net)) {
2224				continue;
2225			}
2226			rc = sk;
2227			goto out;
2228		}
2229		spin_unlock_bh(lock);
2230	}
2231out:
2232	return rc;
2233}
2234
2235static void *established_get_next(struct seq_file *seq, void *cur)
2236{
2237	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2238	struct sock *sk = cur;
2239	struct hlist_nulls_node *node;
2240	struct tcp_iter_state *st = seq->private;
2241	struct net *net = seq_file_net(seq);
2242
2243	++st->num;
2244	++st->offset;
2245
2246	sk = sk_nulls_next(sk);
2247
2248	sk_nulls_for_each_from(sk, node) {
2249		if (sk->sk_family == afinfo->family &&
2250		    net_eq(sock_net(sk), net))
2251			return sk;
2252	}
2253
2254	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2255	++st->bucket;
2256	return established_get_first(seq);
2257}
2258
2259static void *established_get_idx(struct seq_file *seq, loff_t pos)
2260{
2261	struct tcp_iter_state *st = seq->private;
2262	void *rc;
2263
2264	st->bucket = 0;
2265	rc = established_get_first(seq);
2266
2267	while (rc && pos) {
2268		rc = established_get_next(seq, rc);
2269		--pos;
2270	}
2271	return rc;
2272}
2273
2274static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2275{
2276	void *rc;
2277	struct tcp_iter_state *st = seq->private;
2278
2279	st->state = TCP_SEQ_STATE_LISTENING;
2280	rc	  = listening_get_idx(seq, &pos);
2281
2282	if (!rc) {
2283		st->state = TCP_SEQ_STATE_ESTABLISHED;
2284		rc	  = established_get_idx(seq, pos);
2285	}
2286
2287	return rc;
2288}
2289
2290static void *tcp_seek_last_pos(struct seq_file *seq)
2291{
2292	struct tcp_iter_state *st = seq->private;
2293	int offset = st->offset;
2294	int orig_num = st->num;
2295	void *rc = NULL;
2296
2297	switch (st->state) {
2298	case TCP_SEQ_STATE_LISTENING:
2299		if (st->bucket >= INET_LHTABLE_SIZE)
2300			break;
2301		st->state = TCP_SEQ_STATE_LISTENING;
2302		rc = listening_get_next(seq, NULL);
2303		while (offset-- && rc)
2304			rc = listening_get_next(seq, rc);
2305		if (rc)
2306			break;
2307		st->bucket = 0;
2308		st->state = TCP_SEQ_STATE_ESTABLISHED;
2309		/* Fallthrough */
2310	case TCP_SEQ_STATE_ESTABLISHED:
2311		if (st->bucket > tcp_hashinfo.ehash_mask)
2312			break;
2313		rc = established_get_first(seq);
2314		while (offset-- && rc)
2315			rc = established_get_next(seq, rc);
2316	}
2317
2318	st->num = orig_num;
2319
2320	return rc;
2321}
2322
2323void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2324{
2325	struct tcp_iter_state *st = seq->private;
2326	void *rc;
2327
2328	if (*pos && *pos == st->last_pos) {
2329		rc = tcp_seek_last_pos(seq);
2330		if (rc)
2331			goto out;
2332	}
2333
2334	st->state = TCP_SEQ_STATE_LISTENING;
2335	st->num = 0;
2336	st->bucket = 0;
2337	st->offset = 0;
2338	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2339
2340out:
2341	st->last_pos = *pos;
2342	return rc;
2343}
2344EXPORT_SYMBOL(tcp_seq_start);
2345
2346void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2347{
2348	struct tcp_iter_state *st = seq->private;
2349	void *rc = NULL;
2350
2351	if (v == SEQ_START_TOKEN) {
2352		rc = tcp_get_idx(seq, 0);
2353		goto out;
2354	}
2355
2356	switch (st->state) {
2357	case TCP_SEQ_STATE_LISTENING:
2358		rc = listening_get_next(seq, v);
2359		if (!rc) {
2360			st->state = TCP_SEQ_STATE_ESTABLISHED;
2361			st->bucket = 0;
2362			st->offset = 0;
2363			rc	  = established_get_first(seq);
2364		}
2365		break;
2366	case TCP_SEQ_STATE_ESTABLISHED:
2367		rc = established_get_next(seq, v);
2368		break;
2369	}
2370out:
2371	++*pos;
2372	st->last_pos = *pos;
2373	return rc;
2374}
2375EXPORT_SYMBOL(tcp_seq_next);
2376
2377void tcp_seq_stop(struct seq_file *seq, void *v)
2378{
2379	struct tcp_iter_state *st = seq->private;
2380
2381	switch (st->state) {
2382	case TCP_SEQ_STATE_LISTENING:
2383		if (v != SEQ_START_TOKEN)
2384			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2385		break;
2386	case TCP_SEQ_STATE_ESTABLISHED:
2387		if (v)
2388			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2389		break;
2390	}
2391}
2392EXPORT_SYMBOL(tcp_seq_stop);
2393
2394static void get_openreq4(const struct request_sock *req,
2395			 struct seq_file *f, int i)
2396{
2397	const struct inet_request_sock *ireq = inet_rsk(req);
2398	long delta = req->rsk_timer.expires - jiffies;
2399
2400	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2401		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2402		i,
2403		ireq->ir_loc_addr,
2404		ireq->ir_num,
2405		ireq->ir_rmt_addr,
2406		ntohs(ireq->ir_rmt_port),
2407		TCP_SYN_RECV,
2408		0, 0, /* could print option size, but that is af dependent. */
2409		1,    /* timers active (only the expire timer) */
2410		jiffies_delta_to_clock_t(delta),
2411		req->num_timeout,
2412		from_kuid_munged(seq_user_ns(f),
2413				 sock_i_uid(req->rsk_listener)),
2414		0,  /* non standard timer */
2415		0, /* open_requests have no inode */
2416		0,
2417		req);
2418}
2419
2420static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2421{
2422	int timer_active;
2423	unsigned long timer_expires;
2424	const struct tcp_sock *tp = tcp_sk(sk);
2425	const struct inet_connection_sock *icsk = inet_csk(sk);
2426	const struct inet_sock *inet = inet_sk(sk);
2427	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2428	__be32 dest = inet->inet_daddr;
2429	__be32 src = inet->inet_rcv_saddr;
2430	__u16 destp = ntohs(inet->inet_dport);
2431	__u16 srcp = ntohs(inet->inet_sport);
2432	int rx_queue;
2433	int state;
2434
2435	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2436	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2437	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2438		timer_active	= 1;
2439		timer_expires	= icsk->icsk_timeout;
2440	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2441		timer_active	= 4;
2442		timer_expires	= icsk->icsk_timeout;
2443	} else if (timer_pending(&sk->sk_timer)) {
2444		timer_active	= 2;
2445		timer_expires	= sk->sk_timer.expires;
2446	} else {
2447		timer_active	= 0;
2448		timer_expires = jiffies;
2449	}
2450
2451	state = inet_sk_state_load(sk);
2452	if (state == TCP_LISTEN)
2453		rx_queue = sk->sk_ack_backlog;
2454	else
2455		/* Because we don't lock the socket,
2456		 * we might find a transient negative value.
2457		 */
2458		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2459
2460	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2461			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2462		i, src, srcp, dest, destp, state,
2463		tp->write_seq - tp->snd_una,
2464		rx_queue,
2465		timer_active,
2466		jiffies_delta_to_clock_t(timer_expires - jiffies),
2467		icsk->icsk_retransmits,
2468		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2469		icsk->icsk_probes_out,
2470		sock_i_ino(sk),
2471		refcount_read(&sk->sk_refcnt), sk,
2472		jiffies_to_clock_t(icsk->icsk_rto),
2473		jiffies_to_clock_t(icsk->icsk_ack.ato),
2474		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2475		tp->snd_cwnd,
2476		state == TCP_LISTEN ?
2477		    fastopenq->max_qlen :
2478		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2479}
2480
2481static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2482			       struct seq_file *f, int i)
2483{
2484	long delta = tw->tw_timer.expires - jiffies;
2485	__be32 dest, src;
2486	__u16 destp, srcp;
2487
2488	dest  = tw->tw_daddr;
2489	src   = tw->tw_rcv_saddr;
2490	destp = ntohs(tw->tw_dport);
2491	srcp  = ntohs(tw->tw_sport);
2492
2493	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2494		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2495		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2496		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2497		refcount_read(&tw->tw_refcnt), tw);
2498}
2499
2500#define TMPSZ 150
2501
2502static int tcp4_seq_show(struct seq_file *seq, void *v)
2503{
2504	struct tcp_iter_state *st;
2505	struct sock *sk = v;
2506
2507	seq_setwidth(seq, TMPSZ - 1);
2508	if (v == SEQ_START_TOKEN) {
2509		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2510			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2511			   "inode");
2512		goto out;
2513	}
2514	st = seq->private;
2515
2516	if (sk->sk_state == TCP_TIME_WAIT)
2517		get_timewait4_sock(v, seq, st->num);
2518	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2519		get_openreq4(v, seq, st->num);
2520	else
2521		get_tcp4_sock(v, seq, st->num);
2522out:
2523	seq_pad(seq, '\n');
2524	return 0;
2525}
2526
2527static const struct seq_operations tcp4_seq_ops = {
2528	.show		= tcp4_seq_show,
2529	.start		= tcp_seq_start,
2530	.next		= tcp_seq_next,
2531	.stop		= tcp_seq_stop,
2532};
2533
2534static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2535	.family		= AF_INET,
2536};
2537
2538static int __net_init tcp4_proc_init_net(struct net *net)
2539{
2540	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2541			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2542		return -ENOMEM;
2543	return 0;
2544}
2545
2546static void __net_exit tcp4_proc_exit_net(struct net *net)
2547{
2548	remove_proc_entry("tcp", net->proc_net);
2549}
2550
2551static struct pernet_operations tcp4_net_ops = {
2552	.init = tcp4_proc_init_net,
2553	.exit = tcp4_proc_exit_net,
2554};
2555
2556int __init tcp4_proc_init(void)
2557{
2558	return register_pernet_subsys(&tcp4_net_ops);
2559}
2560
2561void tcp4_proc_exit(void)
2562{
2563	unregister_pernet_subsys(&tcp4_net_ops);
2564}
2565#endif /* CONFIG_PROC_FS */
2566
2567struct proto tcp_prot = {
2568	.name			= "TCP",
2569	.owner			= THIS_MODULE,
2570	.close			= tcp_close,
2571	.pre_connect		= tcp_v4_pre_connect,
2572	.connect		= tcp_v4_connect,
2573	.disconnect		= tcp_disconnect,
2574	.accept			= inet_csk_accept,
2575	.ioctl			= tcp_ioctl,
2576	.init			= tcp_v4_init_sock,
2577	.destroy		= tcp_v4_destroy_sock,
2578	.shutdown		= tcp_shutdown,
2579	.setsockopt		= tcp_setsockopt,
2580	.getsockopt		= tcp_getsockopt,
2581	.keepalive		= tcp_set_keepalive,
2582	.recvmsg		= tcp_recvmsg,
2583	.sendmsg		= tcp_sendmsg,
2584	.sendpage		= tcp_sendpage,
2585	.backlog_rcv		= tcp_v4_do_rcv,
2586	.release_cb		= tcp_release_cb,
2587	.hash			= inet_hash,
2588	.unhash			= inet_unhash,
2589	.get_port		= inet_csk_get_port,
2590	.enter_memory_pressure	= tcp_enter_memory_pressure,
2591	.leave_memory_pressure	= tcp_leave_memory_pressure,
2592	.stream_memory_free	= tcp_stream_memory_free,
2593	.sockets_allocated	= &tcp_sockets_allocated,
2594	.orphan_count		= &tcp_orphan_count,
2595	.memory_allocated	= &tcp_memory_allocated,
2596	.memory_pressure	= &tcp_memory_pressure,
2597	.sysctl_mem		= sysctl_tcp_mem,
2598	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2599	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2600	.max_header		= MAX_TCP_HEADER,
2601	.obj_size		= sizeof(struct tcp_sock),
2602	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2603	.twsk_prot		= &tcp_timewait_sock_ops,
2604	.rsk_prot		= &tcp_request_sock_ops,
2605	.h.hashinfo		= &tcp_hashinfo,
2606	.no_autobind		= true,
2607#ifdef CONFIG_COMPAT
2608	.compat_setsockopt	= compat_tcp_setsockopt,
2609	.compat_getsockopt	= compat_tcp_getsockopt,
2610#endif
2611	.diag_destroy		= tcp_abort,
2612};
2613EXPORT_SYMBOL(tcp_prot);
2614
2615static void __net_exit tcp_sk_exit(struct net *net)
2616{
2617	int cpu;
2618
2619	if (net->ipv4.tcp_congestion_control)
2620		module_put(net->ipv4.tcp_congestion_control->owner);
2621
2622	for_each_possible_cpu(cpu)
2623		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2624	free_percpu(net->ipv4.tcp_sk);
2625}
2626
2627static int __net_init tcp_sk_init(struct net *net)
2628{
2629	int res, cpu, cnt;
2630
2631	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2632	if (!net->ipv4.tcp_sk)
2633		return -ENOMEM;
2634
2635	for_each_possible_cpu(cpu) {
2636		struct sock *sk;
2637
2638		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2639					   IPPROTO_TCP, net);
2640		if (res)
2641			goto fail;
2642		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2643
2644		/* Please enforce IP_DF and IPID==0 for RST and
2645		 * ACK sent in SYN-RECV and TIME-WAIT state.
2646		 */
2647		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2648
2649		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2650	}
2651
2652	net->ipv4.sysctl_tcp_ecn = 2;
2653	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2654
2655	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2656	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2657	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2658	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2659	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2660
2661	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2662	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2663	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2664
2665	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2666	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2667	net->ipv4.sysctl_tcp_syncookies = 1;
2668	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2669	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2670	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2671	net->ipv4.sysctl_tcp_orphan_retries = 0;
2672	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2673	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2674	net->ipv4.sysctl_tcp_tw_reuse = 2;
2675
2676	cnt = tcp_hashinfo.ehash_mask + 1;
2677	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2678	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2679
2680	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2681	net->ipv4.sysctl_tcp_sack = 1;
2682	net->ipv4.sysctl_tcp_window_scaling = 1;
2683	net->ipv4.sysctl_tcp_timestamps = 1;
2684	net->ipv4.sysctl_tcp_early_retrans = 3;
2685	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2686	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2687	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2688	net->ipv4.sysctl_tcp_max_reordering = 300;
2689	net->ipv4.sysctl_tcp_dsack = 1;
2690	net->ipv4.sysctl_tcp_app_win = 31;
2691	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2692	net->ipv4.sysctl_tcp_frto = 2;
2693	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2694	/* This limits the percentage of the congestion window which we
2695	 * will allow a single TSO frame to consume.  Building TSO frames
2696	 * which are too large can cause TCP streams to be bursty.
2697	 */
2698	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2699	/* Default TSQ limit of 16 TSO segments */
2700	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2701	/* rfc5961 challenge ack rate limiting */
2702	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2703	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2704	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2705	net->ipv4.sysctl_tcp_autocorking = 1;
2706	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2707	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2708	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2709	if (net != &init_net) {
2710		memcpy(net->ipv4.sysctl_tcp_rmem,
2711		       init_net.ipv4.sysctl_tcp_rmem,
2712		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2713		memcpy(net->ipv4.sysctl_tcp_wmem,
2714		       init_net.ipv4.sysctl_tcp_wmem,
2715		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2716	}
2717	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2718	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2719	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2720	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2721	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2722	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2723
2724	/* Reno is always built in */
2725	if (!net_eq(net, &init_net) &&
2726	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2727		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2728	else
2729		net->ipv4.tcp_congestion_control = &tcp_reno;
2730
2731	return 0;
2732fail:
2733	tcp_sk_exit(net);
2734
2735	return res;
2736}
2737
2738static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2739{
2740	struct net *net;
2741
2742	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2743
2744	list_for_each_entry(net, net_exit_list, exit_list)
2745		tcp_fastopen_ctx_destroy(net);
2746}
2747
2748static struct pernet_operations __net_initdata tcp_sk_ops = {
2749       .init	   = tcp_sk_init,
2750       .exit	   = tcp_sk_exit,
2751       .exit_batch = tcp_sk_exit_batch,
2752};
2753
2754void __init tcp_v4_init(void)
2755{
2756	if (register_pernet_subsys(&tcp_sk_ops))
2757		panic("Failed to create the TCP control socket.\n");
2758}
Configure Feed

Configure Feed