net/ipv4/tcp_ipv4.c at v5.0-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v5.0-rc1 2721 lines 72 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *		IPv4 specific functions
   9 *
  10 *
  11 *		code split from:
  12 *		linux/ipv4/tcp.c
  13 *		linux/ipv4/tcp_input.c
  14 *		linux/ipv4/tcp_output.c
  15 *
  16 *		See tcp.c for author information
  17 *
  18 *	This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *		David S. Miller	:	New socket lookup architecture.
  27 *					This code is dedicated to John Dyson.
  28 *		David S. Miller :	Change semantics of established hash,
  29 *					half is devoted to TIME_WAIT sockets
  30 *					and the rest go in the other half.
  31 *		Andi Kleen :		Add support for syncookies and fixed
  32 *					some bugs: ip options weren't passed to
  33 *					the TCP layer, missed a check for an
  34 *					ACK bit.
  35 *		Andi Kleen :		Implemented fast path mtu discovery.
  36 *	     				Fixed many serious bugs in the
  37 *					request_sock handling and moved
  38 *					most of it into the af independent code.
  39 *					Added tail drop and some other bugfixes.
  40 *					Added new listen semantics.
  41 *		Mike McLagan	:	Routing by source
  42 *	Juan Jose Ciarlante:		ip_dynaddr bits
  43 *		Andi Kleen:		various fixes.
  44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  45 *					coma.
  46 *	Andi Kleen		:	Fix new listen.
  47 *	Andi Kleen		:	Fix accept error reporting.
  48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  50 *					a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83#include <linux/inetdevice.h>
  84
  85#include <crypto/hash.h>
  86#include <linux/scatterlist.h>
  87
  88#include <trace/events/tcp.h>
  89
  90#ifdef CONFIG_TCP_MD5SIG
  91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  92			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
  93#endif
  94
  95struct inet_hashinfo tcp_hashinfo;
  96EXPORT_SYMBOL(tcp_hashinfo);
  97
  98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
  99{
 100	return secure_tcp_seq(ip_hdr(skb)->daddr,
 101			      ip_hdr(skb)->saddr,
 102			      tcp_hdr(skb)->dest,
 103			      tcp_hdr(skb)->source);
 104}
 105
 106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 107{
 108	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
 109}
 110
 111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112{
 113	const struct inet_timewait_sock *tw = inet_twsk(sktw);
 114	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 115	struct tcp_sock *tp = tcp_sk(sk);
 116	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
 117
 118	if (reuse == 2) {
 119		/* Still does not detect *everything* that goes through
 120		 * lo, since we require a loopback src or dst address
 121		 * or direct binding to 'lo' interface.
 122		 */
 123		bool loopback = false;
 124		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
 125			loopback = true;
 126#if IS_ENABLED(CONFIG_IPV6)
 127		if (tw->tw_family == AF_INET6) {
 128			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
 129			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
 130			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
 131			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
 132			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
 133			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
 134				loopback = true;
 135		} else
 136#endif
 137		{
 138			if (ipv4_is_loopback(tw->tw_daddr) ||
 139			    ipv4_is_loopback(tw->tw_rcv_saddr))
 140				loopback = true;
 141		}
 142		if (!loopback)
 143			reuse = 0;
 144	}
 145
 146	/* With PAWS, it is safe from the viewpoint
 147	   of data integrity. Even without PAWS it is safe provided sequence
 148	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 149
 150	   Actually, the idea is close to VJ's one, only timestamp cache is
 151	   held not per host, but per port pair and TW bucket is used as state
 152	   holder.
 153
 154	   If TW bucket has been already destroyed we fall back to VJ's scheme
 155	   and use initial timestamp retrieved from peer table.
 156	 */
 157	if (tcptw->tw_ts_recent_stamp &&
 158	    (!twp || (reuse && time_after32(ktime_get_seconds(),
 159					    tcptw->tw_ts_recent_stamp)))) {
 160		/* In case of repair and re-using TIME-WAIT sockets we still
 161		 * want to be sure that it is safe as above but honor the
 162		 * sequence numbers and time stamps set as part of the repair
 163		 * process.
 164		 *
 165		 * Without this check re-using a TIME-WAIT socket with TCP
 166		 * repair would accumulate a -1 on the repair assigned
 167		 * sequence number. The first time it is reused the sequence
 168		 * is -1, the second time -2, etc. This fixes that issue
 169		 * without appearing to create any others.
 170		 */
 171		if (likely(!tp->repair)) {
 172			tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 173			if (tp->write_seq == 0)
 174				tp->write_seq = 1;
 175			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 176			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 177		}
 178		sock_hold(sktw);
 179		return 1;
 180	}
 181
 182	return 0;
 183}
 184EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 185
 186static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 187			      int addr_len)
 188{
 189	/* This check is replicated from tcp_v4_connect() and intended to
 190	 * prevent BPF program called below from accessing bytes that are out
 191	 * of the bound specified by user in addr_len.
 192	 */
 193	if (addr_len < sizeof(struct sockaddr_in))
 194		return -EINVAL;
 195
 196	sock_owned_by_me(sk);
 197
 198	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
 199}
 200
 201/* This will initiate an outgoing connection. */
 202int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 203{
 204	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 205	struct inet_sock *inet = inet_sk(sk);
 206	struct tcp_sock *tp = tcp_sk(sk);
 207	__be16 orig_sport, orig_dport;
 208	__be32 daddr, nexthop;
 209	struct flowi4 *fl4;
 210	struct rtable *rt;
 211	int err;
 212	struct ip_options_rcu *inet_opt;
 213	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 214
 215	if (addr_len < sizeof(struct sockaddr_in))
 216		return -EINVAL;
 217
 218	if (usin->sin_family != AF_INET)
 219		return -EAFNOSUPPORT;
 220
 221	nexthop = daddr = usin->sin_addr.s_addr;
 222	inet_opt = rcu_dereference_protected(inet->inet_opt,
 223					     lockdep_sock_is_held(sk));
 224	if (inet_opt && inet_opt->opt.srr) {
 225		if (!daddr)
 226			return -EINVAL;
 227		nexthop = inet_opt->opt.faddr;
 228	}
 229
 230	orig_sport = inet->inet_sport;
 231	orig_dport = usin->sin_port;
 232	fl4 = &inet->cork.fl.u.ip4;
 233	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 234			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 235			      IPPROTO_TCP,
 236			      orig_sport, orig_dport, sk);
 237	if (IS_ERR(rt)) {
 238		err = PTR_ERR(rt);
 239		if (err == -ENETUNREACH)
 240			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 241		return err;
 242	}
 243
 244	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 245		ip_rt_put(rt);
 246		return -ENETUNREACH;
 247	}
 248
 249	if (!inet_opt || !inet_opt->opt.srr)
 250		daddr = fl4->daddr;
 251
 252	if (!inet->inet_saddr)
 253		inet->inet_saddr = fl4->saddr;
 254	sk_rcv_saddr_set(sk, inet->inet_saddr);
 255
 256	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 257		/* Reset inherited state */
 258		tp->rx_opt.ts_recent	   = 0;
 259		tp->rx_opt.ts_recent_stamp = 0;
 260		if (likely(!tp->repair))
 261			tp->write_seq	   = 0;
 262	}
 263
 264	inet->inet_dport = usin->sin_port;
 265	sk_daddr_set(sk, daddr);
 266
 267	inet_csk(sk)->icsk_ext_hdr_len = 0;
 268	if (inet_opt)
 269		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 270
 271	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 272
 273	/* Socket identity is still unknown (sport may be zero).
 274	 * However we set state to SYN-SENT and not releasing socket
 275	 * lock select source port, enter ourselves into the hash tables and
 276	 * complete initialization after this.
 277	 */
 278	tcp_set_state(sk, TCP_SYN_SENT);
 279	err = inet_hash_connect(tcp_death_row, sk);
 280	if (err)
 281		goto failure;
 282
 283	sk_set_txhash(sk);
 284
 285	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 286			       inet->inet_sport, inet->inet_dport, sk);
 287	if (IS_ERR(rt)) {
 288		err = PTR_ERR(rt);
 289		rt = NULL;
 290		goto failure;
 291	}
 292	/* OK, now commit destination to socket.  */
 293	sk->sk_gso_type = SKB_GSO_TCPV4;
 294	sk_setup_caps(sk, &rt->dst);
 295	rt = NULL;
 296
 297	if (likely(!tp->repair)) {
 298		if (!tp->write_seq)
 299			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
 300						       inet->inet_daddr,
 301						       inet->inet_sport,
 302						       usin->sin_port);
 303		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
 304						 inet->inet_saddr,
 305						 inet->inet_daddr);
 306	}
 307
 308	inet->inet_id = tp->write_seq ^ jiffies;
 309
 310	if (tcp_fastopen_defer_connect(sk, &err))
 311		return err;
 312	if (err)
 313		goto failure;
 314
 315	err = tcp_connect(sk);
 316
 317	if (err)
 318		goto failure;
 319
 320	return 0;
 321
 322failure:
 323	/*
 324	 * This unhashes the socket and releases the local port,
 325	 * if necessary.
 326	 */
 327	tcp_set_state(sk, TCP_CLOSE);
 328	ip_rt_put(rt);
 329	sk->sk_route_caps = 0;
 330	inet->inet_dport = 0;
 331	return err;
 332}
 333EXPORT_SYMBOL(tcp_v4_connect);
 334
 335/*
 336 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 337 * It can be called through tcp_release_cb() if socket was owned by user
 338 * at the time tcp_v4_err() was called to handle ICMP message.
 339 */
 340void tcp_v4_mtu_reduced(struct sock *sk)
 341{
 342	struct inet_sock *inet = inet_sk(sk);
 343	struct dst_entry *dst;
 344	u32 mtu;
 345
 346	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 347		return;
 348	mtu = tcp_sk(sk)->mtu_info;
 349	dst = inet_csk_update_pmtu(sk, mtu);
 350	if (!dst)
 351		return;
 352
 353	/* Something is about to be wrong... Remember soft error
 354	 * for the case, if this connection will not able to recover.
 355	 */
 356	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 357		sk->sk_err_soft = EMSGSIZE;
 358
 359	mtu = dst_mtu(dst);
 360
 361	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 362	    ip_sk_accept_pmtu(sk) &&
 363	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 364		tcp_sync_mss(sk, mtu);
 365
 366		/* Resend the TCP packet because it's
 367		 * clear that the old packet has been
 368		 * dropped. This is the new "fast" path mtu
 369		 * discovery.
 370		 */
 371		tcp_simple_retransmit(sk);
 372	} /* else let the usual retransmit timer handle it */
 373}
 374EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 375
 376static void do_redirect(struct sk_buff *skb, struct sock *sk)
 377{
 378	struct dst_entry *dst = __sk_dst_check(sk, 0);
 379
 380	if (dst)
 381		dst->ops->redirect(dst, sk, skb);
 382}
 383
 384
 385/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 386void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 387{
 388	struct request_sock *req = inet_reqsk(sk);
 389	struct net *net = sock_net(sk);
 390
 391	/* ICMPs are not backlogged, hence we cannot get
 392	 * an established socket here.
 393	 */
 394	if (seq != tcp_rsk(req)->snt_isn) {
 395		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 396	} else if (abort) {
 397		/*
 398		 * Still in SYN_RECV, just remove it silently.
 399		 * There is no good way to pass the error to the newly
 400		 * created socket, and POSIX does not want network
 401		 * errors returned from accept().
 402		 */
 403		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 404		tcp_listendrop(req->rsk_listener);
 405	}
 406	reqsk_put(req);
 407}
 408EXPORT_SYMBOL(tcp_req_err);
 409
 410/*
 411 * This routine is called by the ICMP module when it gets some
 412 * sort of error condition.  If err < 0 then the socket should
 413 * be closed and the error returned to the user.  If err > 0
 414 * it's just the icmp type << 8 | icmp code.  After adjustment
 415 * header points to the first 8 bytes of the tcp header.  We need
 416 * to find the appropriate port.
 417 *
 418 * The locking strategy used here is very "optimistic". When
 419 * someone else accesses the socket the ICMP is just dropped
 420 * and for some paths there is no check at all.
 421 * A more general error queue to queue errors for later handling
 422 * is probably better.
 423 *
 424 */
 425
 426int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 427{
 428	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 429	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 430	struct inet_connection_sock *icsk;
 431	struct tcp_sock *tp;
 432	struct inet_sock *inet;
 433	const int type = icmp_hdr(icmp_skb)->type;
 434	const int code = icmp_hdr(icmp_skb)->code;
 435	struct sock *sk;
 436	struct sk_buff *skb;
 437	struct request_sock *fastopen;
 438	u32 seq, snd_una;
 439	s32 remaining;
 440	u32 delta_us;
 441	int err;
 442	struct net *net = dev_net(icmp_skb->dev);
 443
 444	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 445				       th->dest, iph->saddr, ntohs(th->source),
 446				       inet_iif(icmp_skb), 0);
 447	if (!sk) {
 448		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 449		return -ENOENT;
 450	}
 451	if (sk->sk_state == TCP_TIME_WAIT) {
 452		inet_twsk_put(inet_twsk(sk));
 453		return 0;
 454	}
 455	seq = ntohl(th->seq);
 456	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 457		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
 458				     type == ICMP_TIME_EXCEEDED ||
 459				     (type == ICMP_DEST_UNREACH &&
 460				      (code == ICMP_NET_UNREACH ||
 461				       code == ICMP_HOST_UNREACH)));
 462		return 0;
 463	}
 464
 465	bh_lock_sock(sk);
 466	/* If too many ICMPs get dropped on busy
 467	 * servers this needs to be solved differently.
 468	 * We do take care of PMTU discovery (RFC1191) special case :
 469	 * we can receive locally generated ICMP messages while socket is held.
 470	 */
 471	if (sock_owned_by_user(sk)) {
 472		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 473			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 474	}
 475	if (sk->sk_state == TCP_CLOSE)
 476		goto out;
 477
 478	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 479		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 480		goto out;
 481	}
 482
 483	icsk = inet_csk(sk);
 484	tp = tcp_sk(sk);
 485	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 486	fastopen = tp->fastopen_rsk;
 487	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 488	if (sk->sk_state != TCP_LISTEN &&
 489	    !between(seq, snd_una, tp->snd_nxt)) {
 490		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 491		goto out;
 492	}
 493
 494	switch (type) {
 495	case ICMP_REDIRECT:
 496		if (!sock_owned_by_user(sk))
 497			do_redirect(icmp_skb, sk);
 498		goto out;
 499	case ICMP_SOURCE_QUENCH:
 500		/* Just silently ignore these. */
 501		goto out;
 502	case ICMP_PARAMETERPROB:
 503		err = EPROTO;
 504		break;
 505	case ICMP_DEST_UNREACH:
 506		if (code > NR_ICMP_UNREACH)
 507			goto out;
 508
 509		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 510			/* We are not interested in TCP_LISTEN and open_requests
 511			 * (SYN-ACKs send out by Linux are always <576bytes so
 512			 * they should go through unfragmented).
 513			 */
 514			if (sk->sk_state == TCP_LISTEN)
 515				goto out;
 516
 517			tp->mtu_info = info;
 518			if (!sock_owned_by_user(sk)) {
 519				tcp_v4_mtu_reduced(sk);
 520			} else {
 521				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 522					sock_hold(sk);
 523			}
 524			goto out;
 525		}
 526
 527		err = icmp_err_convert[code].errno;
 528		/* check if icmp_skb allows revert of backoff
 529		 * (see draft-zimmermann-tcp-lcd) */
 530		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 531			break;
 532		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 533		    !icsk->icsk_backoff || fastopen)
 534			break;
 535
 536		if (sock_owned_by_user(sk))
 537			break;
 538
 539		icsk->icsk_backoff--;
 540		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 541					       TCP_TIMEOUT_INIT;
 542		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 543
 544		skb = tcp_rtx_queue_head(sk);
 545
 546		tcp_mstamp_refresh(tp);
 547		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
 548		remaining = icsk->icsk_rto -
 549			    usecs_to_jiffies(delta_us);
 550
 551		if (remaining > 0) {
 552			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 553						  remaining, TCP_RTO_MAX);
 554		} else {
 555			/* RTO revert clocked out retransmission.
 556			 * Will retransmit now */
 557			tcp_retransmit_timer(sk);
 558		}
 559
 560		break;
 561	case ICMP_TIME_EXCEEDED:
 562		err = EHOSTUNREACH;
 563		break;
 564	default:
 565		goto out;
 566	}
 567
 568	switch (sk->sk_state) {
 569	case TCP_SYN_SENT:
 570	case TCP_SYN_RECV:
 571		/* Only in fast or simultaneous open. If a fast open socket is
 572		 * is already accepted it is treated as a connected one below.
 573		 */
 574		if (fastopen && !fastopen->sk)
 575			break;
 576
 577		if (!sock_owned_by_user(sk)) {
 578			sk->sk_err = err;
 579
 580			sk->sk_error_report(sk);
 581
 582			tcp_done(sk);
 583		} else {
 584			sk->sk_err_soft = err;
 585		}
 586		goto out;
 587	}
 588
 589	/* If we've already connected we will keep trying
 590	 * until we time out, or the user gives up.
 591	 *
 592	 * rfc1122 4.2.3.9 allows to consider as hard errors
 593	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 594	 * but it is obsoleted by pmtu discovery).
 595	 *
 596	 * Note, that in modern internet, where routing is unreliable
 597	 * and in each dark corner broken firewalls sit, sending random
 598	 * errors ordered by their masters even this two messages finally lose
 599	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 600	 *
 601	 * Now we are in compliance with RFCs.
 602	 *							--ANK (980905)
 603	 */
 604
 605	inet = inet_sk(sk);
 606	if (!sock_owned_by_user(sk) && inet->recverr) {
 607		sk->sk_err = err;
 608		sk->sk_error_report(sk);
 609	} else	{ /* Only an error on timeout */
 610		sk->sk_err_soft = err;
 611	}
 612
 613out:
 614	bh_unlock_sock(sk);
 615	sock_put(sk);
 616	return 0;
 617}
 618
 619void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 620{
 621	struct tcphdr *th = tcp_hdr(skb);
 622
 623	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 624	skb->csum_start = skb_transport_header(skb) - skb->head;
 625	skb->csum_offset = offsetof(struct tcphdr, check);
 626}
 627
 628/* This routine computes an IPv4 TCP checksum. */
 629void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 630{
 631	const struct inet_sock *inet = inet_sk(sk);
 632
 633	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 634}
 635EXPORT_SYMBOL(tcp_v4_send_check);
 636
 637/*
 638 *	This routine will send an RST to the other tcp.
 639 *
 640 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 641 *		      for reset.
 642 *	Answer: if a packet caused RST, it is not for a socket
 643 *		existing in our system, if it is matched to a socket,
 644 *		it is just duplicate segment or bug in other side's TCP.
 645 *		So that we build reply only basing on parameters
 646 *		arrived with segment.
 647 *	Exception: precedence violation. We do not implement it in any case.
 648 */
 649
 650static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 651{
 652	const struct tcphdr *th = tcp_hdr(skb);
 653	struct {
 654		struct tcphdr th;
 655#ifdef CONFIG_TCP_MD5SIG
 656		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 657#endif
 658	} rep;
 659	struct ip_reply_arg arg;
 660#ifdef CONFIG_TCP_MD5SIG
 661	struct tcp_md5sig_key *key = NULL;
 662	const __u8 *hash_location = NULL;
 663	unsigned char newhash[16];
 664	int genhash;
 665	struct sock *sk1 = NULL;
 666#endif
 667	struct net *net;
 668	struct sock *ctl_sk;
 669
 670	/* Never send a reset in response to a reset. */
 671	if (th->rst)
 672		return;
 673
 674	/* If sk not NULL, it means we did a successful lookup and incoming
 675	 * route had to be correct. prequeue might have dropped our dst.
 676	 */
 677	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 678		return;
 679
 680	/* Swap the send and the receive. */
 681	memset(&rep, 0, sizeof(rep));
 682	rep.th.dest   = th->source;
 683	rep.th.source = th->dest;
 684	rep.th.doff   = sizeof(struct tcphdr) / 4;
 685	rep.th.rst    = 1;
 686
 687	if (th->ack) {
 688		rep.th.seq = th->ack_seq;
 689	} else {
 690		rep.th.ack = 1;
 691		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 692				       skb->len - (th->doff << 2));
 693	}
 694
 695	memset(&arg, 0, sizeof(arg));
 696	arg.iov[0].iov_base = (unsigned char *)&rep;
 697	arg.iov[0].iov_len  = sizeof(rep.th);
 698
 699	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 700#ifdef CONFIG_TCP_MD5SIG
 701	rcu_read_lock();
 702	hash_location = tcp_parse_md5sig_option(th);
 703	if (sk && sk_fullsock(sk)) {
 704		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 705					&ip_hdr(skb)->saddr, AF_INET);
 706	} else if (hash_location) {
 707		/*
 708		 * active side is lost. Try to find listening socket through
 709		 * source port, and then find md5 key through listening socket.
 710		 * we are not loose security here:
 711		 * Incoming packet is checked with md5 hash with finding key,
 712		 * no RST generated if md5 hash doesn't match.
 713		 */
 714		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 715					     ip_hdr(skb)->saddr,
 716					     th->source, ip_hdr(skb)->daddr,
 717					     ntohs(th->source), inet_iif(skb),
 718					     tcp_v4_sdif(skb));
 719		/* don't send rst if it can't find key */
 720		if (!sk1)
 721			goto out;
 722
 723		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 724					&ip_hdr(skb)->saddr, AF_INET);
 725		if (!key)
 726			goto out;
 727
 728
 729		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 730		if (genhash || memcmp(hash_location, newhash, 16) != 0)
 731			goto out;
 732
 733	}
 734
 735	if (key) {
 736		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 737				   (TCPOPT_NOP << 16) |
 738				   (TCPOPT_MD5SIG << 8) |
 739				   TCPOLEN_MD5SIG);
 740		/* Update length and the length the header thinks exists */
 741		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 742		rep.th.doff = arg.iov[0].iov_len / 4;
 743
 744		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 745				     key, ip_hdr(skb)->saddr,
 746				     ip_hdr(skb)->daddr, &rep.th);
 747	}
 748#endif
 749	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 750				      ip_hdr(skb)->saddr, /* XXX */
 751				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 752	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 753	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 754
 755	/* When socket is gone, all binding information is lost.
 756	 * routing might fail in this case. No choice here, if we choose to force
 757	 * input interface, we will misroute in case of asymmetric route.
 758	 */
 759	if (sk) {
 760		arg.bound_dev_if = sk->sk_bound_dev_if;
 761		if (sk_fullsock(sk))
 762			trace_tcp_send_reset(sk, skb);
 763	}
 764
 765	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 766		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 767
 768	arg.tos = ip_hdr(skb)->tos;
 769	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 770	local_bh_disable();
 771	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 772	if (sk)
 773		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 774				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 775	ip_send_unicast_reply(ctl_sk,
 776			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 777			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 778			      &arg, arg.iov[0].iov_len);
 779
 780	ctl_sk->sk_mark = 0;
 781	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 782	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 783	local_bh_enable();
 784
 785#ifdef CONFIG_TCP_MD5SIG
 786out:
 787	rcu_read_unlock();
 788#endif
 789}
 790
 791/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 792   outside socket context is ugly, certainly. What can I do?
 793 */
 794
 795static void tcp_v4_send_ack(const struct sock *sk,
 796			    struct sk_buff *skb, u32 seq, u32 ack,
 797			    u32 win, u32 tsval, u32 tsecr, int oif,
 798			    struct tcp_md5sig_key *key,
 799			    int reply_flags, u8 tos)
 800{
 801	const struct tcphdr *th = tcp_hdr(skb);
 802	struct {
 803		struct tcphdr th;
 804		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 805#ifdef CONFIG_TCP_MD5SIG
 806			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 807#endif
 808			];
 809	} rep;
 810	struct net *net = sock_net(sk);
 811	struct ip_reply_arg arg;
 812	struct sock *ctl_sk;
 813
 814	memset(&rep.th, 0, sizeof(struct tcphdr));
 815	memset(&arg, 0, sizeof(arg));
 816
 817	arg.iov[0].iov_base = (unsigned char *)&rep;
 818	arg.iov[0].iov_len  = sizeof(rep.th);
 819	if (tsecr) {
 820		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 821				   (TCPOPT_TIMESTAMP << 8) |
 822				   TCPOLEN_TIMESTAMP);
 823		rep.opt[1] = htonl(tsval);
 824		rep.opt[2] = htonl(tsecr);
 825		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 826	}
 827
 828	/* Swap the send and the receive. */
 829	rep.th.dest    = th->source;
 830	rep.th.source  = th->dest;
 831	rep.th.doff    = arg.iov[0].iov_len / 4;
 832	rep.th.seq     = htonl(seq);
 833	rep.th.ack_seq = htonl(ack);
 834	rep.th.ack     = 1;
 835	rep.th.window  = htons(win);
 836
 837#ifdef CONFIG_TCP_MD5SIG
 838	if (key) {
 839		int offset = (tsecr) ? 3 : 0;
 840
 841		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 842					  (TCPOPT_NOP << 16) |
 843					  (TCPOPT_MD5SIG << 8) |
 844					  TCPOLEN_MD5SIG);
 845		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 846		rep.th.doff = arg.iov[0].iov_len/4;
 847
 848		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 849				    key, ip_hdr(skb)->saddr,
 850				    ip_hdr(skb)->daddr, &rep.th);
 851	}
 852#endif
 853	arg.flags = reply_flags;
 854	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 855				      ip_hdr(skb)->saddr, /* XXX */
 856				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 857	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 858	if (oif)
 859		arg.bound_dev_if = oif;
 860	arg.tos = tos;
 861	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 862	local_bh_disable();
 863	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
 864	if (sk)
 865		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 866				   inet_twsk(sk)->tw_mark : sk->sk_mark;
 867	ip_send_unicast_reply(ctl_sk,
 868			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 869			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 870			      &arg, arg.iov[0].iov_len);
 871
 872	ctl_sk->sk_mark = 0;
 873	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 874	local_bh_enable();
 875}
 876
 877static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 878{
 879	struct inet_timewait_sock *tw = inet_twsk(sk);
 880	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 881
 882	tcp_v4_send_ack(sk, skb,
 883			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 884			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 885			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 886			tcptw->tw_ts_recent,
 887			tw->tw_bound_dev_if,
 888			tcp_twsk_md5_key(tcptw),
 889			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 890			tw->tw_tos
 891			);
 892
 893	inet_twsk_put(tw);
 894}
 895
 896static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 897				  struct request_sock *req)
 898{
 899	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 900	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 901	 */
 902	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 903					     tcp_sk(sk)->snd_nxt;
 904
 905	/* RFC 7323 2.3
 906	 * The window field (SEG.WND) of every outgoing segment, with the
 907	 * exception of <SYN> segments, MUST be right-shifted by
 908	 * Rcv.Wind.Shift bits:
 909	 */
 910	tcp_v4_send_ack(sk, skb, seq,
 911			tcp_rsk(req)->rcv_nxt,
 912			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 913			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 914			req->ts_recent,
 915			0,
 916			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
 917					  AF_INET),
 918			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 919			ip_hdr(skb)->tos);
 920}
 921
 922/*
 923 *	Send a SYN-ACK after having received a SYN.
 924 *	This still operates on a request_sock only, not on a big
 925 *	socket.
 926 */
 927static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 928			      struct flowi *fl,
 929			      struct request_sock *req,
 930			      struct tcp_fastopen_cookie *foc,
 931			      enum tcp_synack_type synack_type)
 932{
 933	const struct inet_request_sock *ireq = inet_rsk(req);
 934	struct flowi4 fl4;
 935	int err = -1;
 936	struct sk_buff *skb;
 937
 938	/* First, grab a route. */
 939	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 940		return -1;
 941
 942	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 943
 944	if (skb) {
 945		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 946
 947		rcu_read_lock();
 948		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 949					    ireq->ir_rmt_addr,
 950					    rcu_dereference(ireq->ireq_opt));
 951		rcu_read_unlock();
 952		err = net_xmit_eval(err);
 953	}
 954
 955	return err;
 956}
 957
 958/*
 959 *	IPv4 request_sock destructor.
 960 */
 961static void tcp_v4_reqsk_destructor(struct request_sock *req)
 962{
 963	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 964}
 965
 966#ifdef CONFIG_TCP_MD5SIG
 967/*
 968 * RFC2385 MD5 checksumming requires a mapping of
 969 * IP address->MD5 Key.
 970 * We need to maintain these in the sk structure.
 971 */
 972
 973struct static_key tcp_md5_needed __read_mostly;
 974EXPORT_SYMBOL(tcp_md5_needed);
 975
 976/* Find the Key structure for an address.  */
 977struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
 978					   const union tcp_md5_addr *addr,
 979					   int family)
 980{
 981	const struct tcp_sock *tp = tcp_sk(sk);
 982	struct tcp_md5sig_key *key;
 983	const struct tcp_md5sig_info *md5sig;
 984	__be32 mask;
 985	struct tcp_md5sig_key *best_match = NULL;
 986	bool match;
 987
 988	/* caller either holds rcu_read_lock() or socket lock */
 989	md5sig = rcu_dereference_check(tp->md5sig_info,
 990				       lockdep_sock_is_held(sk));
 991	if (!md5sig)
 992		return NULL;
 993
 994	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 995		if (key->family != family)
 996			continue;
 997
 998		if (family == AF_INET) {
 999			mask = inet_make_mask(key->prefixlen);
1000			match = (key->addr.a4.s_addr & mask) ==
1001				(addr->a4.s_addr & mask);
1002#if IS_ENABLED(CONFIG_IPV6)
1003		} else if (family == AF_INET6) {
1004			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1005						  key->prefixlen);
1006#endif
1007		} else {
1008			match = false;
1009		}
1010
1011		if (match && (!best_match ||
1012			      key->prefixlen > best_match->prefixlen))
1013			best_match = key;
1014	}
1015	return best_match;
1016}
1017EXPORT_SYMBOL(__tcp_md5_do_lookup);
1018
1019static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1020						      const union tcp_md5_addr *addr,
1021						      int family, u8 prefixlen)
1022{
1023	const struct tcp_sock *tp = tcp_sk(sk);
1024	struct tcp_md5sig_key *key;
1025	unsigned int size = sizeof(struct in_addr);
1026	const struct tcp_md5sig_info *md5sig;
1027
1028	/* caller either holds rcu_read_lock() or socket lock */
1029	md5sig = rcu_dereference_check(tp->md5sig_info,
1030				       lockdep_sock_is_held(sk));
1031	if (!md5sig)
1032		return NULL;
1033#if IS_ENABLED(CONFIG_IPV6)
1034	if (family == AF_INET6)
1035		size = sizeof(struct in6_addr);
1036#endif
1037	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1038		if (key->family != family)
1039			continue;
1040		if (!memcmp(&key->addr, addr, size) &&
1041		    key->prefixlen == prefixlen)
1042			return key;
1043	}
1044	return NULL;
1045}
1046
1047struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1048					 const struct sock *addr_sk)
1049{
1050	const union tcp_md5_addr *addr;
1051
1052	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1053	return tcp_md5_do_lookup(sk, addr, AF_INET);
1054}
1055EXPORT_SYMBOL(tcp_v4_md5_lookup);
1056
1057/* This can be called on a newly created socket, from other files */
1058int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1059		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1060		   gfp_t gfp)
1061{
1062	/* Add Key to the list */
1063	struct tcp_md5sig_key *key;
1064	struct tcp_sock *tp = tcp_sk(sk);
1065	struct tcp_md5sig_info *md5sig;
1066
1067	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1068	if (key) {
1069		/* Pre-existing entry - just update that one. */
1070		memcpy(key->key, newkey, newkeylen);
1071		key->keylen = newkeylen;
1072		return 0;
1073	}
1074
1075	md5sig = rcu_dereference_protected(tp->md5sig_info,
1076					   lockdep_sock_is_held(sk));
1077	if (!md5sig) {
1078		md5sig = kmalloc(sizeof(*md5sig), gfp);
1079		if (!md5sig)
1080			return -ENOMEM;
1081
1082		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1083		INIT_HLIST_HEAD(&md5sig->head);
1084		rcu_assign_pointer(tp->md5sig_info, md5sig);
1085	}
1086
1087	key = sock_kmalloc(sk, sizeof(*key), gfp);
1088	if (!key)
1089		return -ENOMEM;
1090	if (!tcp_alloc_md5sig_pool()) {
1091		sock_kfree_s(sk, key, sizeof(*key));
1092		return -ENOMEM;
1093	}
1094
1095	memcpy(key->key, newkey, newkeylen);
1096	key->keylen = newkeylen;
1097	key->family = family;
1098	key->prefixlen = prefixlen;
1099	memcpy(&key->addr, addr,
1100	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1101				      sizeof(struct in_addr));
1102	hlist_add_head_rcu(&key->node, &md5sig->head);
1103	return 0;
1104}
1105EXPORT_SYMBOL(tcp_md5_do_add);
1106
1107int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1108		   u8 prefixlen)
1109{
1110	struct tcp_md5sig_key *key;
1111
1112	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1113	if (!key)
1114		return -ENOENT;
1115	hlist_del_rcu(&key->node);
1116	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1117	kfree_rcu(key, rcu);
1118	return 0;
1119}
1120EXPORT_SYMBOL(tcp_md5_do_del);
1121
1122static void tcp_clear_md5_list(struct sock *sk)
1123{
1124	struct tcp_sock *tp = tcp_sk(sk);
1125	struct tcp_md5sig_key *key;
1126	struct hlist_node *n;
1127	struct tcp_md5sig_info *md5sig;
1128
1129	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1130
1131	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1132		hlist_del_rcu(&key->node);
1133		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1134		kfree_rcu(key, rcu);
1135	}
1136}
1137
1138static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1139				 char __user *optval, int optlen)
1140{
1141	struct tcp_md5sig cmd;
1142	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1143	u8 prefixlen = 32;
1144
1145	if (optlen < sizeof(cmd))
1146		return -EINVAL;
1147
1148	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1149		return -EFAULT;
1150
1151	if (sin->sin_family != AF_INET)
1152		return -EINVAL;
1153
1154	if (optname == TCP_MD5SIG_EXT &&
1155	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1156		prefixlen = cmd.tcpm_prefixlen;
1157		if (prefixlen > 32)
1158			return -EINVAL;
1159	}
1160
1161	if (!cmd.tcpm_keylen)
1162		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1163				      AF_INET, prefixlen);
1164
1165	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1166		return -EINVAL;
1167
1168	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1169			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1170			      GFP_KERNEL);
1171}
1172
1173static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1174				   __be32 daddr, __be32 saddr,
1175				   const struct tcphdr *th, int nbytes)
1176{
1177	struct tcp4_pseudohdr *bp;
1178	struct scatterlist sg;
1179	struct tcphdr *_th;
1180
1181	bp = hp->scratch;
1182	bp->saddr = saddr;
1183	bp->daddr = daddr;
1184	bp->pad = 0;
1185	bp->protocol = IPPROTO_TCP;
1186	bp->len = cpu_to_be16(nbytes);
1187
1188	_th = (struct tcphdr *)(bp + 1);
1189	memcpy(_th, th, sizeof(*th));
1190	_th->check = 0;
1191
1192	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1193	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1194				sizeof(*bp) + sizeof(*th));
1195	return crypto_ahash_update(hp->md5_req);
1196}
1197
1198static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1199			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1200{
1201	struct tcp_md5sig_pool *hp;
1202	struct ahash_request *req;
1203
1204	hp = tcp_get_md5sig_pool();
1205	if (!hp)
1206		goto clear_hash_noput;
1207	req = hp->md5_req;
1208
1209	if (crypto_ahash_init(req))
1210		goto clear_hash;
1211	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1212		goto clear_hash;
1213	if (tcp_md5_hash_key(hp, key))
1214		goto clear_hash;
1215	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1216	if (crypto_ahash_final(req))
1217		goto clear_hash;
1218
1219	tcp_put_md5sig_pool();
1220	return 0;
1221
1222clear_hash:
1223	tcp_put_md5sig_pool();
1224clear_hash_noput:
1225	memset(md5_hash, 0, 16);
1226	return 1;
1227}
1228
1229int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1230			const struct sock *sk,
1231			const struct sk_buff *skb)
1232{
1233	struct tcp_md5sig_pool *hp;
1234	struct ahash_request *req;
1235	const struct tcphdr *th = tcp_hdr(skb);
1236	__be32 saddr, daddr;
1237
1238	if (sk) { /* valid for establish/request sockets */
1239		saddr = sk->sk_rcv_saddr;
1240		daddr = sk->sk_daddr;
1241	} else {
1242		const struct iphdr *iph = ip_hdr(skb);
1243		saddr = iph->saddr;
1244		daddr = iph->daddr;
1245	}
1246
1247	hp = tcp_get_md5sig_pool();
1248	if (!hp)
1249		goto clear_hash_noput;
1250	req = hp->md5_req;
1251
1252	if (crypto_ahash_init(req))
1253		goto clear_hash;
1254
1255	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1256		goto clear_hash;
1257	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1258		goto clear_hash;
1259	if (tcp_md5_hash_key(hp, key))
1260		goto clear_hash;
1261	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1262	if (crypto_ahash_final(req))
1263		goto clear_hash;
1264
1265	tcp_put_md5sig_pool();
1266	return 0;
1267
1268clear_hash:
1269	tcp_put_md5sig_pool();
1270clear_hash_noput:
1271	memset(md5_hash, 0, 16);
1272	return 1;
1273}
1274EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1275
1276#endif
1277
1278/* Called with rcu_read_lock() */
1279static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1280				    const struct sk_buff *skb)
1281{
1282#ifdef CONFIG_TCP_MD5SIG
1283	/*
1284	 * This gets called for each TCP segment that arrives
1285	 * so we want to be efficient.
1286	 * We have 3 drop cases:
1287	 * o No MD5 hash and one expected.
1288	 * o MD5 hash and we're not expecting one.
1289	 * o MD5 hash and its wrong.
1290	 */
1291	const __u8 *hash_location = NULL;
1292	struct tcp_md5sig_key *hash_expected;
1293	const struct iphdr *iph = ip_hdr(skb);
1294	const struct tcphdr *th = tcp_hdr(skb);
1295	int genhash;
1296	unsigned char newhash[16];
1297
1298	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1299					  AF_INET);
1300	hash_location = tcp_parse_md5sig_option(th);
1301
1302	/* We've parsed the options - do we have a hash? */
1303	if (!hash_expected && !hash_location)
1304		return false;
1305
1306	if (hash_expected && !hash_location) {
1307		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1308		return true;
1309	}
1310
1311	if (!hash_expected && hash_location) {
1312		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1313		return true;
1314	}
1315
1316	/* Okay, so this is hash_expected and hash_location -
1317	 * so we need to calculate the checksum.
1318	 */
1319	genhash = tcp_v4_md5_hash_skb(newhash,
1320				      hash_expected,
1321				      NULL, skb);
1322
1323	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1324		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1325		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1326				     &iph->saddr, ntohs(th->source),
1327				     &iph->daddr, ntohs(th->dest),
1328				     genhash ? " tcp_v4_calc_md5_hash failed"
1329				     : "");
1330		return true;
1331	}
1332	return false;
1333#endif
1334	return false;
1335}
1336
1337static void tcp_v4_init_req(struct request_sock *req,
1338			    const struct sock *sk_listener,
1339			    struct sk_buff *skb)
1340{
1341	struct inet_request_sock *ireq = inet_rsk(req);
1342	struct net *net = sock_net(sk_listener);
1343
1344	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1345	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1346	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1347}
1348
1349static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1350					  struct flowi *fl,
1351					  const struct request_sock *req)
1352{
1353	return inet_csk_route_req(sk, &fl->u.ip4, req);
1354}
1355
1356struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1357	.family		=	PF_INET,
1358	.obj_size	=	sizeof(struct tcp_request_sock),
1359	.rtx_syn_ack	=	tcp_rtx_synack,
1360	.send_ack	=	tcp_v4_reqsk_send_ack,
1361	.destructor	=	tcp_v4_reqsk_destructor,
1362	.send_reset	=	tcp_v4_send_reset,
1363	.syn_ack_timeout =	tcp_syn_ack_timeout,
1364};
1365
1366static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1367	.mss_clamp	=	TCP_MSS_DEFAULT,
1368#ifdef CONFIG_TCP_MD5SIG
1369	.req_md5_lookup	=	tcp_v4_md5_lookup,
1370	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1371#endif
1372	.init_req	=	tcp_v4_init_req,
1373#ifdef CONFIG_SYN_COOKIES
1374	.cookie_init_seq =	cookie_v4_init_sequence,
1375#endif
1376	.route_req	=	tcp_v4_route_req,
1377	.init_seq	=	tcp_v4_init_seq,
1378	.init_ts_off	=	tcp_v4_init_ts_off,
1379	.send_synack	=	tcp_v4_send_synack,
1380};
1381
1382int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1383{
1384	/* Never answer to SYNs send to broadcast or multicast */
1385	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1386		goto drop;
1387
1388	return tcp_conn_request(&tcp_request_sock_ops,
1389				&tcp_request_sock_ipv4_ops, sk, skb);
1390
1391drop:
1392	tcp_listendrop(sk);
1393	return 0;
1394}
1395EXPORT_SYMBOL(tcp_v4_conn_request);
1396
1397
1398/*
1399 * The three way handshake has completed - we got a valid synack -
1400 * now create the new socket.
1401 */
1402struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1403				  struct request_sock *req,
1404				  struct dst_entry *dst,
1405				  struct request_sock *req_unhash,
1406				  bool *own_req)
1407{
1408	struct inet_request_sock *ireq;
1409	struct inet_sock *newinet;
1410	struct tcp_sock *newtp;
1411	struct sock *newsk;
1412#ifdef CONFIG_TCP_MD5SIG
1413	struct tcp_md5sig_key *key;
1414#endif
1415	struct ip_options_rcu *inet_opt;
1416
1417	if (sk_acceptq_is_full(sk))
1418		goto exit_overflow;
1419
1420	newsk = tcp_create_openreq_child(sk, req, skb);
1421	if (!newsk)
1422		goto exit_nonewsk;
1423
1424	newsk->sk_gso_type = SKB_GSO_TCPV4;
1425	inet_sk_rx_dst_set(newsk, skb);
1426
1427	newtp		      = tcp_sk(newsk);
1428	newinet		      = inet_sk(newsk);
1429	ireq		      = inet_rsk(req);
1430	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1431	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1432	newsk->sk_bound_dev_if = ireq->ir_iif;
1433	newinet->inet_saddr   = ireq->ir_loc_addr;
1434	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1435	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1436	newinet->mc_index     = inet_iif(skb);
1437	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1438	newinet->rcv_tos      = ip_hdr(skb)->tos;
1439	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440	if (inet_opt)
1441		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1442	newinet->inet_id = newtp->write_seq ^ jiffies;
1443
1444	if (!dst) {
1445		dst = inet_csk_route_child_sock(sk, newsk, req);
1446		if (!dst)
1447			goto put_and_exit;
1448	} else {
1449		/* syncookie case : see end of cookie_v4_check() */
1450	}
1451	sk_setup_caps(newsk, dst);
1452
1453	tcp_ca_openreq_child(newsk, dst);
1454
1455	tcp_sync_mss(newsk, dst_mtu(dst));
1456	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1457
1458	tcp_initialize_rcv_mss(newsk);
1459
1460#ifdef CONFIG_TCP_MD5SIG
1461	/* Copy over the MD5 key from the original socket */
1462	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1463				AF_INET);
1464	if (key) {
1465		/*
1466		 * We're using one, so create a matching key
1467		 * on the newsk structure. If we fail to get
1468		 * memory, then we end up not copying the key
1469		 * across. Shucks.
1470		 */
1471		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1472			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1473		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1474	}
1475#endif
1476
1477	if (__inet_inherit_port(sk, newsk) < 0)
1478		goto put_and_exit;
1479	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1480	if (likely(*own_req)) {
1481		tcp_move_syn(newtp, req);
1482		ireq->ireq_opt = NULL;
1483	} else {
1484		newinet->inet_opt = NULL;
1485	}
1486	return newsk;
1487
1488exit_overflow:
1489	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1490exit_nonewsk:
1491	dst_release(dst);
1492exit:
1493	tcp_listendrop(sk);
1494	return NULL;
1495put_and_exit:
1496	newinet->inet_opt = NULL;
1497	inet_csk_prepare_forced_close(newsk);
1498	tcp_done(newsk);
1499	goto exit;
1500}
1501EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1502
1503static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1504{
1505#ifdef CONFIG_SYN_COOKIES
1506	const struct tcphdr *th = tcp_hdr(skb);
1507
1508	if (!th->syn)
1509		sk = cookie_v4_check(sk, skb);
1510#endif
1511	return sk;
1512}
1513
1514/* The socket must have it's spinlock held when we get
1515 * here, unless it is a TCP_LISTEN socket.
1516 *
1517 * We have a potential double-lock case here, so even when
1518 * doing backlog processing we use the BH locking scheme.
1519 * This is because we cannot sleep with the original spinlock
1520 * held.
1521 */
1522int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1523{
1524	struct sock *rsk;
1525
1526	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1527		struct dst_entry *dst = sk->sk_rx_dst;
1528
1529		sock_rps_save_rxhash(sk, skb);
1530		sk_mark_napi_id(sk, skb);
1531		if (dst) {
1532			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1533			    !dst->ops->check(dst, 0)) {
1534				dst_release(dst);
1535				sk->sk_rx_dst = NULL;
1536			}
1537		}
1538		tcp_rcv_established(sk, skb);
1539		return 0;
1540	}
1541
1542	if (tcp_checksum_complete(skb))
1543		goto csum_err;
1544
1545	if (sk->sk_state == TCP_LISTEN) {
1546		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1547
1548		if (!nsk)
1549			goto discard;
1550		if (nsk != sk) {
1551			if (tcp_child_process(sk, nsk, skb)) {
1552				rsk = nsk;
1553				goto reset;
1554			}
1555			return 0;
1556		}
1557	} else
1558		sock_rps_save_rxhash(sk, skb);
1559
1560	if (tcp_rcv_state_process(sk, skb)) {
1561		rsk = sk;
1562		goto reset;
1563	}
1564	return 0;
1565
1566reset:
1567	tcp_v4_send_reset(rsk, skb);
1568discard:
1569	kfree_skb(skb);
1570	/* Be careful here. If this function gets more complicated and
1571	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1572	 * might be destroyed here. This current version compiles correctly,
1573	 * but you have been warned.
1574	 */
1575	return 0;
1576
1577csum_err:
1578	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1579	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1580	goto discard;
1581}
1582EXPORT_SYMBOL(tcp_v4_do_rcv);
1583
1584int tcp_v4_early_demux(struct sk_buff *skb)
1585{
1586	const struct iphdr *iph;
1587	const struct tcphdr *th;
1588	struct sock *sk;
1589
1590	if (skb->pkt_type != PACKET_HOST)
1591		return 0;
1592
1593	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1594		return 0;
1595
1596	iph = ip_hdr(skb);
1597	th = tcp_hdr(skb);
1598
1599	if (th->doff < sizeof(struct tcphdr) / 4)
1600		return 0;
1601
1602	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1603				       iph->saddr, th->source,
1604				       iph->daddr, ntohs(th->dest),
1605				       skb->skb_iif, inet_sdif(skb));
1606	if (sk) {
1607		skb->sk = sk;
1608		skb->destructor = sock_edemux;
1609		if (sk_fullsock(sk)) {
1610			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1611
1612			if (dst)
1613				dst = dst_check(dst, 0);
1614			if (dst &&
1615			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1616				skb_dst_set_noref(skb, dst);
1617		}
1618	}
1619	return 0;
1620}
1621
1622bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1623{
1624	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1625	struct skb_shared_info *shinfo;
1626	const struct tcphdr *th;
1627	struct tcphdr *thtail;
1628	struct sk_buff *tail;
1629	unsigned int hdrlen;
1630	bool fragstolen;
1631	u32 gso_segs;
1632	int delta;
1633
1634	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1635	 * we can fix skb->truesize to its real value to avoid future drops.
1636	 * This is valid because skb is not yet charged to the socket.
1637	 * It has been noticed pure SACK packets were sometimes dropped
1638	 * (if cooked by drivers without copybreak feature).
1639	 */
1640	skb_condense(skb);
1641
1642	skb_dst_drop(skb);
1643
1644	if (unlikely(tcp_checksum_complete(skb))) {
1645		bh_unlock_sock(sk);
1646		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1647		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1648		return true;
1649	}
1650
1651	/* Attempt coalescing to last skb in backlog, even if we are
1652	 * above the limits.
1653	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1654	 */
1655	th = (const struct tcphdr *)skb->data;
1656	hdrlen = th->doff * 4;
1657	shinfo = skb_shinfo(skb);
1658
1659	if (!shinfo->gso_size)
1660		shinfo->gso_size = skb->len - hdrlen;
1661
1662	if (!shinfo->gso_segs)
1663		shinfo->gso_segs = 1;
1664
1665	tail = sk->sk_backlog.tail;
1666	if (!tail)
1667		goto no_coalesce;
1668	thtail = (struct tcphdr *)tail->data;
1669
1670	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1671	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1672	    ((TCP_SKB_CB(tail)->tcp_flags |
1673	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1674	    ((TCP_SKB_CB(tail)->tcp_flags ^
1675	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1676#ifdef CONFIG_TLS_DEVICE
1677	    tail->decrypted != skb->decrypted ||
1678#endif
1679	    thtail->doff != th->doff ||
1680	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1681		goto no_coalesce;
1682
1683	__skb_pull(skb, hdrlen);
1684	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1685		thtail->window = th->window;
1686
1687		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1688
1689		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1690			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1691
1692		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1693
1694		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1695			TCP_SKB_CB(tail)->has_rxtstamp = true;
1696			tail->tstamp = skb->tstamp;
1697			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1698		}
1699
1700		/* Not as strict as GRO. We only need to carry mss max value */
1701		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1702						 skb_shinfo(tail)->gso_size);
1703
1704		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1705		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1706
1707		sk->sk_backlog.len += delta;
1708		__NET_INC_STATS(sock_net(sk),
1709				LINUX_MIB_TCPBACKLOGCOALESCE);
1710		kfree_skb_partial(skb, fragstolen);
1711		return false;
1712	}
1713	__skb_push(skb, hdrlen);
1714
1715no_coalesce:
1716	/* Only socket owner can try to collapse/prune rx queues
1717	 * to reduce memory overhead, so add a little headroom here.
1718	 * Few sockets backlog are possibly concurrently non empty.
1719	 */
1720	limit += 64*1024;
1721
1722	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1723		bh_unlock_sock(sk);
1724		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1725		return true;
1726	}
1727	return false;
1728}
1729EXPORT_SYMBOL(tcp_add_backlog);
1730
1731int tcp_filter(struct sock *sk, struct sk_buff *skb)
1732{
1733	struct tcphdr *th = (struct tcphdr *)skb->data;
1734	unsigned int eaten = skb->len;
1735	int err;
1736
1737	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1738	if (!err) {
1739		eaten -= skb->len;
1740		TCP_SKB_CB(skb)->end_seq -= eaten;
1741	}
1742	return err;
1743}
1744EXPORT_SYMBOL(tcp_filter);
1745
1746static void tcp_v4_restore_cb(struct sk_buff *skb)
1747{
1748	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1749		sizeof(struct inet_skb_parm));
1750}
1751
1752static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1753			   const struct tcphdr *th)
1754{
1755	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1756	 * barrier() makes sure compiler wont play fool^Waliasing games.
1757	 */
1758	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1759		sizeof(struct inet_skb_parm));
1760	barrier();
1761
1762	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1763	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1764				    skb->len - th->doff * 4);
1765	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1766	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1767	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1768	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1769	TCP_SKB_CB(skb)->sacked	 = 0;
1770	TCP_SKB_CB(skb)->has_rxtstamp =
1771			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1772}
1773
1774/*
1775 *	From tcp_input.c
1776 */
1777
1778int tcp_v4_rcv(struct sk_buff *skb)
1779{
1780	struct net *net = dev_net(skb->dev);
1781	int sdif = inet_sdif(skb);
1782	const struct iphdr *iph;
1783	const struct tcphdr *th;
1784	bool refcounted;
1785	struct sock *sk;
1786	int ret;
1787
1788	if (skb->pkt_type != PACKET_HOST)
1789		goto discard_it;
1790
1791	/* Count it even if it's bad */
1792	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1793
1794	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1795		goto discard_it;
1796
1797	th = (const struct tcphdr *)skb->data;
1798
1799	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1800		goto bad_packet;
1801	if (!pskb_may_pull(skb, th->doff * 4))
1802		goto discard_it;
1803
1804	/* An explanation is required here, I think.
1805	 * Packet length and doff are validated by header prediction,
1806	 * provided case of th->doff==0 is eliminated.
1807	 * So, we defer the checks. */
1808
1809	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1810		goto csum_error;
1811
1812	th = (const struct tcphdr *)skb->data;
1813	iph = ip_hdr(skb);
1814lookup:
1815	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1816			       th->dest, sdif, &refcounted);
1817	if (!sk)
1818		goto no_tcp_socket;
1819
1820process:
1821	if (sk->sk_state == TCP_TIME_WAIT)
1822		goto do_time_wait;
1823
1824	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1825		struct request_sock *req = inet_reqsk(sk);
1826		bool req_stolen = false;
1827		struct sock *nsk;
1828
1829		sk = req->rsk_listener;
1830		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1831			sk_drops_add(sk, skb);
1832			reqsk_put(req);
1833			goto discard_it;
1834		}
1835		if (tcp_checksum_complete(skb)) {
1836			reqsk_put(req);
1837			goto csum_error;
1838		}
1839		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1840			inet_csk_reqsk_queue_drop_and_put(sk, req);
1841			goto lookup;
1842		}
1843		/* We own a reference on the listener, increase it again
1844		 * as we might lose it too soon.
1845		 */
1846		sock_hold(sk);
1847		refcounted = true;
1848		nsk = NULL;
1849		if (!tcp_filter(sk, skb)) {
1850			th = (const struct tcphdr *)skb->data;
1851			iph = ip_hdr(skb);
1852			tcp_v4_fill_cb(skb, iph, th);
1853			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1854		}
1855		if (!nsk) {
1856			reqsk_put(req);
1857			if (req_stolen) {
1858				/* Another cpu got exclusive access to req
1859				 * and created a full blown socket.
1860				 * Try to feed this packet to this socket
1861				 * instead of discarding it.
1862				 */
1863				tcp_v4_restore_cb(skb);
1864				sock_put(sk);
1865				goto lookup;
1866			}
1867			goto discard_and_relse;
1868		}
1869		if (nsk == sk) {
1870			reqsk_put(req);
1871			tcp_v4_restore_cb(skb);
1872		} else if (tcp_child_process(sk, nsk, skb)) {
1873			tcp_v4_send_reset(nsk, skb);
1874			goto discard_and_relse;
1875		} else {
1876			sock_put(sk);
1877			return 0;
1878		}
1879	}
1880	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1881		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1882		goto discard_and_relse;
1883	}
1884
1885	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1886		goto discard_and_relse;
1887
1888	if (tcp_v4_inbound_md5_hash(sk, skb))
1889		goto discard_and_relse;
1890
1891	nf_reset(skb);
1892
1893	if (tcp_filter(sk, skb))
1894		goto discard_and_relse;
1895	th = (const struct tcphdr *)skb->data;
1896	iph = ip_hdr(skb);
1897	tcp_v4_fill_cb(skb, iph, th);
1898
1899	skb->dev = NULL;
1900
1901	if (sk->sk_state == TCP_LISTEN) {
1902		ret = tcp_v4_do_rcv(sk, skb);
1903		goto put_and_return;
1904	}
1905
1906	sk_incoming_cpu_update(sk);
1907
1908	bh_lock_sock_nested(sk);
1909	tcp_segs_in(tcp_sk(sk), skb);
1910	ret = 0;
1911	if (!sock_owned_by_user(sk)) {
1912		ret = tcp_v4_do_rcv(sk, skb);
1913	} else if (tcp_add_backlog(sk, skb)) {
1914		goto discard_and_relse;
1915	}
1916	bh_unlock_sock(sk);
1917
1918put_and_return:
1919	if (refcounted)
1920		sock_put(sk);
1921
1922	return ret;
1923
1924no_tcp_socket:
1925	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1926		goto discard_it;
1927
1928	tcp_v4_fill_cb(skb, iph, th);
1929
1930	if (tcp_checksum_complete(skb)) {
1931csum_error:
1932		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1933bad_packet:
1934		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1935	} else {
1936		tcp_v4_send_reset(NULL, skb);
1937	}
1938
1939discard_it:
1940	/* Discard frame. */
1941	kfree_skb(skb);
1942	return 0;
1943
1944discard_and_relse:
1945	sk_drops_add(sk, skb);
1946	if (refcounted)
1947		sock_put(sk);
1948	goto discard_it;
1949
1950do_time_wait:
1951	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1952		inet_twsk_put(inet_twsk(sk));
1953		goto discard_it;
1954	}
1955
1956	tcp_v4_fill_cb(skb, iph, th);
1957
1958	if (tcp_checksum_complete(skb)) {
1959		inet_twsk_put(inet_twsk(sk));
1960		goto csum_error;
1961	}
1962	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1963	case TCP_TW_SYN: {
1964		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1965							&tcp_hashinfo, skb,
1966							__tcp_hdrlen(th),
1967							iph->saddr, th->source,
1968							iph->daddr, th->dest,
1969							inet_iif(skb),
1970							sdif);
1971		if (sk2) {
1972			inet_twsk_deschedule_put(inet_twsk(sk));
1973			sk = sk2;
1974			tcp_v4_restore_cb(skb);
1975			refcounted = false;
1976			goto process;
1977		}
1978	}
1979		/* to ACK */
1980		/* fall through */
1981	case TCP_TW_ACK:
1982		tcp_v4_timewait_ack(sk, skb);
1983		break;
1984	case TCP_TW_RST:
1985		tcp_v4_send_reset(sk, skb);
1986		inet_twsk_deschedule_put(inet_twsk(sk));
1987		goto discard_it;
1988	case TCP_TW_SUCCESS:;
1989	}
1990	goto discard_it;
1991}
1992
1993static struct timewait_sock_ops tcp_timewait_sock_ops = {
1994	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1995	.twsk_unique	= tcp_twsk_unique,
1996	.twsk_destructor= tcp_twsk_destructor,
1997};
1998
1999void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2000{
2001	struct dst_entry *dst = skb_dst(skb);
2002
2003	if (dst && dst_hold_safe(dst)) {
2004		sk->sk_rx_dst = dst;
2005		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2006	}
2007}
2008EXPORT_SYMBOL(inet_sk_rx_dst_set);
2009
2010const struct inet_connection_sock_af_ops ipv4_specific = {
2011	.queue_xmit	   = ip_queue_xmit,
2012	.send_check	   = tcp_v4_send_check,
2013	.rebuild_header	   = inet_sk_rebuild_header,
2014	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2015	.conn_request	   = tcp_v4_conn_request,
2016	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2017	.net_header_len	   = sizeof(struct iphdr),
2018	.setsockopt	   = ip_setsockopt,
2019	.getsockopt	   = ip_getsockopt,
2020	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2021	.sockaddr_len	   = sizeof(struct sockaddr_in),
2022#ifdef CONFIG_COMPAT
2023	.compat_setsockopt = compat_ip_setsockopt,
2024	.compat_getsockopt = compat_ip_getsockopt,
2025#endif
2026	.mtu_reduced	   = tcp_v4_mtu_reduced,
2027};
2028EXPORT_SYMBOL(ipv4_specific);
2029
2030#ifdef CONFIG_TCP_MD5SIG
2031static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2032	.md5_lookup		= tcp_v4_md5_lookup,
2033	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2034	.md5_parse		= tcp_v4_parse_md5_keys,
2035};
2036#endif
2037
2038/* NOTE: A lot of things set to zero explicitly by call to
2039 *       sk_alloc() so need not be done here.
2040 */
2041static int tcp_v4_init_sock(struct sock *sk)
2042{
2043	struct inet_connection_sock *icsk = inet_csk(sk);
2044
2045	tcp_init_sock(sk);
2046
2047	icsk->icsk_af_ops = &ipv4_specific;
2048
2049#ifdef CONFIG_TCP_MD5SIG
2050	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2051#endif
2052
2053	return 0;
2054}
2055
2056void tcp_v4_destroy_sock(struct sock *sk)
2057{
2058	struct tcp_sock *tp = tcp_sk(sk);
2059
2060	trace_tcp_destroy_sock(sk);
2061
2062	tcp_clear_xmit_timers(sk);
2063
2064	tcp_cleanup_congestion_control(sk);
2065
2066	tcp_cleanup_ulp(sk);
2067
2068	/* Cleanup up the write buffer. */
2069	tcp_write_queue_purge(sk);
2070
2071	/* Check if we want to disable active TFO */
2072	tcp_fastopen_active_disable_ofo_check(sk);
2073
2074	/* Cleans up our, hopefully empty, out_of_order_queue. */
2075	skb_rbtree_purge(&tp->out_of_order_queue);
2076
2077#ifdef CONFIG_TCP_MD5SIG
2078	/* Clean up the MD5 key list, if any */
2079	if (tp->md5sig_info) {
2080		tcp_clear_md5_list(sk);
2081		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2082		tp->md5sig_info = NULL;
2083	}
2084#endif
2085
2086	/* Clean up a referenced TCP bind bucket. */
2087	if (inet_csk(sk)->icsk_bind_hash)
2088		inet_put_port(sk);
2089
2090	BUG_ON(tp->fastopen_rsk);
2091
2092	/* If socket is aborted during connect operation */
2093	tcp_free_fastopen_req(tp);
2094	tcp_fastopen_destroy_cipher(sk);
2095	tcp_saved_syn_free(tp);
2096
2097	sk_sockets_allocated_dec(sk);
2098}
2099EXPORT_SYMBOL(tcp_v4_destroy_sock);
2100
2101#ifdef CONFIG_PROC_FS
2102/* Proc filesystem TCP sock list dumping. */
2103
2104/*
2105 * Get next listener socket follow cur.  If cur is NULL, get first socket
2106 * starting from bucket given in st->bucket; when st->bucket is zero the
2107 * very first socket in the hash table is returned.
2108 */
2109static void *listening_get_next(struct seq_file *seq, void *cur)
2110{
2111	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2112	struct tcp_iter_state *st = seq->private;
2113	struct net *net = seq_file_net(seq);
2114	struct inet_listen_hashbucket *ilb;
2115	struct sock *sk = cur;
2116
2117	if (!sk) {
2118get_head:
2119		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2120		spin_lock(&ilb->lock);
2121		sk = sk_head(&ilb->head);
2122		st->offset = 0;
2123		goto get_sk;
2124	}
2125	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2126	++st->num;
2127	++st->offset;
2128
2129	sk = sk_next(sk);
2130get_sk:
2131	sk_for_each_from(sk) {
2132		if (!net_eq(sock_net(sk), net))
2133			continue;
2134		if (sk->sk_family == afinfo->family)
2135			return sk;
2136	}
2137	spin_unlock(&ilb->lock);
2138	st->offset = 0;
2139	if (++st->bucket < INET_LHTABLE_SIZE)
2140		goto get_head;
2141	return NULL;
2142}
2143
2144static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2145{
2146	struct tcp_iter_state *st = seq->private;
2147	void *rc;
2148
2149	st->bucket = 0;
2150	st->offset = 0;
2151	rc = listening_get_next(seq, NULL);
2152
2153	while (rc && *pos) {
2154		rc = listening_get_next(seq, rc);
2155		--*pos;
2156	}
2157	return rc;
2158}
2159
2160static inline bool empty_bucket(const struct tcp_iter_state *st)
2161{
2162	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2163}
2164
2165/*
2166 * Get first established socket starting from bucket given in st->bucket.
2167 * If st->bucket is zero, the very first socket in the hash is returned.
2168 */
2169static void *established_get_first(struct seq_file *seq)
2170{
2171	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2172	struct tcp_iter_state *st = seq->private;
2173	struct net *net = seq_file_net(seq);
2174	void *rc = NULL;
2175
2176	st->offset = 0;
2177	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2178		struct sock *sk;
2179		struct hlist_nulls_node *node;
2180		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2181
2182		/* Lockless fast path for the common case of empty buckets */
2183		if (empty_bucket(st))
2184			continue;
2185
2186		spin_lock_bh(lock);
2187		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2188			if (sk->sk_family != afinfo->family ||
2189			    !net_eq(sock_net(sk), net)) {
2190				continue;
2191			}
2192			rc = sk;
2193			goto out;
2194		}
2195		spin_unlock_bh(lock);
2196	}
2197out:
2198	return rc;
2199}
2200
2201static void *established_get_next(struct seq_file *seq, void *cur)
2202{
2203	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2204	struct sock *sk = cur;
2205	struct hlist_nulls_node *node;
2206	struct tcp_iter_state *st = seq->private;
2207	struct net *net = seq_file_net(seq);
2208
2209	++st->num;
2210	++st->offset;
2211
2212	sk = sk_nulls_next(sk);
2213
2214	sk_nulls_for_each_from(sk, node) {
2215		if (sk->sk_family == afinfo->family &&
2216		    net_eq(sock_net(sk), net))
2217			return sk;
2218	}
2219
2220	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2221	++st->bucket;
2222	return established_get_first(seq);
2223}
2224
2225static void *established_get_idx(struct seq_file *seq, loff_t pos)
2226{
2227	struct tcp_iter_state *st = seq->private;
2228	void *rc;
2229
2230	st->bucket = 0;
2231	rc = established_get_first(seq);
2232
2233	while (rc && pos) {
2234		rc = established_get_next(seq, rc);
2235		--pos;
2236	}
2237	return rc;
2238}
2239
2240static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2241{
2242	void *rc;
2243	struct tcp_iter_state *st = seq->private;
2244
2245	st->state = TCP_SEQ_STATE_LISTENING;
2246	rc	  = listening_get_idx(seq, &pos);
2247
2248	if (!rc) {
2249		st->state = TCP_SEQ_STATE_ESTABLISHED;
2250		rc	  = established_get_idx(seq, pos);
2251	}
2252
2253	return rc;
2254}
2255
2256static void *tcp_seek_last_pos(struct seq_file *seq)
2257{
2258	struct tcp_iter_state *st = seq->private;
2259	int offset = st->offset;
2260	int orig_num = st->num;
2261	void *rc = NULL;
2262
2263	switch (st->state) {
2264	case TCP_SEQ_STATE_LISTENING:
2265		if (st->bucket >= INET_LHTABLE_SIZE)
2266			break;
2267		st->state = TCP_SEQ_STATE_LISTENING;
2268		rc = listening_get_next(seq, NULL);
2269		while (offset-- && rc)
2270			rc = listening_get_next(seq, rc);
2271		if (rc)
2272			break;
2273		st->bucket = 0;
2274		st->state = TCP_SEQ_STATE_ESTABLISHED;
2275		/* Fallthrough */
2276	case TCP_SEQ_STATE_ESTABLISHED:
2277		if (st->bucket > tcp_hashinfo.ehash_mask)
2278			break;
2279		rc = established_get_first(seq);
2280		while (offset-- && rc)
2281			rc = established_get_next(seq, rc);
2282	}
2283
2284	st->num = orig_num;
2285
2286	return rc;
2287}
2288
2289void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2290{
2291	struct tcp_iter_state *st = seq->private;
2292	void *rc;
2293
2294	if (*pos && *pos == st->last_pos) {
2295		rc = tcp_seek_last_pos(seq);
2296		if (rc)
2297			goto out;
2298	}
2299
2300	st->state = TCP_SEQ_STATE_LISTENING;
2301	st->num = 0;
2302	st->bucket = 0;
2303	st->offset = 0;
2304	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2305
2306out:
2307	st->last_pos = *pos;
2308	return rc;
2309}
2310EXPORT_SYMBOL(tcp_seq_start);
2311
2312void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313{
2314	struct tcp_iter_state *st = seq->private;
2315	void *rc = NULL;
2316
2317	if (v == SEQ_START_TOKEN) {
2318		rc = tcp_get_idx(seq, 0);
2319		goto out;
2320	}
2321
2322	switch (st->state) {
2323	case TCP_SEQ_STATE_LISTENING:
2324		rc = listening_get_next(seq, v);
2325		if (!rc) {
2326			st->state = TCP_SEQ_STATE_ESTABLISHED;
2327			st->bucket = 0;
2328			st->offset = 0;
2329			rc	  = established_get_first(seq);
2330		}
2331		break;
2332	case TCP_SEQ_STATE_ESTABLISHED:
2333		rc = established_get_next(seq, v);
2334		break;
2335	}
2336out:
2337	++*pos;
2338	st->last_pos = *pos;
2339	return rc;
2340}
2341EXPORT_SYMBOL(tcp_seq_next);
2342
2343void tcp_seq_stop(struct seq_file *seq, void *v)
2344{
2345	struct tcp_iter_state *st = seq->private;
2346
2347	switch (st->state) {
2348	case TCP_SEQ_STATE_LISTENING:
2349		if (v != SEQ_START_TOKEN)
2350			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2351		break;
2352	case TCP_SEQ_STATE_ESTABLISHED:
2353		if (v)
2354			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2355		break;
2356	}
2357}
2358EXPORT_SYMBOL(tcp_seq_stop);
2359
2360static void get_openreq4(const struct request_sock *req,
2361			 struct seq_file *f, int i)
2362{
2363	const struct inet_request_sock *ireq = inet_rsk(req);
2364	long delta = req->rsk_timer.expires - jiffies;
2365
2366	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2367		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2368		i,
2369		ireq->ir_loc_addr,
2370		ireq->ir_num,
2371		ireq->ir_rmt_addr,
2372		ntohs(ireq->ir_rmt_port),
2373		TCP_SYN_RECV,
2374		0, 0, /* could print option size, but that is af dependent. */
2375		1,    /* timers active (only the expire timer) */
2376		jiffies_delta_to_clock_t(delta),
2377		req->num_timeout,
2378		from_kuid_munged(seq_user_ns(f),
2379				 sock_i_uid(req->rsk_listener)),
2380		0,  /* non standard timer */
2381		0, /* open_requests have no inode */
2382		0,
2383		req);
2384}
2385
2386static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2387{
2388	int timer_active;
2389	unsigned long timer_expires;
2390	const struct tcp_sock *tp = tcp_sk(sk);
2391	const struct inet_connection_sock *icsk = inet_csk(sk);
2392	const struct inet_sock *inet = inet_sk(sk);
2393	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2394	__be32 dest = inet->inet_daddr;
2395	__be32 src = inet->inet_rcv_saddr;
2396	__u16 destp = ntohs(inet->inet_dport);
2397	__u16 srcp = ntohs(inet->inet_sport);
2398	int rx_queue;
2399	int state;
2400
2401	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2402	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2403	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2404		timer_active	= 1;
2405		timer_expires	= icsk->icsk_timeout;
2406	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2407		timer_active	= 4;
2408		timer_expires	= icsk->icsk_timeout;
2409	} else if (timer_pending(&sk->sk_timer)) {
2410		timer_active	= 2;
2411		timer_expires	= sk->sk_timer.expires;
2412	} else {
2413		timer_active	= 0;
2414		timer_expires = jiffies;
2415	}
2416
2417	state = inet_sk_state_load(sk);
2418	if (state == TCP_LISTEN)
2419		rx_queue = sk->sk_ack_backlog;
2420	else
2421		/* Because we don't lock the socket,
2422		 * we might find a transient negative value.
2423		 */
2424		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2425
2426	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2427			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2428		i, src, srcp, dest, destp, state,
2429		tp->write_seq - tp->snd_una,
2430		rx_queue,
2431		timer_active,
2432		jiffies_delta_to_clock_t(timer_expires - jiffies),
2433		icsk->icsk_retransmits,
2434		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2435		icsk->icsk_probes_out,
2436		sock_i_ino(sk),
2437		refcount_read(&sk->sk_refcnt), sk,
2438		jiffies_to_clock_t(icsk->icsk_rto),
2439		jiffies_to_clock_t(icsk->icsk_ack.ato),
2440		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2441		tp->snd_cwnd,
2442		state == TCP_LISTEN ?
2443		    fastopenq->max_qlen :
2444		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2445}
2446
2447static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2448			       struct seq_file *f, int i)
2449{
2450	long delta = tw->tw_timer.expires - jiffies;
2451	__be32 dest, src;
2452	__u16 destp, srcp;
2453
2454	dest  = tw->tw_daddr;
2455	src   = tw->tw_rcv_saddr;
2456	destp = ntohs(tw->tw_dport);
2457	srcp  = ntohs(tw->tw_sport);
2458
2459	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2460		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2461		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2462		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2463		refcount_read(&tw->tw_refcnt), tw);
2464}
2465
2466#define TMPSZ 150
2467
2468static int tcp4_seq_show(struct seq_file *seq, void *v)
2469{
2470	struct tcp_iter_state *st;
2471	struct sock *sk = v;
2472
2473	seq_setwidth(seq, TMPSZ - 1);
2474	if (v == SEQ_START_TOKEN) {
2475		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2476			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2477			   "inode");
2478		goto out;
2479	}
2480	st = seq->private;
2481
2482	if (sk->sk_state == TCP_TIME_WAIT)
2483		get_timewait4_sock(v, seq, st->num);
2484	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2485		get_openreq4(v, seq, st->num);
2486	else
2487		get_tcp4_sock(v, seq, st->num);
2488out:
2489	seq_pad(seq, '\n');
2490	return 0;
2491}
2492
2493static const struct seq_operations tcp4_seq_ops = {
2494	.show		= tcp4_seq_show,
2495	.start		= tcp_seq_start,
2496	.next		= tcp_seq_next,
2497	.stop		= tcp_seq_stop,
2498};
2499
2500static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2501	.family		= AF_INET,
2502};
2503
2504static int __net_init tcp4_proc_init_net(struct net *net)
2505{
2506	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2507			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2508		return -ENOMEM;
2509	return 0;
2510}
2511
2512static void __net_exit tcp4_proc_exit_net(struct net *net)
2513{
2514	remove_proc_entry("tcp", net->proc_net);
2515}
2516
2517static struct pernet_operations tcp4_net_ops = {
2518	.init = tcp4_proc_init_net,
2519	.exit = tcp4_proc_exit_net,
2520};
2521
2522int __init tcp4_proc_init(void)
2523{
2524	return register_pernet_subsys(&tcp4_net_ops);
2525}
2526
2527void tcp4_proc_exit(void)
2528{
2529	unregister_pernet_subsys(&tcp4_net_ops);
2530}
2531#endif /* CONFIG_PROC_FS */
2532
2533struct proto tcp_prot = {
2534	.name			= "TCP",
2535	.owner			= THIS_MODULE,
2536	.close			= tcp_close,
2537	.pre_connect		= tcp_v4_pre_connect,
2538	.connect		= tcp_v4_connect,
2539	.disconnect		= tcp_disconnect,
2540	.accept			= inet_csk_accept,
2541	.ioctl			= tcp_ioctl,
2542	.init			= tcp_v4_init_sock,
2543	.destroy		= tcp_v4_destroy_sock,
2544	.shutdown		= tcp_shutdown,
2545	.setsockopt		= tcp_setsockopt,
2546	.getsockopt		= tcp_getsockopt,
2547	.keepalive		= tcp_set_keepalive,
2548	.recvmsg		= tcp_recvmsg,
2549	.sendmsg		= tcp_sendmsg,
2550	.sendpage		= tcp_sendpage,
2551	.backlog_rcv		= tcp_v4_do_rcv,
2552	.release_cb		= tcp_release_cb,
2553	.hash			= inet_hash,
2554	.unhash			= inet_unhash,
2555	.get_port		= inet_csk_get_port,
2556	.enter_memory_pressure	= tcp_enter_memory_pressure,
2557	.leave_memory_pressure	= tcp_leave_memory_pressure,
2558	.stream_memory_free	= tcp_stream_memory_free,
2559	.sockets_allocated	= &tcp_sockets_allocated,
2560	.orphan_count		= &tcp_orphan_count,
2561	.memory_allocated	= &tcp_memory_allocated,
2562	.memory_pressure	= &tcp_memory_pressure,
2563	.sysctl_mem		= sysctl_tcp_mem,
2564	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2565	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2566	.max_header		= MAX_TCP_HEADER,
2567	.obj_size		= sizeof(struct tcp_sock),
2568	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2569	.twsk_prot		= &tcp_timewait_sock_ops,
2570	.rsk_prot		= &tcp_request_sock_ops,
2571	.h.hashinfo		= &tcp_hashinfo,
2572	.no_autobind		= true,
2573#ifdef CONFIG_COMPAT
2574	.compat_setsockopt	= compat_tcp_setsockopt,
2575	.compat_getsockopt	= compat_tcp_getsockopt,
2576#endif
2577	.diag_destroy		= tcp_abort,
2578};
2579EXPORT_SYMBOL(tcp_prot);
2580
2581static void __net_exit tcp_sk_exit(struct net *net)
2582{
2583	int cpu;
2584
2585	module_put(net->ipv4.tcp_congestion_control->owner);
2586
2587	for_each_possible_cpu(cpu)
2588		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2589	free_percpu(net->ipv4.tcp_sk);
2590}
2591
2592static int __net_init tcp_sk_init(struct net *net)
2593{
2594	int res, cpu, cnt;
2595
2596	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2597	if (!net->ipv4.tcp_sk)
2598		return -ENOMEM;
2599
2600	for_each_possible_cpu(cpu) {
2601		struct sock *sk;
2602
2603		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2604					   IPPROTO_TCP, net);
2605		if (res)
2606			goto fail;
2607		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2608
2609		/* Please enforce IP_DF and IPID==0 for RST and
2610		 * ACK sent in SYN-RECV and TIME-WAIT state.
2611		 */
2612		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2613
2614		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2615	}
2616
2617	net->ipv4.sysctl_tcp_ecn = 2;
2618	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2619
2620	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2621	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2622	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2623
2624	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2625	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2626	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2627
2628	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2629	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2630	net->ipv4.sysctl_tcp_syncookies = 1;
2631	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2632	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2633	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2634	net->ipv4.sysctl_tcp_orphan_retries = 0;
2635	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2636	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2637	net->ipv4.sysctl_tcp_tw_reuse = 2;
2638
2639	cnt = tcp_hashinfo.ehash_mask + 1;
2640	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2641	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2642
2643	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2644	net->ipv4.sysctl_tcp_sack = 1;
2645	net->ipv4.sysctl_tcp_window_scaling = 1;
2646	net->ipv4.sysctl_tcp_timestamps = 1;
2647	net->ipv4.sysctl_tcp_early_retrans = 3;
2648	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2649	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2650	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2651	net->ipv4.sysctl_tcp_max_reordering = 300;
2652	net->ipv4.sysctl_tcp_dsack = 1;
2653	net->ipv4.sysctl_tcp_app_win = 31;
2654	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2655	net->ipv4.sysctl_tcp_frto = 2;
2656	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2657	/* This limits the percentage of the congestion window which we
2658	 * will allow a single TSO frame to consume.  Building TSO frames
2659	 * which are too large can cause TCP streams to be bursty.
2660	 */
2661	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2662	/* Default TSQ limit of 16 TSO segments */
2663	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2664	/* rfc5961 challenge ack rate limiting */
2665	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2666	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2667	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2668	net->ipv4.sysctl_tcp_autocorking = 1;
2669	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2670	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2671	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2672	if (net != &init_net) {
2673		memcpy(net->ipv4.sysctl_tcp_rmem,
2674		       init_net.ipv4.sysctl_tcp_rmem,
2675		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2676		memcpy(net->ipv4.sysctl_tcp_wmem,
2677		       init_net.ipv4.sysctl_tcp_wmem,
2678		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2679	}
2680	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2681	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2682	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2683	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2684	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2685	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2686
2687	/* Reno is always built in */
2688	if (!net_eq(net, &init_net) &&
2689	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2690		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2691	else
2692		net->ipv4.tcp_congestion_control = &tcp_reno;
2693
2694	return 0;
2695fail:
2696	tcp_sk_exit(net);
2697
2698	return res;
2699}
2700
2701static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2702{
2703	struct net *net;
2704
2705	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2706
2707	list_for_each_entry(net, net_exit_list, exit_list)
2708		tcp_fastopen_ctx_destroy(net);
2709}
2710
2711static struct pernet_operations __net_initdata tcp_sk_ops = {
2712       .init	   = tcp_sk_init,
2713       .exit	   = tcp_sk_exit,
2714       .exit_batch = tcp_sk_exit_batch,
2715};
2716
2717void __init tcp_v4_init(void)
2718{
2719	if (register_pernet_subsys(&tcp_sk_ops))
2720		panic("Failed to create the TCP control socket.\n");
2721}
Configure Feed

Configure Feed