net/ipv4/tcp_ipv4.c at v4.5-rc4

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v4.5-rc4 2419 lines 63 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 *		IPv4 specific functions
   9 *
  10 *
  11 *		code split from:
  12 *		linux/ipv4/tcp.c
  13 *		linux/ipv4/tcp_input.c
  14 *		linux/ipv4/tcp_output.c
  15 *
  16 *		See tcp.c for author information
  17 *
  18 *	This program is free software; you can redistribute it and/or
  19 *      modify it under the terms of the GNU General Public License
  20 *      as published by the Free Software Foundation; either version
  21 *      2 of the License, or (at your option) any later version.
  22 */
  23
  24/*
  25 * Changes:
  26 *		David S. Miller	:	New socket lookup architecture.
  27 *					This code is dedicated to John Dyson.
  28 *		David S. Miller :	Change semantics of established hash,
  29 *					half is devoted to TIME_WAIT sockets
  30 *					and the rest go in the other half.
  31 *		Andi Kleen :		Add support for syncookies and fixed
  32 *					some bugs: ip options weren't passed to
  33 *					the TCP layer, missed a check for an
  34 *					ACK bit.
  35 *		Andi Kleen :		Implemented fast path mtu discovery.
  36 *	     				Fixed many serious bugs in the
  37 *					request_sock handling and moved
  38 *					most of it into the af independent code.
  39 *					Added tail drop and some other bugfixes.
  40 *					Added new listen semantics.
  41 *		Mike McLagan	:	Routing by source
  42 *	Juan Jose Ciarlante:		ip_dynaddr bits
  43 *		Andi Kleen:		various fixes.
  44 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  45 *					coma.
  46 *	Andi Kleen		:	Fix new listen.
  47 *	Andi Kleen		:	Fix accept error reporting.
  48 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  49 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  50 *					a single port at the same time.
  51 */
  52
  53#define pr_fmt(fmt) "TCP: " fmt
  54
  55#include <linux/bottom_half.h>
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64#include <linux/slab.h>
  65
  66#include <net/net_namespace.h>
  67#include <net/icmp.h>
  68#include <net/inet_hashtables.h>
  69#include <net/tcp.h>
  70#include <net/transp_v6.h>
  71#include <net/ipv6.h>
  72#include <net/inet_common.h>
  73#include <net/timewait_sock.h>
  74#include <net/xfrm.h>
  75#include <net/secure_seq.h>
  76#include <net/busy_poll.h>
  77
  78#include <linux/inet.h>
  79#include <linux/ipv6.h>
  80#include <linux/stddef.h>
  81#include <linux/proc_fs.h>
  82#include <linux/seq_file.h>
  83
  84#include <linux/crypto.h>
  85#include <linux/scatterlist.h>
  86
  87int sysctl_tcp_tw_reuse __read_mostly;
  88int sysctl_tcp_low_latency __read_mostly;
  89EXPORT_SYMBOL(sysctl_tcp_low_latency);
  90
  91#ifdef CONFIG_TCP_MD5SIG
  92static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  93			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
  94#endif
  95
  96struct inet_hashinfo tcp_hashinfo;
  97EXPORT_SYMBOL(tcp_hashinfo);
  98
  99static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 100{
 101	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 102					  ip_hdr(skb)->saddr,
 103					  tcp_hdr(skb)->dest,
 104					  tcp_hdr(skb)->source);
 105}
 106
 107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108{
 109	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110	struct tcp_sock *tp = tcp_sk(sk);
 111
 112	/* With PAWS, it is safe from the viewpoint
 113	   of data integrity. Even without PAWS it is safe provided sequence
 114	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 115
 116	   Actually, the idea is close to VJ's one, only timestamp cache is
 117	   held not per host, but per port pair and TW bucket is used as state
 118	   holder.
 119
 120	   If TW bucket has been already destroyed we fall back to VJ's scheme
 121	   and use initial timestamp retrieved from peer table.
 122	 */
 123	if (tcptw->tw_ts_recent_stamp &&
 124	    (!twp || (sysctl_tcp_tw_reuse &&
 125			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 126		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 127		if (tp->write_seq == 0)
 128			tp->write_seq = 1;
 129		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 130		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 131		sock_hold(sktw);
 132		return 1;
 133	}
 134
 135	return 0;
 136}
 137EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 138
 139/* This will initiate an outgoing connection. */
 140int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 141{
 142	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 143	struct inet_sock *inet = inet_sk(sk);
 144	struct tcp_sock *tp = tcp_sk(sk);
 145	__be16 orig_sport, orig_dport;
 146	__be32 daddr, nexthop;
 147	struct flowi4 *fl4;
 148	struct rtable *rt;
 149	int err;
 150	struct ip_options_rcu *inet_opt;
 151
 152	if (addr_len < sizeof(struct sockaddr_in))
 153		return -EINVAL;
 154
 155	if (usin->sin_family != AF_INET)
 156		return -EAFNOSUPPORT;
 157
 158	nexthop = daddr = usin->sin_addr.s_addr;
 159	inet_opt = rcu_dereference_protected(inet->inet_opt,
 160					     sock_owned_by_user(sk));
 161	if (inet_opt && inet_opt->opt.srr) {
 162		if (!daddr)
 163			return -EINVAL;
 164		nexthop = inet_opt->opt.faddr;
 165	}
 166
 167	orig_sport = inet->inet_sport;
 168	orig_dport = usin->sin_port;
 169	fl4 = &inet->cork.fl.u.ip4;
 170	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172			      IPPROTO_TCP,
 173			      orig_sport, orig_dport, sk);
 174	if (IS_ERR(rt)) {
 175		err = PTR_ERR(rt);
 176		if (err == -ENETUNREACH)
 177			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178		return err;
 179	}
 180
 181	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182		ip_rt_put(rt);
 183		return -ENETUNREACH;
 184	}
 185
 186	if (!inet_opt || !inet_opt->opt.srr)
 187		daddr = fl4->daddr;
 188
 189	if (!inet->inet_saddr)
 190		inet->inet_saddr = fl4->saddr;
 191	sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194		/* Reset inherited state */
 195		tp->rx_opt.ts_recent	   = 0;
 196		tp->rx_opt.ts_recent_stamp = 0;
 197		if (likely(!tp->repair))
 198			tp->write_seq	   = 0;
 199	}
 200
 201	if (tcp_death_row.sysctl_tw_recycle &&
 202	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203		tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205	inet->inet_dport = usin->sin_port;
 206	sk_daddr_set(sk, daddr);
 207
 208	inet_csk(sk)->icsk_ext_hdr_len = 0;
 209	if (inet_opt)
 210		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214	/* Socket identity is still unknown (sport may be zero).
 215	 * However we set state to SYN-SENT and not releasing socket
 216	 * lock select source port, enter ourselves into the hash tables and
 217	 * complete initialization after this.
 218	 */
 219	tcp_set_state(sk, TCP_SYN_SENT);
 220	err = inet_hash_connect(&tcp_death_row, sk);
 221	if (err)
 222		goto failure;
 223
 224	sk_set_txhash(sk);
 225
 226	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227			       inet->inet_sport, inet->inet_dport, sk);
 228	if (IS_ERR(rt)) {
 229		err = PTR_ERR(rt);
 230		rt = NULL;
 231		goto failure;
 232	}
 233	/* OK, now commit destination to socket.  */
 234	sk->sk_gso_type = SKB_GSO_TCPV4;
 235	sk_setup_caps(sk, &rt->dst);
 236
 237	if (!tp->write_seq && likely(!tp->repair))
 238		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239							   inet->inet_daddr,
 240							   inet->inet_sport,
 241							   usin->sin_port);
 242
 243	inet->inet_id = tp->write_seq ^ jiffies;
 244
 245	err = tcp_connect(sk);
 246
 247	rt = NULL;
 248	if (err)
 249		goto failure;
 250
 251	return 0;
 252
 253failure:
 254	/*
 255	 * This unhashes the socket and releases the local port,
 256	 * if necessary.
 257	 */
 258	tcp_set_state(sk, TCP_CLOSE);
 259	ip_rt_put(rt);
 260	sk->sk_route_caps = 0;
 261	inet->inet_dport = 0;
 262	return err;
 263}
 264EXPORT_SYMBOL(tcp_v4_connect);
 265
 266/*
 267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268 * It can be called through tcp_release_cb() if socket was owned by user
 269 * at the time tcp_v4_err() was called to handle ICMP message.
 270 */
 271void tcp_v4_mtu_reduced(struct sock *sk)
 272{
 273	struct dst_entry *dst;
 274	struct inet_sock *inet = inet_sk(sk);
 275	u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277	dst = inet_csk_update_pmtu(sk, mtu);
 278	if (!dst)
 279		return;
 280
 281	/* Something is about to be wrong... Remember soft error
 282	 * for the case, if this connection will not able to recover.
 283	 */
 284	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285		sk->sk_err_soft = EMSGSIZE;
 286
 287	mtu = dst_mtu(dst);
 288
 289	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290	    ip_sk_accept_pmtu(sk) &&
 291	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292		tcp_sync_mss(sk, mtu);
 293
 294		/* Resend the TCP packet because it's
 295		 * clear that the old packet has been
 296		 * dropped. This is the new "fast" path mtu
 297		 * discovery.
 298		 */
 299		tcp_simple_retransmit(sk);
 300	} /* else let the usual retransmit timer handle it */
 301}
 302EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305{
 306	struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308	if (dst)
 309		dst->ops->redirect(dst, sk, skb);
 310}
 311
 312
 313/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315{
 316	struct request_sock *req = inet_reqsk(sk);
 317	struct net *net = sock_net(sk);
 318
 319	/* ICMPs are not backlogged, hence we cannot get
 320	 * an established socket here.
 321	 */
 322	WARN_ON(req->sk);
 323
 324	if (seq != tcp_rsk(req)->snt_isn) {
 325		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 326	} else if (abort) {
 327		/*
 328		 * Still in SYN_RECV, just remove it silently.
 329		 * There is no good way to pass the error to the newly
 330		 * created socket, and POSIX does not want network
 331		 * errors returned from accept().
 332		 */
 333		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 334		NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
 335	}
 336	reqsk_put(req);
 337}
 338EXPORT_SYMBOL(tcp_req_err);
 339
 340/*
 341 * This routine is called by the ICMP module when it gets some
 342 * sort of error condition.  If err < 0 then the socket should
 343 * be closed and the error returned to the user.  If err > 0
 344 * it's just the icmp type << 8 | icmp code.  After adjustment
 345 * header points to the first 8 bytes of the tcp header.  We need
 346 * to find the appropriate port.
 347 *
 348 * The locking strategy used here is very "optimistic". When
 349 * someone else accesses the socket the ICMP is just dropped
 350 * and for some paths there is no check at all.
 351 * A more general error queue to queue errors for later handling
 352 * is probably better.
 353 *
 354 */
 355
 356void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 357{
 358	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 359	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 360	struct inet_connection_sock *icsk;
 361	struct tcp_sock *tp;
 362	struct inet_sock *inet;
 363	const int type = icmp_hdr(icmp_skb)->type;
 364	const int code = icmp_hdr(icmp_skb)->code;
 365	struct sock *sk;
 366	struct sk_buff *skb;
 367	struct request_sock *fastopen;
 368	__u32 seq, snd_una;
 369	__u32 remaining;
 370	int err;
 371	struct net *net = dev_net(icmp_skb->dev);
 372
 373	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 374				       th->dest, iph->saddr, ntohs(th->source),
 375				       inet_iif(icmp_skb));
 376	if (!sk) {
 377		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 378		return;
 379	}
 380	if (sk->sk_state == TCP_TIME_WAIT) {
 381		inet_twsk_put(inet_twsk(sk));
 382		return;
 383	}
 384	seq = ntohl(th->seq);
 385	if (sk->sk_state == TCP_NEW_SYN_RECV)
 386		return tcp_req_err(sk, seq,
 387				  type == ICMP_PARAMETERPROB ||
 388				  type == ICMP_TIME_EXCEEDED ||
 389				  (type == ICMP_DEST_UNREACH &&
 390				   (code == ICMP_NET_UNREACH ||
 391				    code == ICMP_HOST_UNREACH)));
 392
 393	bh_lock_sock(sk);
 394	/* If too many ICMPs get dropped on busy
 395	 * servers this needs to be solved differently.
 396	 * We do take care of PMTU discovery (RFC1191) special case :
 397	 * we can receive locally generated ICMP messages while socket is held.
 398	 */
 399	if (sock_owned_by_user(sk)) {
 400		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 401			NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 402	}
 403	if (sk->sk_state == TCP_CLOSE)
 404		goto out;
 405
 406	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 407		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 408		goto out;
 409	}
 410
 411	icsk = inet_csk(sk);
 412	tp = tcp_sk(sk);
 413	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 414	fastopen = tp->fastopen_rsk;
 415	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 416	if (sk->sk_state != TCP_LISTEN &&
 417	    !between(seq, snd_una, tp->snd_nxt)) {
 418		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 419		goto out;
 420	}
 421
 422	switch (type) {
 423	case ICMP_REDIRECT:
 424		do_redirect(icmp_skb, sk);
 425		goto out;
 426	case ICMP_SOURCE_QUENCH:
 427		/* Just silently ignore these. */
 428		goto out;
 429	case ICMP_PARAMETERPROB:
 430		err = EPROTO;
 431		break;
 432	case ICMP_DEST_UNREACH:
 433		if (code > NR_ICMP_UNREACH)
 434			goto out;
 435
 436		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 437			/* We are not interested in TCP_LISTEN and open_requests
 438			 * (SYN-ACKs send out by Linux are always <576bytes so
 439			 * they should go through unfragmented).
 440			 */
 441			if (sk->sk_state == TCP_LISTEN)
 442				goto out;
 443
 444			tp->mtu_info = info;
 445			if (!sock_owned_by_user(sk)) {
 446				tcp_v4_mtu_reduced(sk);
 447			} else {
 448				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 449					sock_hold(sk);
 450			}
 451			goto out;
 452		}
 453
 454		err = icmp_err_convert[code].errno;
 455		/* check if icmp_skb allows revert of backoff
 456		 * (see draft-zimmermann-tcp-lcd) */
 457		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 458			break;
 459		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 460		    !icsk->icsk_backoff || fastopen)
 461			break;
 462
 463		if (sock_owned_by_user(sk))
 464			break;
 465
 466		icsk->icsk_backoff--;
 467		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 468					       TCP_TIMEOUT_INIT;
 469		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 470
 471		skb = tcp_write_queue_head(sk);
 472		BUG_ON(!skb);
 473
 474		remaining = icsk->icsk_rto -
 475			    min(icsk->icsk_rto,
 476				tcp_time_stamp - tcp_skb_timestamp(skb));
 477
 478		if (remaining) {
 479			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 480						  remaining, TCP_RTO_MAX);
 481		} else {
 482			/* RTO revert clocked out retransmission.
 483			 * Will retransmit now */
 484			tcp_retransmit_timer(sk);
 485		}
 486
 487		break;
 488	case ICMP_TIME_EXCEEDED:
 489		err = EHOSTUNREACH;
 490		break;
 491	default:
 492		goto out;
 493	}
 494
 495	switch (sk->sk_state) {
 496	case TCP_SYN_SENT:
 497	case TCP_SYN_RECV:
 498		/* Only in fast or simultaneous open. If a fast open socket is
 499		 * is already accepted it is treated as a connected one below.
 500		 */
 501		if (fastopen && !fastopen->sk)
 502			break;
 503
 504		if (!sock_owned_by_user(sk)) {
 505			sk->sk_err = err;
 506
 507			sk->sk_error_report(sk);
 508
 509			tcp_done(sk);
 510		} else {
 511			sk->sk_err_soft = err;
 512		}
 513		goto out;
 514	}
 515
 516	/* If we've already connected we will keep trying
 517	 * until we time out, or the user gives up.
 518	 *
 519	 * rfc1122 4.2.3.9 allows to consider as hard errors
 520	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 521	 * but it is obsoleted by pmtu discovery).
 522	 *
 523	 * Note, that in modern internet, where routing is unreliable
 524	 * and in each dark corner broken firewalls sit, sending random
 525	 * errors ordered by their masters even this two messages finally lose
 526	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 527	 *
 528	 * Now we are in compliance with RFCs.
 529	 *							--ANK (980905)
 530	 */
 531
 532	inet = inet_sk(sk);
 533	if (!sock_owned_by_user(sk) && inet->recverr) {
 534		sk->sk_err = err;
 535		sk->sk_error_report(sk);
 536	} else	{ /* Only an error on timeout */
 537		sk->sk_err_soft = err;
 538	}
 539
 540out:
 541	bh_unlock_sock(sk);
 542	sock_put(sk);
 543}
 544
 545void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 546{
 547	struct tcphdr *th = tcp_hdr(skb);
 548
 549	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 550		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 551		skb->csum_start = skb_transport_header(skb) - skb->head;
 552		skb->csum_offset = offsetof(struct tcphdr, check);
 553	} else {
 554		th->check = tcp_v4_check(skb->len, saddr, daddr,
 555					 csum_partial(th,
 556						      th->doff << 2,
 557						      skb->csum));
 558	}
 559}
 560
 561/* This routine computes an IPv4 TCP checksum. */
 562void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 563{
 564	const struct inet_sock *inet = inet_sk(sk);
 565
 566	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 567}
 568EXPORT_SYMBOL(tcp_v4_send_check);
 569
 570/*
 571 *	This routine will send an RST to the other tcp.
 572 *
 573 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 574 *		      for reset.
 575 *	Answer: if a packet caused RST, it is not for a socket
 576 *		existing in our system, if it is matched to a socket,
 577 *		it is just duplicate segment or bug in other side's TCP.
 578 *		So that we build reply only basing on parameters
 579 *		arrived with segment.
 580 *	Exception: precedence violation. We do not implement it in any case.
 581 */
 582
 583static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 584{
 585	const struct tcphdr *th = tcp_hdr(skb);
 586	struct {
 587		struct tcphdr th;
 588#ifdef CONFIG_TCP_MD5SIG
 589		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 590#endif
 591	} rep;
 592	struct ip_reply_arg arg;
 593#ifdef CONFIG_TCP_MD5SIG
 594	struct tcp_md5sig_key *key = NULL;
 595	const __u8 *hash_location = NULL;
 596	unsigned char newhash[16];
 597	int genhash;
 598	struct sock *sk1 = NULL;
 599#endif
 600	struct net *net;
 601
 602	/* Never send a reset in response to a reset. */
 603	if (th->rst)
 604		return;
 605
 606	/* If sk not NULL, it means we did a successful lookup and incoming
 607	 * route had to be correct. prequeue might have dropped our dst.
 608	 */
 609	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 610		return;
 611
 612	/* Swap the send and the receive. */
 613	memset(&rep, 0, sizeof(rep));
 614	rep.th.dest   = th->source;
 615	rep.th.source = th->dest;
 616	rep.th.doff   = sizeof(struct tcphdr) / 4;
 617	rep.th.rst    = 1;
 618
 619	if (th->ack) {
 620		rep.th.seq = th->ack_seq;
 621	} else {
 622		rep.th.ack = 1;
 623		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 624				       skb->len - (th->doff << 2));
 625	}
 626
 627	memset(&arg, 0, sizeof(arg));
 628	arg.iov[0].iov_base = (unsigned char *)&rep;
 629	arg.iov[0].iov_len  = sizeof(rep.th);
 630
 631	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 632#ifdef CONFIG_TCP_MD5SIG
 633	hash_location = tcp_parse_md5sig_option(th);
 634	if (sk && sk_fullsock(sk)) {
 635		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 636					&ip_hdr(skb)->saddr, AF_INET);
 637	} else if (hash_location) {
 638		/*
 639		 * active side is lost. Try to find listening socket through
 640		 * source port, and then find md5 key through listening socket.
 641		 * we are not loose security here:
 642		 * Incoming packet is checked with md5 hash with finding key,
 643		 * no RST generated if md5 hash doesn't match.
 644		 */
 645		sk1 = __inet_lookup_listener(net,
 646					     &tcp_hashinfo, ip_hdr(skb)->saddr,
 647					     th->source, ip_hdr(skb)->daddr,
 648					     ntohs(th->source), inet_iif(skb));
 649		/* don't send rst if it can't find key */
 650		if (!sk1)
 651			return;
 652		rcu_read_lock();
 653		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654					&ip_hdr(skb)->saddr, AF_INET);
 655		if (!key)
 656			goto release_sk1;
 657
 658		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 659		if (genhash || memcmp(hash_location, newhash, 16) != 0)
 660			goto release_sk1;
 661	}
 662
 663	if (key) {
 664		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 665				   (TCPOPT_NOP << 16) |
 666				   (TCPOPT_MD5SIG << 8) |
 667				   TCPOLEN_MD5SIG);
 668		/* Update length and the length the header thinks exists */
 669		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 670		rep.th.doff = arg.iov[0].iov_len / 4;
 671
 672		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 673				     key, ip_hdr(skb)->saddr,
 674				     ip_hdr(skb)->daddr, &rep.th);
 675	}
 676#endif
 677	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 678				      ip_hdr(skb)->saddr, /* XXX */
 679				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 680	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 681	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 682
 683	/* When socket is gone, all binding information is lost.
 684	 * routing might fail in this case. No choice here, if we choose to force
 685	 * input interface, we will misroute in case of asymmetric route.
 686	 */
 687	if (sk)
 688		arg.bound_dev_if = sk->sk_bound_dev_if;
 689
 690	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 691		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 692
 693	arg.tos = ip_hdr(skb)->tos;
 694	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 695			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 696			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 697			      &arg, arg.iov[0].iov_len);
 698
 699	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 700	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 701
 702#ifdef CONFIG_TCP_MD5SIG
 703release_sk1:
 704	if (sk1) {
 705		rcu_read_unlock();
 706		sock_put(sk1);
 707	}
 708#endif
 709}
 710
 711/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 712   outside socket context is ugly, certainly. What can I do?
 713 */
 714
 715static void tcp_v4_send_ack(struct net *net,
 716			    struct sk_buff *skb, u32 seq, u32 ack,
 717			    u32 win, u32 tsval, u32 tsecr, int oif,
 718			    struct tcp_md5sig_key *key,
 719			    int reply_flags, u8 tos)
 720{
 721	const struct tcphdr *th = tcp_hdr(skb);
 722	struct {
 723		struct tcphdr th;
 724		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 725#ifdef CONFIG_TCP_MD5SIG
 726			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 727#endif
 728			];
 729	} rep;
 730	struct ip_reply_arg arg;
 731
 732	memset(&rep.th, 0, sizeof(struct tcphdr));
 733	memset(&arg, 0, sizeof(arg));
 734
 735	arg.iov[0].iov_base = (unsigned char *)&rep;
 736	arg.iov[0].iov_len  = sizeof(rep.th);
 737	if (tsecr) {
 738		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 739				   (TCPOPT_TIMESTAMP << 8) |
 740				   TCPOLEN_TIMESTAMP);
 741		rep.opt[1] = htonl(tsval);
 742		rep.opt[2] = htonl(tsecr);
 743		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 744	}
 745
 746	/* Swap the send and the receive. */
 747	rep.th.dest    = th->source;
 748	rep.th.source  = th->dest;
 749	rep.th.doff    = arg.iov[0].iov_len / 4;
 750	rep.th.seq     = htonl(seq);
 751	rep.th.ack_seq = htonl(ack);
 752	rep.th.ack     = 1;
 753	rep.th.window  = htons(win);
 754
 755#ifdef CONFIG_TCP_MD5SIG
 756	if (key) {
 757		int offset = (tsecr) ? 3 : 0;
 758
 759		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 760					  (TCPOPT_NOP << 16) |
 761					  (TCPOPT_MD5SIG << 8) |
 762					  TCPOLEN_MD5SIG);
 763		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 764		rep.th.doff = arg.iov[0].iov_len/4;
 765
 766		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 767				    key, ip_hdr(skb)->saddr,
 768				    ip_hdr(skb)->daddr, &rep.th);
 769	}
 770#endif
 771	arg.flags = reply_flags;
 772	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 773				      ip_hdr(skb)->saddr, /* XXX */
 774				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 775	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 776	if (oif)
 777		arg.bound_dev_if = oif;
 778	arg.tos = tos;
 779	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 780			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 781			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 782			      &arg, arg.iov[0].iov_len);
 783
 784	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 785}
 786
 787static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 788{
 789	struct inet_timewait_sock *tw = inet_twsk(sk);
 790	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 791
 792	tcp_v4_send_ack(sock_net(sk), skb,
 793			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 794			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 795			tcp_time_stamp + tcptw->tw_ts_offset,
 796			tcptw->tw_ts_recent,
 797			tw->tw_bound_dev_if,
 798			tcp_twsk_md5_key(tcptw),
 799			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 800			tw->tw_tos
 801			);
 802
 803	inet_twsk_put(tw);
 804}
 805
 806static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 807				  struct request_sock *req)
 808{
 809	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 810	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 811	 */
 812	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 813					     tcp_sk(sk)->snd_nxt;
 814
 815	tcp_v4_send_ack(sock_net(sk), skb, seq,
 816			tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
 817			tcp_time_stamp,
 818			req->ts_recent,
 819			0,
 820			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 821					  AF_INET),
 822			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 823			ip_hdr(skb)->tos);
 824}
 825
 826/*
 827 *	Send a SYN-ACK after having received a SYN.
 828 *	This still operates on a request_sock only, not on a big
 829 *	socket.
 830 */
 831static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 832			      struct flowi *fl,
 833			      struct request_sock *req,
 834			      struct tcp_fastopen_cookie *foc,
 835				  bool attach_req)
 836{
 837	const struct inet_request_sock *ireq = inet_rsk(req);
 838	struct flowi4 fl4;
 839	int err = -1;
 840	struct sk_buff *skb;
 841
 842	/* First, grab a route. */
 843	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 844		return -1;
 845
 846	skb = tcp_make_synack(sk, dst, req, foc, attach_req);
 847
 848	if (skb) {
 849		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 850
 851		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 852					    ireq->ir_rmt_addr,
 853					    ireq->opt);
 854		err = net_xmit_eval(err);
 855	}
 856
 857	return err;
 858}
 859
 860/*
 861 *	IPv4 request_sock destructor.
 862 */
 863static void tcp_v4_reqsk_destructor(struct request_sock *req)
 864{
 865	kfree(inet_rsk(req)->opt);
 866}
 867
 868
 869#ifdef CONFIG_TCP_MD5SIG
 870/*
 871 * RFC2385 MD5 checksumming requires a mapping of
 872 * IP address->MD5 Key.
 873 * We need to maintain these in the sk structure.
 874 */
 875
 876/* Find the Key structure for an address.  */
 877struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 878					 const union tcp_md5_addr *addr,
 879					 int family)
 880{
 881	const struct tcp_sock *tp = tcp_sk(sk);
 882	struct tcp_md5sig_key *key;
 883	unsigned int size = sizeof(struct in_addr);
 884	const struct tcp_md5sig_info *md5sig;
 885
 886	/* caller either holds rcu_read_lock() or socket lock */
 887	md5sig = rcu_dereference_check(tp->md5sig_info,
 888				       sock_owned_by_user(sk) ||
 889				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
 890	if (!md5sig)
 891		return NULL;
 892#if IS_ENABLED(CONFIG_IPV6)
 893	if (family == AF_INET6)
 894		size = sizeof(struct in6_addr);
 895#endif
 896	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 897		if (key->family != family)
 898			continue;
 899		if (!memcmp(&key->addr, addr, size))
 900			return key;
 901	}
 902	return NULL;
 903}
 904EXPORT_SYMBOL(tcp_md5_do_lookup);
 905
 906struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 907					 const struct sock *addr_sk)
 908{
 909	const union tcp_md5_addr *addr;
 910
 911	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 912	return tcp_md5_do_lookup(sk, addr, AF_INET);
 913}
 914EXPORT_SYMBOL(tcp_v4_md5_lookup);
 915
 916/* This can be called on a newly created socket, from other files */
 917int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 918		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 919{
 920	/* Add Key to the list */
 921	struct tcp_md5sig_key *key;
 922	struct tcp_sock *tp = tcp_sk(sk);
 923	struct tcp_md5sig_info *md5sig;
 924
 925	key = tcp_md5_do_lookup(sk, addr, family);
 926	if (key) {
 927		/* Pre-existing entry - just update that one. */
 928		memcpy(key->key, newkey, newkeylen);
 929		key->keylen = newkeylen;
 930		return 0;
 931	}
 932
 933	md5sig = rcu_dereference_protected(tp->md5sig_info,
 934					   sock_owned_by_user(sk) ||
 935					   lockdep_is_held(&sk->sk_lock.slock));
 936	if (!md5sig) {
 937		md5sig = kmalloc(sizeof(*md5sig), gfp);
 938		if (!md5sig)
 939			return -ENOMEM;
 940
 941		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 942		INIT_HLIST_HEAD(&md5sig->head);
 943		rcu_assign_pointer(tp->md5sig_info, md5sig);
 944	}
 945
 946	key = sock_kmalloc(sk, sizeof(*key), gfp);
 947	if (!key)
 948		return -ENOMEM;
 949	if (!tcp_alloc_md5sig_pool()) {
 950		sock_kfree_s(sk, key, sizeof(*key));
 951		return -ENOMEM;
 952	}
 953
 954	memcpy(key->key, newkey, newkeylen);
 955	key->keylen = newkeylen;
 956	key->family = family;
 957	memcpy(&key->addr, addr,
 958	       (family == AF_INET6) ? sizeof(struct in6_addr) :
 959				      sizeof(struct in_addr));
 960	hlist_add_head_rcu(&key->node, &md5sig->head);
 961	return 0;
 962}
 963EXPORT_SYMBOL(tcp_md5_do_add);
 964
 965int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 966{
 967	struct tcp_md5sig_key *key;
 968
 969	key = tcp_md5_do_lookup(sk, addr, family);
 970	if (!key)
 971		return -ENOENT;
 972	hlist_del_rcu(&key->node);
 973	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 974	kfree_rcu(key, rcu);
 975	return 0;
 976}
 977EXPORT_SYMBOL(tcp_md5_do_del);
 978
 979static void tcp_clear_md5_list(struct sock *sk)
 980{
 981	struct tcp_sock *tp = tcp_sk(sk);
 982	struct tcp_md5sig_key *key;
 983	struct hlist_node *n;
 984	struct tcp_md5sig_info *md5sig;
 985
 986	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 987
 988	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
 989		hlist_del_rcu(&key->node);
 990		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 991		kfree_rcu(key, rcu);
 992	}
 993}
 994
 995static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 996				 int optlen)
 997{
 998	struct tcp_md5sig cmd;
 999	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000
1001	if (optlen < sizeof(cmd))
1002		return -EINVAL;
1003
1004	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1005		return -EFAULT;
1006
1007	if (sin->sin_family != AF_INET)
1008		return -EINVAL;
1009
1010	if (!cmd.tcpm_keylen)
1011		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1012				      AF_INET);
1013
1014	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1015		return -EINVAL;
1016
1017	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1018			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1019			      GFP_KERNEL);
1020}
1021
1022static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1023					__be32 daddr, __be32 saddr, int nbytes)
1024{
1025	struct tcp4_pseudohdr *bp;
1026	struct scatterlist sg;
1027
1028	bp = &hp->md5_blk.ip4;
1029
1030	/*
1031	 * 1. the TCP pseudo-header (in the order: source IP address,
1032	 * destination IP address, zero-padded protocol number, and
1033	 * segment length)
1034	 */
1035	bp->saddr = saddr;
1036	bp->daddr = daddr;
1037	bp->pad = 0;
1038	bp->protocol = IPPROTO_TCP;
1039	bp->len = cpu_to_be16(nbytes);
1040
1041	sg_init_one(&sg, bp, sizeof(*bp));
1042	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1043}
1044
1045static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1046			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1047{
1048	struct tcp_md5sig_pool *hp;
1049	struct hash_desc *desc;
1050
1051	hp = tcp_get_md5sig_pool();
1052	if (!hp)
1053		goto clear_hash_noput;
1054	desc = &hp->md5_desc;
1055
1056	if (crypto_hash_init(desc))
1057		goto clear_hash;
1058	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1059		goto clear_hash;
1060	if (tcp_md5_hash_header(hp, th))
1061		goto clear_hash;
1062	if (tcp_md5_hash_key(hp, key))
1063		goto clear_hash;
1064	if (crypto_hash_final(desc, md5_hash))
1065		goto clear_hash;
1066
1067	tcp_put_md5sig_pool();
1068	return 0;
1069
1070clear_hash:
1071	tcp_put_md5sig_pool();
1072clear_hash_noput:
1073	memset(md5_hash, 0, 16);
1074	return 1;
1075}
1076
1077int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1078			const struct sock *sk,
1079			const struct sk_buff *skb)
1080{
1081	struct tcp_md5sig_pool *hp;
1082	struct hash_desc *desc;
1083	const struct tcphdr *th = tcp_hdr(skb);
1084	__be32 saddr, daddr;
1085
1086	if (sk) { /* valid for establish/request sockets */
1087		saddr = sk->sk_rcv_saddr;
1088		daddr = sk->sk_daddr;
1089	} else {
1090		const struct iphdr *iph = ip_hdr(skb);
1091		saddr = iph->saddr;
1092		daddr = iph->daddr;
1093	}
1094
1095	hp = tcp_get_md5sig_pool();
1096	if (!hp)
1097		goto clear_hash_noput;
1098	desc = &hp->md5_desc;
1099
1100	if (crypto_hash_init(desc))
1101		goto clear_hash;
1102
1103	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1104		goto clear_hash;
1105	if (tcp_md5_hash_header(hp, th))
1106		goto clear_hash;
1107	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1108		goto clear_hash;
1109	if (tcp_md5_hash_key(hp, key))
1110		goto clear_hash;
1111	if (crypto_hash_final(desc, md5_hash))
1112		goto clear_hash;
1113
1114	tcp_put_md5sig_pool();
1115	return 0;
1116
1117clear_hash:
1118	tcp_put_md5sig_pool();
1119clear_hash_noput:
1120	memset(md5_hash, 0, 16);
1121	return 1;
1122}
1123EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1124
1125#endif
1126
1127/* Called with rcu_read_lock() */
1128static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1129				    const struct sk_buff *skb)
1130{
1131#ifdef CONFIG_TCP_MD5SIG
1132	/*
1133	 * This gets called for each TCP segment that arrives
1134	 * so we want to be efficient.
1135	 * We have 3 drop cases:
1136	 * o No MD5 hash and one expected.
1137	 * o MD5 hash and we're not expecting one.
1138	 * o MD5 hash and its wrong.
1139	 */
1140	const __u8 *hash_location = NULL;
1141	struct tcp_md5sig_key *hash_expected;
1142	const struct iphdr *iph = ip_hdr(skb);
1143	const struct tcphdr *th = tcp_hdr(skb);
1144	int genhash;
1145	unsigned char newhash[16];
1146
1147	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1148					  AF_INET);
1149	hash_location = tcp_parse_md5sig_option(th);
1150
1151	/* We've parsed the options - do we have a hash? */
1152	if (!hash_expected && !hash_location)
1153		return false;
1154
1155	if (hash_expected && !hash_location) {
1156		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1157		return true;
1158	}
1159
1160	if (!hash_expected && hash_location) {
1161		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1162		return true;
1163	}
1164
1165	/* Okay, so this is hash_expected and hash_location -
1166	 * so we need to calculate the checksum.
1167	 */
1168	genhash = tcp_v4_md5_hash_skb(newhash,
1169				      hash_expected,
1170				      NULL, skb);
1171
1172	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1173		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1174				     &iph->saddr, ntohs(th->source),
1175				     &iph->daddr, ntohs(th->dest),
1176				     genhash ? " tcp_v4_calc_md5_hash failed"
1177				     : "");
1178		return true;
1179	}
1180	return false;
1181#endif
1182	return false;
1183}
1184
1185static void tcp_v4_init_req(struct request_sock *req,
1186			    const struct sock *sk_listener,
1187			    struct sk_buff *skb)
1188{
1189	struct inet_request_sock *ireq = inet_rsk(req);
1190
1191	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1192	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1193	ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1194	ireq->opt = tcp_v4_save_options(skb);
1195}
1196
1197static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1198					  struct flowi *fl,
1199					  const struct request_sock *req,
1200					  bool *strict)
1201{
1202	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1203
1204	if (strict) {
1205		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1206			*strict = true;
1207		else
1208			*strict = false;
1209	}
1210
1211	return dst;
1212}
1213
1214struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1215	.family		=	PF_INET,
1216	.obj_size	=	sizeof(struct tcp_request_sock),
1217	.rtx_syn_ack	=	tcp_rtx_synack,
1218	.send_ack	=	tcp_v4_reqsk_send_ack,
1219	.destructor	=	tcp_v4_reqsk_destructor,
1220	.send_reset	=	tcp_v4_send_reset,
1221	.syn_ack_timeout =	tcp_syn_ack_timeout,
1222};
1223
1224static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1225	.mss_clamp	=	TCP_MSS_DEFAULT,
1226#ifdef CONFIG_TCP_MD5SIG
1227	.req_md5_lookup	=	tcp_v4_md5_lookup,
1228	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1229#endif
1230	.init_req	=	tcp_v4_init_req,
1231#ifdef CONFIG_SYN_COOKIES
1232	.cookie_init_seq =	cookie_v4_init_sequence,
1233#endif
1234	.route_req	=	tcp_v4_route_req,
1235	.init_seq	=	tcp_v4_init_sequence,
1236	.send_synack	=	tcp_v4_send_synack,
1237};
1238
1239int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1240{
1241	/* Never answer to SYNs send to broadcast or multicast */
1242	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1243		goto drop;
1244
1245	return tcp_conn_request(&tcp_request_sock_ops,
1246				&tcp_request_sock_ipv4_ops, sk, skb);
1247
1248drop:
1249	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1250	return 0;
1251}
1252EXPORT_SYMBOL(tcp_v4_conn_request);
1253
1254
1255/*
1256 * The three way handshake has completed - we got a valid synack -
1257 * now create the new socket.
1258 */
1259struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1260				  struct request_sock *req,
1261				  struct dst_entry *dst,
1262				  struct request_sock *req_unhash,
1263				  bool *own_req)
1264{
1265	struct inet_request_sock *ireq;
1266	struct inet_sock *newinet;
1267	struct tcp_sock *newtp;
1268	struct sock *newsk;
1269#ifdef CONFIG_TCP_MD5SIG
1270	struct tcp_md5sig_key *key;
1271#endif
1272	struct ip_options_rcu *inet_opt;
1273
1274	if (sk_acceptq_is_full(sk))
1275		goto exit_overflow;
1276
1277	newsk = tcp_create_openreq_child(sk, req, skb);
1278	if (!newsk)
1279		goto exit_nonewsk;
1280
1281	newsk->sk_gso_type = SKB_GSO_TCPV4;
1282	inet_sk_rx_dst_set(newsk, skb);
1283
1284	newtp		      = tcp_sk(newsk);
1285	newinet		      = inet_sk(newsk);
1286	ireq		      = inet_rsk(req);
1287	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1288	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1289	newsk->sk_bound_dev_if = ireq->ir_iif;
1290	newinet->inet_saddr	      = ireq->ir_loc_addr;
1291	inet_opt	      = ireq->opt;
1292	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1293	ireq->opt	      = NULL;
1294	newinet->mc_index     = inet_iif(skb);
1295	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1296	newinet->rcv_tos      = ip_hdr(skb)->tos;
1297	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1298	if (inet_opt)
1299		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1300	newinet->inet_id = newtp->write_seq ^ jiffies;
1301
1302	if (!dst) {
1303		dst = inet_csk_route_child_sock(sk, newsk, req);
1304		if (!dst)
1305			goto put_and_exit;
1306	} else {
1307		/* syncookie case : see end of cookie_v4_check() */
1308	}
1309	sk_setup_caps(newsk, dst);
1310
1311	tcp_ca_openreq_child(newsk, dst);
1312
1313	tcp_sync_mss(newsk, dst_mtu(dst));
1314	newtp->advmss = dst_metric_advmss(dst);
1315	if (tcp_sk(sk)->rx_opt.user_mss &&
1316	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1317		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1318
1319	tcp_initialize_rcv_mss(newsk);
1320
1321#ifdef CONFIG_TCP_MD5SIG
1322	/* Copy over the MD5 key from the original socket */
1323	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1324				AF_INET);
1325	if (key) {
1326		/*
1327		 * We're using one, so create a matching key
1328		 * on the newsk structure. If we fail to get
1329		 * memory, then we end up not copying the key
1330		 * across. Shucks.
1331		 */
1332		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1333			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1334		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1335	}
1336#endif
1337
1338	if (__inet_inherit_port(sk, newsk) < 0)
1339		goto put_and_exit;
1340	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1341	if (*own_req)
1342		tcp_move_syn(newtp, req);
1343
1344	return newsk;
1345
1346exit_overflow:
1347	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1348exit_nonewsk:
1349	dst_release(dst);
1350exit:
1351	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1352	return NULL;
1353put_and_exit:
1354	inet_csk_prepare_forced_close(newsk);
1355	tcp_done(newsk);
1356	goto exit;
1357}
1358EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1359
1360static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1361{
1362#ifdef CONFIG_SYN_COOKIES
1363	const struct tcphdr *th = tcp_hdr(skb);
1364
1365	if (!th->syn)
1366		sk = cookie_v4_check(sk, skb);
1367#endif
1368	return sk;
1369}
1370
1371/* The socket must have it's spinlock held when we get
1372 * here, unless it is a TCP_LISTEN socket.
1373 *
1374 * We have a potential double-lock case here, so even when
1375 * doing backlog processing we use the BH locking scheme.
1376 * This is because we cannot sleep with the original spinlock
1377 * held.
1378 */
1379int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1380{
1381	struct sock *rsk;
1382
1383	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1384		struct dst_entry *dst = sk->sk_rx_dst;
1385
1386		sock_rps_save_rxhash(sk, skb);
1387		sk_mark_napi_id(sk, skb);
1388		if (dst) {
1389			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1390			    !dst->ops->check(dst, 0)) {
1391				dst_release(dst);
1392				sk->sk_rx_dst = NULL;
1393			}
1394		}
1395		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1396		return 0;
1397	}
1398
1399	if (tcp_checksum_complete(skb))
1400		goto csum_err;
1401
1402	if (sk->sk_state == TCP_LISTEN) {
1403		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1404
1405		if (!nsk)
1406			goto discard;
1407		if (nsk != sk) {
1408			sock_rps_save_rxhash(nsk, skb);
1409			sk_mark_napi_id(nsk, skb);
1410			if (tcp_child_process(sk, nsk, skb)) {
1411				rsk = nsk;
1412				goto reset;
1413			}
1414			return 0;
1415		}
1416	} else
1417		sock_rps_save_rxhash(sk, skb);
1418
1419	if (tcp_rcv_state_process(sk, skb)) {
1420		rsk = sk;
1421		goto reset;
1422	}
1423	return 0;
1424
1425reset:
1426	tcp_v4_send_reset(rsk, skb);
1427discard:
1428	kfree_skb(skb);
1429	/* Be careful here. If this function gets more complicated and
1430	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1431	 * might be destroyed here. This current version compiles correctly,
1432	 * but you have been warned.
1433	 */
1434	return 0;
1435
1436csum_err:
1437	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1438	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1439	goto discard;
1440}
1441EXPORT_SYMBOL(tcp_v4_do_rcv);
1442
1443void tcp_v4_early_demux(struct sk_buff *skb)
1444{
1445	const struct iphdr *iph;
1446	const struct tcphdr *th;
1447	struct sock *sk;
1448
1449	if (skb->pkt_type != PACKET_HOST)
1450		return;
1451
1452	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1453		return;
1454
1455	iph = ip_hdr(skb);
1456	th = tcp_hdr(skb);
1457
1458	if (th->doff < sizeof(struct tcphdr) / 4)
1459		return;
1460
1461	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1462				       iph->saddr, th->source,
1463				       iph->daddr, ntohs(th->dest),
1464				       skb->skb_iif);
1465	if (sk) {
1466		skb->sk = sk;
1467		skb->destructor = sock_edemux;
1468		if (sk_fullsock(sk)) {
1469			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1470
1471			if (dst)
1472				dst = dst_check(dst, 0);
1473			if (dst &&
1474			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1475				skb_dst_set_noref(skb, dst);
1476		}
1477	}
1478}
1479
1480/* Packet is added to VJ-style prequeue for processing in process
1481 * context, if a reader task is waiting. Apparently, this exciting
1482 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1483 * failed somewhere. Latency? Burstiness? Well, at least now we will
1484 * see, why it failed. 8)8)				  --ANK
1485 *
1486 */
1487bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1488{
1489	struct tcp_sock *tp = tcp_sk(sk);
1490
1491	if (sysctl_tcp_low_latency || !tp->ucopy.task)
1492		return false;
1493
1494	if (skb->len <= tcp_hdrlen(skb) &&
1495	    skb_queue_len(&tp->ucopy.prequeue) == 0)
1496		return false;
1497
1498	/* Before escaping RCU protected region, we need to take care of skb
1499	 * dst. Prequeue is only enabled for established sockets.
1500	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1501	 * Instead of doing full sk_rx_dst validity here, let's perform
1502	 * an optimistic check.
1503	 */
1504	if (likely(sk->sk_rx_dst))
1505		skb_dst_drop(skb);
1506	else
1507		skb_dst_force_safe(skb);
1508
1509	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1510	tp->ucopy.memory += skb->truesize;
1511	if (tp->ucopy.memory > sk->sk_rcvbuf) {
1512		struct sk_buff *skb1;
1513
1514		BUG_ON(sock_owned_by_user(sk));
1515
1516		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1517			sk_backlog_rcv(sk, skb1);
1518			NET_INC_STATS_BH(sock_net(sk),
1519					 LINUX_MIB_TCPPREQUEUEDROPPED);
1520		}
1521
1522		tp->ucopy.memory = 0;
1523	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1524		wake_up_interruptible_sync_poll(sk_sleep(sk),
1525					   POLLIN | POLLRDNORM | POLLRDBAND);
1526		if (!inet_csk_ack_scheduled(sk))
1527			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1528						  (3 * tcp_rto_min(sk)) / 4,
1529						  TCP_RTO_MAX);
1530	}
1531	return true;
1532}
1533EXPORT_SYMBOL(tcp_prequeue);
1534
1535/*
1536 *	From tcp_input.c
1537 */
1538
1539int tcp_v4_rcv(struct sk_buff *skb)
1540{
1541	const struct iphdr *iph;
1542	const struct tcphdr *th;
1543	struct sock *sk;
1544	int ret;
1545	struct net *net = dev_net(skb->dev);
1546
1547	if (skb->pkt_type != PACKET_HOST)
1548		goto discard_it;
1549
1550	/* Count it even if it's bad */
1551	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1552
1553	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1554		goto discard_it;
1555
1556	th = tcp_hdr(skb);
1557
1558	if (th->doff < sizeof(struct tcphdr) / 4)
1559		goto bad_packet;
1560	if (!pskb_may_pull(skb, th->doff * 4))
1561		goto discard_it;
1562
1563	/* An explanation is required here, I think.
1564	 * Packet length and doff are validated by header prediction,
1565	 * provided case of th->doff==0 is eliminated.
1566	 * So, we defer the checks. */
1567
1568	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1569		goto csum_error;
1570
1571	th = tcp_hdr(skb);
1572	iph = ip_hdr(skb);
1573	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1574	 * barrier() makes sure compiler wont play fool^Waliasing games.
1575	 */
1576	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1577		sizeof(struct inet_skb_parm));
1578	barrier();
1579
1580	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1581	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1582				    skb->len - th->doff * 4);
1583	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1584	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1585	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1586	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1587	TCP_SKB_CB(skb)->sacked	 = 0;
1588
1589lookup:
1590	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1591	if (!sk)
1592		goto no_tcp_socket;
1593
1594process:
1595	if (sk->sk_state == TCP_TIME_WAIT)
1596		goto do_time_wait;
1597
1598	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1599		struct request_sock *req = inet_reqsk(sk);
1600		struct sock *nsk = NULL;
1601
1602		sk = req->rsk_listener;
1603		if (tcp_v4_inbound_md5_hash(sk, skb))
1604			goto discard_and_relse;
1605		if (likely(sk->sk_state == TCP_LISTEN)) {
1606			nsk = tcp_check_req(sk, skb, req, false);
1607		} else {
1608			inet_csk_reqsk_queue_drop_and_put(sk, req);
1609			goto lookup;
1610		}
1611		if (!nsk) {
1612			reqsk_put(req);
1613			goto discard_it;
1614		}
1615		if (nsk == sk) {
1616			sock_hold(sk);
1617			reqsk_put(req);
1618		} else if (tcp_child_process(sk, nsk, skb)) {
1619			tcp_v4_send_reset(nsk, skb);
1620			goto discard_it;
1621		} else {
1622			return 0;
1623		}
1624	}
1625	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1626		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1627		goto discard_and_relse;
1628	}
1629
1630	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1631		goto discard_and_relse;
1632
1633	if (tcp_v4_inbound_md5_hash(sk, skb))
1634		goto discard_and_relse;
1635
1636	nf_reset(skb);
1637
1638	if (sk_filter(sk, skb))
1639		goto discard_and_relse;
1640
1641	skb->dev = NULL;
1642
1643	if (sk->sk_state == TCP_LISTEN) {
1644		ret = tcp_v4_do_rcv(sk, skb);
1645		goto put_and_return;
1646	}
1647
1648	sk_incoming_cpu_update(sk);
1649
1650	bh_lock_sock_nested(sk);
1651	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1652	ret = 0;
1653	if (!sock_owned_by_user(sk)) {
1654		if (!tcp_prequeue(sk, skb))
1655			ret = tcp_v4_do_rcv(sk, skb);
1656	} else if (unlikely(sk_add_backlog(sk, skb,
1657					   sk->sk_rcvbuf + sk->sk_sndbuf))) {
1658		bh_unlock_sock(sk);
1659		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1660		goto discard_and_relse;
1661	}
1662	bh_unlock_sock(sk);
1663
1664put_and_return:
1665	sock_put(sk);
1666
1667	return ret;
1668
1669no_tcp_socket:
1670	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1671		goto discard_it;
1672
1673	if (tcp_checksum_complete(skb)) {
1674csum_error:
1675		TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1676bad_packet:
1677		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1678	} else {
1679		tcp_v4_send_reset(NULL, skb);
1680	}
1681
1682discard_it:
1683	/* Discard frame. */
1684	kfree_skb(skb);
1685	return 0;
1686
1687discard_and_relse:
1688	sock_put(sk);
1689	goto discard_it;
1690
1691do_time_wait:
1692	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1693		inet_twsk_put(inet_twsk(sk));
1694		goto discard_it;
1695	}
1696
1697	if (tcp_checksum_complete(skb)) {
1698		inet_twsk_put(inet_twsk(sk));
1699		goto csum_error;
1700	}
1701	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1702	case TCP_TW_SYN: {
1703		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1704							&tcp_hashinfo,
1705							iph->saddr, th->source,
1706							iph->daddr, th->dest,
1707							inet_iif(skb));
1708		if (sk2) {
1709			inet_twsk_deschedule_put(inet_twsk(sk));
1710			sk = sk2;
1711			goto process;
1712		}
1713		/* Fall through to ACK */
1714	}
1715	case TCP_TW_ACK:
1716		tcp_v4_timewait_ack(sk, skb);
1717		break;
1718	case TCP_TW_RST:
1719		tcp_v4_send_reset(sk, skb);
1720		inet_twsk_deschedule_put(inet_twsk(sk));
1721		goto discard_it;
1722	case TCP_TW_SUCCESS:;
1723	}
1724	goto discard_it;
1725}
1726
1727static struct timewait_sock_ops tcp_timewait_sock_ops = {
1728	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1729	.twsk_unique	= tcp_twsk_unique,
1730	.twsk_destructor= tcp_twsk_destructor,
1731};
1732
1733void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1734{
1735	struct dst_entry *dst = skb_dst(skb);
1736
1737	if (dst && dst_hold_safe(dst)) {
1738		sk->sk_rx_dst = dst;
1739		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1740	}
1741}
1742EXPORT_SYMBOL(inet_sk_rx_dst_set);
1743
1744const struct inet_connection_sock_af_ops ipv4_specific = {
1745	.queue_xmit	   = ip_queue_xmit,
1746	.send_check	   = tcp_v4_send_check,
1747	.rebuild_header	   = inet_sk_rebuild_header,
1748	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1749	.conn_request	   = tcp_v4_conn_request,
1750	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1751	.net_header_len	   = sizeof(struct iphdr),
1752	.setsockopt	   = ip_setsockopt,
1753	.getsockopt	   = ip_getsockopt,
1754	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1755	.sockaddr_len	   = sizeof(struct sockaddr_in),
1756	.bind_conflict	   = inet_csk_bind_conflict,
1757#ifdef CONFIG_COMPAT
1758	.compat_setsockopt = compat_ip_setsockopt,
1759	.compat_getsockopt = compat_ip_getsockopt,
1760#endif
1761	.mtu_reduced	   = tcp_v4_mtu_reduced,
1762};
1763EXPORT_SYMBOL(ipv4_specific);
1764
1765#ifdef CONFIG_TCP_MD5SIG
1766static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1767	.md5_lookup		= tcp_v4_md5_lookup,
1768	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1769	.md5_parse		= tcp_v4_parse_md5_keys,
1770};
1771#endif
1772
1773/* NOTE: A lot of things set to zero explicitly by call to
1774 *       sk_alloc() so need not be done here.
1775 */
1776static int tcp_v4_init_sock(struct sock *sk)
1777{
1778	struct inet_connection_sock *icsk = inet_csk(sk);
1779
1780	tcp_init_sock(sk);
1781
1782	icsk->icsk_af_ops = &ipv4_specific;
1783
1784#ifdef CONFIG_TCP_MD5SIG
1785	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1786#endif
1787
1788	return 0;
1789}
1790
1791void tcp_v4_destroy_sock(struct sock *sk)
1792{
1793	struct tcp_sock *tp = tcp_sk(sk);
1794
1795	tcp_clear_xmit_timers(sk);
1796
1797	tcp_cleanup_congestion_control(sk);
1798
1799	/* Cleanup up the write buffer. */
1800	tcp_write_queue_purge(sk);
1801
1802	/* Cleans up our, hopefully empty, out_of_order_queue. */
1803	__skb_queue_purge(&tp->out_of_order_queue);
1804
1805#ifdef CONFIG_TCP_MD5SIG
1806	/* Clean up the MD5 key list, if any */
1807	if (tp->md5sig_info) {
1808		tcp_clear_md5_list(sk);
1809		kfree_rcu(tp->md5sig_info, rcu);
1810		tp->md5sig_info = NULL;
1811	}
1812#endif
1813
1814	/* Clean prequeue, it must be empty really */
1815	__skb_queue_purge(&tp->ucopy.prequeue);
1816
1817	/* Clean up a referenced TCP bind bucket. */
1818	if (inet_csk(sk)->icsk_bind_hash)
1819		inet_put_port(sk);
1820
1821	BUG_ON(tp->fastopen_rsk);
1822
1823	/* If socket is aborted during connect operation */
1824	tcp_free_fastopen_req(tp);
1825	tcp_saved_syn_free(tp);
1826
1827	sk_sockets_allocated_dec(sk);
1828
1829	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1830		sock_release_memcg(sk);
1831}
1832EXPORT_SYMBOL(tcp_v4_destroy_sock);
1833
1834#ifdef CONFIG_PROC_FS
1835/* Proc filesystem TCP sock list dumping. */
1836
1837/*
1838 * Get next listener socket follow cur.  If cur is NULL, get first socket
1839 * starting from bucket given in st->bucket; when st->bucket is zero the
1840 * very first socket in the hash table is returned.
1841 */
1842static void *listening_get_next(struct seq_file *seq, void *cur)
1843{
1844	struct inet_connection_sock *icsk;
1845	struct hlist_nulls_node *node;
1846	struct sock *sk = cur;
1847	struct inet_listen_hashbucket *ilb;
1848	struct tcp_iter_state *st = seq->private;
1849	struct net *net = seq_file_net(seq);
1850
1851	if (!sk) {
1852		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1853		spin_lock_bh(&ilb->lock);
1854		sk = sk_nulls_head(&ilb->head);
1855		st->offset = 0;
1856		goto get_sk;
1857	}
1858	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1859	++st->num;
1860	++st->offset;
1861
1862	sk = sk_nulls_next(sk);
1863get_sk:
1864	sk_nulls_for_each_from(sk, node) {
1865		if (!net_eq(sock_net(sk), net))
1866			continue;
1867		if (sk->sk_family == st->family) {
1868			cur = sk;
1869			goto out;
1870		}
1871		icsk = inet_csk(sk);
1872	}
1873	spin_unlock_bh(&ilb->lock);
1874	st->offset = 0;
1875	if (++st->bucket < INET_LHTABLE_SIZE) {
1876		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1877		spin_lock_bh(&ilb->lock);
1878		sk = sk_nulls_head(&ilb->head);
1879		goto get_sk;
1880	}
1881	cur = NULL;
1882out:
1883	return cur;
1884}
1885
1886static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1887{
1888	struct tcp_iter_state *st = seq->private;
1889	void *rc;
1890
1891	st->bucket = 0;
1892	st->offset = 0;
1893	rc = listening_get_next(seq, NULL);
1894
1895	while (rc && *pos) {
1896		rc = listening_get_next(seq, rc);
1897		--*pos;
1898	}
1899	return rc;
1900}
1901
1902static inline bool empty_bucket(const struct tcp_iter_state *st)
1903{
1904	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1905}
1906
1907/*
1908 * Get first established socket starting from bucket given in st->bucket.
1909 * If st->bucket is zero, the very first socket in the hash is returned.
1910 */
1911static void *established_get_first(struct seq_file *seq)
1912{
1913	struct tcp_iter_state *st = seq->private;
1914	struct net *net = seq_file_net(seq);
1915	void *rc = NULL;
1916
1917	st->offset = 0;
1918	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1919		struct sock *sk;
1920		struct hlist_nulls_node *node;
1921		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1922
1923		/* Lockless fast path for the common case of empty buckets */
1924		if (empty_bucket(st))
1925			continue;
1926
1927		spin_lock_bh(lock);
1928		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1929			if (sk->sk_family != st->family ||
1930			    !net_eq(sock_net(sk), net)) {
1931				continue;
1932			}
1933			rc = sk;
1934			goto out;
1935		}
1936		spin_unlock_bh(lock);
1937	}
1938out:
1939	return rc;
1940}
1941
1942static void *established_get_next(struct seq_file *seq, void *cur)
1943{
1944	struct sock *sk = cur;
1945	struct hlist_nulls_node *node;
1946	struct tcp_iter_state *st = seq->private;
1947	struct net *net = seq_file_net(seq);
1948
1949	++st->num;
1950	++st->offset;
1951
1952	sk = sk_nulls_next(sk);
1953
1954	sk_nulls_for_each_from(sk, node) {
1955		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1956			return sk;
1957	}
1958
1959	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1960	++st->bucket;
1961	return established_get_first(seq);
1962}
1963
1964static void *established_get_idx(struct seq_file *seq, loff_t pos)
1965{
1966	struct tcp_iter_state *st = seq->private;
1967	void *rc;
1968
1969	st->bucket = 0;
1970	rc = established_get_first(seq);
1971
1972	while (rc && pos) {
1973		rc = established_get_next(seq, rc);
1974		--pos;
1975	}
1976	return rc;
1977}
1978
1979static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1980{
1981	void *rc;
1982	struct tcp_iter_state *st = seq->private;
1983
1984	st->state = TCP_SEQ_STATE_LISTENING;
1985	rc	  = listening_get_idx(seq, &pos);
1986
1987	if (!rc) {
1988		st->state = TCP_SEQ_STATE_ESTABLISHED;
1989		rc	  = established_get_idx(seq, pos);
1990	}
1991
1992	return rc;
1993}
1994
1995static void *tcp_seek_last_pos(struct seq_file *seq)
1996{
1997	struct tcp_iter_state *st = seq->private;
1998	int offset = st->offset;
1999	int orig_num = st->num;
2000	void *rc = NULL;
2001
2002	switch (st->state) {
2003	case TCP_SEQ_STATE_LISTENING:
2004		if (st->bucket >= INET_LHTABLE_SIZE)
2005			break;
2006		st->state = TCP_SEQ_STATE_LISTENING;
2007		rc = listening_get_next(seq, NULL);
2008		while (offset-- && rc)
2009			rc = listening_get_next(seq, rc);
2010		if (rc)
2011			break;
2012		st->bucket = 0;
2013		st->state = TCP_SEQ_STATE_ESTABLISHED;
2014		/* Fallthrough */
2015	case TCP_SEQ_STATE_ESTABLISHED:
2016		if (st->bucket > tcp_hashinfo.ehash_mask)
2017			break;
2018		rc = established_get_first(seq);
2019		while (offset-- && rc)
2020			rc = established_get_next(seq, rc);
2021	}
2022
2023	st->num = orig_num;
2024
2025	return rc;
2026}
2027
2028static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2029{
2030	struct tcp_iter_state *st = seq->private;
2031	void *rc;
2032
2033	if (*pos && *pos == st->last_pos) {
2034		rc = tcp_seek_last_pos(seq);
2035		if (rc)
2036			goto out;
2037	}
2038
2039	st->state = TCP_SEQ_STATE_LISTENING;
2040	st->num = 0;
2041	st->bucket = 0;
2042	st->offset = 0;
2043	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2044
2045out:
2046	st->last_pos = *pos;
2047	return rc;
2048}
2049
2050static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2051{
2052	struct tcp_iter_state *st = seq->private;
2053	void *rc = NULL;
2054
2055	if (v == SEQ_START_TOKEN) {
2056		rc = tcp_get_idx(seq, 0);
2057		goto out;
2058	}
2059
2060	switch (st->state) {
2061	case TCP_SEQ_STATE_LISTENING:
2062		rc = listening_get_next(seq, v);
2063		if (!rc) {
2064			st->state = TCP_SEQ_STATE_ESTABLISHED;
2065			st->bucket = 0;
2066			st->offset = 0;
2067			rc	  = established_get_first(seq);
2068		}
2069		break;
2070	case TCP_SEQ_STATE_ESTABLISHED:
2071		rc = established_get_next(seq, v);
2072		break;
2073	}
2074out:
2075	++*pos;
2076	st->last_pos = *pos;
2077	return rc;
2078}
2079
2080static void tcp_seq_stop(struct seq_file *seq, void *v)
2081{
2082	struct tcp_iter_state *st = seq->private;
2083
2084	switch (st->state) {
2085	case TCP_SEQ_STATE_LISTENING:
2086		if (v != SEQ_START_TOKEN)
2087			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2088		break;
2089	case TCP_SEQ_STATE_ESTABLISHED:
2090		if (v)
2091			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2092		break;
2093	}
2094}
2095
2096int tcp_seq_open(struct inode *inode, struct file *file)
2097{
2098	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2099	struct tcp_iter_state *s;
2100	int err;
2101
2102	err = seq_open_net(inode, file, &afinfo->seq_ops,
2103			  sizeof(struct tcp_iter_state));
2104	if (err < 0)
2105		return err;
2106
2107	s = ((struct seq_file *)file->private_data)->private;
2108	s->family		= afinfo->family;
2109	s->last_pos		= 0;
2110	return 0;
2111}
2112EXPORT_SYMBOL(tcp_seq_open);
2113
2114int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2115{
2116	int rc = 0;
2117	struct proc_dir_entry *p;
2118
2119	afinfo->seq_ops.start		= tcp_seq_start;
2120	afinfo->seq_ops.next		= tcp_seq_next;
2121	afinfo->seq_ops.stop		= tcp_seq_stop;
2122
2123	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2124			     afinfo->seq_fops, afinfo);
2125	if (!p)
2126		rc = -ENOMEM;
2127	return rc;
2128}
2129EXPORT_SYMBOL(tcp_proc_register);
2130
2131void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2132{
2133	remove_proc_entry(afinfo->name, net->proc_net);
2134}
2135EXPORT_SYMBOL(tcp_proc_unregister);
2136
2137static void get_openreq4(const struct request_sock *req,
2138			 struct seq_file *f, int i)
2139{
2140	const struct inet_request_sock *ireq = inet_rsk(req);
2141	long delta = req->rsk_timer.expires - jiffies;
2142
2143	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2144		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2145		i,
2146		ireq->ir_loc_addr,
2147		ireq->ir_num,
2148		ireq->ir_rmt_addr,
2149		ntohs(ireq->ir_rmt_port),
2150		TCP_SYN_RECV,
2151		0, 0, /* could print option size, but that is af dependent. */
2152		1,    /* timers active (only the expire timer) */
2153		jiffies_delta_to_clock_t(delta),
2154		req->num_timeout,
2155		from_kuid_munged(seq_user_ns(f),
2156				 sock_i_uid(req->rsk_listener)),
2157		0,  /* non standard timer */
2158		0, /* open_requests have no inode */
2159		0,
2160		req);
2161}
2162
2163static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2164{
2165	int timer_active;
2166	unsigned long timer_expires;
2167	const struct tcp_sock *tp = tcp_sk(sk);
2168	const struct inet_connection_sock *icsk = inet_csk(sk);
2169	const struct inet_sock *inet = inet_sk(sk);
2170	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2171	__be32 dest = inet->inet_daddr;
2172	__be32 src = inet->inet_rcv_saddr;
2173	__u16 destp = ntohs(inet->inet_dport);
2174	__u16 srcp = ntohs(inet->inet_sport);
2175	int rx_queue;
2176	int state;
2177
2178	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2179	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2180	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2181		timer_active	= 1;
2182		timer_expires	= icsk->icsk_timeout;
2183	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2184		timer_active	= 4;
2185		timer_expires	= icsk->icsk_timeout;
2186	} else if (timer_pending(&sk->sk_timer)) {
2187		timer_active	= 2;
2188		timer_expires	= sk->sk_timer.expires;
2189	} else {
2190		timer_active	= 0;
2191		timer_expires = jiffies;
2192	}
2193
2194	state = sk_state_load(sk);
2195	if (state == TCP_LISTEN)
2196		rx_queue = sk->sk_ack_backlog;
2197	else
2198		/* Because we don't lock the socket,
2199		 * we might find a transient negative value.
2200		 */
2201		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2202
2203	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2204			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2205		i, src, srcp, dest, destp, state,
2206		tp->write_seq - tp->snd_una,
2207		rx_queue,
2208		timer_active,
2209		jiffies_delta_to_clock_t(timer_expires - jiffies),
2210		icsk->icsk_retransmits,
2211		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2212		icsk->icsk_probes_out,
2213		sock_i_ino(sk),
2214		atomic_read(&sk->sk_refcnt), sk,
2215		jiffies_to_clock_t(icsk->icsk_rto),
2216		jiffies_to_clock_t(icsk->icsk_ack.ato),
2217		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2218		tp->snd_cwnd,
2219		state == TCP_LISTEN ?
2220		    fastopenq->max_qlen :
2221		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2222}
2223
2224static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2225			       struct seq_file *f, int i)
2226{
2227	long delta = tw->tw_timer.expires - jiffies;
2228	__be32 dest, src;
2229	__u16 destp, srcp;
2230
2231	dest  = tw->tw_daddr;
2232	src   = tw->tw_rcv_saddr;
2233	destp = ntohs(tw->tw_dport);
2234	srcp  = ntohs(tw->tw_sport);
2235
2236	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2237		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2238		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2239		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2240		atomic_read(&tw->tw_refcnt), tw);
2241}
2242
2243#define TMPSZ 150
2244
2245static int tcp4_seq_show(struct seq_file *seq, void *v)
2246{
2247	struct tcp_iter_state *st;
2248	struct sock *sk = v;
2249
2250	seq_setwidth(seq, TMPSZ - 1);
2251	if (v == SEQ_START_TOKEN) {
2252		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2253			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2254			   "inode");
2255		goto out;
2256	}
2257	st = seq->private;
2258
2259	if (sk->sk_state == TCP_TIME_WAIT)
2260		get_timewait4_sock(v, seq, st->num);
2261	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2262		get_openreq4(v, seq, st->num);
2263	else
2264		get_tcp4_sock(v, seq, st->num);
2265out:
2266	seq_pad(seq, '\n');
2267	return 0;
2268}
2269
2270static const struct file_operations tcp_afinfo_seq_fops = {
2271	.owner   = THIS_MODULE,
2272	.open    = tcp_seq_open,
2273	.read    = seq_read,
2274	.llseek  = seq_lseek,
2275	.release = seq_release_net
2276};
2277
2278static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2279	.name		= "tcp",
2280	.family		= AF_INET,
2281	.seq_fops	= &tcp_afinfo_seq_fops,
2282	.seq_ops	= {
2283		.show		= tcp4_seq_show,
2284	},
2285};
2286
2287static int __net_init tcp4_proc_init_net(struct net *net)
2288{
2289	return tcp_proc_register(net, &tcp4_seq_afinfo);
2290}
2291
2292static void __net_exit tcp4_proc_exit_net(struct net *net)
2293{
2294	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2295}
2296
2297static struct pernet_operations tcp4_net_ops = {
2298	.init = tcp4_proc_init_net,
2299	.exit = tcp4_proc_exit_net,
2300};
2301
2302int __init tcp4_proc_init(void)
2303{
2304	return register_pernet_subsys(&tcp4_net_ops);
2305}
2306
2307void tcp4_proc_exit(void)
2308{
2309	unregister_pernet_subsys(&tcp4_net_ops);
2310}
2311#endif /* CONFIG_PROC_FS */
2312
2313struct proto tcp_prot = {
2314	.name			= "TCP",
2315	.owner			= THIS_MODULE,
2316	.close			= tcp_close,
2317	.connect		= tcp_v4_connect,
2318	.disconnect		= tcp_disconnect,
2319	.accept			= inet_csk_accept,
2320	.ioctl			= tcp_ioctl,
2321	.init			= tcp_v4_init_sock,
2322	.destroy		= tcp_v4_destroy_sock,
2323	.shutdown		= tcp_shutdown,
2324	.setsockopt		= tcp_setsockopt,
2325	.getsockopt		= tcp_getsockopt,
2326	.recvmsg		= tcp_recvmsg,
2327	.sendmsg		= tcp_sendmsg,
2328	.sendpage		= tcp_sendpage,
2329	.backlog_rcv		= tcp_v4_do_rcv,
2330	.release_cb		= tcp_release_cb,
2331	.hash			= inet_hash,
2332	.unhash			= inet_unhash,
2333	.get_port		= inet_csk_get_port,
2334	.enter_memory_pressure	= tcp_enter_memory_pressure,
2335	.stream_memory_free	= tcp_stream_memory_free,
2336	.sockets_allocated	= &tcp_sockets_allocated,
2337	.orphan_count		= &tcp_orphan_count,
2338	.memory_allocated	= &tcp_memory_allocated,
2339	.memory_pressure	= &tcp_memory_pressure,
2340	.sysctl_mem		= sysctl_tcp_mem,
2341	.sysctl_wmem		= sysctl_tcp_wmem,
2342	.sysctl_rmem		= sysctl_tcp_rmem,
2343	.max_header		= MAX_TCP_HEADER,
2344	.obj_size		= sizeof(struct tcp_sock),
2345	.slab_flags		= SLAB_DESTROY_BY_RCU,
2346	.twsk_prot		= &tcp_timewait_sock_ops,
2347	.rsk_prot		= &tcp_request_sock_ops,
2348	.h.hashinfo		= &tcp_hashinfo,
2349	.no_autobind		= true,
2350#ifdef CONFIG_COMPAT
2351	.compat_setsockopt	= compat_tcp_setsockopt,
2352	.compat_getsockopt	= compat_tcp_getsockopt,
2353#endif
2354	.diag_destroy		= tcp_abort,
2355};
2356EXPORT_SYMBOL(tcp_prot);
2357
2358static void __net_exit tcp_sk_exit(struct net *net)
2359{
2360	int cpu;
2361
2362	for_each_possible_cpu(cpu)
2363		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2364	free_percpu(net->ipv4.tcp_sk);
2365}
2366
2367static int __net_init tcp_sk_init(struct net *net)
2368{
2369	int res, cpu;
2370
2371	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2372	if (!net->ipv4.tcp_sk)
2373		return -ENOMEM;
2374
2375	for_each_possible_cpu(cpu) {
2376		struct sock *sk;
2377
2378		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2379					   IPPROTO_TCP, net);
2380		if (res)
2381			goto fail;
2382		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2383	}
2384
2385	net->ipv4.sysctl_tcp_ecn = 2;
2386	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2387
2388	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2389	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2390	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2391
2392	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2393	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2394	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2395
2396	return 0;
2397fail:
2398	tcp_sk_exit(net);
2399
2400	return res;
2401}
2402
2403static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2404{
2405	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2406}
2407
2408static struct pernet_operations __net_initdata tcp_sk_ops = {
2409       .init	   = tcp_sk_init,
2410       .exit	   = tcp_sk_exit,
2411       .exit_batch = tcp_sk_exit_batch,
2412};
2413
2414void __init tcp_v4_init(void)
2415{
2416	inet_hashinfo_init(&tcp_hashinfo);
2417	if (register_pernet_subsys(&tcp_sk_ops))
2418		panic("Failed to create the TCP control socket.\n");
2419}
Configure Feed

Configure Feed