net/ipv4/tcp_ipv4.c at v2.6.25

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp_ipv4.c
at v2.6.25 2468 lines 62 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Implementation of the Transmission Control Protocol(TCP).
   7 *
   8 * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
   9 *
  10 *		IPv4 specific functions
  11 *
  12 *
  13 *		code split from:
  14 *		linux/ipv4/tcp.c
  15 *		linux/ipv4/tcp_input.c
  16 *		linux/ipv4/tcp_output.c
  17 *
  18 *		See tcp.c for author information
  19 *
  20 *	This program is free software; you can redistribute it and/or
  21 *      modify it under the terms of the GNU General Public License
  22 *      as published by the Free Software Foundation; either version
  23 *      2 of the License, or (at your option) any later version.
  24 */
  25
  26/*
  27 * Changes:
  28 *		David S. Miller	:	New socket lookup architecture.
  29 *					This code is dedicated to John Dyson.
  30 *		David S. Miller :	Change semantics of established hash,
  31 *					half is devoted to TIME_WAIT sockets
  32 *					and the rest go in the other half.
  33 *		Andi Kleen :		Add support for syncookies and fixed
  34 *					some bugs: ip options weren't passed to
  35 *					the TCP layer, missed a check for an
  36 *					ACK bit.
  37 *		Andi Kleen :		Implemented fast path mtu discovery.
  38 *	     				Fixed many serious bugs in the
  39 *					request_sock handling and moved
  40 *					most of it into the af independent code.
  41 *					Added tail drop and some other bugfixes.
  42 *					Added new listen semantics.
  43 *		Mike McLagan	:	Routing by source
  44 *	Juan Jose Ciarlante:		ip_dynaddr bits
  45 *		Andi Kleen:		various fixes.
  46 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
  47 *					coma.
  48 *	Andi Kleen		:	Fix new listen.
  49 *	Andi Kleen		:	Fix accept error reporting.
  50 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
  51 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
  52 *					a single port at the same time.
  53 */
  54
  55
  56#include <linux/types.h>
  57#include <linux/fcntl.h>
  58#include <linux/module.h>
  59#include <linux/random.h>
  60#include <linux/cache.h>
  61#include <linux/jhash.h>
  62#include <linux/init.h>
  63#include <linux/times.h>
  64
  65#include <net/net_namespace.h>
  66#include <net/icmp.h>
  67#include <net/inet_hashtables.h>
  68#include <net/tcp.h>
  69#include <net/transp_v6.h>
  70#include <net/ipv6.h>
  71#include <net/inet_common.h>
  72#include <net/timewait_sock.h>
  73#include <net/xfrm.h>
  74#include <net/netdma.h>
  75
  76#include <linux/inet.h>
  77#include <linux/ipv6.h>
  78#include <linux/stddef.h>
  79#include <linux/proc_fs.h>
  80#include <linux/seq_file.h>
  81
  82#include <linux/crypto.h>
  83#include <linux/scatterlist.h>
  84
  85int sysctl_tcp_tw_reuse __read_mostly;
  86int sysctl_tcp_low_latency __read_mostly;
  87
  88/* Check TCP sequence numbers in ICMP packets. */
  89#define ICMP_MIN_LENGTH 8
  90
  91/* Socket used for sending RSTs */
  92static struct socket *tcp_socket __read_mostly;
  93
  94void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
  95
  96#ifdef CONFIG_TCP_MD5SIG
  97static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  98						   __be32 addr);
  99static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
 100				   __be32 saddr, __be32 daddr,
 101				   struct tcphdr *th, int protocol,
 102				   unsigned int tcplen);
 103#endif
 104
 105struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 106	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 107	.lhash_users = ATOMIC_INIT(0),
 108	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
 109};
 110
 111static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 112{
 113	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 114					  ip_hdr(skb)->saddr,
 115					  tcp_hdr(skb)->dest,
 116					  tcp_hdr(skb)->source);
 117}
 118
 119int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 120{
 121	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 122	struct tcp_sock *tp = tcp_sk(sk);
 123
 124	/* With PAWS, it is safe from the viewpoint
 125	   of data integrity. Even without PAWS it is safe provided sequence
 126	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 127
 128	   Actually, the idea is close to VJ's one, only timestamp cache is
 129	   held not per host, but per port pair and TW bucket is used as state
 130	   holder.
 131
 132	   If TW bucket has been already destroyed we fall back to VJ's scheme
 133	   and use initial timestamp retrieved from peer table.
 134	 */
 135	if (tcptw->tw_ts_recent_stamp &&
 136	    (twp == NULL || (sysctl_tcp_tw_reuse &&
 137			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 138		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 139		if (tp->write_seq == 0)
 140			tp->write_seq = 1;
 141		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
 142		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 143		sock_hold(sktw);
 144		return 1;
 145	}
 146
 147	return 0;
 148}
 149
 150EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 151
 152/* This will initiate an outgoing connection. */
 153int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 154{
 155	struct inet_sock *inet = inet_sk(sk);
 156	struct tcp_sock *tp = tcp_sk(sk);
 157	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 158	struct rtable *rt;
 159	__be32 daddr, nexthop;
 160	int tmp;
 161	int err;
 162
 163	if (addr_len < sizeof(struct sockaddr_in))
 164		return -EINVAL;
 165
 166	if (usin->sin_family != AF_INET)
 167		return -EAFNOSUPPORT;
 168
 169	nexthop = daddr = usin->sin_addr.s_addr;
 170	if (inet->opt && inet->opt->srr) {
 171		if (!daddr)
 172			return -EINVAL;
 173		nexthop = inet->opt->faddr;
 174	}
 175
 176	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
 177			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 178			       IPPROTO_TCP,
 179			       inet->sport, usin->sin_port, sk, 1);
 180	if (tmp < 0) {
 181		if (tmp == -ENETUNREACH)
 182			IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 183		return tmp;
 184	}
 185
 186	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 187		ip_rt_put(rt);
 188		return -ENETUNREACH;
 189	}
 190
 191	if (!inet->opt || !inet->opt->srr)
 192		daddr = rt->rt_dst;
 193
 194	if (!inet->saddr)
 195		inet->saddr = rt->rt_src;
 196	inet->rcv_saddr = inet->saddr;
 197
 198	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
 199		/* Reset inherited state */
 200		tp->rx_opt.ts_recent	   = 0;
 201		tp->rx_opt.ts_recent_stamp = 0;
 202		tp->write_seq		   = 0;
 203	}
 204
 205	if (tcp_death_row.sysctl_tw_recycle &&
 206	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 207		struct inet_peer *peer = rt_get_peer(rt);
 208		/*
 209		 * VJ's idea. We save last timestamp seen from
 210		 * the destination in peer table, when entering state
 211		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 212		 * when trying new connection.
 213		 */
 214		if (peer != NULL &&
 215		    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
 216			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 217			tp->rx_opt.ts_recent = peer->tcp_ts;
 218		}
 219	}
 220
 221	inet->dport = usin->sin_port;
 222	inet->daddr = daddr;
 223
 224	inet_csk(sk)->icsk_ext_hdr_len = 0;
 225	if (inet->opt)
 226		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 227
 228	tp->rx_opt.mss_clamp = 536;
 229
 230	/* Socket identity is still unknown (sport may be zero).
 231	 * However we set state to SYN-SENT and not releasing socket
 232	 * lock select source port, enter ourselves into the hash tables and
 233	 * complete initialization after this.
 234	 */
 235	tcp_set_state(sk, TCP_SYN_SENT);
 236	err = inet_hash_connect(&tcp_death_row, sk);
 237	if (err)
 238		goto failure;
 239
 240	err = ip_route_newports(&rt, IPPROTO_TCP,
 241				inet->sport, inet->dport, sk);
 242	if (err)
 243		goto failure;
 244
 245	/* OK, now commit destination to socket.  */
 246	sk->sk_gso_type = SKB_GSO_TCPV4;
 247	sk_setup_caps(sk, &rt->u.dst);
 248
 249	if (!tp->write_seq)
 250		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
 251							   inet->daddr,
 252							   inet->sport,
 253							   usin->sin_port);
 254
 255	inet->id = tp->write_seq ^ jiffies;
 256
 257	err = tcp_connect(sk);
 258	rt = NULL;
 259	if (err)
 260		goto failure;
 261
 262	return 0;
 263
 264failure:
 265	/*
 266	 * This unhashes the socket and releases the local port,
 267	 * if necessary.
 268	 */
 269	tcp_set_state(sk, TCP_CLOSE);
 270	ip_rt_put(rt);
 271	sk->sk_route_caps = 0;
 272	inet->dport = 0;
 273	return err;
 274}
 275
 276/*
 277 * This routine does path mtu discovery as defined in RFC1191.
 278 */
 279static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 280{
 281	struct dst_entry *dst;
 282	struct inet_sock *inet = inet_sk(sk);
 283
 284	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 285	 * send out by Linux are always <576bytes so they should go through
 286	 * unfragmented).
 287	 */
 288	if (sk->sk_state == TCP_LISTEN)
 289		return;
 290
 291	/* We don't check in the destentry if pmtu discovery is forbidden
 292	 * on this route. We just assume that no packet_to_big packets
 293	 * are send back when pmtu discovery is not active.
 294	 * There is a small race when the user changes this flag in the
 295	 * route, but I think that's acceptable.
 296	 */
 297	if ((dst = __sk_dst_check(sk, 0)) == NULL)
 298		return;
 299
 300	dst->ops->update_pmtu(dst, mtu);
 301
 302	/* Something is about to be wrong... Remember soft error
 303	 * for the case, if this connection will not able to recover.
 304	 */
 305	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 306		sk->sk_err_soft = EMSGSIZE;
 307
 308	mtu = dst_mtu(dst);
 309
 310	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 311	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 312		tcp_sync_mss(sk, mtu);
 313
 314		/* Resend the TCP packet because it's
 315		 * clear that the old packet has been
 316		 * dropped. This is the new "fast" path mtu
 317		 * discovery.
 318		 */
 319		tcp_simple_retransmit(sk);
 320	} /* else let the usual retransmit timer handle it */
 321}
 322
 323/*
 324 * This routine is called by the ICMP module when it gets some
 325 * sort of error condition.  If err < 0 then the socket should
 326 * be closed and the error returned to the user.  If err > 0
 327 * it's just the icmp type << 8 | icmp code.  After adjustment
 328 * header points to the first 8 bytes of the tcp header.  We need
 329 * to find the appropriate port.
 330 *
 331 * The locking strategy used here is very "optimistic". When
 332 * someone else accesses the socket the ICMP is just dropped
 333 * and for some paths there is no check at all.
 334 * A more general error queue to queue errors for later handling
 335 * is probably better.
 336 *
 337 */
 338
 339void tcp_v4_err(struct sk_buff *skb, u32 info)
 340{
 341	struct iphdr *iph = (struct iphdr *)skb->data;
 342	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
 343	struct tcp_sock *tp;
 344	struct inet_sock *inet;
 345	const int type = icmp_hdr(skb)->type;
 346	const int code = icmp_hdr(skb)->code;
 347	struct sock *sk;
 348	__u32 seq;
 349	int err;
 350
 351	if (skb->len < (iph->ihl << 2) + 8) {
 352		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 353		return;
 354	}
 355
 356	sk = inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->daddr, th->dest,
 357			iph->saddr, th->source, inet_iif(skb));
 358	if (!sk) {
 359		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 360		return;
 361	}
 362	if (sk->sk_state == TCP_TIME_WAIT) {
 363		inet_twsk_put(inet_twsk(sk));
 364		return;
 365	}
 366
 367	bh_lock_sock(sk);
 368	/* If too many ICMPs get dropped on busy
 369	 * servers this needs to be solved differently.
 370	 */
 371	if (sock_owned_by_user(sk))
 372		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
 373
 374	if (sk->sk_state == TCP_CLOSE)
 375		goto out;
 376
 377	tp = tcp_sk(sk);
 378	seq = ntohl(th->seq);
 379	if (sk->sk_state != TCP_LISTEN &&
 380	    !between(seq, tp->snd_una, tp->snd_nxt)) {
 381		NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 382		goto out;
 383	}
 384
 385	switch (type) {
 386	case ICMP_SOURCE_QUENCH:
 387		/* Just silently ignore these. */
 388		goto out;
 389	case ICMP_PARAMETERPROB:
 390		err = EPROTO;
 391		break;
 392	case ICMP_DEST_UNREACH:
 393		if (code > NR_ICMP_UNREACH)
 394			goto out;
 395
 396		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 397			if (!sock_owned_by_user(sk))
 398				do_pmtu_discovery(sk, iph, info);
 399			goto out;
 400		}
 401
 402		err = icmp_err_convert[code].errno;
 403		break;
 404	case ICMP_TIME_EXCEEDED:
 405		err = EHOSTUNREACH;
 406		break;
 407	default:
 408		goto out;
 409	}
 410
 411	switch (sk->sk_state) {
 412		struct request_sock *req, **prev;
 413	case TCP_LISTEN:
 414		if (sock_owned_by_user(sk))
 415			goto out;
 416
 417		req = inet_csk_search_req(sk, &prev, th->dest,
 418					  iph->daddr, iph->saddr);
 419		if (!req)
 420			goto out;
 421
 422		/* ICMPs are not backlogged, hence we cannot get
 423		   an established socket here.
 424		 */
 425		BUG_TRAP(!req->sk);
 426
 427		if (seq != tcp_rsk(req)->snt_isn) {
 428			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 429			goto out;
 430		}
 431
 432		/*
 433		 * Still in SYN_RECV, just remove it silently.
 434		 * There is no good way to pass the error to the newly
 435		 * created socket, and POSIX does not want network
 436		 * errors returned from accept().
 437		 */
 438		inet_csk_reqsk_queue_drop(sk, req, prev);
 439		goto out;
 440
 441	case TCP_SYN_SENT:
 442	case TCP_SYN_RECV:  /* Cannot happen.
 443			       It can f.e. if SYNs crossed.
 444			     */
 445		if (!sock_owned_by_user(sk)) {
 446			sk->sk_err = err;
 447
 448			sk->sk_error_report(sk);
 449
 450			tcp_done(sk);
 451		} else {
 452			sk->sk_err_soft = err;
 453		}
 454		goto out;
 455	}
 456
 457	/* If we've already connected we will keep trying
 458	 * until we time out, or the user gives up.
 459	 *
 460	 * rfc1122 4.2.3.9 allows to consider as hard errors
 461	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 462	 * but it is obsoleted by pmtu discovery).
 463	 *
 464	 * Note, that in modern internet, where routing is unreliable
 465	 * and in each dark corner broken firewalls sit, sending random
 466	 * errors ordered by their masters even this two messages finally lose
 467	 * their original sense (even Linux sends invalid PORT_UNREACHs)
 468	 *
 469	 * Now we are in compliance with RFCs.
 470	 *							--ANK (980905)
 471	 */
 472
 473	inet = inet_sk(sk);
 474	if (!sock_owned_by_user(sk) && inet->recverr) {
 475		sk->sk_err = err;
 476		sk->sk_error_report(sk);
 477	} else	{ /* Only an error on timeout */
 478		sk->sk_err_soft = err;
 479	}
 480
 481out:
 482	bh_unlock_sock(sk);
 483	sock_put(sk);
 484}
 485
 486/* This routine computes an IPv4 TCP checksum. */
 487void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 488{
 489	struct inet_sock *inet = inet_sk(sk);
 490	struct tcphdr *th = tcp_hdr(skb);
 491
 492	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 493		th->check = ~tcp_v4_check(len, inet->saddr,
 494					  inet->daddr, 0);
 495		skb->csum_start = skb_transport_header(skb) - skb->head;
 496		skb->csum_offset = offsetof(struct tcphdr, check);
 497	} else {
 498		th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
 499					 csum_partial((char *)th,
 500						      th->doff << 2,
 501						      skb->csum));
 502	}
 503}
 504
 505int tcp_v4_gso_send_check(struct sk_buff *skb)
 506{
 507	const struct iphdr *iph;
 508	struct tcphdr *th;
 509
 510	if (!pskb_may_pull(skb, sizeof(*th)))
 511		return -EINVAL;
 512
 513	iph = ip_hdr(skb);
 514	th = tcp_hdr(skb);
 515
 516	th->check = 0;
 517	th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 518	skb->csum_start = skb_transport_header(skb) - skb->head;
 519	skb->csum_offset = offsetof(struct tcphdr, check);
 520	skb->ip_summed = CHECKSUM_PARTIAL;
 521	return 0;
 522}
 523
 524/*
 525 *	This routine will send an RST to the other tcp.
 526 *
 527 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 528 *		      for reset.
 529 *	Answer: if a packet caused RST, it is not for a socket
 530 *		existing in our system, if it is matched to a socket,
 531 *		it is just duplicate segment or bug in other side's TCP.
 532 *		So that we build reply only basing on parameters
 533 *		arrived with segment.
 534 *	Exception: precedence violation. We do not implement it in any case.
 535 */
 536
 537static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 538{
 539	struct tcphdr *th = tcp_hdr(skb);
 540	struct {
 541		struct tcphdr th;
 542#ifdef CONFIG_TCP_MD5SIG
 543		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 544#endif
 545	} rep;
 546	struct ip_reply_arg arg;
 547#ifdef CONFIG_TCP_MD5SIG
 548	struct tcp_md5sig_key *key;
 549#endif
 550
 551	/* Never send a reset in response to a reset. */
 552	if (th->rst)
 553		return;
 554
 555	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
 556		return;
 557
 558	/* Swap the send and the receive. */
 559	memset(&rep, 0, sizeof(rep));
 560	rep.th.dest   = th->source;
 561	rep.th.source = th->dest;
 562	rep.th.doff   = sizeof(struct tcphdr) / 4;
 563	rep.th.rst    = 1;
 564
 565	if (th->ack) {
 566		rep.th.seq = th->ack_seq;
 567	} else {
 568		rep.th.ack = 1;
 569		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 570				       skb->len - (th->doff << 2));
 571	}
 572
 573	memset(&arg, 0, sizeof(arg));
 574	arg.iov[0].iov_base = (unsigned char *)&rep;
 575	arg.iov[0].iov_len  = sizeof(rep.th);
 576
 577#ifdef CONFIG_TCP_MD5SIG
 578	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 579	if (key) {
 580		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 581				   (TCPOPT_NOP << 16) |
 582				   (TCPOPT_MD5SIG << 8) |
 583				   TCPOLEN_MD5SIG);
 584		/* Update length and the length the header thinks exists */
 585		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 586		rep.th.doff = arg.iov[0].iov_len / 4;
 587
 588		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[1],
 589					key,
 590					ip_hdr(skb)->daddr,
 591					ip_hdr(skb)->saddr,
 592					&rep.th, IPPROTO_TCP,
 593					arg.iov[0].iov_len);
 594	}
 595#endif
 596	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 597				      ip_hdr(skb)->saddr, /* XXX */
 598				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
 599	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 600
 601	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 602
 603	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 604	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
 605}
 606
 607/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 608   outside socket context is ugly, certainly. What can I do?
 609 */
 610
 611static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk,
 612			    struct sk_buff *skb, u32 seq, u32 ack,
 613			    u32 win, u32 ts)
 614{
 615	struct tcphdr *th = tcp_hdr(skb);
 616	struct {
 617		struct tcphdr th;
 618		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 619#ifdef CONFIG_TCP_MD5SIG
 620			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 621#endif
 622			];
 623	} rep;
 624	struct ip_reply_arg arg;
 625#ifdef CONFIG_TCP_MD5SIG
 626	struct tcp_md5sig_key *key;
 627	struct tcp_md5sig_key tw_key;
 628#endif
 629
 630	memset(&rep.th, 0, sizeof(struct tcphdr));
 631	memset(&arg, 0, sizeof(arg));
 632
 633	arg.iov[0].iov_base = (unsigned char *)&rep;
 634	arg.iov[0].iov_len  = sizeof(rep.th);
 635	if (ts) {
 636		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 637				   (TCPOPT_TIMESTAMP << 8) |
 638				   TCPOLEN_TIMESTAMP);
 639		rep.opt[1] = htonl(tcp_time_stamp);
 640		rep.opt[2] = htonl(ts);
 641		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 642	}
 643
 644	/* Swap the send and the receive. */
 645	rep.th.dest    = th->source;
 646	rep.th.source  = th->dest;
 647	rep.th.doff    = arg.iov[0].iov_len / 4;
 648	rep.th.seq     = htonl(seq);
 649	rep.th.ack_seq = htonl(ack);
 650	rep.th.ack     = 1;
 651	rep.th.window  = htons(win);
 652
 653#ifdef CONFIG_TCP_MD5SIG
 654	/*
 655	 * The SKB holds an imcoming packet, but may not have a valid ->sk
 656	 * pointer. This is especially the case when we're dealing with a
 657	 * TIME_WAIT ack, because the sk structure is long gone, and only
 658	 * the tcp_timewait_sock remains. So the md5 key is stashed in that
 659	 * structure, and we use it in preference.  I believe that (twsk ||
 660	 * skb->sk) holds true, but we program defensively.
 661	 */
 662	if (!twsk && skb->sk) {
 663		key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr);
 664	} else if (twsk && twsk->tw_md5_keylen) {
 665		tw_key.key = twsk->tw_md5_key;
 666		tw_key.keylen = twsk->tw_md5_keylen;
 667		key = &tw_key;
 668	} else
 669		key = NULL;
 670
 671	if (key) {
 672		int offset = (ts) ? 3 : 0;
 673
 674		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 675					  (TCPOPT_NOP << 16) |
 676					  (TCPOPT_MD5SIG << 8) |
 677					  TCPOLEN_MD5SIG);
 678		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 679		rep.th.doff = arg.iov[0].iov_len/4;
 680
 681		tcp_v4_do_calc_md5_hash((__u8 *)&rep.opt[offset],
 682					key,
 683					ip_hdr(skb)->daddr,
 684					ip_hdr(skb)->saddr,
 685					&rep.th, IPPROTO_TCP,
 686					arg.iov[0].iov_len);
 687	}
 688#endif
 689	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 690				      ip_hdr(skb)->saddr, /* XXX */
 691				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
 692	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 693	if (twsk)
 694		arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if;
 695
 696	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
 697
 698	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
 699}
 700
 701static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 702{
 703	struct inet_timewait_sock *tw = inet_twsk(sk);
 704	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 705
 706	tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 707			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 708			tcptw->tw_ts_recent);
 709
 710	inet_twsk_put(tw);
 711}
 712
 713static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
 714				  struct request_sock *req)
 715{
 716	tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1,
 717			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 718			req->ts_recent);
 719}
 720
 721/*
 722 *	Send a SYN-ACK after having received a SYN.
 723 *	This still operates on a request_sock only, not on a big
 724 *	socket.
 725 */
 726static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 727			      struct dst_entry *dst)
 728{
 729	const struct inet_request_sock *ireq = inet_rsk(req);
 730	int err = -1;
 731	struct sk_buff * skb;
 732
 733	/* First, grab a route. */
 734	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 735		goto out;
 736
 737	skb = tcp_make_synack(sk, dst, req);
 738
 739	if (skb) {
 740		struct tcphdr *th = tcp_hdr(skb);
 741
 742		th->check = tcp_v4_check(skb->len,
 743					 ireq->loc_addr,
 744					 ireq->rmt_addr,
 745					 csum_partial((char *)th, skb->len,
 746						      skb->csum));
 747
 748		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 749					    ireq->rmt_addr,
 750					    ireq->opt);
 751		err = net_xmit_eval(err);
 752	}
 753
 754out:
 755	dst_release(dst);
 756	return err;
 757}
 758
 759/*
 760 *	IPv4 request_sock destructor.
 761 */
 762static void tcp_v4_reqsk_destructor(struct request_sock *req)
 763{
 764	kfree(inet_rsk(req)->opt);
 765}
 766
 767#ifdef CONFIG_SYN_COOKIES
 768static void syn_flood_warning(struct sk_buff *skb)
 769{
 770	static unsigned long warntime;
 771
 772	if (time_after(jiffies, (warntime + HZ * 60))) {
 773		warntime = jiffies;
 774		printk(KERN_INFO
 775		       "possible SYN flooding on port %d. Sending cookies.\n",
 776		       ntohs(tcp_hdr(skb)->dest));
 777	}
 778}
 779#endif
 780
 781/*
 782 * Save and compile IPv4 options into the request_sock if needed.
 783 */
 784static struct ip_options *tcp_v4_save_options(struct sock *sk,
 785					      struct sk_buff *skb)
 786{
 787	struct ip_options *opt = &(IPCB(skb)->opt);
 788	struct ip_options *dopt = NULL;
 789
 790	if (opt && opt->optlen) {
 791		int opt_size = optlength(opt);
 792		dopt = kmalloc(opt_size, GFP_ATOMIC);
 793		if (dopt) {
 794			if (ip_options_echo(dopt, skb)) {
 795				kfree(dopt);
 796				dopt = NULL;
 797			}
 798		}
 799	}
 800	return dopt;
 801}
 802
 803#ifdef CONFIG_TCP_MD5SIG
 804/*
 805 * RFC2385 MD5 checksumming requires a mapping of
 806 * IP address->MD5 Key.
 807 * We need to maintain these in the sk structure.
 808 */
 809
 810/* Find the Key structure for an address.  */
 811static struct tcp_md5sig_key *
 812			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 813{
 814	struct tcp_sock *tp = tcp_sk(sk);
 815	int i;
 816
 817	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 818		return NULL;
 819	for (i = 0; i < tp->md5sig_info->entries4; i++) {
 820		if (tp->md5sig_info->keys4[i].addr == addr)
 821			return &tp->md5sig_info->keys4[i].base;
 822	}
 823	return NULL;
 824}
 825
 826struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 827					 struct sock *addr_sk)
 828{
 829	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
 830}
 831
 832EXPORT_SYMBOL(tcp_v4_md5_lookup);
 833
 834static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 835						      struct request_sock *req)
 836{
 837	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 838}
 839
 840/* This can be called on a newly created socket, from other files */
 841int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 842		      u8 *newkey, u8 newkeylen)
 843{
 844	/* Add Key to the list */
 845	struct tcp_md5sig_key *key;
 846	struct tcp_sock *tp = tcp_sk(sk);
 847	struct tcp4_md5sig_key *keys;
 848
 849	key = tcp_v4_md5_do_lookup(sk, addr);
 850	if (key) {
 851		/* Pre-existing entry - just update that one. */
 852		kfree(key->key);
 853		key->key = newkey;
 854		key->keylen = newkeylen;
 855	} else {
 856		struct tcp_md5sig_info *md5sig;
 857
 858		if (!tp->md5sig_info) {
 859			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 860						  GFP_ATOMIC);
 861			if (!tp->md5sig_info) {
 862				kfree(newkey);
 863				return -ENOMEM;
 864			}
 865			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 866		}
 867		if (tcp_alloc_md5sig_pool() == NULL) {
 868			kfree(newkey);
 869			return -ENOMEM;
 870		}
 871		md5sig = tp->md5sig_info;
 872
 873		if (md5sig->alloced4 == md5sig->entries4) {
 874			keys = kmalloc((sizeof(*keys) *
 875					(md5sig->entries4 + 1)), GFP_ATOMIC);
 876			if (!keys) {
 877				kfree(newkey);
 878				tcp_free_md5sig_pool();
 879				return -ENOMEM;
 880			}
 881
 882			if (md5sig->entries4)
 883				memcpy(keys, md5sig->keys4,
 884				       sizeof(*keys) * md5sig->entries4);
 885
 886			/* Free old key list, and reference new one */
 887			kfree(md5sig->keys4);
 888			md5sig->keys4 = keys;
 889			md5sig->alloced4++;
 890		}
 891		md5sig->entries4++;
 892		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 893		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 894		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 895	}
 896	return 0;
 897}
 898
 899EXPORT_SYMBOL(tcp_v4_md5_do_add);
 900
 901static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 902			       u8 *newkey, u8 newkeylen)
 903{
 904	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
 905				 newkey, newkeylen);
 906}
 907
 908int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 909{
 910	struct tcp_sock *tp = tcp_sk(sk);
 911	int i;
 912
 913	for (i = 0; i < tp->md5sig_info->entries4; i++) {
 914		if (tp->md5sig_info->keys4[i].addr == addr) {
 915			/* Free the key */
 916			kfree(tp->md5sig_info->keys4[i].base.key);
 917			tp->md5sig_info->entries4--;
 918
 919			if (tp->md5sig_info->entries4 == 0) {
 920				kfree(tp->md5sig_info->keys4);
 921				tp->md5sig_info->keys4 = NULL;
 922				tp->md5sig_info->alloced4 = 0;
 923			} else if (tp->md5sig_info->entries4 != i) {
 924				/* Need to do some manipulation */
 925				memmove(&tp->md5sig_info->keys4[i],
 926					&tp->md5sig_info->keys4[i+1],
 927					(tp->md5sig_info->entries4 - i) *
 928					 sizeof(struct tcp4_md5sig_key));
 929			}
 930			tcp_free_md5sig_pool();
 931			return 0;
 932		}
 933	}
 934	return -ENOENT;
 935}
 936
 937EXPORT_SYMBOL(tcp_v4_md5_do_del);
 938
 939static void tcp_v4_clear_md5_list(struct sock *sk)
 940{
 941	struct tcp_sock *tp = tcp_sk(sk);
 942
 943	/* Free each key, then the set of key keys,
 944	 * the crypto element, and then decrement our
 945	 * hold on the last resort crypto.
 946	 */
 947	if (tp->md5sig_info->entries4) {
 948		int i;
 949		for (i = 0; i < tp->md5sig_info->entries4; i++)
 950			kfree(tp->md5sig_info->keys4[i].base.key);
 951		tp->md5sig_info->entries4 = 0;
 952		tcp_free_md5sig_pool();
 953	}
 954	if (tp->md5sig_info->keys4) {
 955		kfree(tp->md5sig_info->keys4);
 956		tp->md5sig_info->keys4 = NULL;
 957		tp->md5sig_info->alloced4  = 0;
 958	}
 959}
 960
 961static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 962				 int optlen)
 963{
 964	struct tcp_md5sig cmd;
 965	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 966	u8 *newkey;
 967
 968	if (optlen < sizeof(cmd))
 969		return -EINVAL;
 970
 971	if (copy_from_user(&cmd, optval, sizeof(cmd)))
 972		return -EFAULT;
 973
 974	if (sin->sin_family != AF_INET)
 975		return -EINVAL;
 976
 977	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
 978		if (!tcp_sk(sk)->md5sig_info)
 979			return -ENOENT;
 980		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
 981	}
 982
 983	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
 984		return -EINVAL;
 985
 986	if (!tcp_sk(sk)->md5sig_info) {
 987		struct tcp_sock *tp = tcp_sk(sk);
 988		struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
 989
 990		if (!p)
 991			return -EINVAL;
 992
 993		tp->md5sig_info = p;
 994		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 995	}
 996
 997	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
 998	if (!newkey)
 999		return -ENOMEM;
1000	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1001				 newkey, cmd.tcpm_keylen);
1002}
1003
1004static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1005				   __be32 saddr, __be32 daddr,
1006				   struct tcphdr *th, int protocol,
1007				   unsigned int tcplen)
1008{
1009	struct scatterlist sg[4];
1010	__u16 data_len;
1011	int block = 0;
1012	__sum16 old_checksum;
1013	struct tcp_md5sig_pool *hp;
1014	struct tcp4_pseudohdr *bp;
1015	struct hash_desc *desc;
1016	int err;
1017	unsigned int nbytes = 0;
1018
1019	/*
1020	 * Okay, so RFC2385 is turned on for this connection,
1021	 * so we need to generate the MD5 hash for the packet now.
1022	 */
1023
1024	hp = tcp_get_md5sig_pool();
1025	if (!hp)
1026		goto clear_hash_noput;
1027
1028	bp = &hp->md5_blk.ip4;
1029	desc = &hp->md5_desc;
1030
1031	/*
1032	 * 1. the TCP pseudo-header (in the order: source IP address,
1033	 * destination IP address, zero-padded protocol number, and
1034	 * segment length)
1035	 */
1036	bp->saddr = saddr;
1037	bp->daddr = daddr;
1038	bp->pad = 0;
1039	bp->protocol = protocol;
1040	bp->len = htons(tcplen);
1041
1042	sg_init_table(sg, 4);
1043
1044	sg_set_buf(&sg[block++], bp, sizeof(*bp));
1045	nbytes += sizeof(*bp);
1046
1047	/* 2. the TCP header, excluding options, and assuming a
1048	 * checksum of zero/
1049	 */
1050	old_checksum = th->check;
1051	th->check = 0;
1052	sg_set_buf(&sg[block++], th, sizeof(struct tcphdr));
1053	nbytes += sizeof(struct tcphdr);
1054
1055	/* 3. the TCP segment data (if any) */
1056	data_len = tcplen - (th->doff << 2);
1057	if (data_len > 0) {
1058		unsigned char *data = (unsigned char *)th + (th->doff << 2);
1059		sg_set_buf(&sg[block++], data, data_len);
1060		nbytes += data_len;
1061	}
1062
1063	/* 4. an independently-specified key or password, known to both
1064	 * TCPs and presumably connection-specific
1065	 */
1066	sg_set_buf(&sg[block++], key->key, key->keylen);
1067	nbytes += key->keylen;
1068
1069	sg_mark_end(&sg[block - 1]);
1070
1071	/* Now store the Hash into the packet */
1072	err = crypto_hash_init(desc);
1073	if (err)
1074		goto clear_hash;
1075	err = crypto_hash_update(desc, sg, nbytes);
1076	if (err)
1077		goto clear_hash;
1078	err = crypto_hash_final(desc, md5_hash);
1079	if (err)
1080		goto clear_hash;
1081
1082	/* Reset header, and free up the crypto */
1083	tcp_put_md5sig_pool();
1084	th->check = old_checksum;
1085
1086out:
1087	return 0;
1088clear_hash:
1089	tcp_put_md5sig_pool();
1090clear_hash_noput:
1091	memset(md5_hash, 0, 16);
1092	goto out;
1093}
1094
1095int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key,
1096			 struct sock *sk,
1097			 struct dst_entry *dst,
1098			 struct request_sock *req,
1099			 struct tcphdr *th, int protocol,
1100			 unsigned int tcplen)
1101{
1102	__be32 saddr, daddr;
1103
1104	if (sk) {
1105		saddr = inet_sk(sk)->saddr;
1106		daddr = inet_sk(sk)->daddr;
1107	} else {
1108		struct rtable *rt = (struct rtable *)dst;
1109		BUG_ON(!rt);
1110		saddr = rt->rt_src;
1111		daddr = rt->rt_dst;
1112	}
1113	return tcp_v4_do_calc_md5_hash(md5_hash, key,
1114				       saddr, daddr,
1115				       th, protocol, tcplen);
1116}
1117
1118EXPORT_SYMBOL(tcp_v4_calc_md5_hash);
1119
1120static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1121{
1122	/*
1123	 * This gets called for each TCP segment that arrives
1124	 * so we want to be efficient.
1125	 * We have 3 drop cases:
1126	 * o No MD5 hash and one expected.
1127	 * o MD5 hash and we're not expecting one.
1128	 * o MD5 hash and its wrong.
1129	 */
1130	__u8 *hash_location = NULL;
1131	struct tcp_md5sig_key *hash_expected;
1132	const struct iphdr *iph = ip_hdr(skb);
1133	struct tcphdr *th = tcp_hdr(skb);
1134	int length = (th->doff << 2) - sizeof(struct tcphdr);
1135	int genhash;
1136	unsigned char *ptr;
1137	unsigned char newhash[16];
1138
1139	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1140
1141	/*
1142	 * If the TCP option length is less than the TCP_MD5SIG
1143	 * option length, then we can shortcut
1144	 */
1145	if (length < TCPOLEN_MD5SIG) {
1146		if (hash_expected)
1147			return 1;
1148		else
1149			return 0;
1150	}
1151
1152	/* Okay, we can't shortcut - we have to grub through the options */
1153	ptr = (unsigned char *)(th + 1);
1154	while (length > 0) {
1155		int opcode = *ptr++;
1156		int opsize;
1157
1158		switch (opcode) {
1159		case TCPOPT_EOL:
1160			goto done_opts;
1161		case TCPOPT_NOP:
1162			length--;
1163			continue;
1164		default:
1165			opsize = *ptr++;
1166			if (opsize < 2)
1167				goto done_opts;
1168			if (opsize > length)
1169				goto done_opts;
1170
1171			if (opcode == TCPOPT_MD5SIG) {
1172				hash_location = ptr;
1173				goto done_opts;
1174			}
1175		}
1176		ptr += opsize-2;
1177		length -= opsize;
1178	}
1179done_opts:
1180	/* We've parsed the options - do we have a hash? */
1181	if (!hash_expected && !hash_location)
1182		return 0;
1183
1184	if (hash_expected && !hash_location) {
1185		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1186			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1187			       NIPQUAD(iph->saddr), ntohs(th->source),
1188			       NIPQUAD(iph->daddr), ntohs(th->dest));
1189		return 1;
1190	}
1191
1192	if (!hash_expected && hash_location) {
1193		LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1194			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1195			       NIPQUAD(iph->saddr), ntohs(th->source),
1196			       NIPQUAD(iph->daddr), ntohs(th->dest));
1197		return 1;
1198	}
1199
1200	/* Okay, so this is hash_expected and hash_location -
1201	 * so we need to calculate the checksum.
1202	 */
1203	genhash = tcp_v4_do_calc_md5_hash(newhash,
1204					  hash_expected,
1205					  iph->saddr, iph->daddr,
1206					  th, sk->sk_protocol,
1207					  skb->len);
1208
1209	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1210		if (net_ratelimit()) {
1211			printk(KERN_INFO "MD5 Hash failed for "
1212			       "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1213			       NIPQUAD(iph->saddr), ntohs(th->source),
1214			       NIPQUAD(iph->daddr), ntohs(th->dest),
1215			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1216		}
1217		return 1;
1218	}
1219	return 0;
1220}
1221
1222#endif
1223
1224struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1225	.family		=	PF_INET,
1226	.obj_size	=	sizeof(struct tcp_request_sock),
1227	.rtx_syn_ack	=	tcp_v4_send_synack,
1228	.send_ack	=	tcp_v4_reqsk_send_ack,
1229	.destructor	=	tcp_v4_reqsk_destructor,
1230	.send_reset	=	tcp_v4_send_reset,
1231};
1232
1233#ifdef CONFIG_TCP_MD5SIG
1234static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1235	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1236};
1237#endif
1238
1239static struct timewait_sock_ops tcp_timewait_sock_ops = {
1240	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1241	.twsk_unique	= tcp_twsk_unique,
1242	.twsk_destructor= tcp_twsk_destructor,
1243};
1244
1245int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1246{
1247	struct inet_request_sock *ireq;
1248	struct tcp_options_received tmp_opt;
1249	struct request_sock *req;
1250	__be32 saddr = ip_hdr(skb)->saddr;
1251	__be32 daddr = ip_hdr(skb)->daddr;
1252	__u32 isn = TCP_SKB_CB(skb)->when;
1253	struct dst_entry *dst = NULL;
1254#ifdef CONFIG_SYN_COOKIES
1255	int want_cookie = 0;
1256#else
1257#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1258#endif
1259
1260	/* Never answer to SYNs send to broadcast or multicast */
1261	if (((struct rtable *)skb->dst)->rt_flags &
1262	    (RTCF_BROADCAST | RTCF_MULTICAST))
1263		goto drop;
1264
1265	/* TW buckets are converted to open requests without
1266	 * limitations, they conserve resources and peer is
1267	 * evidently real one.
1268	 */
1269	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1270#ifdef CONFIG_SYN_COOKIES
1271		if (sysctl_tcp_syncookies) {
1272			want_cookie = 1;
1273		} else
1274#endif
1275		goto drop;
1276	}
1277
1278	/* Accept backlog is full. If we have already queued enough
1279	 * of warm entries in syn queue, drop request. It is better than
1280	 * clogging syn queue with openreqs with exponentially increasing
1281	 * timeout.
1282	 */
1283	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1284		goto drop;
1285
1286	req = reqsk_alloc(&tcp_request_sock_ops);
1287	if (!req)
1288		goto drop;
1289
1290#ifdef CONFIG_TCP_MD5SIG
1291	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1292#endif
1293
1294	tcp_clear_options(&tmp_opt);
1295	tmp_opt.mss_clamp = 536;
1296	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1297
1298	tcp_parse_options(skb, &tmp_opt, 0);
1299
1300	if (want_cookie) {
1301		tcp_clear_options(&tmp_opt);
1302		tmp_opt.saw_tstamp = 0;
1303	}
1304
1305	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1306		/* Some OSes (unknown ones, but I see them on web server, which
1307		 * contains information interesting only for windows'
1308		 * users) do not send their stamp in SYN. It is easy case.
1309		 * We simply do not advertise TS support.
1310		 */
1311		tmp_opt.saw_tstamp = 0;
1312		tmp_opt.tstamp_ok  = 0;
1313	}
1314	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315
1316	tcp_openreq_init(req, &tmp_opt, skb);
1317
1318	if (security_inet_conn_request(sk, skb, req))
1319		goto drop_and_free;
1320
1321	ireq = inet_rsk(req);
1322	ireq->loc_addr = daddr;
1323	ireq->rmt_addr = saddr;
1324	ireq->opt = tcp_v4_save_options(sk, skb);
1325	if (!want_cookie)
1326		TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328	if (want_cookie) {
1329#ifdef CONFIG_SYN_COOKIES
1330		syn_flood_warning(skb);
1331#endif
1332		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333	} else if (!isn) {
1334		struct inet_peer *peer = NULL;
1335
1336		/* VJ's idea. We save last timestamp seen
1337		 * from the destination in peer table, when entering
1338		 * state TIME-WAIT, and check against it before
1339		 * accepting new connection request.
1340		 *
1341		 * If "isn" is not zero, this request hit alive
1342		 * timewait bucket, so that all the necessary checks
1343		 * are made in the function processing timewait state.
1344		 */
1345		if (tmp_opt.saw_tstamp &&
1346		    tcp_death_row.sysctl_tw_recycle &&
1347		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1348		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1349		    peer->v4daddr == saddr) {
1350			if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1351			    (s32)(peer->tcp_ts - req->ts_recent) >
1352							TCP_PAWS_WINDOW) {
1353				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1354				dst_release(dst);
1355				goto drop_and_free;
1356			}
1357		}
1358		/* Kill the following clause, if you dislike this way. */
1359		else if (!sysctl_tcp_syncookies &&
1360			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1361			  (sysctl_max_syn_backlog >> 2)) &&
1362			 (!peer || !peer->tcp_ts_stamp) &&
1363			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1364			/* Without syncookies last quarter of
1365			 * backlog is filled with destinations,
1366			 * proven to be alive.
1367			 * It means that we continue to communicate
1368			 * to destinations, already remembered
1369			 * to the moment of synflood.
1370			 */
1371			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1372				       "request from %u.%u.%u.%u/%u\n",
1373				       NIPQUAD(saddr),
1374				       ntohs(tcp_hdr(skb)->source));
1375			dst_release(dst);
1376			goto drop_and_free;
1377		}
1378
1379		isn = tcp_v4_init_sequence(skb);
1380	}
1381	tcp_rsk(req)->snt_isn = isn;
1382
1383	if (tcp_v4_send_synack(sk, req, dst))
1384		goto drop_and_free;
1385
1386	if (want_cookie) {
1387		reqsk_free(req);
1388	} else {
1389		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1390	}
1391	return 0;
1392
1393drop_and_free:
1394	reqsk_free(req);
1395drop:
1396	return 0;
1397}
1398
1399
1400/*
1401 * The three way handshake has completed - we got a valid synack -
1402 * now create the new socket.
1403 */
1404struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1405				  struct request_sock *req,
1406				  struct dst_entry *dst)
1407{
1408	struct inet_request_sock *ireq;
1409	struct inet_sock *newinet;
1410	struct tcp_sock *newtp;
1411	struct sock *newsk;
1412#ifdef CONFIG_TCP_MD5SIG
1413	struct tcp_md5sig_key *key;
1414#endif
1415
1416	if (sk_acceptq_is_full(sk))
1417		goto exit_overflow;
1418
1419	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1420		goto exit;
1421
1422	newsk = tcp_create_openreq_child(sk, req, skb);
1423	if (!newsk)
1424		goto exit;
1425
1426	newsk->sk_gso_type = SKB_GSO_TCPV4;
1427	sk_setup_caps(newsk, dst);
1428
1429	newtp		      = tcp_sk(newsk);
1430	newinet		      = inet_sk(newsk);
1431	ireq		      = inet_rsk(req);
1432	newinet->daddr	      = ireq->rmt_addr;
1433	newinet->rcv_saddr    = ireq->loc_addr;
1434	newinet->saddr	      = ireq->loc_addr;
1435	newinet->opt	      = ireq->opt;
1436	ireq->opt	      = NULL;
1437	newinet->mc_index     = inet_iif(skb);
1438	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1439	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1440	if (newinet->opt)
1441		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1442	newinet->id = newtp->write_seq ^ jiffies;
1443
1444	tcp_mtup_init(newsk);
1445	tcp_sync_mss(newsk, dst_mtu(dst));
1446	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1447	tcp_initialize_rcv_mss(newsk);
1448
1449#ifdef CONFIG_TCP_MD5SIG
1450	/* Copy over the MD5 key from the original socket */
1451	if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1452		/*
1453		 * We're using one, so create a matching key
1454		 * on the newsk structure. If we fail to get
1455		 * memory, then we end up not copying the key
1456		 * across. Shucks.
1457		 */
1458		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1459		if (newkey != NULL)
1460			tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1461					  newkey, key->keylen);
1462	}
1463#endif
1464
1465	__inet_hash_nolisten(newsk);
1466	__inet_inherit_port(sk, newsk);
1467
1468	return newsk;
1469
1470exit_overflow:
1471	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1472exit:
1473	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1474	dst_release(dst);
1475	return NULL;
1476}
1477
1478static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1479{
1480	struct tcphdr *th = tcp_hdr(skb);
1481	const struct iphdr *iph = ip_hdr(skb);
1482	struct sock *nsk;
1483	struct request_sock **prev;
1484	/* Find possible connection requests. */
1485	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1486						       iph->saddr, iph->daddr);
1487	if (req)
1488		return tcp_check_req(sk, skb, req, prev);
1489
1490	nsk = inet_lookup_established(sk->sk_net, &tcp_hashinfo, iph->saddr,
1491			th->source, iph->daddr, th->dest, inet_iif(skb));
1492
1493	if (nsk) {
1494		if (nsk->sk_state != TCP_TIME_WAIT) {
1495			bh_lock_sock(nsk);
1496			return nsk;
1497		}
1498		inet_twsk_put(inet_twsk(nsk));
1499		return NULL;
1500	}
1501
1502#ifdef CONFIG_SYN_COOKIES
1503	if (!th->rst && !th->syn && th->ack)
1504		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1505#endif
1506	return sk;
1507}
1508
1509static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1510{
1511	const struct iphdr *iph = ip_hdr(skb);
1512
1513	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1514		if (!tcp_v4_check(skb->len, iph->saddr,
1515				  iph->daddr, skb->csum)) {
1516			skb->ip_summed = CHECKSUM_UNNECESSARY;
1517			return 0;
1518		}
1519	}
1520
1521	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1522				       skb->len, IPPROTO_TCP, 0);
1523
1524	if (skb->len <= 76) {
1525		return __skb_checksum_complete(skb);
1526	}
1527	return 0;
1528}
1529
1530
1531/* The socket must have it's spinlock held when we get
1532 * here.
1533 *
1534 * We have a potential double-lock case here, so even when
1535 * doing backlog processing we use the BH locking scheme.
1536 * This is because we cannot sleep with the original spinlock
1537 * held.
1538 */
1539int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1540{
1541	struct sock *rsk;
1542#ifdef CONFIG_TCP_MD5SIG
1543	/*
1544	 * We really want to reject the packet as early as possible
1545	 * if:
1546	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1547	 *  o There is an MD5 option and we're not expecting one
1548	 */
1549	if (tcp_v4_inbound_md5_hash(sk, skb))
1550		goto discard;
1551#endif
1552
1553	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1554		TCP_CHECK_TIMER(sk);
1555		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1556			rsk = sk;
1557			goto reset;
1558		}
1559		TCP_CHECK_TIMER(sk);
1560		return 0;
1561	}
1562
1563	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1564		goto csum_err;
1565
1566	if (sk->sk_state == TCP_LISTEN) {
1567		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1568		if (!nsk)
1569			goto discard;
1570
1571		if (nsk != sk) {
1572			if (tcp_child_process(sk, nsk, skb)) {
1573				rsk = nsk;
1574				goto reset;
1575			}
1576			return 0;
1577		}
1578	}
1579
1580	TCP_CHECK_TIMER(sk);
1581	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1582		rsk = sk;
1583		goto reset;
1584	}
1585	TCP_CHECK_TIMER(sk);
1586	return 0;
1587
1588reset:
1589	tcp_v4_send_reset(rsk, skb);
1590discard:
1591	kfree_skb(skb);
1592	/* Be careful here. If this function gets more complicated and
1593	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1594	 * might be destroyed here. This current version compiles correctly,
1595	 * but you have been warned.
1596	 */
1597	return 0;
1598
1599csum_err:
1600	TCP_INC_STATS_BH(TCP_MIB_INERRS);
1601	goto discard;
1602}
1603
1604/*
1605 *	From tcp_input.c
1606 */
1607
1608int tcp_v4_rcv(struct sk_buff *skb)
1609{
1610	const struct iphdr *iph;
1611	struct tcphdr *th;
1612	struct sock *sk;
1613	int ret;
1614
1615	if (skb->pkt_type != PACKET_HOST)
1616		goto discard_it;
1617
1618	/* Count it even if it's bad */
1619	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1620
1621	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1622		goto discard_it;
1623
1624	th = tcp_hdr(skb);
1625
1626	if (th->doff < sizeof(struct tcphdr) / 4)
1627		goto bad_packet;
1628	if (!pskb_may_pull(skb, th->doff * 4))
1629		goto discard_it;
1630
1631	/* An explanation is required here, I think.
1632	 * Packet length and doff are validated by header prediction,
1633	 * provided case of th->doff==0 is eliminated.
1634	 * So, we defer the checks. */
1635	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1636		goto bad_packet;
1637
1638	th = tcp_hdr(skb);
1639	iph = ip_hdr(skb);
1640	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1641	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1642				    skb->len - th->doff * 4);
1643	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1644	TCP_SKB_CB(skb)->when	 = 0;
1645	TCP_SKB_CB(skb)->flags	 = iph->tos;
1646	TCP_SKB_CB(skb)->sacked	 = 0;
1647
1648	sk = __inet_lookup(skb->dev->nd_net, &tcp_hashinfo, iph->saddr,
1649			th->source, iph->daddr, th->dest, inet_iif(skb));
1650	if (!sk)
1651		goto no_tcp_socket;
1652
1653process:
1654	if (sk->sk_state == TCP_TIME_WAIT)
1655		goto do_time_wait;
1656
1657	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1658		goto discard_and_relse;
1659	nf_reset(skb);
1660
1661	if (sk_filter(sk, skb))
1662		goto discard_and_relse;
1663
1664	skb->dev = NULL;
1665
1666	bh_lock_sock_nested(sk);
1667	ret = 0;
1668	if (!sock_owned_by_user(sk)) {
1669#ifdef CONFIG_NET_DMA
1670		struct tcp_sock *tp = tcp_sk(sk);
1671		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1672			tp->ucopy.dma_chan = get_softnet_dma();
1673		if (tp->ucopy.dma_chan)
1674			ret = tcp_v4_do_rcv(sk, skb);
1675		else
1676#endif
1677		{
1678			if (!tcp_prequeue(sk, skb))
1679			ret = tcp_v4_do_rcv(sk, skb);
1680		}
1681	} else
1682		sk_add_backlog(sk, skb);
1683	bh_unlock_sock(sk);
1684
1685	sock_put(sk);
1686
1687	return ret;
1688
1689no_tcp_socket:
1690	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1691		goto discard_it;
1692
1693	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1694bad_packet:
1695		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1696	} else {
1697		tcp_v4_send_reset(NULL, skb);
1698	}
1699
1700discard_it:
1701	/* Discard frame. */
1702	kfree_skb(skb);
1703	return 0;
1704
1705discard_and_relse:
1706	sock_put(sk);
1707	goto discard_it;
1708
1709do_time_wait:
1710	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1711		inet_twsk_put(inet_twsk(sk));
1712		goto discard_it;
1713	}
1714
1715	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1716		TCP_INC_STATS_BH(TCP_MIB_INERRS);
1717		inet_twsk_put(inet_twsk(sk));
1718		goto discard_it;
1719	}
1720	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1721	case TCP_TW_SYN: {
1722		struct sock *sk2 = inet_lookup_listener(skb->dev->nd_net,
1723							&tcp_hashinfo,
1724							iph->daddr, th->dest,
1725							inet_iif(skb));
1726		if (sk2) {
1727			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1728			inet_twsk_put(inet_twsk(sk));
1729			sk = sk2;
1730			goto process;
1731		}
1732		/* Fall through to ACK */
1733	}
1734	case TCP_TW_ACK:
1735		tcp_v4_timewait_ack(sk, skb);
1736		break;
1737	case TCP_TW_RST:
1738		goto no_tcp_socket;
1739	case TCP_TW_SUCCESS:;
1740	}
1741	goto discard_it;
1742}
1743
1744/* VJ's idea. Save last timestamp seen from this destination
1745 * and hold it at least for normal timewait interval to use for duplicate
1746 * segment detection in subsequent connections, before they enter synchronized
1747 * state.
1748 */
1749
1750int tcp_v4_remember_stamp(struct sock *sk)
1751{
1752	struct inet_sock *inet = inet_sk(sk);
1753	struct tcp_sock *tp = tcp_sk(sk);
1754	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1755	struct inet_peer *peer = NULL;
1756	int release_it = 0;
1757
1758	if (!rt || rt->rt_dst != inet->daddr) {
1759		peer = inet_getpeer(inet->daddr, 1);
1760		release_it = 1;
1761	} else {
1762		if (!rt->peer)
1763			rt_bind_peer(rt, 1);
1764		peer = rt->peer;
1765	}
1766
1767	if (peer) {
1768		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1769		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1770		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1771			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1772			peer->tcp_ts = tp->rx_opt.ts_recent;
1773		}
1774		if (release_it)
1775			inet_putpeer(peer);
1776		return 1;
1777	}
1778
1779	return 0;
1780}
1781
1782int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1783{
1784	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1785
1786	if (peer) {
1787		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1788
1789		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1790		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1791		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1792			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1793			peer->tcp_ts	   = tcptw->tw_ts_recent;
1794		}
1795		inet_putpeer(peer);
1796		return 1;
1797	}
1798
1799	return 0;
1800}
1801
1802struct inet_connection_sock_af_ops ipv4_specific = {
1803	.queue_xmit	   = ip_queue_xmit,
1804	.send_check	   = tcp_v4_send_check,
1805	.rebuild_header	   = inet_sk_rebuild_header,
1806	.conn_request	   = tcp_v4_conn_request,
1807	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1808	.remember_stamp	   = tcp_v4_remember_stamp,
1809	.net_header_len	   = sizeof(struct iphdr),
1810	.setsockopt	   = ip_setsockopt,
1811	.getsockopt	   = ip_getsockopt,
1812	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1813	.sockaddr_len	   = sizeof(struct sockaddr_in),
1814	.bind_conflict	   = inet_csk_bind_conflict,
1815#ifdef CONFIG_COMPAT
1816	.compat_setsockopt = compat_ip_setsockopt,
1817	.compat_getsockopt = compat_ip_getsockopt,
1818#endif
1819};
1820
1821#ifdef CONFIG_TCP_MD5SIG
1822static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1823	.md5_lookup		= tcp_v4_md5_lookup,
1824	.calc_md5_hash		= tcp_v4_calc_md5_hash,
1825	.md5_add		= tcp_v4_md5_add_func,
1826	.md5_parse		= tcp_v4_parse_md5_keys,
1827};
1828#endif
1829
1830/* NOTE: A lot of things set to zero explicitly by call to
1831 *       sk_alloc() so need not be done here.
1832 */
1833static int tcp_v4_init_sock(struct sock *sk)
1834{
1835	struct inet_connection_sock *icsk = inet_csk(sk);
1836	struct tcp_sock *tp = tcp_sk(sk);
1837
1838	skb_queue_head_init(&tp->out_of_order_queue);
1839	tcp_init_xmit_timers(sk);
1840	tcp_prequeue_init(tp);
1841
1842	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1843	tp->mdev = TCP_TIMEOUT_INIT;
1844
1845	/* So many TCP implementations out there (incorrectly) count the
1846	 * initial SYN frame in their delayed-ACK and congestion control
1847	 * algorithms that we must have the following bandaid to talk
1848	 * efficiently to them.  -DaveM
1849	 */
1850	tp->snd_cwnd = 2;
1851
1852	/* See draft-stevens-tcpca-spec-01 for discussion of the
1853	 * initialization of these values.
1854	 */
1855	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
1856	tp->snd_cwnd_clamp = ~0;
1857	tp->mss_cache = 536;
1858
1859	tp->reordering = sysctl_tcp_reordering;
1860	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1861
1862	sk->sk_state = TCP_CLOSE;
1863
1864	sk->sk_write_space = sk_stream_write_space;
1865	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1866
1867	icsk->icsk_af_ops = &ipv4_specific;
1868	icsk->icsk_sync_mss = tcp_sync_mss;
1869#ifdef CONFIG_TCP_MD5SIG
1870	tp->af_specific = &tcp_sock_ipv4_specific;
1871#endif
1872
1873	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1874	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1875
1876	atomic_inc(&tcp_sockets_allocated);
1877
1878	return 0;
1879}
1880
1881int tcp_v4_destroy_sock(struct sock *sk)
1882{
1883	struct tcp_sock *tp = tcp_sk(sk);
1884
1885	tcp_clear_xmit_timers(sk);
1886
1887	tcp_cleanup_congestion_control(sk);
1888
1889	/* Cleanup up the write buffer. */
1890	tcp_write_queue_purge(sk);
1891
1892	/* Cleans up our, hopefully empty, out_of_order_queue. */
1893	__skb_queue_purge(&tp->out_of_order_queue);
1894
1895#ifdef CONFIG_TCP_MD5SIG
1896	/* Clean up the MD5 key list, if any */
1897	if (tp->md5sig_info) {
1898		tcp_v4_clear_md5_list(sk);
1899		kfree(tp->md5sig_info);
1900		tp->md5sig_info = NULL;
1901	}
1902#endif
1903
1904#ifdef CONFIG_NET_DMA
1905	/* Cleans up our sk_async_wait_queue */
1906	__skb_queue_purge(&sk->sk_async_wait_queue);
1907#endif
1908
1909	/* Clean prequeue, it must be empty really */
1910	__skb_queue_purge(&tp->ucopy.prequeue);
1911
1912	/* Clean up a referenced TCP bind bucket. */
1913	if (inet_csk(sk)->icsk_bind_hash)
1914		inet_put_port(sk);
1915
1916	/*
1917	 * If sendmsg cached page exists, toss it.
1918	 */
1919	if (sk->sk_sndmsg_page) {
1920		__free_page(sk->sk_sndmsg_page);
1921		sk->sk_sndmsg_page = NULL;
1922	}
1923
1924	atomic_dec(&tcp_sockets_allocated);
1925
1926	return 0;
1927}
1928
1929EXPORT_SYMBOL(tcp_v4_destroy_sock);
1930
1931#ifdef CONFIG_PROC_FS
1932/* Proc filesystem TCP sock list dumping. */
1933
1934static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1935{
1936	return hlist_empty(head) ? NULL :
1937		list_entry(head->first, struct inet_timewait_sock, tw_node);
1938}
1939
1940static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1941{
1942	return tw->tw_node.next ?
1943		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1944}
1945
1946static void *listening_get_next(struct seq_file *seq, void *cur)
1947{
1948	struct inet_connection_sock *icsk;
1949	struct hlist_node *node;
1950	struct sock *sk = cur;
1951	struct tcp_iter_state* st = seq->private;
1952
1953	if (!sk) {
1954		st->bucket = 0;
1955		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1956		goto get_sk;
1957	}
1958
1959	++st->num;
1960
1961	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1962		struct request_sock *req = cur;
1963
1964		icsk = inet_csk(st->syn_wait_sk);
1965		req = req->dl_next;
1966		while (1) {
1967			while (req) {
1968				if (req->rsk_ops->family == st->family) {
1969					cur = req;
1970					goto out;
1971				}
1972				req = req->dl_next;
1973			}
1974			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1975				break;
1976get_req:
1977			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1978		}
1979		sk	  = sk_next(st->syn_wait_sk);
1980		st->state = TCP_SEQ_STATE_LISTENING;
1981		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1982	} else {
1983		icsk = inet_csk(sk);
1984		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1985		if (reqsk_queue_len(&icsk->icsk_accept_queue))
1986			goto start_req;
1987		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1988		sk = sk_next(sk);
1989	}
1990get_sk:
1991	sk_for_each_from(sk, node) {
1992		if (sk->sk_family == st->family) {
1993			cur = sk;
1994			goto out;
1995		}
1996		icsk = inet_csk(sk);
1997		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1998		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1999start_req:
2000			st->uid		= sock_i_uid(sk);
2001			st->syn_wait_sk = sk;
2002			st->state	= TCP_SEQ_STATE_OPENREQ;
2003			st->sbucket	= 0;
2004			goto get_req;
2005		}
2006		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007	}
2008	if (++st->bucket < INET_LHTABLE_SIZE) {
2009		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
2010		goto get_sk;
2011	}
2012	cur = NULL;
2013out:
2014	return cur;
2015}
2016
2017static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2018{
2019	void *rc = listening_get_next(seq, NULL);
2020
2021	while (rc && *pos) {
2022		rc = listening_get_next(seq, rc);
2023		--*pos;
2024	}
2025	return rc;
2026}
2027
2028static void *established_get_first(struct seq_file *seq)
2029{
2030	struct tcp_iter_state* st = seq->private;
2031	void *rc = NULL;
2032
2033	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
2034		struct sock *sk;
2035		struct hlist_node *node;
2036		struct inet_timewait_sock *tw;
2037		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2038
2039		read_lock_bh(lock);
2040		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2041			if (sk->sk_family != st->family) {
2042				continue;
2043			}
2044			rc = sk;
2045			goto out;
2046		}
2047		st->state = TCP_SEQ_STATE_TIME_WAIT;
2048		inet_twsk_for_each(tw, node,
2049				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2050			if (tw->tw_family != st->family) {
2051				continue;
2052			}
2053			rc = tw;
2054			goto out;
2055		}
2056		read_unlock_bh(lock);
2057		st->state = TCP_SEQ_STATE_ESTABLISHED;
2058	}
2059out:
2060	return rc;
2061}
2062
2063static void *established_get_next(struct seq_file *seq, void *cur)
2064{
2065	struct sock *sk = cur;
2066	struct inet_timewait_sock *tw;
2067	struct hlist_node *node;
2068	struct tcp_iter_state* st = seq->private;
2069
2070	++st->num;
2071
2072	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2073		tw = cur;
2074		tw = tw_next(tw);
2075get_tw:
2076		while (tw && tw->tw_family != st->family) {
2077			tw = tw_next(tw);
2078		}
2079		if (tw) {
2080			cur = tw;
2081			goto out;
2082		}
2083		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2084		st->state = TCP_SEQ_STATE_ESTABLISHED;
2085
2086		if (++st->bucket < tcp_hashinfo.ehash_size) {
2087			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2088			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2089		} else {
2090			cur = NULL;
2091			goto out;
2092		}
2093	} else
2094		sk = sk_next(sk);
2095
2096	sk_for_each_from(sk, node) {
2097		if (sk->sk_family == st->family)
2098			goto found;
2099	}
2100
2101	st->state = TCP_SEQ_STATE_TIME_WAIT;
2102	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2103	goto get_tw;
2104found:
2105	cur = sk;
2106out:
2107	return cur;
2108}
2109
2110static void *established_get_idx(struct seq_file *seq, loff_t pos)
2111{
2112	void *rc = established_get_first(seq);
2113
2114	while (rc && pos) {
2115		rc = established_get_next(seq, rc);
2116		--pos;
2117	}
2118	return rc;
2119}
2120
2121static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2122{
2123	void *rc;
2124	struct tcp_iter_state* st = seq->private;
2125
2126	inet_listen_lock(&tcp_hashinfo);
2127	st->state = TCP_SEQ_STATE_LISTENING;
2128	rc	  = listening_get_idx(seq, &pos);
2129
2130	if (!rc) {
2131		inet_listen_unlock(&tcp_hashinfo);
2132		st->state = TCP_SEQ_STATE_ESTABLISHED;
2133		rc	  = established_get_idx(seq, pos);
2134	}
2135
2136	return rc;
2137}
2138
2139static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2140{
2141	struct tcp_iter_state* st = seq->private;
2142	st->state = TCP_SEQ_STATE_LISTENING;
2143	st->num = 0;
2144	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2145}
2146
2147static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2148{
2149	void *rc = NULL;
2150	struct tcp_iter_state* st;
2151
2152	if (v == SEQ_START_TOKEN) {
2153		rc = tcp_get_idx(seq, 0);
2154		goto out;
2155	}
2156	st = seq->private;
2157
2158	switch (st->state) {
2159	case TCP_SEQ_STATE_OPENREQ:
2160	case TCP_SEQ_STATE_LISTENING:
2161		rc = listening_get_next(seq, v);
2162		if (!rc) {
2163			inet_listen_unlock(&tcp_hashinfo);
2164			st->state = TCP_SEQ_STATE_ESTABLISHED;
2165			rc	  = established_get_first(seq);
2166		}
2167		break;
2168	case TCP_SEQ_STATE_ESTABLISHED:
2169	case TCP_SEQ_STATE_TIME_WAIT:
2170		rc = established_get_next(seq, v);
2171		break;
2172	}
2173out:
2174	++*pos;
2175	return rc;
2176}
2177
2178static void tcp_seq_stop(struct seq_file *seq, void *v)
2179{
2180	struct tcp_iter_state* st = seq->private;
2181
2182	switch (st->state) {
2183	case TCP_SEQ_STATE_OPENREQ:
2184		if (v) {
2185			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2186			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2187		}
2188	case TCP_SEQ_STATE_LISTENING:
2189		if (v != SEQ_START_TOKEN)
2190			inet_listen_unlock(&tcp_hashinfo);
2191		break;
2192	case TCP_SEQ_STATE_TIME_WAIT:
2193	case TCP_SEQ_STATE_ESTABLISHED:
2194		if (v)
2195			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2196		break;
2197	}
2198}
2199
2200static int tcp_seq_open(struct inode *inode, struct file *file)
2201{
2202	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2203	struct seq_file *seq;
2204	struct tcp_iter_state *s;
2205	int rc;
2206
2207	if (unlikely(afinfo == NULL))
2208		return -EINVAL;
2209
2210	s = kzalloc(sizeof(*s), GFP_KERNEL);
2211	if (!s)
2212		return -ENOMEM;
2213	s->family		= afinfo->family;
2214	s->seq_ops.start	= tcp_seq_start;
2215	s->seq_ops.next		= tcp_seq_next;
2216	s->seq_ops.show		= afinfo->seq_show;
2217	s->seq_ops.stop		= tcp_seq_stop;
2218
2219	rc = seq_open(file, &s->seq_ops);
2220	if (rc)
2221		goto out_kfree;
2222	seq	     = file->private_data;
2223	seq->private = s;
2224out:
2225	return rc;
2226out_kfree:
2227	kfree(s);
2228	goto out;
2229}
2230
2231int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2232{
2233	int rc = 0;
2234	struct proc_dir_entry *p;
2235
2236	if (!afinfo)
2237		return -EINVAL;
2238	afinfo->seq_fops->owner		= afinfo->owner;
2239	afinfo->seq_fops->open		= tcp_seq_open;
2240	afinfo->seq_fops->read		= seq_read;
2241	afinfo->seq_fops->llseek	= seq_lseek;
2242	afinfo->seq_fops->release	= seq_release_private;
2243
2244	p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
2245	if (p)
2246		p->data = afinfo;
2247	else
2248		rc = -ENOMEM;
2249	return rc;
2250}
2251
2252void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2253{
2254	if (!afinfo)
2255		return;
2256	proc_net_remove(&init_net, afinfo->name);
2257	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2258}
2259
2260static void get_openreq4(struct sock *sk, struct request_sock *req,
2261			 char *tmpbuf, int i, int uid)
2262{
2263	const struct inet_request_sock *ireq = inet_rsk(req);
2264	int ttd = req->expires - jiffies;
2265
2266	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2267		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2268		i,
2269		ireq->loc_addr,
2270		ntohs(inet_sk(sk)->sport),
2271		ireq->rmt_addr,
2272		ntohs(ireq->rmt_port),
2273		TCP_SYN_RECV,
2274		0, 0, /* could print option size, but that is af dependent. */
2275		1,    /* timers active (only the expire timer) */
2276		jiffies_to_clock_t(ttd),
2277		req->retrans,
2278		uid,
2279		0,  /* non standard timer */
2280		0, /* open_requests have no inode */
2281		atomic_read(&sk->sk_refcnt),
2282		req);
2283}
2284
2285static void get_tcp4_sock(struct sock *sk, char *tmpbuf, int i)
2286{
2287	int timer_active;
2288	unsigned long timer_expires;
2289	struct tcp_sock *tp = tcp_sk(sk);
2290	const struct inet_connection_sock *icsk = inet_csk(sk);
2291	struct inet_sock *inet = inet_sk(sk);
2292	__be32 dest = inet->daddr;
2293	__be32 src = inet->rcv_saddr;
2294	__u16 destp = ntohs(inet->dport);
2295	__u16 srcp = ntohs(inet->sport);
2296
2297	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2298		timer_active	= 1;
2299		timer_expires	= icsk->icsk_timeout;
2300	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2301		timer_active	= 4;
2302		timer_expires	= icsk->icsk_timeout;
2303	} else if (timer_pending(&sk->sk_timer)) {
2304		timer_active	= 2;
2305		timer_expires	= sk->sk_timer.expires;
2306	} else {
2307		timer_active	= 0;
2308		timer_expires = jiffies;
2309	}
2310
2311	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2312			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
2313		i, src, srcp, dest, destp, sk->sk_state,
2314		tp->write_seq - tp->snd_una,
2315		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2316					     (tp->rcv_nxt - tp->copied_seq),
2317		timer_active,
2318		jiffies_to_clock_t(timer_expires - jiffies),
2319		icsk->icsk_retransmits,
2320		sock_i_uid(sk),
2321		icsk->icsk_probes_out,
2322		sock_i_ino(sk),
2323		atomic_read(&sk->sk_refcnt), sk,
2324		icsk->icsk_rto,
2325		icsk->icsk_ack.ato,
2326		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2327		tp->snd_cwnd,
2328		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2329}
2330
2331static void get_timewait4_sock(struct inet_timewait_sock *tw,
2332			       char *tmpbuf, int i)
2333{
2334	__be32 dest, src;
2335	__u16 destp, srcp;
2336	int ttd = tw->tw_ttd - jiffies;
2337
2338	if (ttd < 0)
2339		ttd = 0;
2340
2341	dest  = tw->tw_daddr;
2342	src   = tw->tw_rcv_saddr;
2343	destp = ntohs(tw->tw_dport);
2344	srcp  = ntohs(tw->tw_sport);
2345
2346	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2347		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2348		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2349		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2350		atomic_read(&tw->tw_refcnt), tw);
2351}
2352
2353#define TMPSZ 150
2354
2355static int tcp4_seq_show(struct seq_file *seq, void *v)
2356{
2357	struct tcp_iter_state* st;
2358	char tmpbuf[TMPSZ + 1];
2359
2360	if (v == SEQ_START_TOKEN) {
2361		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2362			   "  sl  local_address rem_address   st tx_queue "
2363			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2364			   "inode");
2365		goto out;
2366	}
2367	st = seq->private;
2368
2369	switch (st->state) {
2370	case TCP_SEQ_STATE_LISTENING:
2371	case TCP_SEQ_STATE_ESTABLISHED:
2372		get_tcp4_sock(v, tmpbuf, st->num);
2373		break;
2374	case TCP_SEQ_STATE_OPENREQ:
2375		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2376		break;
2377	case TCP_SEQ_STATE_TIME_WAIT:
2378		get_timewait4_sock(v, tmpbuf, st->num);
2379		break;
2380	}
2381	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2382out:
2383	return 0;
2384}
2385
2386static struct file_operations tcp4_seq_fops;
2387static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388	.owner		= THIS_MODULE,
2389	.name		= "tcp",
2390	.family		= AF_INET,
2391	.seq_show	= tcp4_seq_show,
2392	.seq_fops	= &tcp4_seq_fops,
2393};
2394
2395int __init tcp4_proc_init(void)
2396{
2397	return tcp_proc_register(&tcp4_seq_afinfo);
2398}
2399
2400void tcp4_proc_exit(void)
2401{
2402	tcp_proc_unregister(&tcp4_seq_afinfo);
2403}
2404#endif /* CONFIG_PROC_FS */
2405
2406DEFINE_PROTO_INUSE(tcp)
2407
2408struct proto tcp_prot = {
2409	.name			= "TCP",
2410	.owner			= THIS_MODULE,
2411	.close			= tcp_close,
2412	.connect		= tcp_v4_connect,
2413	.disconnect		= tcp_disconnect,
2414	.accept			= inet_csk_accept,
2415	.ioctl			= tcp_ioctl,
2416	.init			= tcp_v4_init_sock,
2417	.destroy		= tcp_v4_destroy_sock,
2418	.shutdown		= tcp_shutdown,
2419	.setsockopt		= tcp_setsockopt,
2420	.getsockopt		= tcp_getsockopt,
2421	.recvmsg		= tcp_recvmsg,
2422	.backlog_rcv		= tcp_v4_do_rcv,
2423	.hash			= inet_hash,
2424	.unhash			= inet_unhash,
2425	.get_port		= inet_csk_get_port,
2426	.enter_memory_pressure	= tcp_enter_memory_pressure,
2427	.sockets_allocated	= &tcp_sockets_allocated,
2428	.orphan_count		= &tcp_orphan_count,
2429	.memory_allocated	= &tcp_memory_allocated,
2430	.memory_pressure	= &tcp_memory_pressure,
2431	.sysctl_mem		= sysctl_tcp_mem,
2432	.sysctl_wmem		= sysctl_tcp_wmem,
2433	.sysctl_rmem		= sysctl_tcp_rmem,
2434	.max_header		= MAX_TCP_HEADER,
2435	.obj_size		= sizeof(struct tcp_sock),
2436	.twsk_prot		= &tcp_timewait_sock_ops,
2437	.rsk_prot		= &tcp_request_sock_ops,
2438	.hashinfo		= &tcp_hashinfo,
2439#ifdef CONFIG_COMPAT
2440	.compat_setsockopt	= compat_tcp_setsockopt,
2441	.compat_getsockopt	= compat_tcp_getsockopt,
2442#endif
2443	REF_PROTO_INUSE(tcp)
2444};
2445
2446void __init tcp_v4_init(struct net_proto_family *ops)
2447{
2448	if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW,
2449				     IPPROTO_TCP) < 0)
2450		panic("Failed to create the TCP control socket.\n");
2451}
2452
2453EXPORT_SYMBOL(ipv4_specific);
2454EXPORT_SYMBOL(tcp_hashinfo);
2455EXPORT_SYMBOL(tcp_prot);
2456EXPORT_SYMBOL(tcp_v4_conn_request);
2457EXPORT_SYMBOL(tcp_v4_connect);
2458EXPORT_SYMBOL(tcp_v4_do_rcv);
2459EXPORT_SYMBOL(tcp_v4_remember_stamp);
2460EXPORT_SYMBOL(tcp_v4_send_check);
2461EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2462
2463#ifdef CONFIG_PROC_FS
2464EXPORT_SYMBOL(tcp_proc_register);
2465EXPORT_SYMBOL(tcp_proc_unregister);
2466#endif
2467EXPORT_SYMBOL(sysctl_tcp_low_latency);
2468
Configure Feed

Configure Feed