net/ipv4/tcp.c at v6.5-rc5 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv4 / tcp.c
at v6.5-rc5 4735 lines 128 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Implementation of the Transmission Control Protocol(TCP).
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
  12 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
  13 *		Florian La Roche, <flla@stud.uni-sb.de>
  14 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  15 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
  16 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  17 *		Matthew Dillon, <dillon@apollo.west.oic.com>
  18 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  19 *		Jorge Cwik, <jorge@laser.satlink.net>
  20 *
  21 * Fixes:
  22 *		Alan Cox	:	Numerous verify_area() calls
  23 *		Alan Cox	:	Set the ACK bit on a reset
  24 *		Alan Cox	:	Stopped it crashing if it closed while
  25 *					sk->inuse=1 and was trying to connect
  26 *					(tcp_err()).
  27 *		Alan Cox	:	All icmp error handling was broken
  28 *					pointers passed where wrong and the
  29 *					socket was looked up backwards. Nobody
  30 *					tested any icmp error code obviously.
  31 *		Alan Cox	:	tcp_err() now handled properly. It
  32 *					wakes people on errors. poll
  33 *					behaves and the icmp error race
  34 *					has gone by moving it into sock.c
  35 *		Alan Cox	:	tcp_send_reset() fixed to work for
  36 *					everything not just packets for
  37 *					unknown sockets.
  38 *		Alan Cox	:	tcp option processing.
  39 *		Alan Cox	:	Reset tweaked (still not 100%) [Had
  40 *					syn rule wrong]
  41 *		Herp Rosmanith  :	More reset fixes
  42 *		Alan Cox	:	No longer acks invalid rst frames.
  43 *					Acking any kind of RST is right out.
  44 *		Alan Cox	:	Sets an ignore me flag on an rst
  45 *					receive otherwise odd bits of prattle
  46 *					escape still
  47 *		Alan Cox	:	Fixed another acking RST frame bug.
  48 *					Should stop LAN workplace lockups.
  49 *		Alan Cox	: 	Some tidyups using the new skb list
  50 *					facilities
  51 *		Alan Cox	:	sk->keepopen now seems to work
  52 *		Alan Cox	:	Pulls options out correctly on accepts
  53 *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
  54 *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
  55 *					bit to skb ops.
  56 *		Alan Cox	:	Tidied tcp_data to avoid a potential
  57 *					nasty.
  58 *		Alan Cox	:	Added some better commenting, as the
  59 *					tcp is hard to follow
  60 *		Alan Cox	:	Removed incorrect check for 20 * psh
  61 *	Michael O'Reilly	:	ack < copied bug fix.
  62 *	Johannes Stille		:	Misc tcp fixes (not all in yet).
  63 *		Alan Cox	:	FIN with no memory -> CRASH
  64 *		Alan Cox	:	Added socket option proto entries.
  65 *					Also added awareness of them to accept.
  66 *		Alan Cox	:	Added TCP options (SOL_TCP)
  67 *		Alan Cox	:	Switched wakeup calls to callbacks,
  68 *					so the kernel can layer network
  69 *					sockets.
  70 *		Alan Cox	:	Use ip_tos/ip_ttl settings.
  71 *		Alan Cox	:	Handle FIN (more) properly (we hope).
  72 *		Alan Cox	:	RST frames sent on unsynchronised
  73 *					state ack error.
  74 *		Alan Cox	:	Put in missing check for SYN bit.
  75 *		Alan Cox	:	Added tcp_select_window() aka NET2E
  76 *					window non shrink trick.
  77 *		Alan Cox	:	Added a couple of small NET2E timer
  78 *					fixes
  79 *		Charles Hedrick :	TCP fixes
  80 *		Toomas Tamm	:	TCP window fixes
  81 *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
  82 *		Charles Hedrick	:	Rewrote most of it to actually work
  83 *		Linus		:	Rewrote tcp_read() and URG handling
  84 *					completely
  85 *		Gerhard Koerting:	Fixed some missing timer handling
  86 *		Matthew Dillon  :	Reworked TCP machine states as per RFC
  87 *		Gerhard Koerting:	PC/TCP workarounds
  88 *		Adam Caldwell	:	Assorted timer/timing errors
  89 *		Matthew Dillon	:	Fixed another RST bug
  90 *		Alan Cox	:	Move to kernel side addressing changes.
  91 *		Alan Cox	:	Beginning work on TCP fastpathing
  92 *					(not yet usable)
  93 *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
  94 *		Alan Cox	:	TCP fast path debugging
  95 *		Alan Cox	:	Window clamping
  96 *		Michael Riepe	:	Bug in tcp_check()
  97 *		Matt Dillon	:	More TCP improvements and RST bug fixes
  98 *		Matt Dillon	:	Yet more small nasties remove from the
  99 *					TCP code (Be very nice to this man if
 100 *					tcp finally works 100%) 8)
 101 *		Alan Cox	:	BSD accept semantics.
 102 *		Alan Cox	:	Reset on closedown bug.
 103 *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
 104 *		Michael Pall	:	Handle poll() after URG properly in
 105 *					all cases.
 106 *		Michael Pall	:	Undo the last fix in tcp_read_urg()
 107 *					(multi URG PUSH broke rlogin).
 108 *		Michael Pall	:	Fix the multi URG PUSH problem in
 109 *					tcp_readable(), poll() after URG
 110 *					works now.
 111 *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
 112 *					BSD api.
 113 *		Alan Cox	:	Changed the semantics of sk->socket to
 114 *					fix a race and a signal problem with
 115 *					accept() and async I/O.
 116 *		Alan Cox	:	Relaxed the rules on tcp_sendto().
 117 *		Yury Shevchuk	:	Really fixed accept() blocking problem.
 118 *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
 119 *					clients/servers which listen in on
 120 *					fixed ports.
 121 *		Alan Cox	:	Cleaned the above up and shrank it to
 122 *					a sensible code size.
 123 *		Alan Cox	:	Self connect lockup fix.
 124 *		Alan Cox	:	No connect to multicast.
 125 *		Ross Biro	:	Close unaccepted children on master
 126 *					socket close.
 127 *		Alan Cox	:	Reset tracing code.
 128 *		Alan Cox	:	Spurious resets on shutdown.
 129 *		Alan Cox	:	Giant 15 minute/60 second timer error
 130 *		Alan Cox	:	Small whoops in polling before an
 131 *					accept.
 132 *		Alan Cox	:	Kept the state trace facility since
 133 *					it's handy for debugging.
 134 *		Alan Cox	:	More reset handler fixes.
 135 *		Alan Cox	:	Started rewriting the code based on
 136 *					the RFC's for other useful protocol
 137 *					references see: Comer, KA9Q NOS, and
 138 *					for a reference on the difference
 139 *					between specifications and how BSD
 140 *					works see the 4.4lite source.
 141 *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
 142 *					close.
 143 *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
 144 *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
 145 *		Alan Cox	:	Reimplemented timers as per the RFC
 146 *					and using multiple timers for sanity.
 147 *		Alan Cox	:	Small bug fixes, and a lot of new
 148 *					comments.
 149 *		Alan Cox	:	Fixed dual reader crash by locking
 150 *					the buffers (much like datagram.c)
 151 *		Alan Cox	:	Fixed stuck sockets in probe. A probe
 152 *					now gets fed up of retrying without
 153 *					(even a no space) answer.
 154 *		Alan Cox	:	Extracted closing code better
 155 *		Alan Cox	:	Fixed the closing state machine to
 156 *					resemble the RFC.
 157 *		Alan Cox	:	More 'per spec' fixes.
 158 *		Jorge Cwik	:	Even faster checksumming.
 159 *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
 160 *					only frames. At least one pc tcp stack
 161 *					generates them.
 162 *		Alan Cox	:	Cache last socket.
 163 *		Alan Cox	:	Per route irtt.
 164 *		Matt Day	:	poll()->select() match BSD precisely on error
 165 *		Alan Cox	:	New buffers
 166 *		Marc Tamsky	:	Various sk->prot->retransmits and
 167 *					sk->retransmits misupdating fixed.
 168 *					Fixed tcp_write_timeout: stuck close,
 169 *					and TCP syn retries gets used now.
 170 *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
 171 *					ack if state is TCP_CLOSED.
 172 *		Alan Cox	:	Look up device on a retransmit - routes may
 173 *					change. Doesn't yet cope with MSS shrink right
 174 *					but it's a start!
 175 *		Marc Tamsky	:	Closing in closing fixes.
 176 *		Mike Shaver	:	RFC1122 verifications.
 177 *		Alan Cox	:	rcv_saddr errors.
 178 *		Alan Cox	:	Block double connect().
 179 *		Alan Cox	:	Small hooks for enSKIP.
 180 *		Alexey Kuznetsov:	Path MTU discovery.
 181 *		Alan Cox	:	Support soft errors.
 182 *		Alan Cox	:	Fix MTU discovery pathological case
 183 *					when the remote claims no mtu!
 184 *		Marc Tamsky	:	TCP_CLOSE fix.
 185 *		Colin (G3TNE)	:	Send a reset on syn ack replies in
 186 *					window but wrong (fixes NT lpd problems)
 187 *		Pedro Roque	:	Better TCP window handling, delayed ack.
 188 *		Joerg Reuter	:	No modification of locked buffers in
 189 *					tcp_do_retransmit()
 190 *		Eric Schenk	:	Changed receiver side silly window
 191 *					avoidance algorithm to BSD style
 192 *					algorithm. This doubles throughput
 193 *					against machines running Solaris,
 194 *					and seems to result in general
 195 *					improvement.
 196 *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
 197 *	Willy Konynenberg	:	Transparent proxying support.
 198 *	Mike McLagan		:	Routing by source
 199 *		Keith Owens	:	Do proper merging with partial SKB's in
 200 *					tcp_do_sendmsg to avoid burstiness.
 201 *		Eric Schenk	:	Fix fast close down bug with
 202 *					shutdown() followed by close().
 203 *		Andi Kleen 	:	Make poll agree with SIGIO
 204 *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
 205 *					lingertime == 0 (RFC 793 ABORT Call)
 206 *	Hirokazu Takahashi	:	Use copy_from_user() instead of
 207 *					csum_and_copy_from_user() if possible.
 208 *
 209 * Description of States:
 210 *
 211 *	TCP_SYN_SENT		sent a connection request, waiting for ack
 212 *
 213 *	TCP_SYN_RECV		received a connection request, sent ack,
 214 *				waiting for final ack in three-way handshake.
 215 *
 216 *	TCP_ESTABLISHED		connection established
 217 *
 218 *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
 219 *				transmission of remaining buffered data
 220 *
 221 *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
 222 *				to shutdown
 223 *
 224 *	TCP_CLOSING		both sides have shutdown but we still have
 225 *				data we have to finish sending
 226 *
 227 *	TCP_TIME_WAIT		timeout to catch resent junk before entering
 228 *				closed, can only be entered from FIN_WAIT2
 229 *				or CLOSING.  Required because the other end
 230 *				may not have gotten our last ACK causing it
 231 *				to retransmit the data packet (which we ignore)
 232 *
 233 *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
 234 *				us to finish writing our data and to shutdown
 235 *				(we have to close() to move on to LAST_ACK)
 236 *
 237 *	TCP_LAST_ACK		out side has shutdown after remote has
 238 *				shutdown.  There may still be data in our
 239 *				buffer that we have to finish sending
 240 *
 241 *	TCP_CLOSE		socket is finished
 242 */
 243
 244#define pr_fmt(fmt) "TCP: " fmt
 245
 246#include <crypto/hash.h>
 247#include <linux/kernel.h>
 248#include <linux/module.h>
 249#include <linux/types.h>
 250#include <linux/fcntl.h>
 251#include <linux/poll.h>
 252#include <linux/inet_diag.h>
 253#include <linux/init.h>
 254#include <linux/fs.h>
 255#include <linux/skbuff.h>
 256#include <linux/scatterlist.h>
 257#include <linux/splice.h>
 258#include <linux/net.h>
 259#include <linux/socket.h>
 260#include <linux/random.h>
 261#include <linux/memblock.h>
 262#include <linux/highmem.h>
 263#include <linux/cache.h>
 264#include <linux/err.h>
 265#include <linux/time.h>
 266#include <linux/slab.h>
 267#include <linux/errqueue.h>
 268#include <linux/static_key.h>
 269#include <linux/btf.h>
 270
 271#include <net/icmp.h>
 272#include <net/inet_common.h>
 273#include <net/tcp.h>
 274#include <net/mptcp.h>
 275#include <net/xfrm.h>
 276#include <net/ip.h>
 277#include <net/sock.h>
 278
 279#include <linux/uaccess.h>
 280#include <asm/ioctls.h>
 281#include <net/busy_poll.h>
 282
 283/* Track pending CMSGs. */
 284enum {
 285	TCP_CMSG_INQ = 1,
 286	TCP_CMSG_TS = 2
 287};
 288
 289DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
 290EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
 291
 292long sysctl_tcp_mem[3] __read_mostly;
 293EXPORT_SYMBOL(sysctl_tcp_mem);
 294
 295atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;	/* Current allocated memory. */
 296EXPORT_SYMBOL(tcp_memory_allocated);
 297DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
 298EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
 299
 300#if IS_ENABLED(CONFIG_SMC)
 301DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
 302EXPORT_SYMBOL(tcp_have_smc);
 303#endif
 304
 305/*
 306 * Current number of TCP sockets.
 307 */
 308struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
 309EXPORT_SYMBOL(tcp_sockets_allocated);
 310
 311/*
 312 * TCP splice context
 313 */
 314struct tcp_splice_state {
 315	struct pipe_inode_info *pipe;
 316	size_t len;
 317	unsigned int flags;
 318};
 319
 320/*
 321 * Pressure flag: try to collapse.
 322 * Technical note: it is used by multiple contexts non atomically.
 323 * All the __sk_mem_schedule() is of this nature: accounting
 324 * is strict, actions are advisory and have some latency.
 325 */
 326unsigned long tcp_memory_pressure __read_mostly;
 327EXPORT_SYMBOL_GPL(tcp_memory_pressure);
 328
 329void tcp_enter_memory_pressure(struct sock *sk)
 330{
 331	unsigned long val;
 332
 333	if (READ_ONCE(tcp_memory_pressure))
 334		return;
 335	val = jiffies;
 336
 337	if (!val)
 338		val--;
 339	if (!cmpxchg(&tcp_memory_pressure, 0, val))
 340		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
 341}
 342EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
 343
 344void tcp_leave_memory_pressure(struct sock *sk)
 345{
 346	unsigned long val;
 347
 348	if (!READ_ONCE(tcp_memory_pressure))
 349		return;
 350	val = xchg(&tcp_memory_pressure, 0);
 351	if (val)
 352		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
 353			      jiffies_to_msecs(jiffies - val));
 354}
 355EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
 356
 357/* Convert seconds to retransmits based on initial and max timeout */
 358static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
 359{
 360	u8 res = 0;
 361
 362	if (seconds > 0) {
 363		int period = timeout;
 364
 365		res = 1;
 366		while (seconds > period && res < 255) {
 367			res++;
 368			timeout <<= 1;
 369			if (timeout > rto_max)
 370				timeout = rto_max;
 371			period += timeout;
 372		}
 373	}
 374	return res;
 375}
 376
 377/* Convert retransmits to seconds based on initial and max timeout */
 378static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
 379{
 380	int period = 0;
 381
 382	if (retrans > 0) {
 383		period = timeout;
 384		while (--retrans) {
 385			timeout <<= 1;
 386			if (timeout > rto_max)
 387				timeout = rto_max;
 388			period += timeout;
 389		}
 390	}
 391	return period;
 392}
 393
 394static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
 395{
 396	u32 rate = READ_ONCE(tp->rate_delivered);
 397	u32 intv = READ_ONCE(tp->rate_interval_us);
 398	u64 rate64 = 0;
 399
 400	if (rate && intv) {
 401		rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
 402		do_div(rate64, intv);
 403	}
 404	return rate64;
 405}
 406
 407/* Address-family independent initialization for a tcp_sock.
 408 *
 409 * NOTE: A lot of things set to zero explicitly by call to
 410 *       sk_alloc() so need not be done here.
 411 */
 412void tcp_init_sock(struct sock *sk)
 413{
 414	struct inet_connection_sock *icsk = inet_csk(sk);
 415	struct tcp_sock *tp = tcp_sk(sk);
 416
 417	tp->out_of_order_queue = RB_ROOT;
 418	sk->tcp_rtx_queue = RB_ROOT;
 419	tcp_init_xmit_timers(sk);
 420	INIT_LIST_HEAD(&tp->tsq_node);
 421	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
 422
 423	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 424	icsk->icsk_rto_min = TCP_RTO_MIN;
 425	icsk->icsk_delack_max = TCP_DELACK_MAX;
 426	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
 427	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
 428
 429	/* So many TCP implementations out there (incorrectly) count the
 430	 * initial SYN frame in their delayed-ACK and congestion control
 431	 * algorithms that we must have the following bandaid to talk
 432	 * efficiently to them.  -DaveM
 433	 */
 434	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
 435
 436	/* There's a bubble in the pipe until at least the first ACK. */
 437	tp->app_limited = ~0U;
 438	tp->rate_app_limited = 1;
 439
 440	/* See draft-stevens-tcpca-spec-01 for discussion of the
 441	 * initialization of these values.
 442	 */
 443	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 444	tp->snd_cwnd_clamp = ~0;
 445	tp->mss_cache = TCP_MSS_DEFAULT;
 446
 447	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
 448	tcp_assign_congestion_control(sk);
 449
 450	tp->tsoffset = 0;
 451	tp->rack.reo_wnd_steps = 1;
 452
 453	sk->sk_write_space = sk_stream_write_space;
 454	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 455
 456	icsk->icsk_sync_mss = tcp_sync_mss;
 457
 458	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
 459	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
 460
 461	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 462	sk_sockets_allocated_inc(sk);
 463}
 464EXPORT_SYMBOL(tcp_init_sock);
 465
 466static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
 467{
 468	struct sk_buff *skb = tcp_write_queue_tail(sk);
 469
 470	if (tsflags && skb) {
 471		struct skb_shared_info *shinfo = skb_shinfo(skb);
 472		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 473
 474		sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
 475		if (tsflags & SOF_TIMESTAMPING_TX_ACK)
 476			tcb->txstamp_ack = 1;
 477		if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
 478			shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
 479	}
 480}
 481
 482static bool tcp_stream_is_readable(struct sock *sk, int target)
 483{
 484	if (tcp_epollin_ready(sk, target))
 485		return true;
 486	return sk_is_readable(sk);
 487}
 488
 489/*
 490 *	Wait for a TCP event.
 491 *
 492 *	Note that we don't need to lock the socket, as the upper poll layers
 493 *	take care of normal races (between the test and the event) and we don't
 494 *	go look at any of the socket buffers directly.
 495 */
 496__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 497{
 498	__poll_t mask;
 499	struct sock *sk = sock->sk;
 500	const struct tcp_sock *tp = tcp_sk(sk);
 501	u8 shutdown;
 502	int state;
 503
 504	sock_poll_wait(file, sock, wait);
 505
 506	state = inet_sk_state_load(sk);
 507	if (state == TCP_LISTEN)
 508		return inet_csk_listen_poll(sk);
 509
 510	/* Socket is not locked. We are protected from async events
 511	 * by poll logic and correct handling of state changes
 512	 * made by other threads is impossible in any case.
 513	 */
 514
 515	mask = 0;
 516
 517	/*
 518	 * EPOLLHUP is certainly not done right. But poll() doesn't
 519	 * have a notion of HUP in just one direction, and for a
 520	 * socket the read side is more interesting.
 521	 *
 522	 * Some poll() documentation says that EPOLLHUP is incompatible
 523	 * with the EPOLLOUT/POLLWR flags, so somebody should check this
 524	 * all. But careful, it tends to be safer to return too many
 525	 * bits than too few, and you can easily break real applications
 526	 * if you don't tell them that something has hung up!
 527	 *
 528	 * Check-me.
 529	 *
 530	 * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
 531	 * our fs/select.c). It means that after we received EOF,
 532	 * poll always returns immediately, making impossible poll() on write()
 533	 * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
 534	 * if and only if shutdown has been made in both directions.
 535	 * Actually, it is interesting to look how Solaris and DUX
 536	 * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
 537	 * then we could set it on SND_SHUTDOWN. BTW examples given
 538	 * in Stevens' books assume exactly this behaviour, it explains
 539	 * why EPOLLHUP is incompatible with EPOLLOUT.	--ANK
 540	 *
 541	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 542	 * blocking on fresh not-connected or disconnected socket. --ANK
 543	 */
 544	shutdown = READ_ONCE(sk->sk_shutdown);
 545	if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
 546		mask |= EPOLLHUP;
 547	if (shutdown & RCV_SHUTDOWN)
 548		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 549
 550	/* Connected or passive Fast Open socket? */
 551	if (state != TCP_SYN_SENT &&
 552	    (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
 553		int target = sock_rcvlowat(sk, 0, INT_MAX);
 554		u16 urg_data = READ_ONCE(tp->urg_data);
 555
 556		if (unlikely(urg_data) &&
 557		    READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
 558		    !sock_flag(sk, SOCK_URGINLINE))
 559			target++;
 560
 561		if (tcp_stream_is_readable(sk, target))
 562			mask |= EPOLLIN | EPOLLRDNORM;
 563
 564		if (!(shutdown & SEND_SHUTDOWN)) {
 565			if (__sk_stream_is_writeable(sk, 1)) {
 566				mask |= EPOLLOUT | EPOLLWRNORM;
 567			} else {  /* send SIGIO later */
 568				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 569				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 570
 571				/* Race breaker. If space is freed after
 572				 * wspace test but before the flags are set,
 573				 * IO signal will be lost. Memory barrier
 574				 * pairs with the input side.
 575				 */
 576				smp_mb__after_atomic();
 577				if (__sk_stream_is_writeable(sk, 1))
 578					mask |= EPOLLOUT | EPOLLWRNORM;
 579			}
 580		} else
 581			mask |= EPOLLOUT | EPOLLWRNORM;
 582
 583		if (urg_data & TCP_URG_VALID)
 584			mask |= EPOLLPRI;
 585	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
 586		/* Active TCP fastopen socket with defer_connect
 587		 * Return EPOLLOUT so application can call write()
 588		 * in order for kernel to generate SYN+data
 589		 */
 590		mask |= EPOLLOUT | EPOLLWRNORM;
 591	}
 592	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 593	smp_rmb();
 594	if (READ_ONCE(sk->sk_err) ||
 595	    !skb_queue_empty_lockless(&sk->sk_error_queue))
 596		mask |= EPOLLERR;
 597
 598	return mask;
 599}
 600EXPORT_SYMBOL(tcp_poll);
 601
 602int tcp_ioctl(struct sock *sk, int cmd, int *karg)
 603{
 604	struct tcp_sock *tp = tcp_sk(sk);
 605	int answ;
 606	bool slow;
 607
 608	switch (cmd) {
 609	case SIOCINQ:
 610		if (sk->sk_state == TCP_LISTEN)
 611			return -EINVAL;
 612
 613		slow = lock_sock_fast(sk);
 614		answ = tcp_inq(sk);
 615		unlock_sock_fast(sk, slow);
 616		break;
 617	case SIOCATMARK:
 618		answ = READ_ONCE(tp->urg_data) &&
 619		       READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
 620		break;
 621	case SIOCOUTQ:
 622		if (sk->sk_state == TCP_LISTEN)
 623			return -EINVAL;
 624
 625		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 626			answ = 0;
 627		else
 628			answ = READ_ONCE(tp->write_seq) - tp->snd_una;
 629		break;
 630	case SIOCOUTQNSD:
 631		if (sk->sk_state == TCP_LISTEN)
 632			return -EINVAL;
 633
 634		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 635			answ = 0;
 636		else
 637			answ = READ_ONCE(tp->write_seq) -
 638			       READ_ONCE(tp->snd_nxt);
 639		break;
 640	default:
 641		return -ENOIOCTLCMD;
 642	}
 643
 644	*karg = answ;
 645	return 0;
 646}
 647EXPORT_SYMBOL(tcp_ioctl);
 648
 649void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 650{
 651	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
 652	tp->pushed_seq = tp->write_seq;
 653}
 654
 655static inline bool forced_push(const struct tcp_sock *tp)
 656{
 657	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 658}
 659
 660void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
 661{
 662	struct tcp_sock *tp = tcp_sk(sk);
 663	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 664
 665	tcb->seq     = tcb->end_seq = tp->write_seq;
 666	tcb->tcp_flags = TCPHDR_ACK;
 667	__skb_header_release(skb);
 668	tcp_add_write_queue_tail(sk, skb);
 669	sk_wmem_queued_add(sk, skb->truesize);
 670	sk_mem_charge(sk, skb->truesize);
 671	if (tp->nonagle & TCP_NAGLE_PUSH)
 672		tp->nonagle &= ~TCP_NAGLE_PUSH;
 673
 674	tcp_slow_start_after_idle_check(sk);
 675}
 676
 677static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
 678{
 679	if (flags & MSG_OOB)
 680		tp->snd_up = tp->write_seq;
 681}
 682
 683/* If a not yet filled skb is pushed, do not send it if
 684 * we have data packets in Qdisc or NIC queues :
 685 * Because TX completion will happen shortly, it gives a chance
 686 * to coalesce future sendmsg() payload into this skb, without
 687 * need for a timer, and with no latency trade off.
 688 * As packets containing data payload have a bigger truesize
 689 * than pure acks (dataless) packets, the last checks prevent
 690 * autocorking if we only have an ACK in Qdisc/NIC queues,
 691 * or if TX completion was delayed after we processed ACK packet.
 692 */
 693static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
 694				int size_goal)
 695{
 696	return skb->len < size_goal &&
 697	       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
 698	       !tcp_rtx_queue_empty(sk) &&
 699	       refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
 700	       tcp_skb_can_collapse_to(skb);
 701}
 702
 703void tcp_push(struct sock *sk, int flags, int mss_now,
 704	      int nonagle, int size_goal)
 705{
 706	struct tcp_sock *tp = tcp_sk(sk);
 707	struct sk_buff *skb;
 708
 709	skb = tcp_write_queue_tail(sk);
 710	if (!skb)
 711		return;
 712	if (!(flags & MSG_MORE) || forced_push(tp))
 713		tcp_mark_push(tp, skb);
 714
 715	tcp_mark_urg(tp, flags);
 716
 717	if (tcp_should_autocork(sk, skb, size_goal)) {
 718
 719		/* avoid atomic op if TSQ_THROTTLED bit is already set */
 720		if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
 721			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
 722			set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
 723		}
 724		/* It is possible TX completion already happened
 725		 * before we set TSQ_THROTTLED.
 726		 */
 727		if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
 728			return;
 729	}
 730
 731	if (flags & MSG_MORE)
 732		nonagle = TCP_NAGLE_CORK;
 733
 734	__tcp_push_pending_frames(sk, mss_now, nonagle);
 735}
 736
 737static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 738				unsigned int offset, size_t len)
 739{
 740	struct tcp_splice_state *tss = rd_desc->arg.data;
 741	int ret;
 742
 743	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
 744			      min(rd_desc->count, len), tss->flags);
 745	if (ret > 0)
 746		rd_desc->count -= ret;
 747	return ret;
 748}
 749
 750static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 751{
 752	/* Store TCP splice context information in read_descriptor_t. */
 753	read_descriptor_t rd_desc = {
 754		.arg.data = tss,
 755		.count	  = tss->len,
 756	};
 757
 758	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 759}
 760
 761/**
 762 *  tcp_splice_read - splice data from TCP socket to a pipe
 763 * @sock:	socket to splice from
 764 * @ppos:	position (not valid)
 765 * @pipe:	pipe to splice to
 766 * @len:	number of bytes to splice
 767 * @flags:	splice modifier flags
 768 *
 769 * Description:
 770 *    Will read pages from given socket and fill them into a pipe.
 771 *
 772 **/
 773ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 774			struct pipe_inode_info *pipe, size_t len,
 775			unsigned int flags)
 776{
 777	struct sock *sk = sock->sk;
 778	struct tcp_splice_state tss = {
 779		.pipe = pipe,
 780		.len = len,
 781		.flags = flags,
 782	};
 783	long timeo;
 784	ssize_t spliced;
 785	int ret;
 786
 787	sock_rps_record_flow(sk);
 788	/*
 789	 * We can't seek on a socket input
 790	 */
 791	if (unlikely(*ppos))
 792		return -ESPIPE;
 793
 794	ret = spliced = 0;
 795
 796	lock_sock(sk);
 797
 798	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
 799	while (tss.len) {
 800		ret = __tcp_splice_read(sk, &tss);
 801		if (ret < 0)
 802			break;
 803		else if (!ret) {
 804			if (spliced)
 805				break;
 806			if (sock_flag(sk, SOCK_DONE))
 807				break;
 808			if (sk->sk_err) {
 809				ret = sock_error(sk);
 810				break;
 811			}
 812			if (sk->sk_shutdown & RCV_SHUTDOWN)
 813				break;
 814			if (sk->sk_state == TCP_CLOSE) {
 815				/*
 816				 * This occurs when user tries to read
 817				 * from never connected socket.
 818				 */
 819				ret = -ENOTCONN;
 820				break;
 821			}
 822			if (!timeo) {
 823				ret = -EAGAIN;
 824				break;
 825			}
 826			/* if __tcp_splice_read() got nothing while we have
 827			 * an skb in receive queue, we do not want to loop.
 828			 * This might happen with URG data.
 829			 */
 830			if (!skb_queue_empty(&sk->sk_receive_queue))
 831				break;
 832			sk_wait_data(sk, &timeo, NULL);
 833			if (signal_pending(current)) {
 834				ret = sock_intr_errno(timeo);
 835				break;
 836			}
 837			continue;
 838		}
 839		tss.len -= ret;
 840		spliced += ret;
 841
 842		if (!tss.len || !timeo)
 843			break;
 844		release_sock(sk);
 845		lock_sock(sk);
 846
 847		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 848		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 849		    signal_pending(current))
 850			break;
 851	}
 852
 853	release_sock(sk);
 854
 855	if (spliced)
 856		return spliced;
 857
 858	return ret;
 859}
 860EXPORT_SYMBOL(tcp_splice_read);
 861
 862struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
 863				     bool force_schedule)
 864{
 865	struct sk_buff *skb;
 866
 867	skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
 868	if (likely(skb)) {
 869		bool mem_scheduled;
 870
 871		skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 872		if (force_schedule) {
 873			mem_scheduled = true;
 874			sk_forced_mem_schedule(sk, skb->truesize);
 875		} else {
 876			mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
 877		}
 878		if (likely(mem_scheduled)) {
 879			skb_reserve(skb, MAX_TCP_HEADER);
 880			skb->ip_summed = CHECKSUM_PARTIAL;
 881			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 882			return skb;
 883		}
 884		__kfree_skb(skb);
 885	} else {
 886		sk->sk_prot->enter_memory_pressure(sk);
 887		sk_stream_moderate_sndbuf(sk);
 888	}
 889	return NULL;
 890}
 891
 892static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 893				       int large_allowed)
 894{
 895	struct tcp_sock *tp = tcp_sk(sk);
 896	u32 new_size_goal, size_goal;
 897
 898	if (!large_allowed)
 899		return mss_now;
 900
 901	/* Note : tcp_tso_autosize() will eventually split this later */
 902	new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
 903
 904	/* We try hard to avoid divides here */
 905	size_goal = tp->gso_segs * mss_now;
 906	if (unlikely(new_size_goal < size_goal ||
 907		     new_size_goal >= size_goal + mss_now)) {
 908		tp->gso_segs = min_t(u16, new_size_goal / mss_now,
 909				     sk->sk_gso_max_segs);
 910		size_goal = tp->gso_segs * mss_now;
 911	}
 912
 913	return max(size_goal, mss_now);
 914}
 915
 916int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 917{
 918	int mss_now;
 919
 920	mss_now = tcp_current_mss(sk);
 921	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
 922
 923	return mss_now;
 924}
 925
 926/* In some cases, both sendmsg() could have added an skb to the write queue,
 927 * but failed adding payload on it.  We need to remove it to consume less
 928 * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
 929 * epoll() users.
 930 */
 931void tcp_remove_empty_skb(struct sock *sk)
 932{
 933	struct sk_buff *skb = tcp_write_queue_tail(sk);
 934
 935	if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
 936		tcp_unlink_write_queue(skb, sk);
 937		if (tcp_write_queue_empty(sk))
 938			tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
 939		tcp_wmem_free_skb(sk, skb);
 940	}
 941}
 942
 943/* skb changing from pure zc to mixed, must charge zc */
 944static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
 945{
 946	if (unlikely(skb_zcopy_pure(skb))) {
 947		u32 extra = skb->truesize -
 948			    SKB_TRUESIZE(skb_end_offset(skb));
 949
 950		if (!sk_wmem_schedule(sk, extra))
 951			return -ENOMEM;
 952
 953		sk_mem_charge(sk, extra);
 954		skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
 955	}
 956	return 0;
 957}
 958
 959
 960int tcp_wmem_schedule(struct sock *sk, int copy)
 961{
 962	int left;
 963
 964	if (likely(sk_wmem_schedule(sk, copy)))
 965		return copy;
 966
 967	/* We could be in trouble if we have nothing queued.
 968	 * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
 969	 * to guarantee some progress.
 970	 */
 971	left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued;
 972	if (left > 0)
 973		sk_forced_mem_schedule(sk, min(left, copy));
 974	return min(copy, sk->sk_forward_alloc);
 975}
 976
 977void tcp_free_fastopen_req(struct tcp_sock *tp)
 978{
 979	if (tp->fastopen_req) {
 980		kfree(tp->fastopen_req);
 981		tp->fastopen_req = NULL;
 982	}
 983}
 984
 985int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 986			 size_t size, struct ubuf_info *uarg)
 987{
 988	struct tcp_sock *tp = tcp_sk(sk);
 989	struct inet_sock *inet = inet_sk(sk);
 990	struct sockaddr *uaddr = msg->msg_name;
 991	int err, flags;
 992
 993	if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
 994	      TFO_CLIENT_ENABLE) ||
 995	    (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
 996	     uaddr->sa_family == AF_UNSPEC))
 997		return -EOPNOTSUPP;
 998	if (tp->fastopen_req)
 999		return -EALREADY; /* Another Fast Open is in progress */
1000
1001	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1002				   sk->sk_allocation);
1003	if (unlikely(!tp->fastopen_req))
1004		return -ENOBUFS;
1005	tp->fastopen_req->data = msg;
1006	tp->fastopen_req->size = size;
1007	tp->fastopen_req->uarg = uarg;
1008
1009	if (inet->defer_connect) {
1010		err = tcp_connect(sk);
1011		/* Same failure procedure as in tcp_v4/6_connect */
1012		if (err) {
1013			tcp_set_state(sk, TCP_CLOSE);
1014			inet->inet_dport = 0;
1015			sk->sk_route_caps = 0;
1016		}
1017	}
1018	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1019	err = __inet_stream_connect(sk->sk_socket, uaddr,
1020				    msg->msg_namelen, flags, 1);
1021	/* fastopen_req could already be freed in __inet_stream_connect
1022	 * if the connection times out or gets rst
1023	 */
1024	if (tp->fastopen_req) {
1025		*copied = tp->fastopen_req->copied;
1026		tcp_free_fastopen_req(tp);
1027		inet->defer_connect = 0;
1028	}
1029	return err;
1030}
1031
1032int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1033{
1034	struct tcp_sock *tp = tcp_sk(sk);
1035	struct ubuf_info *uarg = NULL;
1036	struct sk_buff *skb;
1037	struct sockcm_cookie sockc;
1038	int flags, err, copied = 0;
1039	int mss_now = 0, size_goal, copied_syn = 0;
1040	int process_backlog = 0;
1041	int zc = 0;
1042	long timeo;
1043
1044	flags = msg->msg_flags;
1045
1046	if ((flags & MSG_ZEROCOPY) && size) {
1047		if (msg->msg_ubuf) {
1048			uarg = msg->msg_ubuf;
1049			if (sk->sk_route_caps & NETIF_F_SG)
1050				zc = MSG_ZEROCOPY;
1051		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1052			skb = tcp_write_queue_tail(sk);
1053			uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
1054			if (!uarg) {
1055				err = -ENOBUFS;
1056				goto out_err;
1057			}
1058			if (sk->sk_route_caps & NETIF_F_SG)
1059				zc = MSG_ZEROCOPY;
1060			else
1061				uarg_to_msgzc(uarg)->zerocopy = 0;
1062		}
1063	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
1064		if (sk->sk_route_caps & NETIF_F_SG)
1065			zc = MSG_SPLICE_PAGES;
1066	}
1067
1068	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1069	    !tp->repair) {
1070		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1071		if (err == -EINPROGRESS && copied_syn > 0)
1072			goto out;
1073		else if (err)
1074			goto out_err;
1075	}
1076
1077	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1078
1079	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
1080
1081	/* Wait for a connection to finish. One exception is TCP Fast Open
1082	 * (passive side) where data is allowed to be sent before a connection
1083	 * is fully established.
1084	 */
1085	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1086	    !tcp_passive_fastopen(sk)) {
1087		err = sk_stream_wait_connect(sk, &timeo);
1088		if (err != 0)
1089			goto do_error;
1090	}
1091
1092	if (unlikely(tp->repair)) {
1093		if (tp->repair_queue == TCP_RECV_QUEUE) {
1094			copied = tcp_send_rcvq(sk, msg, size);
1095			goto out_nopush;
1096		}
1097
1098		err = -EINVAL;
1099		if (tp->repair_queue == TCP_NO_QUEUE)
1100			goto out_err;
1101
1102		/* 'common' sending to sendq */
1103	}
1104
1105	sockcm_init(&sockc, sk);
1106	if (msg->msg_controllen) {
1107		err = sock_cmsg_send(sk, msg, &sockc);
1108		if (unlikely(err)) {
1109			err = -EINVAL;
1110			goto out_err;
1111		}
1112	}
1113
1114	/* This should be in poll */
1115	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1116
1117	/* Ok commence sending. */
1118	copied = 0;
1119
1120restart:
1121	mss_now = tcp_send_mss(sk, &size_goal, flags);
1122
1123	err = -EPIPE;
1124	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1125		goto do_error;
1126
1127	while (msg_data_left(msg)) {
1128		ssize_t copy = 0;
1129
1130		skb = tcp_write_queue_tail(sk);
1131		if (skb)
1132			copy = size_goal - skb->len;
1133
1134		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1135			bool first_skb;
1136
1137new_segment:
1138			if (!sk_stream_memory_free(sk))
1139				goto wait_for_space;
1140
1141			if (unlikely(process_backlog >= 16)) {
1142				process_backlog = 0;
1143				if (sk_flush_backlog(sk))
1144					goto restart;
1145			}
1146			first_skb = tcp_rtx_and_write_queues_empty(sk);
1147			skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
1148						   first_skb);
1149			if (!skb)
1150				goto wait_for_space;
1151
1152			process_backlog++;
1153
1154			tcp_skb_entail(sk, skb);
1155			copy = size_goal;
1156
1157			/* All packets are restored as if they have
1158			 * already been sent. skb_mstamp_ns isn't set to
1159			 * avoid wrong rtt estimation.
1160			 */
1161			if (tp->repair)
1162				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1163		}
1164
1165		/* Try to append data to the end of skb. */
1166		if (copy > msg_data_left(msg))
1167			copy = msg_data_left(msg);
1168
1169		if (zc == 0) {
1170			bool merge = true;
1171			int i = skb_shinfo(skb)->nr_frags;
1172			struct page_frag *pfrag = sk_page_frag(sk);
1173
1174			if (!sk_page_frag_refill(sk, pfrag))
1175				goto wait_for_space;
1176
1177			if (!skb_can_coalesce(skb, i, pfrag->page,
1178					      pfrag->offset)) {
1179				if (i >= READ_ONCE(sysctl_max_skb_frags)) {
1180					tcp_mark_push(tp, skb);
1181					goto new_segment;
1182				}
1183				merge = false;
1184			}
1185
1186			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1187
1188			if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
1189				if (tcp_downgrade_zcopy_pure(sk, skb))
1190					goto wait_for_space;
1191				skb_zcopy_downgrade_managed(skb);
1192			}
1193
1194			copy = tcp_wmem_schedule(sk, copy);
1195			if (!copy)
1196				goto wait_for_space;
1197
1198			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1199						       pfrag->page,
1200						       pfrag->offset,
1201						       copy);
1202			if (err)
1203				goto do_error;
1204
1205			/* Update the skb. */
1206			if (merge) {
1207				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1208			} else {
1209				skb_fill_page_desc(skb, i, pfrag->page,
1210						   pfrag->offset, copy);
1211				page_ref_inc(pfrag->page);
1212			}
1213			pfrag->offset += copy;
1214		} else if (zc == MSG_ZEROCOPY)  {
1215			/* First append to a fragless skb builds initial
1216			 * pure zerocopy skb
1217			 */
1218			if (!skb->len)
1219				skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1220
1221			if (!skb_zcopy_pure(skb)) {
1222				copy = tcp_wmem_schedule(sk, copy);
1223				if (!copy)
1224					goto wait_for_space;
1225			}
1226
1227			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1228			if (err == -EMSGSIZE || err == -EEXIST) {
1229				tcp_mark_push(tp, skb);
1230				goto new_segment;
1231			}
1232			if (err < 0)
1233				goto do_error;
1234			copy = err;
1235		} else if (zc == MSG_SPLICE_PAGES) {
1236			/* Splice in data if we can; copy if we can't. */
1237			if (tcp_downgrade_zcopy_pure(sk, skb))
1238				goto wait_for_space;
1239			copy = tcp_wmem_schedule(sk, copy);
1240			if (!copy)
1241				goto wait_for_space;
1242
1243			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1244						   sk->sk_allocation);
1245			if (err < 0) {
1246				if (err == -EMSGSIZE) {
1247					tcp_mark_push(tp, skb);
1248					goto new_segment;
1249				}
1250				goto do_error;
1251			}
1252			copy = err;
1253
1254			if (!(flags & MSG_NO_SHARED_FRAGS))
1255				skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1256
1257			sk_wmem_queued_add(sk, copy);
1258			sk_mem_charge(sk, copy);
1259		}
1260
1261		if (!copied)
1262			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1263
1264		WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1265		TCP_SKB_CB(skb)->end_seq += copy;
1266		tcp_skb_pcount_set(skb, 0);
1267
1268		copied += copy;
1269		if (!msg_data_left(msg)) {
1270			if (unlikely(flags & MSG_EOR))
1271				TCP_SKB_CB(skb)->eor = 1;
1272			goto out;
1273		}
1274
1275		if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1276			continue;
1277
1278		if (forced_push(tp)) {
1279			tcp_mark_push(tp, skb);
1280			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1281		} else if (skb == tcp_send_head(sk))
1282			tcp_push_one(sk, mss_now);
1283		continue;
1284
1285wait_for_space:
1286		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1287		if (copied)
1288			tcp_push(sk, flags & ~MSG_MORE, mss_now,
1289				 TCP_NAGLE_PUSH, size_goal);
1290
1291		err = sk_stream_wait_memory(sk, &timeo);
1292		if (err != 0)
1293			goto do_error;
1294
1295		mss_now = tcp_send_mss(sk, &size_goal, flags);
1296	}
1297
1298out:
1299	if (copied) {
1300		tcp_tx_timestamp(sk, sockc.tsflags);
1301		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1302	}
1303out_nopush:
1304	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
1305	if (uarg && !msg->msg_ubuf)
1306		net_zcopy_put(uarg);
1307	return copied + copied_syn;
1308
1309do_error:
1310	tcp_remove_empty_skb(sk);
1311
1312	if (copied + copied_syn)
1313		goto out;
1314out_err:
1315	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
1316	if (uarg && !msg->msg_ubuf)
1317		net_zcopy_put_abort(uarg, true);
1318	err = sk_stream_error(sk, flags, err);
1319	/* make sure we wake any epoll edge trigger waiter */
1320	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1321		sk->sk_write_space(sk);
1322		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1323	}
1324	return err;
1325}
1326EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1327
1328int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1329{
1330	int ret;
1331
1332	lock_sock(sk);
1333	ret = tcp_sendmsg_locked(sk, msg, size);
1334	release_sock(sk);
1335
1336	return ret;
1337}
1338EXPORT_SYMBOL(tcp_sendmsg);
1339
1340void tcp_splice_eof(struct socket *sock)
1341{
1342	struct sock *sk = sock->sk;
1343	struct tcp_sock *tp = tcp_sk(sk);
1344	int mss_now, size_goal;
1345
1346	if (!tcp_write_queue_tail(sk))
1347		return;
1348
1349	lock_sock(sk);
1350	mss_now = tcp_send_mss(sk, &size_goal, 0);
1351	tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
1352	release_sock(sk);
1353}
1354EXPORT_SYMBOL_GPL(tcp_splice_eof);
1355
1356/*
1357 *	Handle reading urgent data. BSD has very simple semantics for
1358 *	this, no blocking and very strange errors 8)
1359 */
1360
1361static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1362{
1363	struct tcp_sock *tp = tcp_sk(sk);
1364
1365	/* No URG data to read. */
1366	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1367	    tp->urg_data == TCP_URG_READ)
1368		return -EINVAL;	/* Yes this is right ! */
1369
1370	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1371		return -ENOTCONN;
1372
1373	if (tp->urg_data & TCP_URG_VALID) {
1374		int err = 0;
1375		char c = tp->urg_data;
1376
1377		if (!(flags & MSG_PEEK))
1378			WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1379
1380		/* Read urgent data. */
1381		msg->msg_flags |= MSG_OOB;
1382
1383		if (len > 0) {
1384			if (!(flags & MSG_TRUNC))
1385				err = memcpy_to_msg(msg, &c, 1);
1386			len = 1;
1387		} else
1388			msg->msg_flags |= MSG_TRUNC;
1389
1390		return err ? -EFAULT : len;
1391	}
1392
1393	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1394		return 0;
1395
1396	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1397	 * the available implementations agree in this case:
1398	 * this call should never block, independent of the
1399	 * blocking state of the socket.
1400	 * Mike <pall@rz.uni-karlsruhe.de>
1401	 */
1402	return -EAGAIN;
1403}
1404
1405static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1406{
1407	struct sk_buff *skb;
1408	int copied = 0, err = 0;
1409
1410	/* XXX -- need to support SO_PEEK_OFF */
1411
1412	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1413		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1414		if (err)
1415			return err;
1416		copied += skb->len;
1417	}
1418
1419	skb_queue_walk(&sk->sk_write_queue, skb) {
1420		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1421		if (err)
1422			break;
1423
1424		copied += skb->len;
1425	}
1426
1427	return err ?: copied;
1428}
1429
1430/* Clean up the receive buffer for full frames taken by the user,
1431 * then send an ACK if necessary.  COPIED is the number of bytes
1432 * tcp_recvmsg has given to the user so far, it speeds up the
1433 * calculation of whether or not we must ACK for the sake of
1434 * a window update.
1435 */
1436void __tcp_cleanup_rbuf(struct sock *sk, int copied)
1437{
1438	struct tcp_sock *tp = tcp_sk(sk);
1439	bool time_to_ack = false;
1440
1441	if (inet_csk_ack_scheduled(sk)) {
1442		const struct inet_connection_sock *icsk = inet_csk(sk);
1443
1444		if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
1445		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1446		    /*
1447		     * If this read emptied read buffer, we send ACK, if
1448		     * connection is not bidirectional, user drained
1449		     * receive buffer and there was a small segment
1450		     * in queue.
1451		     */
1452		    (copied > 0 &&
1453		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1454		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1455		       !inet_csk_in_pingpong_mode(sk))) &&
1456		      !atomic_read(&sk->sk_rmem_alloc)))
1457			time_to_ack = true;
1458	}
1459
1460	/* We send an ACK if we can now advertise a non-zero window
1461	 * which has been raised "significantly".
1462	 *
1463	 * Even if window raised up to infinity, do not send window open ACK
1464	 * in states, where we will not receive more. It is useless.
1465	 */
1466	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1467		__u32 rcv_window_now = tcp_receive_window(tp);
1468
1469		/* Optimize, __tcp_select_window() is not cheap. */
1470		if (2*rcv_window_now <= tp->window_clamp) {
1471			__u32 new_window = __tcp_select_window(sk);
1472
1473			/* Send ACK now, if this read freed lots of space
1474			 * in our buffer. Certainly, new_window is new window.
1475			 * We can advertise it now, if it is not less than current one.
1476			 * "Lots" means "at least twice" here.
1477			 */
1478			if (new_window && new_window >= 2 * rcv_window_now)
1479				time_to_ack = true;
1480		}
1481	}
1482	if (time_to_ack)
1483		tcp_send_ack(sk);
1484}
1485
1486void tcp_cleanup_rbuf(struct sock *sk, int copied)
1487{
1488	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1489	struct tcp_sock *tp = tcp_sk(sk);
1490
1491	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1492	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1493	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1494	__tcp_cleanup_rbuf(sk, copied);
1495}
1496
1497static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
1498{
1499	__skb_unlink(skb, &sk->sk_receive_queue);
1500	if (likely(skb->destructor == sock_rfree)) {
1501		sock_rfree(skb);
1502		skb->destructor = NULL;
1503		skb->sk = NULL;
1504		return skb_attempt_defer_free(skb);
1505	}
1506	__kfree_skb(skb);
1507}
1508
1509struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1510{
1511	struct sk_buff *skb;
1512	u32 offset;
1513
1514	while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1515		offset = seq - TCP_SKB_CB(skb)->seq;
1516		if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1517			pr_err_once("%s: found a SYN, please report !\n", __func__);
1518			offset--;
1519		}
1520		if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1521			*off = offset;
1522			return skb;
1523		}
1524		/* This looks weird, but this can happen if TCP collapsing
1525		 * splitted a fat GRO packet, while we released socket lock
1526		 * in skb_splice_bits()
1527		 */
1528		tcp_eat_recv_skb(sk, skb);
1529	}
1530	return NULL;
1531}
1532EXPORT_SYMBOL(tcp_recv_skb);
1533
1534/*
1535 * This routine provides an alternative to tcp_recvmsg() for routines
1536 * that would like to handle copying from skbuffs directly in 'sendfile'
1537 * fashion.
1538 * Note:
1539 *	- It is assumed that the socket was locked by the caller.
1540 *	- The routine does not block.
1541 *	- At present, there is no support for reading OOB data
1542 *	  or for 'peeking' the socket using this routine
1543 *	  (although both would be easy to implement).
1544 */
1545int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1546		  sk_read_actor_t recv_actor)
1547{
1548	struct sk_buff *skb;
1549	struct tcp_sock *tp = tcp_sk(sk);
1550	u32 seq = tp->copied_seq;
1551	u32 offset;
1552	int copied = 0;
1553
1554	if (sk->sk_state == TCP_LISTEN)
1555		return -ENOTCONN;
1556	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1557		if (offset < skb->len) {
1558			int used;
1559			size_t len;
1560
1561			len = skb->len - offset;
1562			/* Stop reading if we hit a patch of urgent data */
1563			if (unlikely(tp->urg_data)) {
1564				u32 urg_offset = tp->urg_seq - seq;
1565				if (urg_offset < len)
1566					len = urg_offset;
1567				if (!len)
1568					break;
1569			}
1570			used = recv_actor(desc, skb, offset, len);
1571			if (used <= 0) {
1572				if (!copied)
1573					copied = used;
1574				break;
1575			}
1576			if (WARN_ON_ONCE(used > len))
1577				used = len;
1578			seq += used;
1579			copied += used;
1580			offset += used;
1581
1582			/* If recv_actor drops the lock (e.g. TCP splice
1583			 * receive) the skb pointer might be invalid when
1584			 * getting here: tcp_collapse might have deleted it
1585			 * while aggregating skbs from the socket queue.
1586			 */
1587			skb = tcp_recv_skb(sk, seq - 1, &offset);
1588			if (!skb)
1589				break;
1590			/* TCP coalescing might have appended data to the skb.
1591			 * Try to splice more frags
1592			 */
1593			if (offset + 1 != skb->len)
1594				continue;
1595		}
1596		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1597			tcp_eat_recv_skb(sk, skb);
1598			++seq;
1599			break;
1600		}
1601		tcp_eat_recv_skb(sk, skb);
1602		if (!desc->count)
1603			break;
1604		WRITE_ONCE(tp->copied_seq, seq);
1605	}
1606	WRITE_ONCE(tp->copied_seq, seq);
1607
1608	tcp_rcv_space_adjust(sk);
1609
1610	/* Clean up data we have read: This will do ACK frames. */
1611	if (copied > 0) {
1612		tcp_recv_skb(sk, seq, &offset);
1613		tcp_cleanup_rbuf(sk, copied);
1614	}
1615	return copied;
1616}
1617EXPORT_SYMBOL(tcp_read_sock);
1618
1619int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
1620{
1621	struct tcp_sock *tp = tcp_sk(sk);
1622	u32 seq = tp->copied_seq;
1623	struct sk_buff *skb;
1624	int copied = 0;
1625	u32 offset;
1626
1627	if (sk->sk_state == TCP_LISTEN)
1628		return -ENOTCONN;
1629
1630	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1631		u8 tcp_flags;
1632		int used;
1633
1634		__skb_unlink(skb, &sk->sk_receive_queue);
1635		WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
1636		tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
1637		used = recv_actor(sk, skb);
1638		if (used < 0) {
1639			if (!copied)
1640				copied = used;
1641			break;
1642		}
1643		seq += used;
1644		copied += used;
1645
1646		if (tcp_flags & TCPHDR_FIN) {
1647			++seq;
1648			break;
1649		}
1650	}
1651	return copied;
1652}
1653EXPORT_SYMBOL(tcp_read_skb);
1654
1655void tcp_read_done(struct sock *sk, size_t len)
1656{
1657	struct tcp_sock *tp = tcp_sk(sk);
1658	u32 seq = tp->copied_seq;
1659	struct sk_buff *skb;
1660	size_t left;
1661	u32 offset;
1662
1663	if (sk->sk_state == TCP_LISTEN)
1664		return;
1665
1666	left = len;
1667	while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1668		int used;
1669
1670		used = min_t(size_t, skb->len - offset, left);
1671		seq += used;
1672		left -= used;
1673
1674		if (skb->len > offset + used)
1675			break;
1676
1677		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1678			tcp_eat_recv_skb(sk, skb);
1679			++seq;
1680			break;
1681		}
1682		tcp_eat_recv_skb(sk, skb);
1683	}
1684	WRITE_ONCE(tp->copied_seq, seq);
1685
1686	tcp_rcv_space_adjust(sk);
1687
1688	/* Clean up data we have read: This will do ACK frames. */
1689	if (left != len)
1690		tcp_cleanup_rbuf(sk, len - left);
1691}
1692EXPORT_SYMBOL(tcp_read_done);
1693
1694int tcp_peek_len(struct socket *sock)
1695{
1696	return tcp_inq(sock->sk);
1697}
1698EXPORT_SYMBOL(tcp_peek_len);
1699
1700/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
1701int tcp_set_rcvlowat(struct sock *sk, int val)
1702{
1703	int cap;
1704
1705	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1706		cap = sk->sk_rcvbuf >> 1;
1707	else
1708		cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1709	val = min(val, cap);
1710	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1711
1712	/* Check if we need to signal EPOLLIN right now */
1713	tcp_data_ready(sk);
1714
1715	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1716		return 0;
1717
1718	val <<= 1;
1719	if (val > sk->sk_rcvbuf) {
1720		WRITE_ONCE(sk->sk_rcvbuf, val);
1721		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1722	}
1723	return 0;
1724}
1725EXPORT_SYMBOL(tcp_set_rcvlowat);
1726
1727void tcp_update_recv_tstamps(struct sk_buff *skb,
1728			     struct scm_timestamping_internal *tss)
1729{
1730	if (skb->tstamp)
1731		tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1732	else
1733		tss->ts[0] = (struct timespec64) {0};
1734
1735	if (skb_hwtstamps(skb)->hwtstamp)
1736		tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1737	else
1738		tss->ts[2] = (struct timespec64) {0};
1739}
1740
1741#ifdef CONFIG_MMU
1742const struct vm_operations_struct tcp_vm_ops = {
1743};
1744
1745int tcp_mmap(struct file *file, struct socket *sock,
1746	     struct vm_area_struct *vma)
1747{
1748	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1749		return -EPERM;
1750	vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);
1751
1752	/* Instruct vm_insert_page() to not mmap_read_lock(mm) */
1753	vm_flags_set(vma, VM_MIXEDMAP);
1754
1755	vma->vm_ops = &tcp_vm_ops;
1756	return 0;
1757}
1758EXPORT_SYMBOL(tcp_mmap);
1759
1760static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1761				       u32 *offset_frag)
1762{
1763	skb_frag_t *frag;
1764
1765	if (unlikely(offset_skb >= skb->len))
1766		return NULL;
1767
1768	offset_skb -= skb_headlen(skb);
1769	if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1770		return NULL;
1771
1772	frag = skb_shinfo(skb)->frags;
1773	while (offset_skb) {
1774		if (skb_frag_size(frag) > offset_skb) {
1775			*offset_frag = offset_skb;
1776			return frag;
1777		}
1778		offset_skb -= skb_frag_size(frag);
1779		++frag;
1780	}
1781	*offset_frag = 0;
1782	return frag;
1783}
1784
1785static bool can_map_frag(const skb_frag_t *frag)
1786{
1787	return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1788}
1789
1790static int find_next_mappable_frag(const skb_frag_t *frag,
1791				   int remaining_in_skb)
1792{
1793	int offset = 0;
1794
1795	if (likely(can_map_frag(frag)))
1796		return 0;
1797
1798	while (offset < remaining_in_skb && !can_map_frag(frag)) {
1799		offset += skb_frag_size(frag);
1800		++frag;
1801	}
1802	return offset;
1803}
1804
1805static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1806					  struct tcp_zerocopy_receive *zc,
1807					  struct sk_buff *skb, u32 offset)
1808{
1809	u32 frag_offset, partial_frag_remainder = 0;
1810	int mappable_offset;
1811	skb_frag_t *frag;
1812
1813	/* worst case: skip to next skb. try to improve on this case below */
1814	zc->recv_skip_hint = skb->len - offset;
1815
1816	/* Find the frag containing this offset (and how far into that frag) */
1817	frag = skb_advance_to_frag(skb, offset, &frag_offset);
1818	if (!frag)
1819		return;
1820
1821	if (frag_offset) {
1822		struct skb_shared_info *info = skb_shinfo(skb);
1823
1824		/* We read part of the last frag, must recvmsg() rest of skb. */
1825		if (frag == &info->frags[info->nr_frags - 1])
1826			return;
1827
1828		/* Else, we must at least read the remainder in this frag. */
1829		partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1830		zc->recv_skip_hint -= partial_frag_remainder;
1831		++frag;
1832	}
1833
1834	/* partial_frag_remainder: If part way through a frag, must read rest.
1835	 * mappable_offset: Bytes till next mappable frag, *not* counting bytes
1836	 * in partial_frag_remainder.
1837	 */
1838	mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1839	zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1840}
1841
1842static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1843			      int flags, struct scm_timestamping_internal *tss,
1844			      int *cmsg_flags);
1845static int receive_fallback_to_copy(struct sock *sk,
1846				    struct tcp_zerocopy_receive *zc, int inq,
1847				    struct scm_timestamping_internal *tss)
1848{
1849	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1850	struct msghdr msg = {};
1851	struct iovec iov;
1852	int err;
1853
1854	zc->length = 0;
1855	zc->recv_skip_hint = 0;
1856
1857	if (copy_address != zc->copybuf_address)
1858		return -EINVAL;
1859
1860	err = import_single_range(ITER_DEST, (void __user *)copy_address,
1861				  inq, &iov, &msg.msg_iter);
1862	if (err)
1863		return err;
1864
1865	err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
1866				 tss, &zc->msg_flags);
1867	if (err < 0)
1868		return err;
1869
1870	zc->copybuf_len = err;
1871	if (likely(zc->copybuf_len)) {
1872		struct sk_buff *skb;
1873		u32 offset;
1874
1875		skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1876		if (skb)
1877			tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1878	}
1879	return 0;
1880}
1881
1882static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1883				   struct sk_buff *skb, u32 copylen,
1884				   u32 *offset, u32 *seq)
1885{
1886	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1887	struct msghdr msg = {};
1888	struct iovec iov;
1889	int err;
1890
1891	if (copy_address != zc->copybuf_address)
1892		return -EINVAL;
1893
1894	err = import_single_range(ITER_DEST, (void __user *)copy_address,
1895				  copylen, &iov, &msg.msg_iter);
1896	if (err)
1897		return err;
1898	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1899	if (err)
1900		return err;
1901	zc->recv_skip_hint -= copylen;
1902	*offset += copylen;
1903	*seq += copylen;
1904	return (__s32)copylen;
1905}
1906
1907static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
1908				  struct sock *sk,
1909				  struct sk_buff *skb,
1910				  u32 *seq,
1911				  s32 copybuf_len,
1912				  struct scm_timestamping_internal *tss)
1913{
1914	u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1915
1916	if (!copylen)
1917		return 0;
1918	/* skb is null if inq < PAGE_SIZE. */
1919	if (skb) {
1920		offset = *seq - TCP_SKB_CB(skb)->seq;
1921	} else {
1922		skb = tcp_recv_skb(sk, *seq, &offset);
1923		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1924			tcp_update_recv_tstamps(skb, tss);
1925			zc->msg_flags |= TCP_CMSG_TS;
1926		}
1927	}
1928
1929	zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1930						  seq);
1931	return zc->copybuf_len < 0 ? 0 : copylen;
1932}
1933
1934static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1935					      struct page **pending_pages,
1936					      unsigned long pages_remaining,
1937					      unsigned long *address,
1938					      u32 *length,
1939					      u32 *seq,
1940					      struct tcp_zerocopy_receive *zc,
1941					      u32 total_bytes_to_map,
1942					      int err)
1943{
1944	/* At least one page did not map. Try zapping if we skipped earlier. */
1945	if (err == -EBUSY &&
1946	    zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1947		u32 maybe_zap_len;
1948
1949		maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
1950				*length + /* Mapped or pending */
1951				(pages_remaining * PAGE_SIZE); /* Failed map. */
1952		zap_page_range_single(vma, *address, maybe_zap_len, NULL);
1953		err = 0;
1954	}
1955
1956	if (!err) {
1957		unsigned long leftover_pages = pages_remaining;
1958		int bytes_mapped;
1959
1960		/* We called zap_page_range_single, try to reinsert. */
1961		err = vm_insert_pages(vma, *address,
1962				      pending_pages,
1963				      &pages_remaining);
1964		bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
1965		*seq += bytes_mapped;
1966		*address += bytes_mapped;
1967	}
1968	if (err) {
1969		/* Either we were unable to zap, OR we zapped, retried an
1970		 * insert, and still had an issue. Either ways, pages_remaining
1971		 * is the number of pages we were unable to map, and we unroll
1972		 * some state we speculatively touched before.
1973		 */
1974		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1975
1976		*length -= bytes_not_mapped;
1977		zc->recv_skip_hint += bytes_not_mapped;
1978	}
1979	return err;
1980}
1981
1982static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1983					struct page **pages,
1984					unsigned int pages_to_map,
1985					unsigned long *address,
1986					u32 *length,
1987					u32 *seq,
1988					struct tcp_zerocopy_receive *zc,
1989					u32 total_bytes_to_map)
1990{
1991	unsigned long pages_remaining = pages_to_map;
1992	unsigned int pages_mapped;
1993	unsigned int bytes_mapped;
1994	int err;
1995
1996	err = vm_insert_pages(vma, *address, pages, &pages_remaining);
1997	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
1998	bytes_mapped = PAGE_SIZE * pages_mapped;
1999	/* Even if vm_insert_pages fails, it may have partially succeeded in
2000	 * mapping (some but not all of the pages).
2001	 */
2002	*seq += bytes_mapped;
2003	*address += bytes_mapped;
2004
2005	if (likely(!err))
2006		return 0;
2007
2008	/* Error: maybe zap and retry + rollback state for failed inserts. */
2009	return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2010		pages_remaining, address, length, seq, zc, total_bytes_to_map,
2011		err);
2012}
2013
2014#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
2015static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2016				      struct tcp_zerocopy_receive *zc,
2017				      struct scm_timestamping_internal *tss)
2018{
2019	unsigned long msg_control_addr;
2020	struct msghdr cmsg_dummy;
2021
2022	msg_control_addr = (unsigned long)zc->msg_control;
2023	cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
2024	cmsg_dummy.msg_controllen =
2025		(__kernel_size_t)zc->msg_controllen;
2026	cmsg_dummy.msg_flags = in_compat_syscall()
2027		? MSG_CMSG_COMPAT : 0;
2028	cmsg_dummy.msg_control_is_user = true;
2029	zc->msg_flags = 0;
2030	if (zc->msg_control == msg_control_addr &&
2031	    zc->msg_controllen == cmsg_dummy.msg_controllen) {
2032		tcp_recv_timestamp(&cmsg_dummy, sk, tss);
2033		zc->msg_control = (__u64)
2034			((uintptr_t)cmsg_dummy.msg_control_user);
2035		zc->msg_controllen =
2036			(__u64)cmsg_dummy.msg_controllen;
2037		zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2038	}
2039}
2040
2041static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
2042					   unsigned long address,
2043					   bool *mmap_locked)
2044{
2045	struct vm_area_struct *vma = NULL;
2046
2047#ifdef CONFIG_PER_VMA_LOCK
2048	vma = lock_vma_under_rcu(mm, address);
2049#endif
2050	if (vma) {
2051		if (!vma_is_tcp(vma)) {
2052			vma_end_read(vma);
2053			return NULL;
2054		}
2055		*mmap_locked = false;
2056		return vma;
2057	}
2058
2059	mmap_read_lock(mm);
2060	vma = vma_lookup(mm, address);
2061	if (!vma || !vma_is_tcp(vma)) {
2062		mmap_read_unlock(mm);
2063		return NULL;
2064	}
2065	*mmap_locked = true;
2066	return vma;
2067}
2068
2069#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2070static int tcp_zerocopy_receive(struct sock *sk,
2071				struct tcp_zerocopy_receive *zc,
2072				struct scm_timestamping_internal *tss)
2073{
2074	u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2075	unsigned long address = (unsigned long)zc->address;
2076	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2077	s32 copybuf_len = zc->copybuf_len;
2078	struct tcp_sock *tp = tcp_sk(sk);
2079	const skb_frag_t *frags = NULL;
2080	unsigned int pages_to_map = 0;
2081	struct vm_area_struct *vma;
2082	struct sk_buff *skb = NULL;
2083	u32 seq = tp->copied_seq;
2084	u32 total_bytes_to_map;
2085	int inq = tcp_inq(sk);
2086	bool mmap_locked;
2087	int ret;
2088
2089	zc->copybuf_len = 0;
2090	zc->msg_flags = 0;
2091
2092	if (address & (PAGE_SIZE - 1) || address != zc->address)
2093		return -EINVAL;
2094
2095	if (sk->sk_state == TCP_LISTEN)
2096		return -ENOTCONN;
2097
2098	sock_rps_record_flow(sk);
2099
2100	if (inq && inq <= copybuf_len)
2101		return receive_fallback_to_copy(sk, zc, inq, tss);
2102
2103	if (inq < PAGE_SIZE) {
2104		zc->length = 0;
2105		zc->recv_skip_hint = inq;
2106		if (!inq && sock_flag(sk, SOCK_DONE))
2107			return -EIO;
2108		return 0;
2109	}
2110
2111	vma = find_tcp_vma(current->mm, address, &mmap_locked);
2112	if (!vma)
2113		return -EINVAL;
2114
2115	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2116	avail_len = min_t(u32, vma_len, inq);
2117	total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2118	if (total_bytes_to_map) {
2119		if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2120			zap_page_range_single(vma, address, total_bytes_to_map,
2121					      NULL);
2122		zc->length = total_bytes_to_map;
2123		zc->recv_skip_hint = 0;
2124	} else {
2125		zc->length = avail_len;
2126		zc->recv_skip_hint = avail_len;
2127	}
2128	ret = 0;
2129	while (length + PAGE_SIZE <= zc->length) {
2130		int mappable_offset;
2131		struct page *page;
2132
2133		if (zc->recv_skip_hint < PAGE_SIZE) {
2134			u32 offset_frag;
2135
2136			if (skb) {
2137				if (zc->recv_skip_hint > 0)
2138					break;
2139				skb = skb->next;
2140				offset = seq - TCP_SKB_CB(skb)->seq;
2141			} else {
2142				skb = tcp_recv_skb(sk, seq, &offset);
2143			}
2144
2145			if (TCP_SKB_CB(skb)->has_rxtstamp) {
2146				tcp_update_recv_tstamps(skb, tss);
2147				zc->msg_flags |= TCP_CMSG_TS;
2148			}
2149			zc->recv_skip_hint = skb->len - offset;
2150			frags = skb_advance_to_frag(skb, offset, &offset_frag);
2151			if (!frags || offset_frag)
2152				break;
2153		}
2154
2155		mappable_offset = find_next_mappable_frag(frags,
2156							  zc->recv_skip_hint);
2157		if (mappable_offset) {
2158			zc->recv_skip_hint = mappable_offset;
2159			break;
2160		}
2161		page = skb_frag_page(frags);
2162		prefetchw(page);
2163		pages[pages_to_map++] = page;
2164		length += PAGE_SIZE;
2165		zc->recv_skip_hint -= PAGE_SIZE;
2166		frags++;
2167		if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2168		    zc->recv_skip_hint < PAGE_SIZE) {
2169			/* Either full batch, or we're about to go to next skb
2170			 * (and we cannot unroll failed ops across skbs).
2171			 */
2172			ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2173							   pages_to_map,
2174							   &address, &length,
2175							   &seq, zc,
2176							   total_bytes_to_map);
2177			if (ret)
2178				goto out;
2179			pages_to_map = 0;
2180		}
2181	}
2182	if (pages_to_map) {
2183		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2184						   &address, &length, &seq,
2185						   zc, total_bytes_to_map);
2186	}
2187out:
2188	if (mmap_locked)
2189		mmap_read_unlock(current->mm);
2190	else
2191		vma_end_read(vma);
2192	/* Try to copy straggler data. */
2193	if (!ret)
2194		copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
2195
2196	if (length + copylen) {
2197		WRITE_ONCE(tp->copied_seq, seq);
2198		tcp_rcv_space_adjust(sk);
2199
2200		/* Clean up data we have read: This will do ACK frames. */
2201		tcp_recv_skb(sk, seq, &offset);
2202		tcp_cleanup_rbuf(sk, length + copylen);
2203		ret = 0;
2204		if (length == zc->length)
2205			zc->recv_skip_hint = 0;
2206	} else {
2207		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2208			ret = -EIO;
2209	}
2210	zc->length = length;
2211	return ret;
2212}
2213#endif
2214
2215/* Similar to __sock_recv_timestamp, but does not require an skb */
2216void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2217			struct scm_timestamping_internal *tss)
2218{
2219	int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2220	bool has_timestamping = false;
2221
2222	if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2223		if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2224			if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2225				if (new_tstamp) {
2226					struct __kernel_timespec kts = {
2227						.tv_sec = tss->ts[0].tv_sec,
2228						.tv_nsec = tss->ts[0].tv_nsec,
2229					};
2230					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2231						 sizeof(kts), &kts);
2232				} else {
2233					struct __kernel_old_timespec ts_old = {
2234						.tv_sec = tss->ts[0].tv_sec,
2235						.tv_nsec = tss->ts[0].tv_nsec,
2236					};
2237					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2238						 sizeof(ts_old), &ts_old);
2239				}
2240			} else {
2241				if (new_tstamp) {
2242					struct __kernel_sock_timeval stv = {
2243						.tv_sec = tss->ts[0].tv_sec,
2244						.tv_usec = tss->ts[0].tv_nsec / 1000,
2245					};
2246					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2247						 sizeof(stv), &stv);
2248				} else {
2249					struct __kernel_old_timeval tv = {
2250						.tv_sec = tss->ts[0].tv_sec,
2251						.tv_usec = tss->ts[0].tv_nsec / 1000,
2252					};
2253					put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2254						 sizeof(tv), &tv);
2255				}
2256			}
2257		}
2258
2259		if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2260			has_timestamping = true;
2261		else
2262			tss->ts[0] = (struct timespec64) {0};
2263	}
2264
2265	if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2266		if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2267			has_timestamping = true;
2268		else
2269			tss->ts[2] = (struct timespec64) {0};
2270	}
2271
2272	if (has_timestamping) {
2273		tss->ts[1] = (struct timespec64) {0};
2274		if (sock_flag(sk, SOCK_TSTAMP_NEW))
2275			put_cmsg_scm_timestamping64(msg, tss);
2276		else
2277			put_cmsg_scm_timestamping(msg, tss);
2278	}
2279}
2280
2281static int tcp_inq_hint(struct sock *sk)
2282{
2283	const struct tcp_sock *tp = tcp_sk(sk);
2284	u32 copied_seq = READ_ONCE(tp->copied_seq);
2285	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2286	int inq;
2287
2288	inq = rcv_nxt - copied_seq;
2289	if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2290		lock_sock(sk);
2291		inq = tp->rcv_nxt - tp->copied_seq;
2292		release_sock(sk);
2293	}
2294	/* After receiving a FIN, tell the user-space to continue reading
2295	 * by returning a non-zero inq.
2296	 */
2297	if (inq == 0 && sock_flag(sk, SOCK_DONE))
2298		inq = 1;
2299	return inq;
2300}
2301
2302/*
2303 *	This routine copies from a sock struct into the user buffer.
2304 *
2305 *	Technical note: in 2.3 we work on _locked_ socket, so that
2306 *	tricks with *seq access order and skb->users are not required.
2307 *	Probably, code can be easily improved even more.
2308 */
2309
2310static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2311			      int flags, struct scm_timestamping_internal *tss,
2312			      int *cmsg_flags)
2313{
2314	struct tcp_sock *tp = tcp_sk(sk);
2315	int copied = 0;
2316	u32 peek_seq;
2317	u32 *seq;
2318	unsigned long used;
2319	int err;
2320	int target;		/* Read at least this many bytes */
2321	long timeo;
2322	struct sk_buff *skb, *last;
2323	u32 urg_hole = 0;
2324
2325	err = -ENOTCONN;
2326	if (sk->sk_state == TCP_LISTEN)
2327		goto out;
2328
2329	if (tp->recvmsg_inq) {
2330		*cmsg_flags = TCP_CMSG_INQ;
2331		msg->msg_get_inq = 1;
2332	}
2333	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2334
2335	/* Urgent data needs to be handled specially. */
2336	if (flags & MSG_OOB)
2337		goto recv_urg;
2338
2339	if (unlikely(tp->repair)) {
2340		err = -EPERM;
2341		if (!(flags & MSG_PEEK))
2342			goto out;
2343
2344		if (tp->repair_queue == TCP_SEND_QUEUE)
2345			goto recv_sndq;
2346
2347		err = -EINVAL;
2348		if (tp->repair_queue == TCP_NO_QUEUE)
2349			goto out;
2350
2351		/* 'common' recv queue MSG_PEEK-ing */
2352	}
2353
2354	seq = &tp->copied_seq;
2355	if (flags & MSG_PEEK) {
2356		peek_seq = tp->copied_seq;
2357		seq = &peek_seq;
2358	}
2359
2360	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2361
2362	do {
2363		u32 offset;
2364
2365		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
2366		if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2367			if (copied)
2368				break;
2369			if (signal_pending(current)) {
2370				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2371				break;
2372			}
2373		}
2374
2375		/* Next get a buffer. */
2376
2377		last = skb_peek_tail(&sk->sk_receive_queue);
2378		skb_queue_walk(&sk->sk_receive_queue, skb) {
2379			last = skb;
2380			/* Now that we have two receive queues this
2381			 * shouldn't happen.
2382			 */
2383			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2384				 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2385				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2386				 flags))
2387				break;
2388
2389			offset = *seq - TCP_SKB_CB(skb)->seq;
2390			if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2391				pr_err_once("%s: found a SYN, please report !\n", __func__);
2392				offset--;
2393			}
2394			if (offset < skb->len)
2395				goto found_ok_skb;
2396			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2397				goto found_fin_ok;
2398			WARN(!(flags & MSG_PEEK),
2399			     "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2400			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2401		}
2402
2403		/* Well, if we have backlog, try to process it now yet. */
2404
2405		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2406			break;
2407
2408		if (copied) {
2409			if (!timeo ||
2410			    sk->sk_err ||
2411			    sk->sk_state == TCP_CLOSE ||
2412			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2413			    signal_pending(current))
2414				break;
2415		} else {
2416			if (sock_flag(sk, SOCK_DONE))
2417				break;
2418
2419			if (sk->sk_err) {
2420				copied = sock_error(sk);
2421				break;
2422			}
2423
2424			if (sk->sk_shutdown & RCV_SHUTDOWN)
2425				break;
2426
2427			if (sk->sk_state == TCP_CLOSE) {
2428				/* This occurs when user tries to read
2429				 * from never connected socket.
2430				 */
2431				copied = -ENOTCONN;
2432				break;
2433			}
2434
2435			if (!timeo) {
2436				copied = -EAGAIN;
2437				break;
2438			}
2439
2440			if (signal_pending(current)) {
2441				copied = sock_intr_errno(timeo);
2442				break;
2443			}
2444		}
2445
2446		if (copied >= target) {
2447			/* Do not sleep, just process backlog. */
2448			__sk_flush_backlog(sk);
2449		} else {
2450			tcp_cleanup_rbuf(sk, copied);
2451			sk_wait_data(sk, &timeo, last);
2452		}
2453
2454		if ((flags & MSG_PEEK) &&
2455		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
2456			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2457					    current->comm,
2458					    task_pid_nr(current));
2459			peek_seq = tp->copied_seq;
2460		}
2461		continue;
2462
2463found_ok_skb:
2464		/* Ok so how much can we use? */
2465		used = skb->len - offset;
2466		if (len < used)
2467			used = len;
2468
2469		/* Do we have urgent data here? */
2470		if (unlikely(tp->urg_data)) {
2471			u32 urg_offset = tp->urg_seq - *seq;
2472			if (urg_offset < used) {
2473				if (!urg_offset) {
2474					if (!sock_flag(sk, SOCK_URGINLINE)) {
2475						WRITE_ONCE(*seq, *seq + 1);
2476						urg_hole++;
2477						offset++;
2478						used--;
2479						if (!used)
2480							goto skip_copy;
2481					}
2482				} else
2483					used = urg_offset;
2484			}
2485		}
2486
2487		if (!(flags & MSG_TRUNC)) {
2488			err = skb_copy_datagram_msg(skb, offset, msg, used);
2489			if (err) {
2490				/* Exception. Bailout! */
2491				if (!copied)
2492					copied = -EFAULT;
2493				break;
2494			}
2495		}
2496
2497		WRITE_ONCE(*seq, *seq + used);
2498		copied += used;
2499		len -= used;
2500
2501		tcp_rcv_space_adjust(sk);
2502
2503skip_copy:
2504		if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2505			WRITE_ONCE(tp->urg_data, 0);
2506			tcp_fast_path_check(sk);
2507		}
2508
2509		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2510			tcp_update_recv_tstamps(skb, tss);
2511			*cmsg_flags |= TCP_CMSG_TS;
2512		}
2513
2514		if (used + offset < skb->len)
2515			continue;
2516
2517		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2518			goto found_fin_ok;
2519		if (!(flags & MSG_PEEK))
2520			tcp_eat_recv_skb(sk, skb);
2521		continue;
2522
2523found_fin_ok:
2524		/* Process the FIN. */
2525		WRITE_ONCE(*seq, *seq + 1);
2526		if (!(flags & MSG_PEEK))
2527			tcp_eat_recv_skb(sk, skb);
2528		break;
2529	} while (len > 0);
2530
2531	/* According to UNIX98, msg_name/msg_namelen are ignored
2532	 * on connected socket. I was just happy when found this 8) --ANK
2533	 */
2534
2535	/* Clean up data we have read: This will do ACK frames. */
2536	tcp_cleanup_rbuf(sk, copied);
2537	return copied;
2538
2539out:
2540	return err;
2541
2542recv_urg:
2543	err = tcp_recv_urg(sk, msg, len, flags);
2544	goto out;
2545
2546recv_sndq:
2547	err = tcp_peek_sndq(sk, msg, len);
2548	goto out;
2549}
2550
2551int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
2552		int *addr_len)
2553{
2554	int cmsg_flags = 0, ret;
2555	struct scm_timestamping_internal tss;
2556
2557	if (unlikely(flags & MSG_ERRQUEUE))
2558		return inet_recv_error(sk, msg, len, addr_len);
2559
2560	if (sk_can_busy_loop(sk) &&
2561	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2562	    sk->sk_state == TCP_ESTABLISHED)
2563		sk_busy_loop(sk, flags & MSG_DONTWAIT);
2564
2565	lock_sock(sk);
2566	ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
2567	release_sock(sk);
2568
2569	if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
2570		if (cmsg_flags & TCP_CMSG_TS)
2571			tcp_recv_timestamp(msg, sk, &tss);
2572		if (msg->msg_get_inq) {
2573			msg->msg_inq = tcp_inq_hint(sk);
2574			if (cmsg_flags & TCP_CMSG_INQ)
2575				put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
2576					 sizeof(msg->msg_inq), &msg->msg_inq);
2577		}
2578	}
2579	return ret;
2580}
2581EXPORT_SYMBOL(tcp_recvmsg);
2582
2583void tcp_set_state(struct sock *sk, int state)
2584{
2585	int oldstate = sk->sk_state;
2586
2587	/* We defined a new enum for TCP states that are exported in BPF
2588	 * so as not force the internal TCP states to be frozen. The
2589	 * following checks will detect if an internal state value ever
2590	 * differs from the BPF value. If this ever happens, then we will
2591	 * need to remap the internal value to the BPF value before calling
2592	 * tcp_call_bpf_2arg.
2593	 */
2594	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2595	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2596	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2597	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2598	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2599	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2600	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2601	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2602	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2603	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2604	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2605	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2606	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2607
2608	/* bpf uapi header bpf.h defines an anonymous enum with values
2609	 * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
2610	 * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
2611	 * But clang built vmlinux does not have this enum in DWARF
2612	 * since clang removes the above code before generating IR/debuginfo.
2613	 * Let us explicitly emit the type debuginfo to ensure the
2614	 * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
2615	 * regardless of which compiler is used.
2616	 */
2617	BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2618
2619	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2620		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2621
2622	switch (state) {
2623	case TCP_ESTABLISHED:
2624		if (oldstate != TCP_ESTABLISHED)
2625			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2626		break;
2627
2628	case TCP_CLOSE:
2629		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2630			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2631
2632		sk->sk_prot->unhash(sk);
2633		if (inet_csk(sk)->icsk_bind_hash &&
2634		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2635			inet_put_port(sk);
2636		fallthrough;
2637	default:
2638		if (oldstate == TCP_ESTABLISHED)
2639			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2640	}
2641
2642	/* Change state AFTER socket is unhashed to avoid closed
2643	 * socket sitting in hash tables.
2644	 */
2645	inet_sk_state_store(sk, state);
2646}
2647EXPORT_SYMBOL_GPL(tcp_set_state);
2648
2649/*
2650 *	State processing on a close. This implements the state shift for
2651 *	sending our FIN frame. Note that we only send a FIN for some
2652 *	states. A shutdown() may have already sent the FIN, or we may be
2653 *	closed.
2654 */
2655
2656static const unsigned char new_state[16] = {
2657  /* current state:        new state:      action:	*/
2658  [0 /* (Invalid) */]	= TCP_CLOSE,
2659  [TCP_ESTABLISHED]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2660  [TCP_SYN_SENT]	= TCP_CLOSE,
2661  [TCP_SYN_RECV]	= TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2662  [TCP_FIN_WAIT1]	= TCP_FIN_WAIT1,
2663  [TCP_FIN_WAIT2]	= TCP_FIN_WAIT2,
2664  [TCP_TIME_WAIT]	= TCP_CLOSE,
2665  [TCP_CLOSE]		= TCP_CLOSE,
2666  [TCP_CLOSE_WAIT]	= TCP_LAST_ACK  | TCP_ACTION_FIN,
2667  [TCP_LAST_ACK]	= TCP_LAST_ACK,
2668  [TCP_LISTEN]		= TCP_CLOSE,
2669  [TCP_CLOSING]		= TCP_CLOSING,
2670  [TCP_NEW_SYN_RECV]	= TCP_CLOSE,	/* should not happen ! */
2671};
2672
2673static int tcp_close_state(struct sock *sk)
2674{
2675	int next = (int)new_state[sk->sk_state];
2676	int ns = next & TCP_STATE_MASK;
2677
2678	tcp_set_state(sk, ns);
2679
2680	return next & TCP_ACTION_FIN;
2681}
2682
2683/*
2684 *	Shutdown the sending side of a connection. Much like close except
2685 *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2686 */
2687
2688void tcp_shutdown(struct sock *sk, int how)
2689{
2690	/*	We need to grab some memory, and put together a FIN,
2691	 *	and then put it into the queue to be sent.
2692	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2693	 */
2694	if (!(how & SEND_SHUTDOWN))
2695		return;
2696
2697	/* If we've already sent a FIN, or it's a closed state, skip this. */
2698	if ((1 << sk->sk_state) &
2699	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2700	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2701		/* Clear out any half completed packets.  FIN if needed. */
2702		if (tcp_close_state(sk))
2703			tcp_send_fin(sk);
2704	}
2705}
2706EXPORT_SYMBOL(tcp_shutdown);
2707
2708int tcp_orphan_count_sum(void)
2709{
2710	int i, total = 0;
2711
2712	for_each_possible_cpu(i)
2713		total += per_cpu(tcp_orphan_count, i);
2714
2715	return max(total, 0);
2716}
2717
2718static int tcp_orphan_cache;
2719static struct timer_list tcp_orphan_timer;
2720#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2721
2722static void tcp_orphan_update(struct timer_list *unused)
2723{
2724	WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2725	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
2726}
2727
2728static bool tcp_too_many_orphans(int shift)
2729{
2730	return READ_ONCE(tcp_orphan_cache) << shift >
2731		READ_ONCE(sysctl_tcp_max_orphans);
2732}
2733
2734bool tcp_check_oom(struct sock *sk, int shift)
2735{
2736	bool too_many_orphans, out_of_socket_memory;
2737
2738	too_many_orphans = tcp_too_many_orphans(shift);
2739	out_of_socket_memory = tcp_out_of_memory(sk);
2740
2741	if (too_many_orphans)
2742		net_info_ratelimited("too many orphaned sockets\n");
2743	if (out_of_socket_memory)
2744		net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2745	return too_many_orphans || out_of_socket_memory;
2746}
2747
2748void __tcp_close(struct sock *sk, long timeout)
2749{
2750	struct sk_buff *skb;
2751	int data_was_unread = 0;
2752	int state;
2753
2754	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
2755
2756	if (sk->sk_state == TCP_LISTEN) {
2757		tcp_set_state(sk, TCP_CLOSE);
2758
2759		/* Special case. */
2760		inet_csk_listen_stop(sk);
2761
2762		goto adjudge_to_death;
2763	}
2764
2765	/*  We need to flush the recv. buffs.  We do this only on the
2766	 *  descriptor close, not protocol-sourced closes, because the
2767	 *  reader process may not have drained the data yet!
2768	 */
2769	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2770		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2771
2772		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2773			len--;
2774		data_was_unread += len;
2775		__kfree_skb(skb);
2776	}
2777
2778	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
2779	if (sk->sk_state == TCP_CLOSE)
2780		goto adjudge_to_death;
2781
2782	/* As outlined in RFC 2525, section 2.17, we send a RST here because
2783	 * data was lost. To witness the awful effects of the old behavior of
2784	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2785	 * GET in an FTP client, suspend the process, wait for the client to
2786	 * advertise a zero window, then kill -9 the FTP client, wheee...
2787	 * Note: timeout is always zero in such a case.
2788	 */
2789	if (unlikely(tcp_sk(sk)->repair)) {
2790		sk->sk_prot->disconnect(sk, 0);
2791	} else if (data_was_unread) {
2792		/* Unread data was tossed, zap the connection. */
2793		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2794		tcp_set_state(sk, TCP_CLOSE);
2795		tcp_send_active_reset(sk, sk->sk_allocation);
2796	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2797		/* Check zero linger _after_ checking for unread data. */
2798		sk->sk_prot->disconnect(sk, 0);
2799		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2800	} else if (tcp_close_state(sk)) {
2801		/* We FIN if the application ate all the data before
2802		 * zapping the connection.
2803		 */
2804
2805		/* RED-PEN. Formally speaking, we have broken TCP state
2806		 * machine. State transitions:
2807		 *
2808		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
2809		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
2810		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
2811		 *
2812		 * are legal only when FIN has been sent (i.e. in window),
2813		 * rather than queued out of window. Purists blame.
2814		 *
2815		 * F.e. "RFC state" is ESTABLISHED,
2816		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2817		 *
2818		 * The visible declinations are that sometimes
2819		 * we enter time-wait state, when it is not required really
2820		 * (harmless), do not send active resets, when they are
2821		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2822		 * they look as CLOSING or LAST_ACK for Linux)
2823		 * Probably, I missed some more holelets.
2824		 * 						--ANK
2825		 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2826		 * in a single packet! (May consider it later but will
2827		 * probably need API support or TCP_CORK SYN-ACK until
2828		 * data is written and socket is closed.)
2829		 */
2830		tcp_send_fin(sk);
2831	}
2832
2833	sk_stream_wait_close(sk, timeout);
2834
2835adjudge_to_death:
2836	state = sk->sk_state;
2837	sock_hold(sk);
2838	sock_orphan(sk);
2839
2840	local_bh_disable();
2841	bh_lock_sock(sk);
2842	/* remove backlog if any, without releasing ownership. */
2843	__release_sock(sk);
2844
2845	this_cpu_inc(tcp_orphan_count);
2846
2847	/* Have we already been destroyed by a softirq or backlog? */
2848	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2849		goto out;
2850
2851	/*	This is a (useful) BSD violating of the RFC. There is a
2852	 *	problem with TCP as specified in that the other end could
2853	 *	keep a socket open forever with no application left this end.
2854	 *	We use a 1 minute timeout (about the same as BSD) then kill
2855	 *	our end. If they send after that then tough - BUT: long enough
2856	 *	that we won't make the old 4*rto = almost no time - whoops
2857	 *	reset mistake.
2858	 *
2859	 *	Nope, it was not mistake. It is really desired behaviour
2860	 *	f.e. on http servers, when such sockets are useless, but
2861	 *	consume significant resources. Let's do it with special
2862	 *	linger2	option.					--ANK
2863	 */
2864
2865	if (sk->sk_state == TCP_FIN_WAIT2) {
2866		struct tcp_sock *tp = tcp_sk(sk);
2867		if (tp->linger2 < 0) {
2868			tcp_set_state(sk, TCP_CLOSE);
2869			tcp_send_active_reset(sk, GFP_ATOMIC);
2870			__NET_INC_STATS(sock_net(sk),
2871					LINUX_MIB_TCPABORTONLINGER);
2872		} else {
2873			const int tmo = tcp_fin_time(sk);
2874
2875			if (tmo > TCP_TIMEWAIT_LEN) {
2876				inet_csk_reset_keepalive_timer(sk,
2877						tmo - TCP_TIMEWAIT_LEN);
2878			} else {
2879				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2880				goto out;
2881			}
2882		}
2883	}
2884	if (sk->sk_state != TCP_CLOSE) {
2885		if (tcp_check_oom(sk, 0)) {
2886			tcp_set_state(sk, TCP_CLOSE);
2887			tcp_send_active_reset(sk, GFP_ATOMIC);
2888			__NET_INC_STATS(sock_net(sk),
2889					LINUX_MIB_TCPABORTONMEMORY);
2890		} else if (!check_net(sock_net(sk))) {
2891			/* Not possible to send reset; just close */
2892			tcp_set_state(sk, TCP_CLOSE);
2893		}
2894	}
2895
2896	if (sk->sk_state == TCP_CLOSE) {
2897		struct request_sock *req;
2898
2899		req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2900						lockdep_sock_is_held(sk));
2901		/* We could get here with a non-NULL req if the socket is
2902		 * aborted (e.g., closed with unread data) before 3WHS
2903		 * finishes.
2904		 */
2905		if (req)
2906			reqsk_fastopen_remove(sk, req, false);
2907		inet_csk_destroy_sock(sk);
2908	}
2909	/* Otherwise, socket is reprieved until protocol close. */
2910
2911out:
2912	bh_unlock_sock(sk);
2913	local_bh_enable();
2914}
2915
2916void tcp_close(struct sock *sk, long timeout)
2917{
2918	lock_sock(sk);
2919	__tcp_close(sk, timeout);
2920	release_sock(sk);
2921	sock_put(sk);
2922}
2923EXPORT_SYMBOL(tcp_close);
2924
2925/* These states need RST on ABORT according to RFC793 */
2926
2927static inline bool tcp_need_reset(int state)
2928{
2929	return (1 << state) &
2930	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2931		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2932}
2933
2934static void tcp_rtx_queue_purge(struct sock *sk)
2935{
2936	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2937
2938	tcp_sk(sk)->highest_sack = NULL;
2939	while (p) {
2940		struct sk_buff *skb = rb_to_skb(p);
2941
2942		p = rb_next(p);
2943		/* Since we are deleting whole queue, no need to
2944		 * list_del(&skb->tcp_tsorted_anchor)
2945		 */
2946		tcp_rtx_queue_unlink(skb, sk);
2947		tcp_wmem_free_skb(sk, skb);
2948	}
2949}
2950
2951void tcp_write_queue_purge(struct sock *sk)
2952{
2953	struct sk_buff *skb;
2954
2955	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2956	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2957		tcp_skb_tsorted_anchor_cleanup(skb);
2958		tcp_wmem_free_skb(sk, skb);
2959	}
2960	tcp_rtx_queue_purge(sk);
2961	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2962	tcp_clear_all_retrans_hints(tcp_sk(sk));
2963	tcp_sk(sk)->packets_out = 0;
2964	inet_csk(sk)->icsk_backoff = 0;
2965}
2966
2967int tcp_disconnect(struct sock *sk, int flags)
2968{
2969	struct inet_sock *inet = inet_sk(sk);
2970	struct inet_connection_sock *icsk = inet_csk(sk);
2971	struct tcp_sock *tp = tcp_sk(sk);
2972	int old_state = sk->sk_state;
2973	u32 seq;
2974
2975	/* Deny disconnect if other threads are blocked in sk_wait_event()
2976	 * or inet_wait_for_connect().
2977	 */
2978	if (sk->sk_wait_pending)
2979		return -EBUSY;
2980
2981	if (old_state != TCP_CLOSE)
2982		tcp_set_state(sk, TCP_CLOSE);
2983
2984	/* ABORT function of RFC793 */
2985	if (old_state == TCP_LISTEN) {
2986		inet_csk_listen_stop(sk);
2987	} else if (unlikely(tp->repair)) {
2988		WRITE_ONCE(sk->sk_err, ECONNABORTED);
2989	} else if (tcp_need_reset(old_state) ||
2990		   (tp->snd_nxt != tp->write_seq &&
2991		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2992		/* The last check adjusts for discrepancy of Linux wrt. RFC
2993		 * states
2994		 */
2995		tcp_send_active_reset(sk, gfp_any());
2996		WRITE_ONCE(sk->sk_err, ECONNRESET);
2997	} else if (old_state == TCP_SYN_SENT)
2998		WRITE_ONCE(sk->sk_err, ECONNRESET);
2999
3000	tcp_clear_xmit_timers(sk);
3001	__skb_queue_purge(&sk->sk_receive_queue);
3002	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3003	WRITE_ONCE(tp->urg_data, 0);
3004	tcp_write_queue_purge(sk);
3005	tcp_fastopen_active_disable_ofo_check(sk);
3006	skb_rbtree_purge(&tp->out_of_order_queue);
3007
3008	inet->inet_dport = 0;
3009
3010	inet_bhash2_reset_saddr(sk);
3011
3012	WRITE_ONCE(sk->sk_shutdown, 0);
3013	sock_reset_flag(sk, SOCK_DONE);
3014	tp->srtt_us = 0;
3015	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3016	tp->rcv_rtt_last_tsecr = 0;
3017
3018	seq = tp->write_seq + tp->max_window + 2;
3019	if (!seq)
3020		seq = 1;
3021	WRITE_ONCE(tp->write_seq, seq);
3022
3023	icsk->icsk_backoff = 0;
3024	icsk->icsk_probes_out = 0;
3025	icsk->icsk_probes_tstamp = 0;
3026	icsk->icsk_rto = TCP_TIMEOUT_INIT;
3027	icsk->icsk_rto_min = TCP_RTO_MIN;
3028	icsk->icsk_delack_max = TCP_DELACK_MAX;
3029	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3030	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
3031	tp->snd_cwnd_cnt = 0;
3032	tp->is_cwnd_limited = 0;
3033	tp->max_packets_out = 0;
3034	tp->window_clamp = 0;
3035	tp->delivered = 0;
3036	tp->delivered_ce = 0;
3037	if (icsk->icsk_ca_ops->release)
3038		icsk->icsk_ca_ops->release(sk);
3039	memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
3040	icsk->icsk_ca_initialized = 0;
3041	tcp_set_ca_state(sk, TCP_CA_Open);
3042	tp->is_sack_reneg = 0;
3043	tcp_clear_retrans(tp);
3044	tp->total_retrans = 0;
3045	inet_csk_delack_init(sk);
3046	/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
3047	 * issue in __tcp_select_window()
3048	 */
3049	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3050	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3051	__sk_dst_reset(sk);
3052	dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
3053	tcp_saved_syn_free(tp);
3054	tp->compressed_ack = 0;
3055	tp->segs_in = 0;
3056	tp->segs_out = 0;
3057	tp->bytes_sent = 0;
3058	tp->bytes_acked = 0;
3059	tp->bytes_received = 0;
3060	tp->bytes_retrans = 0;
3061	tp->data_segs_in = 0;
3062	tp->data_segs_out = 0;
3063	tp->duplicate_sack[0].start_seq = 0;
3064	tp->duplicate_sack[0].end_seq = 0;
3065	tp->dsack_dups = 0;
3066	tp->reord_seen = 0;
3067	tp->retrans_out = 0;
3068	tp->sacked_out = 0;
3069	tp->tlp_high_seq = 0;
3070	tp->last_oow_ack_time = 0;
3071	tp->plb_rehash = 0;
3072	/* There's a bubble in the pipe until at least the first ACK. */
3073	tp->app_limited = ~0U;
3074	tp->rate_app_limited = 1;
3075	tp->rack.mstamp = 0;
3076	tp->rack.advanced = 0;
3077	tp->rack.reo_wnd_steps = 1;
3078	tp->rack.last_delivered = 0;
3079	tp->rack.reo_wnd_persist = 0;
3080	tp->rack.dsack_seen = 0;
3081	tp->syn_data_acked = 0;
3082	tp->rx_opt.saw_tstamp = 0;
3083	tp->rx_opt.dsack = 0;
3084	tp->rx_opt.num_sacks = 0;
3085	tp->rcv_ooopack = 0;
3086
3087
3088	/* Clean up fastopen related fields */
3089	tcp_free_fastopen_req(tp);
3090	inet->defer_connect = 0;
3091	tp->fastopen_client_fail = 0;
3092
3093	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3094
3095	if (sk->sk_frag.page) {
3096		put_page(sk->sk_frag.page);
3097		sk->sk_frag.page = NULL;
3098		sk->sk_frag.offset = 0;
3099	}
3100	sk_error_report(sk);
3101	return 0;
3102}
3103EXPORT_SYMBOL(tcp_disconnect);
3104
3105static inline bool tcp_can_repair_sock(const struct sock *sk)
3106{
3107	return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3108		(sk->sk_state != TCP_LISTEN);
3109}
3110
3111static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3112{
3113	struct tcp_repair_window opt;
3114
3115	if (!tp->repair)
3116		return -EPERM;
3117
3118	if (len != sizeof(opt))
3119		return -EINVAL;
3120
3121	if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3122		return -EFAULT;
3123
3124	if (opt.max_window < opt.snd_wnd)
3125		return -EINVAL;
3126
3127	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3128		return -EINVAL;
3129
3130	if (after(opt.rcv_wup, tp->rcv_nxt))
3131		return -EINVAL;
3132
3133	tp->snd_wl1	= opt.snd_wl1;
3134	tp->snd_wnd	= opt.snd_wnd;
3135	tp->max_window	= opt.max_window;
3136
3137	tp->rcv_wnd	= opt.rcv_wnd;
3138	tp->rcv_wup	= opt.rcv_wup;
3139
3140	return 0;
3141}
3142
3143static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3144		unsigned int len)
3145{
3146	struct tcp_sock *tp = tcp_sk(sk);
3147	struct tcp_repair_opt opt;
3148	size_t offset = 0;
3149
3150	while (len >= sizeof(opt)) {
3151		if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3152			return -EFAULT;
3153
3154		offset += sizeof(opt);
3155		len -= sizeof(opt);
3156
3157		switch (opt.opt_code) {
3158		case TCPOPT_MSS:
3159			tp->rx_opt.mss_clamp = opt.opt_val;
3160			tcp_mtup_init(sk);
3161			break;
3162		case TCPOPT_WINDOW:
3163			{
3164				u16 snd_wscale = opt.opt_val & 0xFFFF;
3165				u16 rcv_wscale = opt.opt_val >> 16;
3166
3167				if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3168					return -EFBIG;
3169
3170				tp->rx_opt.snd_wscale = snd_wscale;
3171				tp->rx_opt.rcv_wscale = rcv_wscale;
3172				tp->rx_opt.wscale_ok = 1;
3173			}
3174			break;
3175		case TCPOPT_SACK_PERM:
3176			if (opt.opt_val != 0)
3177				return -EINVAL;
3178
3179			tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3180			break;
3181		case TCPOPT_TIMESTAMP:
3182			if (opt.opt_val != 0)
3183				return -EINVAL;
3184
3185			tp->rx_opt.tstamp_ok = 1;
3186			break;
3187		}
3188	}
3189
3190	return 0;
3191}
3192
3193DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3194EXPORT_SYMBOL(tcp_tx_delay_enabled);
3195
3196static void tcp_enable_tx_delay(void)
3197{
3198	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3199		static int __tcp_tx_delay_enabled = 0;
3200
3201		if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3202			static_branch_enable(&tcp_tx_delay_enabled);
3203			pr_info("TCP_TX_DELAY enabled\n");
3204		}
3205	}
3206}
3207
3208/* When set indicates to always queue non-full frames.  Later the user clears
3209 * this option and we transmit any pending partial frames in the queue.  This is
3210 * meant to be used alongside sendfile() to get properly filled frames when the
3211 * user (for example) must write out headers with a write() call first and then
3212 * use sendfile to send out the data parts.
3213 *
3214 * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
3215 * TCP_NODELAY.
3216 */
3217void __tcp_sock_set_cork(struct sock *sk, bool on)
3218{
3219	struct tcp_sock *tp = tcp_sk(sk);
3220
3221	if (on) {
3222		tp->nonagle |= TCP_NAGLE_CORK;
3223	} else {
3224		tp->nonagle &= ~TCP_NAGLE_CORK;
3225		if (tp->nonagle & TCP_NAGLE_OFF)
3226			tp->nonagle |= TCP_NAGLE_PUSH;
3227		tcp_push_pending_frames(sk);
3228	}
3229}
3230
3231void tcp_sock_set_cork(struct sock *sk, bool on)
3232{
3233	lock_sock(sk);
3234	__tcp_sock_set_cork(sk, on);
3235	release_sock(sk);
3236}
3237EXPORT_SYMBOL(tcp_sock_set_cork);
3238
3239/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
3240 * remembered, but it is not activated until cork is cleared.
3241 *
3242 * However, when TCP_NODELAY is set we make an explicit push, which overrides
3243 * even TCP_CORK for currently queued segments.
3244 */
3245void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3246{
3247	if (on) {
3248		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3249		tcp_push_pending_frames(sk);
3250	} else {
3251		tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3252	}
3253}
3254
3255void tcp_sock_set_nodelay(struct sock *sk)
3256{
3257	lock_sock(sk);
3258	__tcp_sock_set_nodelay(sk, true);
3259	release_sock(sk);
3260}
3261EXPORT_SYMBOL(tcp_sock_set_nodelay);
3262
3263static void __tcp_sock_set_quickack(struct sock *sk, int val)
3264{
3265	if (!val) {
3266		inet_csk_enter_pingpong_mode(sk);
3267		return;
3268	}
3269
3270	inet_csk_exit_pingpong_mode(sk);
3271	if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3272	    inet_csk_ack_scheduled(sk)) {
3273		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3274		tcp_cleanup_rbuf(sk, 1);
3275		if (!(val & 1))
3276			inet_csk_enter_pingpong_mode(sk);
3277	}
3278}
3279
3280void tcp_sock_set_quickack(struct sock *sk, int val)
3281{
3282	lock_sock(sk);
3283	__tcp_sock_set_quickack(sk, val);
3284	release_sock(sk);
3285}
3286EXPORT_SYMBOL(tcp_sock_set_quickack);
3287
3288int tcp_sock_set_syncnt(struct sock *sk, int val)
3289{
3290	if (val < 1 || val > MAX_TCP_SYNCNT)
3291		return -EINVAL;
3292
3293	lock_sock(sk);
3294	WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
3295	release_sock(sk);
3296	return 0;
3297}
3298EXPORT_SYMBOL(tcp_sock_set_syncnt);
3299
3300void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3301{
3302	lock_sock(sk);
3303	WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
3304	release_sock(sk);
3305}
3306EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3307
3308int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3309{
3310	struct tcp_sock *tp = tcp_sk(sk);
3311
3312	if (val < 1 || val > MAX_TCP_KEEPIDLE)
3313		return -EINVAL;
3314
3315	/* Paired with WRITE_ONCE() in keepalive_time_when() */
3316	WRITE_ONCE(tp->keepalive_time, val * HZ);
3317	if (sock_flag(sk, SOCK_KEEPOPEN) &&
3318	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3319		u32 elapsed = keepalive_time_elapsed(tp);
3320
3321		if (tp->keepalive_time > elapsed)
3322			elapsed = tp->keepalive_time - elapsed;
3323		else
3324			elapsed = 0;
3325		inet_csk_reset_keepalive_timer(sk, elapsed);
3326	}
3327
3328	return 0;
3329}
3330
3331int tcp_sock_set_keepidle(struct sock *sk, int val)
3332{
3333	int err;
3334
3335	lock_sock(sk);
3336	err = tcp_sock_set_keepidle_locked(sk, val);
3337	release_sock(sk);
3338	return err;
3339}
3340EXPORT_SYMBOL(tcp_sock_set_keepidle);
3341
3342int tcp_sock_set_keepintvl(struct sock *sk, int val)
3343{
3344	if (val < 1 || val > MAX_TCP_KEEPINTVL)
3345		return -EINVAL;
3346
3347	lock_sock(sk);
3348	WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
3349	release_sock(sk);
3350	return 0;
3351}
3352EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3353
3354int tcp_sock_set_keepcnt(struct sock *sk, int val)
3355{
3356	if (val < 1 || val > MAX_TCP_KEEPCNT)
3357		return -EINVAL;
3358
3359	lock_sock(sk);
3360	/* Paired with READ_ONCE() in keepalive_probes() */
3361	WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
3362	release_sock(sk);
3363	return 0;
3364}
3365EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3366
3367int tcp_set_window_clamp(struct sock *sk, int val)
3368{
3369	struct tcp_sock *tp = tcp_sk(sk);
3370
3371	if (!val) {
3372		if (sk->sk_state != TCP_CLOSE)
3373			return -EINVAL;
3374		tp->window_clamp = 0;
3375	} else {
3376		tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3377			SOCK_MIN_RCVBUF / 2 : val;
3378		tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
3379	}
3380	return 0;
3381}
3382
3383/*
3384 *	Socket option code for TCP.
3385 */
3386int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3387		      sockptr_t optval, unsigned int optlen)
3388{
3389	struct tcp_sock *tp = tcp_sk(sk);
3390	struct inet_connection_sock *icsk = inet_csk(sk);
3391	struct net *net = sock_net(sk);
3392	int val;
3393	int err = 0;
3394
3395	/* These are data/string values, all the others are ints */
3396	switch (optname) {
3397	case TCP_CONGESTION: {
3398		char name[TCP_CA_NAME_MAX];
3399
3400		if (optlen < 1)
3401			return -EINVAL;
3402
3403		val = strncpy_from_sockptr(name, optval,
3404					min_t(long, TCP_CA_NAME_MAX-1, optlen));
3405		if (val < 0)
3406			return -EFAULT;
3407		name[val] = 0;
3408
3409		sockopt_lock_sock(sk);
3410		err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
3411						 sockopt_ns_capable(sock_net(sk)->user_ns,
3412								    CAP_NET_ADMIN));
3413		sockopt_release_sock(sk);
3414		return err;
3415	}
3416	case TCP_ULP: {
3417		char name[TCP_ULP_NAME_MAX];
3418
3419		if (optlen < 1)
3420			return -EINVAL;
3421
3422		val = strncpy_from_sockptr(name, optval,
3423					min_t(long, TCP_ULP_NAME_MAX - 1,
3424					      optlen));
3425		if (val < 0)
3426			return -EFAULT;
3427		name[val] = 0;
3428
3429		sockopt_lock_sock(sk);
3430		err = tcp_set_ulp(sk, name);
3431		sockopt_release_sock(sk);
3432		return err;
3433	}
3434	case TCP_FASTOPEN_KEY: {
3435		__u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3436		__u8 *backup_key = NULL;
3437
3438		/* Allow a backup key as well to facilitate key rotation
3439		 * First key is the active one.
3440		 */
3441		if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3442		    optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3443			return -EINVAL;
3444
3445		if (copy_from_sockptr(key, optval, optlen))
3446			return -EFAULT;
3447
3448		if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3449			backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3450
3451		return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3452	}
3453	default:
3454		/* fallthru */
3455		break;
3456	}
3457
3458	if (optlen < sizeof(int))
3459		return -EINVAL;
3460
3461	if (copy_from_sockptr(&val, optval, sizeof(val)))
3462		return -EFAULT;
3463
3464	sockopt_lock_sock(sk);
3465
3466	switch (optname) {
3467	case TCP_MAXSEG:
3468		/* Values greater than interface MTU won't take effect. However
3469		 * at the point when this call is done we typically don't yet
3470		 * know which interface is going to be used
3471		 */
3472		if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3473			err = -EINVAL;
3474			break;
3475		}
3476		tp->rx_opt.user_mss = val;
3477		break;
3478
3479	case TCP_NODELAY:
3480		__tcp_sock_set_nodelay(sk, val);
3481		break;
3482
3483	case TCP_THIN_LINEAR_TIMEOUTS:
3484		if (val < 0 || val > 1)
3485			err = -EINVAL;
3486		else
3487			tp->thin_lto = val;
3488		break;
3489
3490	case TCP_THIN_DUPACK:
3491		if (val < 0 || val > 1)
3492			err = -EINVAL;
3493		break;
3494
3495	case TCP_REPAIR:
3496		if (!tcp_can_repair_sock(sk))
3497			err = -EPERM;
3498		else if (val == TCP_REPAIR_ON) {
3499			tp->repair = 1;
3500			sk->sk_reuse = SK_FORCE_REUSE;
3501			tp->repair_queue = TCP_NO_QUEUE;
3502		} else if (val == TCP_REPAIR_OFF) {
3503			tp->repair = 0;
3504			sk->sk_reuse = SK_NO_REUSE;
3505			tcp_send_window_probe(sk);
3506		} else if (val == TCP_REPAIR_OFF_NO_WP) {
3507			tp->repair = 0;
3508			sk->sk_reuse = SK_NO_REUSE;
3509		} else
3510			err = -EINVAL;
3511
3512		break;
3513
3514	case TCP_REPAIR_QUEUE:
3515		if (!tp->repair)
3516			err = -EPERM;
3517		else if ((unsigned int)val < TCP_QUEUES_NR)
3518			tp->repair_queue = val;
3519		else
3520			err = -EINVAL;
3521		break;
3522
3523	case TCP_QUEUE_SEQ:
3524		if (sk->sk_state != TCP_CLOSE) {
3525			err = -EPERM;
3526		} else if (tp->repair_queue == TCP_SEND_QUEUE) {
3527			if (!tcp_rtx_queue_empty(sk))
3528				err = -EPERM;
3529			else
3530				WRITE_ONCE(tp->write_seq, val);
3531		} else if (tp->repair_queue == TCP_RECV_QUEUE) {
3532			if (tp->rcv_nxt != tp->copied_seq) {
3533				err = -EPERM;
3534			} else {
3535				WRITE_ONCE(tp->rcv_nxt, val);
3536				WRITE_ONCE(tp->copied_seq, val);
3537			}
3538		} else {
3539			err = -EINVAL;
3540		}
3541		break;
3542
3543	case TCP_REPAIR_OPTIONS:
3544		if (!tp->repair)
3545			err = -EINVAL;
3546		else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
3547			err = tcp_repair_options_est(sk, optval, optlen);
3548		else
3549			err = -EPERM;
3550		break;
3551
3552	case TCP_CORK:
3553		__tcp_sock_set_cork(sk, val);
3554		break;
3555
3556	case TCP_KEEPIDLE:
3557		err = tcp_sock_set_keepidle_locked(sk, val);
3558		break;
3559	case TCP_KEEPINTVL:
3560		if (val < 1 || val > MAX_TCP_KEEPINTVL)
3561			err = -EINVAL;
3562		else
3563			WRITE_ONCE(tp->keepalive_intvl, val * HZ);
3564		break;
3565	case TCP_KEEPCNT:
3566		if (val < 1 || val > MAX_TCP_KEEPCNT)
3567			err = -EINVAL;
3568		else
3569			WRITE_ONCE(tp->keepalive_probes, val);
3570		break;
3571	case TCP_SYNCNT:
3572		if (val < 1 || val > MAX_TCP_SYNCNT)
3573			err = -EINVAL;
3574		else
3575			WRITE_ONCE(icsk->icsk_syn_retries, val);
3576		break;
3577
3578	case TCP_SAVE_SYN:
3579		/* 0: disable, 1: enable, 2: start from ether_header */
3580		if (val < 0 || val > 2)
3581			err = -EINVAL;
3582		else
3583			tp->save_syn = val;
3584		break;
3585
3586	case TCP_LINGER2:
3587		if (val < 0)
3588			WRITE_ONCE(tp->linger2, -1);
3589		else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3590			WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
3591		else
3592			WRITE_ONCE(tp->linger2, val * HZ);
3593		break;
3594
3595	case TCP_DEFER_ACCEPT:
3596		/* Translate value in seconds to number of retransmits */
3597		WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
3598			   secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3599					   TCP_RTO_MAX / HZ));
3600		break;
3601
3602	case TCP_WINDOW_CLAMP:
3603		err = tcp_set_window_clamp(sk, val);
3604		break;
3605
3606	case TCP_QUICKACK:
3607		__tcp_sock_set_quickack(sk, val);
3608		break;
3609
3610#ifdef CONFIG_TCP_MD5SIG
3611	case TCP_MD5SIG:
3612	case TCP_MD5SIG_EXT:
3613		err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3614		break;
3615#endif
3616	case TCP_USER_TIMEOUT:
3617		/* Cap the max time in ms TCP will retry or probe the window
3618		 * before giving up and aborting (ETIMEDOUT) a connection.
3619		 */
3620		if (val < 0)
3621			err = -EINVAL;
3622		else
3623			WRITE_ONCE(icsk->icsk_user_timeout, val);
3624		break;
3625
3626	case TCP_FASTOPEN:
3627		if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3628		    TCPF_LISTEN))) {
3629			tcp_fastopen_init_key_once(net);
3630
3631			fastopen_queue_tune(sk, val);
3632		} else {
3633			err = -EINVAL;
3634		}
3635		break;
3636	case TCP_FASTOPEN_CONNECT:
3637		if (val > 1 || val < 0) {
3638			err = -EINVAL;
3639		} else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3640			   TFO_CLIENT_ENABLE) {
3641			if (sk->sk_state == TCP_CLOSE)
3642				tp->fastopen_connect = val;
3643			else
3644				err = -EINVAL;
3645		} else {
3646			err = -EOPNOTSUPP;
3647		}
3648		break;
3649	case TCP_FASTOPEN_NO_COOKIE:
3650		if (val > 1 || val < 0)
3651			err = -EINVAL;
3652		else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3653			err = -EINVAL;
3654		else
3655			tp->fastopen_no_cookie = val;
3656		break;
3657	case TCP_TIMESTAMP:
3658		if (!tp->repair)
3659			err = -EPERM;
3660		else
3661			WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
3662		break;
3663	case TCP_REPAIR_WINDOW:
3664		err = tcp_repair_set_window(tp, optval, optlen);
3665		break;
3666	case TCP_NOTSENT_LOWAT:
3667		WRITE_ONCE(tp->notsent_lowat, val);
3668		sk->sk_write_space(sk);
3669		break;
3670	case TCP_INQ:
3671		if (val > 1 || val < 0)
3672			err = -EINVAL;
3673		else
3674			tp->recvmsg_inq = val;
3675		break;
3676	case TCP_TX_DELAY:
3677		if (val)
3678			tcp_enable_tx_delay();
3679		WRITE_ONCE(tp->tcp_tx_delay, val);
3680		break;
3681	default:
3682		err = -ENOPROTOOPT;
3683		break;
3684	}
3685
3686	sockopt_release_sock(sk);
3687	return err;
3688}
3689
3690int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3691		   unsigned int optlen)
3692{
3693	const struct inet_connection_sock *icsk = inet_csk(sk);
3694
3695	if (level != SOL_TCP)
3696		/* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
3697		return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
3698								optval, optlen);
3699	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3700}
3701EXPORT_SYMBOL(tcp_setsockopt);
3702
3703static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3704				      struct tcp_info *info)
3705{
3706	u64 stats[__TCP_CHRONO_MAX], total = 0;
3707	enum tcp_chrono i;
3708
3709	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3710		stats[i] = tp->chrono_stat[i - 1];
3711		if (i == tp->chrono_type)
3712			stats[i] += tcp_jiffies32 - tp->chrono_start;
3713		stats[i] *= USEC_PER_SEC / HZ;
3714		total += stats[i];
3715	}
3716
3717	info->tcpi_busy_time = total;
3718	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3719	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3720}
3721
3722/* Return information about state of tcp endpoint in API format. */
3723void tcp_get_info(struct sock *sk, struct tcp_info *info)
3724{
3725	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
3726	const struct inet_connection_sock *icsk = inet_csk(sk);
3727	unsigned long rate;
3728	u32 now;
3729	u64 rate64;
3730	bool slow;
3731
3732	memset(info, 0, sizeof(*info));
3733	if (sk->sk_type != SOCK_STREAM)
3734		return;
3735
3736	info->tcpi_state = inet_sk_state_load(sk);
3737
3738	/* Report meaningful fields for all TCP states, including listeners */
3739	rate = READ_ONCE(sk->sk_pacing_rate);
3740	rate64 = (rate != ~0UL) ? rate : ~0ULL;
3741	info->tcpi_pacing_rate = rate64;
3742
3743	rate = READ_ONCE(sk->sk_max_pacing_rate);
3744	rate64 = (rate != ~0UL) ? rate : ~0ULL;
3745	info->tcpi_max_pacing_rate = rate64;
3746
3747	info->tcpi_reordering = tp->reordering;
3748	info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
3749
3750	if (info->tcpi_state == TCP_LISTEN) {
3751		/* listeners aliased fields :
3752		 * tcpi_unacked -> Number of children ready for accept()
3753		 * tcpi_sacked  -> max backlog
3754		 */
3755		info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3756		info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3757		return;
3758	}
3759
3760	slow = lock_sock_fast(sk);
3761
3762	info->tcpi_ca_state = icsk->icsk_ca_state;
3763	info->tcpi_retransmits = icsk->icsk_retransmits;
3764	info->tcpi_probes = icsk->icsk_probes_out;
3765	info->tcpi_backoff = icsk->icsk_backoff;
3766
3767	if (tp->rx_opt.tstamp_ok)
3768		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3769	if (tcp_is_sack(tp))
3770		info->tcpi_options |= TCPI_OPT_SACK;
3771	if (tp->rx_opt.wscale_ok) {
3772		info->tcpi_options |= TCPI_OPT_WSCALE;
3773		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3774		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3775	}
3776
3777	if (tp->ecn_flags & TCP_ECN_OK)
3778		info->tcpi_options |= TCPI_OPT_ECN;
3779	if (tp->ecn_flags & TCP_ECN_SEEN)
3780		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3781	if (tp->syn_data_acked)
3782		info->tcpi_options |= TCPI_OPT_SYN_DATA;
3783
3784	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3785	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3786	info->tcpi_snd_mss = tp->mss_cache;
3787	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3788
3789	info->tcpi_unacked = tp->packets_out;
3790	info->tcpi_sacked = tp->sacked_out;
3791
3792	info->tcpi_lost = tp->lost_out;
3793	info->tcpi_retrans = tp->retrans_out;
3794
3795	now = tcp_jiffies32;
3796	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3797	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3798	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3799
3800	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3801	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3802	info->tcpi_rtt = tp->srtt_us >> 3;
3803	info->tcpi_rttvar = tp->mdev_us >> 2;
3804	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3805	info->tcpi_advmss = tp->advmss;
3806
3807	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3808	info->tcpi_rcv_space = tp->rcvq_space.space;
3809
3810	info->tcpi_total_retrans = tp->total_retrans;
3811
3812	info->tcpi_bytes_acked = tp->bytes_acked;
3813	info->tcpi_bytes_received = tp->bytes_received;
3814	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3815	tcp_get_info_chrono_stats(tp, info);
3816
3817	info->tcpi_segs_out = tp->segs_out;
3818
3819	/* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
3820	info->tcpi_segs_in = READ_ONCE(tp->segs_in);
3821	info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
3822
3823	info->tcpi_min_rtt = tcp_min_rtt(tp);
3824	info->tcpi_data_segs_out = tp->data_segs_out;
3825
3826	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3827	rate64 = tcp_compute_delivery_rate(tp);
3828	if (rate64)
3829		info->tcpi_delivery_rate = rate64;
3830	info->tcpi_delivered = tp->delivered;
3831	info->tcpi_delivered_ce = tp->delivered_ce;
3832	info->tcpi_bytes_sent = tp->bytes_sent;
3833	info->tcpi_bytes_retrans = tp->bytes_retrans;
3834	info->tcpi_dsack_dups = tp->dsack_dups;
3835	info->tcpi_reord_seen = tp->reord_seen;
3836	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3837	info->tcpi_snd_wnd = tp->snd_wnd;
3838	info->tcpi_rcv_wnd = tp->rcv_wnd;
3839	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
3840	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3841	unlock_sock_fast(sk, slow);
3842}
3843EXPORT_SYMBOL_GPL(tcp_get_info);
3844
3845static size_t tcp_opt_stats_get_size(void)
3846{
3847	return
3848		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
3849		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
3850		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
3851		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
3852		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
3853		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
3854		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
3855		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
3856		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
3857		nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
3858		nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
3859		nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
3860		nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
3861		nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
3862		nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
3863		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
3864		nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
3865		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
3866		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
3867		nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
3868		nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
3869		nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
3870		nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
3871		nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
3872		nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
3873		nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
3874		nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
3875		0;
3876}
3877
3878/* Returns TTL or hop limit of an incoming packet from skb. */
3879static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3880{
3881	if (skb->protocol == htons(ETH_P_IP))
3882		return ip_hdr(skb)->ttl;
3883	else if (skb->protocol == htons(ETH_P_IPV6))
3884		return ipv6_hdr(skb)->hop_limit;
3885	else
3886		return 0;
3887}
3888
3889struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3890					       const struct sk_buff *orig_skb,
3891					       const struct sk_buff *ack_skb)
3892{
3893	const struct tcp_sock *tp = tcp_sk(sk);
3894	struct sk_buff *stats;
3895	struct tcp_info info;
3896	unsigned long rate;
3897	u64 rate64;
3898
3899	stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3900	if (!stats)
3901		return NULL;
3902
3903	tcp_get_info_chrono_stats(tp, &info);
3904	nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3905			  info.tcpi_busy_time, TCP_NLA_PAD);
3906	nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3907			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
3908	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3909			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3910	nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3911			  tp->data_segs_out, TCP_NLA_PAD);
3912	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3913			  tp->total_retrans, TCP_NLA_PAD);
3914
3915	rate = READ_ONCE(sk->sk_pacing_rate);
3916	rate64 = (rate != ~0UL) ? rate : ~0ULL;
3917	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3918
3919	rate64 = tcp_compute_delivery_rate(tp);
3920	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3921
3922	nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
3923	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3924	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3925
3926	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3927	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3928	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3929	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3930	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3931
3932	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3933	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3934
3935	nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3936			  TCP_NLA_PAD);
3937	nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3938			  TCP_NLA_PAD);
3939	nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3940	nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3941	nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3942	nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3943	nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3944		    max_t(int, 0, tp->write_seq - tp->snd_nxt));
3945	nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3946			  TCP_NLA_PAD);
3947	if (ack_skb)
3948		nla_put_u8(stats, TCP_NLA_TTL,
3949			   tcp_skb_ttl_or_hop_limit(ack_skb));
3950
3951	nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
3952	return stats;
3953}
3954
3955int do_tcp_getsockopt(struct sock *sk, int level,
3956		      int optname, sockptr_t optval, sockptr_t optlen)
3957{
3958	struct inet_connection_sock *icsk = inet_csk(sk);
3959	struct tcp_sock *tp = tcp_sk(sk);
3960	struct net *net = sock_net(sk);
3961	int val, len;
3962
3963	if (copy_from_sockptr(&len, optlen, sizeof(int)))
3964		return -EFAULT;
3965
3966	len = min_t(unsigned int, len, sizeof(int));
3967
3968	if (len < 0)
3969		return -EINVAL;
3970
3971	switch (optname) {
3972	case TCP_MAXSEG:
3973		val = tp->mss_cache;
3974		if (tp->rx_opt.user_mss &&
3975		    ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3976			val = tp->rx_opt.user_mss;
3977		if (tp->repair)
3978			val = tp->rx_opt.mss_clamp;
3979		break;
3980	case TCP_NODELAY:
3981		val = !!(tp->nonagle&TCP_NAGLE_OFF);
3982		break;
3983	case TCP_CORK:
3984		val = !!(tp->nonagle&TCP_NAGLE_CORK);
3985		break;
3986	case TCP_KEEPIDLE:
3987		val = keepalive_time_when(tp) / HZ;
3988		break;
3989	case TCP_KEEPINTVL:
3990		val = keepalive_intvl_when(tp) / HZ;
3991		break;
3992	case TCP_KEEPCNT:
3993		val = keepalive_probes(tp);
3994		break;
3995	case TCP_SYNCNT:
3996		val = READ_ONCE(icsk->icsk_syn_retries) ? :
3997			READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
3998		break;
3999	case TCP_LINGER2:
4000		val = READ_ONCE(tp->linger2);
4001		if (val >= 0)
4002			val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
4003		break;
4004	case TCP_DEFER_ACCEPT:
4005		val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
4006		val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
4007				      TCP_RTO_MAX / HZ);
4008		break;
4009	case TCP_WINDOW_CLAMP:
4010		val = tp->window_clamp;
4011		break;
4012	case TCP_INFO: {
4013		struct tcp_info info;
4014
4015		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4016			return -EFAULT;
4017
4018		tcp_get_info(sk, &info);
4019
4020		len = min_t(unsigned int, len, sizeof(info));
4021		if (copy_to_sockptr(optlen, &len, sizeof(int)))
4022			return -EFAULT;
4023		if (copy_to_sockptr(optval, &info, len))
4024			return -EFAULT;
4025		return 0;
4026	}
4027	case TCP_CC_INFO: {
4028		const struct tcp_congestion_ops *ca_ops;
4029		union tcp_cc_info info;
4030		size_t sz = 0;
4031		int attr;
4032
4033		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4034			return -EFAULT;
4035
4036		ca_ops = icsk->icsk_ca_ops;
4037		if (ca_ops && ca_ops->get_info)
4038			sz = ca_ops->get_info(sk, ~0U, &attr, &info);
4039
4040		len = min_t(unsigned int, len, sz);
4041		if (copy_to_sockptr(optlen, &len, sizeof(int)))
4042			return -EFAULT;
4043		if (copy_to_sockptr(optval, &info, len))
4044			return -EFAULT;
4045		return 0;
4046	}
4047	case TCP_QUICKACK:
4048		val = !inet_csk_in_pingpong_mode(sk);
4049		break;
4050
4051	case TCP_CONGESTION:
4052		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4053			return -EFAULT;
4054		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4055		if (copy_to_sockptr(optlen, &len, sizeof(int)))
4056			return -EFAULT;
4057		if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
4058			return -EFAULT;
4059		return 0;
4060
4061	case TCP_ULP:
4062		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4063			return -EFAULT;
4064		len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4065		if (!icsk->icsk_ulp_ops) {
4066			len = 0;
4067			if (copy_to_sockptr(optlen, &len, sizeof(int)))
4068				return -EFAULT;
4069			return 0;
4070		}
4071		if (copy_to_sockptr(optlen, &len, sizeof(int)))
4072			return -EFAULT;
4073		if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
4074			return -EFAULT;
4075		return 0;
4076
4077	case TCP_FASTOPEN_KEY: {
4078		u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4079		unsigned int key_len;
4080
4081		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4082			return -EFAULT;
4083
4084		key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4085				TCP_FASTOPEN_KEY_LENGTH;
4086		len = min_t(unsigned int, len, key_len);
4087		if (copy_to_sockptr(optlen, &len, sizeof(int)))
4088			return -EFAULT;
4089		if (copy_to_sockptr(optval, key, len))
4090			return -EFAULT;
4091		return 0;
4092	}
4093	case TCP_THIN_LINEAR_TIMEOUTS:
4094		val = tp->thin_lto;
4095		break;
4096
4097	case TCP_THIN_DUPACK:
4098		val = 0;
4099		break;
4100
4101	case TCP_REPAIR:
4102		val = tp->repair;
4103		break;
4104
4105	case TCP_REPAIR_QUEUE:
4106		if (tp->repair)
4107			val = tp->repair_queue;
4108		else
4109			return -EINVAL;
4110		break;
4111
4112	case TCP_REPAIR_WINDOW: {
4113		struct tcp_repair_window opt;
4114
4115		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4116			return -EFAULT;
4117
4118		if (len != sizeof(opt))
4119			return -EINVAL;
4120
4121		if (!tp->repair)
4122			return -EPERM;
4123
4124		opt.snd_wl1	= tp->snd_wl1;
4125		opt.snd_wnd	= tp->snd_wnd;
4126		opt.max_window	= tp->max_window;
4127		opt.rcv_wnd	= tp->rcv_wnd;
4128		opt.rcv_wup	= tp->rcv_wup;
4129
4130		if (copy_to_sockptr(optval, &opt, len))
4131			return -EFAULT;
4132		return 0;
4133	}
4134	case TCP_QUEUE_SEQ:
4135		if (tp->repair_queue == TCP_SEND_QUEUE)
4136			val = tp->write_seq;
4137		else if (tp->repair_queue == TCP_RECV_QUEUE)
4138			val = tp->rcv_nxt;
4139		else
4140			return -EINVAL;
4141		break;
4142
4143	case TCP_USER_TIMEOUT:
4144		val = READ_ONCE(icsk->icsk_user_timeout);
4145		break;
4146
4147	case TCP_FASTOPEN:
4148		val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
4149		break;
4150
4151	case TCP_FASTOPEN_CONNECT:
4152		val = tp->fastopen_connect;
4153		break;
4154
4155	case TCP_FASTOPEN_NO_COOKIE:
4156		val = tp->fastopen_no_cookie;
4157		break;
4158
4159	case TCP_TX_DELAY:
4160		val = READ_ONCE(tp->tcp_tx_delay);
4161		break;
4162
4163	case TCP_TIMESTAMP:
4164		val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
4165		break;
4166	case TCP_NOTSENT_LOWAT:
4167		val = READ_ONCE(tp->notsent_lowat);
4168		break;
4169	case TCP_INQ:
4170		val = tp->recvmsg_inq;
4171		break;
4172	case TCP_SAVE_SYN:
4173		val = tp->save_syn;
4174		break;
4175	case TCP_SAVED_SYN: {
4176		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4177			return -EFAULT;
4178
4179		sockopt_lock_sock(sk);
4180		if (tp->saved_syn) {
4181			if (len < tcp_saved_syn_len(tp->saved_syn)) {
4182				len = tcp_saved_syn_len(tp->saved_syn);
4183				if (copy_to_sockptr(optlen, &len, sizeof(int))) {
4184					sockopt_release_sock(sk);
4185					return -EFAULT;
4186				}
4187				sockopt_release_sock(sk);
4188				return -EINVAL;
4189			}
4190			len = tcp_saved_syn_len(tp->saved_syn);
4191			if (copy_to_sockptr(optlen, &len, sizeof(int))) {
4192				sockopt_release_sock(sk);
4193				return -EFAULT;
4194			}
4195			if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
4196				sockopt_release_sock(sk);
4197				return -EFAULT;
4198			}
4199			tcp_saved_syn_free(tp);
4200			sockopt_release_sock(sk);
4201		} else {
4202			sockopt_release_sock(sk);
4203			len = 0;
4204			if (copy_to_sockptr(optlen, &len, sizeof(int)))
4205				return -EFAULT;
4206		}
4207		return 0;
4208	}
4209#ifdef CONFIG_MMU
4210	case TCP_ZEROCOPY_RECEIVE: {
4211		struct scm_timestamping_internal tss;
4212		struct tcp_zerocopy_receive zc = {};
4213		int err;
4214
4215		if (copy_from_sockptr(&len, optlen, sizeof(int)))
4216			return -EFAULT;
4217		if (len < 0 ||
4218		    len < offsetofend(struct tcp_zerocopy_receive, length))
4219			return -EINVAL;
4220		if (unlikely(len > sizeof(zc))) {
4221			err = check_zeroed_sockptr(optval, sizeof(zc),
4222						   len - sizeof(zc));
4223			if (err < 1)
4224				return err == 0 ? -EINVAL : err;
4225			len = sizeof(zc);
4226			if (copy_to_sockptr(optlen, &len, sizeof(int)))
4227				return -EFAULT;
4228		}
4229		if (copy_from_sockptr(&zc, optval, len))
4230			return -EFAULT;
4231		if (zc.reserved)
4232			return -EINVAL;
4233		if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
4234			return -EINVAL;
4235		sockopt_lock_sock(sk);
4236		err = tcp_zerocopy_receive(sk, &zc, &tss);
4237		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4238							  &zc, &len, err);
4239		sockopt_release_sock(sk);
4240		if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4241			goto zerocopy_rcv_cmsg;
4242		switch (len) {
4243		case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4244			goto zerocopy_rcv_cmsg;
4245		case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4246		case offsetofend(struct tcp_zerocopy_receive, msg_control):
4247		case offsetofend(struct tcp_zerocopy_receive, flags):
4248		case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4249		case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4250		case offsetofend(struct tcp_zerocopy_receive, err):
4251			goto zerocopy_rcv_sk_err;
4252		case offsetofend(struct tcp_zerocopy_receive, inq):
4253			goto zerocopy_rcv_inq;
4254		case offsetofend(struct tcp_zerocopy_receive, length):
4255		default:
4256			goto zerocopy_rcv_out;
4257		}
4258zerocopy_rcv_cmsg:
4259		if (zc.msg_flags & TCP_CMSG_TS)
4260			tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
4261		else
4262			zc.msg_flags = 0;
4263zerocopy_rcv_sk_err:
4264		if (!err)
4265			zc.err = sock_error(sk);
4266zerocopy_rcv_inq:
4267		zc.inq = tcp_inq_hint(sk);
4268zerocopy_rcv_out:
4269		if (!err && copy_to_sockptr(optval, &zc, len))
4270			err = -EFAULT;
4271		return err;
4272	}
4273#endif
4274	default:
4275		return -ENOPROTOOPT;
4276	}
4277
4278	if (copy_to_sockptr(optlen, &len, sizeof(int)))
4279		return -EFAULT;
4280	if (copy_to_sockptr(optval, &val, len))
4281		return -EFAULT;
4282	return 0;
4283}
4284
4285bool tcp_bpf_bypass_getsockopt(int level, int optname)
4286{
4287	/* TCP do_tcp_getsockopt has optimized getsockopt implementation
4288	 * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
4289	 */
4290	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4291		return true;
4292
4293	return false;
4294}
4295EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4296
4297int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4298		   int __user *optlen)
4299{
4300	struct inet_connection_sock *icsk = inet_csk(sk);
4301
4302	if (level != SOL_TCP)
4303		/* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
4304		return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
4305								optval, optlen);
4306	return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
4307				 USER_SOCKPTR(optlen));
4308}
4309EXPORT_SYMBOL(tcp_getsockopt);
4310
4311#ifdef CONFIG_TCP_MD5SIG
4312static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4313static DEFINE_MUTEX(tcp_md5sig_mutex);
4314static bool tcp_md5sig_pool_populated = false;
4315
4316static void __tcp_alloc_md5sig_pool(void)
4317{
4318	struct crypto_ahash *hash;
4319	int cpu;
4320
4321	hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4322	if (IS_ERR(hash))
4323		return;
4324
4325	for_each_possible_cpu(cpu) {
4326		void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4327		struct ahash_request *req;
4328
4329		if (!scratch) {
4330			scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4331					       sizeof(struct tcphdr),
4332					       GFP_KERNEL,
4333					       cpu_to_node(cpu));
4334			if (!scratch)
4335				return;
4336			per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4337		}
4338		if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4339			continue;
4340
4341		req = ahash_request_alloc(hash, GFP_KERNEL);
4342		if (!req)
4343			return;
4344
4345		ahash_request_set_callback(req, 0, NULL, NULL);
4346
4347		per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4348	}
4349	/* before setting tcp_md5sig_pool_populated, we must commit all writes
4350	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
4351	 */
4352	smp_wmb();
4353	/* Paired with READ_ONCE() from tcp_alloc_md5sig_pool()
4354	 * and tcp_get_md5sig_pool().
4355	*/
4356	WRITE_ONCE(tcp_md5sig_pool_populated, true);
4357}
4358
4359bool tcp_alloc_md5sig_pool(void)
4360{
4361	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4362	if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
4363		mutex_lock(&tcp_md5sig_mutex);
4364
4365		if (!tcp_md5sig_pool_populated)
4366			__tcp_alloc_md5sig_pool();
4367
4368		mutex_unlock(&tcp_md5sig_mutex);
4369	}
4370	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4371	return READ_ONCE(tcp_md5sig_pool_populated);
4372}
4373EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4374
4375
4376/**
4377 *	tcp_get_md5sig_pool - get md5sig_pool for this user
4378 *
4379 *	We use percpu structure, so if we succeed, we exit with preemption
4380 *	and BH disabled, to make sure another thread or softirq handling
4381 *	wont try to get same context.
4382 */
4383struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4384{
4385	local_bh_disable();
4386
4387	/* Paired with WRITE_ONCE() from __tcp_alloc_md5sig_pool() */
4388	if (READ_ONCE(tcp_md5sig_pool_populated)) {
4389		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
4390		smp_rmb();
4391		return this_cpu_ptr(&tcp_md5sig_pool);
4392	}
4393	local_bh_enable();
4394	return NULL;
4395}
4396EXPORT_SYMBOL(tcp_get_md5sig_pool);
4397
4398int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4399			  const struct sk_buff *skb, unsigned int header_len)
4400{
4401	struct scatterlist sg;
4402	const struct tcphdr *tp = tcp_hdr(skb);
4403	struct ahash_request *req = hp->md5_req;
4404	unsigned int i;
4405	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4406					   skb_headlen(skb) - header_len : 0;
4407	const struct skb_shared_info *shi = skb_shinfo(skb);
4408	struct sk_buff *frag_iter;
4409
4410	sg_init_table(&sg, 1);
4411
4412	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4413	ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4414	if (crypto_ahash_update(req))
4415		return 1;
4416
4417	for (i = 0; i < shi->nr_frags; ++i) {
4418		const skb_frag_t *f = &shi->frags[i];
4419		unsigned int offset = skb_frag_off(f);
4420		struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4421
4422		sg_set_page(&sg, page, skb_frag_size(f),
4423			    offset_in_page(offset));
4424		ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4425		if (crypto_ahash_update(req))
4426			return 1;
4427	}
4428
4429	skb_walk_frags(skb, frag_iter)
4430		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4431			return 1;
4432
4433	return 0;
4434}
4435EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4436
4437int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4438{
4439	u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
4440	struct scatterlist sg;
4441
4442	sg_init_one(&sg, key->key, keylen);
4443	ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4444
4445	/* We use data_race() because tcp_md5_do_add() might change key->key under us */
4446	return data_race(crypto_ahash_update(hp->md5_req));
4447}
4448EXPORT_SYMBOL(tcp_md5_hash_key);
4449
4450/* Called with rcu_read_lock() */
4451enum skb_drop_reason
4452tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
4453		     const void *saddr, const void *daddr,
4454		     int family, int dif, int sdif)
4455{
4456	/*
4457	 * This gets called for each TCP segment that arrives
4458	 * so we want to be efficient.
4459	 * We have 3 drop cases:
4460	 * o No MD5 hash and one expected.
4461	 * o MD5 hash and we're not expecting one.
4462	 * o MD5 hash and its wrong.
4463	 */
4464	const __u8 *hash_location = NULL;
4465	struct tcp_md5sig_key *hash_expected;
4466	const struct tcphdr *th = tcp_hdr(skb);
4467	const struct tcp_sock *tp = tcp_sk(sk);
4468	int genhash, l3index;
4469	u8 newhash[16];
4470
4471	/* sdif set, means packet ingressed via a device
4472	 * in an L3 domain and dif is set to the l3mdev
4473	 */
4474	l3index = sdif ? dif : 0;
4475
4476	hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
4477	hash_location = tcp_parse_md5sig_option(th);
4478
4479	/* We've parsed the options - do we have a hash? */
4480	if (!hash_expected && !hash_location)
4481		return SKB_NOT_DROPPED_YET;
4482
4483	if (hash_expected && !hash_location) {
4484		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
4485		return SKB_DROP_REASON_TCP_MD5NOTFOUND;
4486	}
4487
4488	if (!hash_expected && hash_location) {
4489		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
4490		return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
4491	}
4492
4493	/* Check the signature.
4494	 * To support dual stack listeners, we need to handle
4495	 * IPv4-mapped case.
4496	 */
4497	if (family == AF_INET)
4498		genhash = tcp_v4_md5_hash_skb(newhash,
4499					      hash_expected,
4500					      NULL, skb);
4501	else
4502		genhash = tp->af_specific->calc_md5_hash(newhash,
4503							 hash_expected,
4504							 NULL, skb);
4505
4506	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
4507		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
4508		if (family == AF_INET) {
4509			net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
4510					saddr, ntohs(th->source),
4511					daddr, ntohs(th->dest),
4512					genhash ? " tcp_v4_calc_md5_hash failed"
4513					: "", l3index);
4514		} else {
4515			net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
4516					genhash ? "failed" : "mismatch",
4517					saddr, ntohs(th->source),
4518					daddr, ntohs(th->dest), l3index);
4519		}
4520		return SKB_DROP_REASON_TCP_MD5FAILURE;
4521	}
4522	return SKB_NOT_DROPPED_YET;
4523}
4524EXPORT_SYMBOL(tcp_inbound_md5_hash);
4525
4526#endif
4527
4528void tcp_done(struct sock *sk)
4529{
4530	struct request_sock *req;
4531
4532	/* We might be called with a new socket, after
4533	 * inet_csk_prepare_forced_close() has been called
4534	 * so we can not use lockdep_sock_is_held(sk)
4535	 */
4536	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4537
4538	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4539		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4540
4541	tcp_set_state(sk, TCP_CLOSE);
4542	tcp_clear_xmit_timers(sk);
4543	if (req)
4544		reqsk_fastopen_remove(sk, req, false);
4545
4546	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
4547
4548	if (!sock_flag(sk, SOCK_DEAD))
4549		sk->sk_state_change(sk);
4550	else
4551		inet_csk_destroy_sock(sk);
4552}
4553EXPORT_SYMBOL_GPL(tcp_done);
4554
4555int tcp_abort(struct sock *sk, int err)
4556{
4557	int state = inet_sk_state_load(sk);
4558
4559	if (state == TCP_NEW_SYN_RECV) {
4560		struct request_sock *req = inet_reqsk(sk);
4561
4562		local_bh_disable();
4563		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4564		local_bh_enable();
4565		return 0;
4566	}
4567	if (state == TCP_TIME_WAIT) {
4568		struct inet_timewait_sock *tw = inet_twsk(sk);
4569
4570		refcount_inc(&tw->tw_refcnt);
4571		local_bh_disable();
4572		inet_twsk_deschedule_put(tw);
4573		local_bh_enable();
4574		return 0;
4575	}
4576
4577	/* BPF context ensures sock locking. */
4578	if (!has_current_bpf_ctx())
4579		/* Don't race with userspace socket closes such as tcp_close. */
4580		lock_sock(sk);
4581
4582	if (sk->sk_state == TCP_LISTEN) {
4583		tcp_set_state(sk, TCP_CLOSE);
4584		inet_csk_listen_stop(sk);
4585	}
4586
4587	/* Don't race with BH socket closes such as inet_csk_listen_stop. */
4588	local_bh_disable();
4589	bh_lock_sock(sk);
4590
4591	if (!sock_flag(sk, SOCK_DEAD)) {
4592		WRITE_ONCE(sk->sk_err, err);
4593		/* This barrier is coupled with smp_rmb() in tcp_poll() */
4594		smp_wmb();
4595		sk_error_report(sk);
4596		if (tcp_need_reset(sk->sk_state))
4597			tcp_send_active_reset(sk, GFP_ATOMIC);
4598		tcp_done(sk);
4599	}
4600
4601	bh_unlock_sock(sk);
4602	local_bh_enable();
4603	tcp_write_queue_purge(sk);
4604	if (!has_current_bpf_ctx())
4605		release_sock(sk);
4606	return 0;
4607}
4608EXPORT_SYMBOL_GPL(tcp_abort);
4609
4610extern struct tcp_congestion_ops tcp_reno;
4611
4612static __initdata unsigned long thash_entries;
4613static int __init set_thash_entries(char *str)
4614{
4615	ssize_t ret;
4616
4617	if (!str)
4618		return 0;
4619
4620	ret = kstrtoul(str, 0, &thash_entries);
4621	if (ret)
4622		return 0;
4623
4624	return 1;
4625}
4626__setup("thash_entries=", set_thash_entries);
4627
4628static void __init tcp_init_mem(void)
4629{
4630	unsigned long limit = nr_free_buffer_pages() / 16;
4631
4632	limit = max(limit, 128UL);
4633	sysctl_tcp_mem[0] = limit / 4 * 3;		/* 4.68 % */
4634	sysctl_tcp_mem[1] = limit;			/* 6.25 % */
4635	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;	/* 9.37 % */
4636}
4637
4638void __init tcp_init(void)
4639{
4640	int max_rshare, max_wshare, cnt;
4641	unsigned long limit;
4642	unsigned int i;
4643
4644	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4645	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4646		     sizeof_field(struct sk_buff, cb));
4647
4648	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4649
4650	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4651	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
4652
4653	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4654			    thash_entries, 21,  /* one slot per 2 MB*/
4655			    0, 64 * 1024);
4656	tcp_hashinfo.bind_bucket_cachep =
4657		kmem_cache_create("tcp_bind_bucket",
4658				  sizeof(struct inet_bind_bucket), 0,
4659				  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
4660				  SLAB_ACCOUNT,
4661				  NULL);
4662	tcp_hashinfo.bind2_bucket_cachep =
4663		kmem_cache_create("tcp_bind2_bucket",
4664				  sizeof(struct inet_bind2_bucket), 0,
4665				  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
4666				  SLAB_ACCOUNT,
4667				  NULL);
4668
4669	/* Size and allocate the main established and bind bucket
4670	 * hash tables.
4671	 *
4672	 * The methodology is similar to that of the buffer cache.
4673	 */
4674	tcp_hashinfo.ehash =
4675		alloc_large_system_hash("TCP established",
4676					sizeof(struct inet_ehash_bucket),
4677					thash_entries,
4678					17, /* one slot per 128 KB of memory */
4679					0,
4680					NULL,
4681					&tcp_hashinfo.ehash_mask,
4682					0,
4683					thash_entries ? 0 : 512 * 1024);
4684	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4685		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4686
4687	if (inet_ehash_locks_alloc(&tcp_hashinfo))
4688		panic("TCP: failed to alloc ehash_locks");
4689	tcp_hashinfo.bhash =
4690		alloc_large_system_hash("TCP bind",
4691					2 * sizeof(struct inet_bind_hashbucket),
4692					tcp_hashinfo.ehash_mask + 1,
4693					17, /* one slot per 128 KB of memory */
4694					0,
4695					&tcp_hashinfo.bhash_size,
4696					NULL,
4697					0,
4698					64 * 1024);
4699	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4700	tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
4701	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4702		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4703		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4704		spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
4705		INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
4706	}
4707
4708	tcp_hashinfo.pernet = false;
4709
4710	cnt = tcp_hashinfo.ehash_mask + 1;
4711	sysctl_tcp_max_orphans = cnt / 2;
4712
4713	tcp_init_mem();
4714	/* Set per-socket limits to no more than 1/128 the pressure threshold */
4715	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4716	max_wshare = min(4UL*1024*1024, limit);
4717	max_rshare = min(6UL*1024*1024, limit);
4718
4719	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
4720	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4721	init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4722
4723	init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
4724	init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4725	init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4726
4727	pr_info("Hash tables configured (established %u bind %u)\n",
4728		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4729
4730	tcp_v4_init();
4731	tcp_metrics_init();
4732	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4733	tcp_tasklet_init();
4734	mptcp_init();
4735}
Configure Feed

Configure Feed