net/core/sock.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at master 114 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <linux/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/udp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113#include <linux/user_namespace.h>
 114#include <linux/static_key.h>
 115#include <linux/memcontrol.h>
 116#include <linux/prefetch.h>
 117#include <linux/compat.h>
 118#include <linux/mroute.h>
 119#include <linux/mroute6.h>
 120#include <linux/icmpv6.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <linux/skbuff_ref.h>
 128#include <net/net_namespace.h>
 129#include <net/request_sock.h>
 130#include <net/sock.h>
 131#include <net/proto_memory.h>
 132#include <linux/net_tstamp.h>
 133#include <net/xfrm.h>
 134#include <linux/ipsec.h>
 135#include <net/cls_cgroup.h>
 136#include <net/netprio_cgroup.h>
 137#include <linux/sock_diag.h>
 138
 139#include <linux/filter.h>
 140#include <net/sock_reuseport.h>
 141#include <net/bpf_sk_storage.h>
 142
 143#include <trace/events/sock.h>
 144
 145#include <net/tcp.h>
 146#include <net/busy_poll.h>
 147#include <net/phonet/phonet.h>
 148
 149#include <linux/ethtool.h>
 150
 151#include <uapi/linux/pidfd.h>
 152
 153#include "dev.h"
 154
 155static DEFINE_MUTEX(proto_list_mutex);
 156static LIST_HEAD(proto_list);
 157
 158static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
 159static void sock_def_write_space(struct sock *sk);
 160
 161/**
 162 * sk_ns_capable - General socket capability test
 163 * @sk: Socket to use a capability on or through
 164 * @user_ns: The user namespace of the capability to use
 165 * @cap: The capability to use
 166 *
 167 * Test to see if the opener of the socket had when the socket was
 168 * created and the current process has the capability @cap in the user
 169 * namespace @user_ns.
 170 */
 171bool sk_ns_capable(const struct sock *sk,
 172		   struct user_namespace *user_ns, int cap)
 173{
 174	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 175		ns_capable(user_ns, cap);
 176}
 177EXPORT_SYMBOL(sk_ns_capable);
 178
 179/**
 180 * sk_capable - Socket global capability test
 181 * @sk: Socket to use a capability on or through
 182 * @cap: The global capability to use
 183 *
 184 * Test to see if the opener of the socket had when the socket was
 185 * created and the current process has the capability @cap in all user
 186 * namespaces.
 187 */
 188bool sk_capable(const struct sock *sk, int cap)
 189{
 190	return sk_ns_capable(sk, &init_user_ns, cap);
 191}
 192EXPORT_SYMBOL(sk_capable);
 193
 194/**
 195 * sk_net_capable - Network namespace socket capability test
 196 * @sk: Socket to use a capability on or through
 197 * @cap: The capability to use
 198 *
 199 * Test to see if the opener of the socket had when the socket was created
 200 * and the current process has the capability @cap over the network namespace
 201 * the socket is a member of.
 202 */
 203bool sk_net_capable(const struct sock *sk, int cap)
 204{
 205	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 206}
 207EXPORT_SYMBOL(sk_net_capable);
 208
 209/*
 210 * Each address family might have different locking rules, so we have
 211 * one slock key per address family and separate keys for internal and
 212 * userspace sockets.
 213 */
 214static struct lock_class_key af_family_keys[AF_MAX];
 215static struct lock_class_key af_family_kern_keys[AF_MAX];
 216static struct lock_class_key af_family_slock_keys[AF_MAX];
 217static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 218
 219/*
 220 * Make lock validator output more readable. (we pre-construct these
 221 * strings build-time, so that runtime initialization of socket
 222 * locks is fast):
 223 */
 224
 225#define _sock_locks(x)						  \
 226  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 227  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 228  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 229  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 230  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 231  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 232  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 233  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 234  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 235  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 236  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 237  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 238  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 239  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 240  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 241  x "AF_MCTP"  , \
 242  x "AF_MAX"
 243
 244static const char *const af_family_key_strings[AF_MAX+1] = {
 245	_sock_locks("sk_lock-")
 246};
 247static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 248	_sock_locks("slock-")
 249};
 250static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 251	_sock_locks("clock-")
 252};
 253
 254static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 255	_sock_locks("k-sk_lock-")
 256};
 257static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 258	_sock_locks("k-slock-")
 259};
 260static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 261	_sock_locks("k-clock-")
 262};
 263static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 264	_sock_locks("rlock-")
 265};
 266static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 267	_sock_locks("wlock-")
 268};
 269static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 270	_sock_locks("elock-")
 271};
 272
 273/*
 274 * sk_callback_lock and sk queues locking rules are per-address-family,
 275 * so split the lock classes by using a per-AF key:
 276 */
 277static struct lock_class_key af_callback_keys[AF_MAX];
 278static struct lock_class_key af_rlock_keys[AF_MAX];
 279static struct lock_class_key af_wlock_keys[AF_MAX];
 280static struct lock_class_key af_elock_keys[AF_MAX];
 281static struct lock_class_key af_kern_callback_keys[AF_MAX];
 282
 283/* Run time adjustable parameters. */
 284__u32 sysctl_wmem_max __read_mostly = 4 << 20;
 285EXPORT_SYMBOL(sysctl_wmem_max);
 286__u32 sysctl_rmem_max __read_mostly = 4 << 20;
 287EXPORT_SYMBOL(sysctl_rmem_max);
 288__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
 289__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
 290
 291DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 292EXPORT_SYMBOL_GPL(memalloc_socks_key);
 293
 294/**
 295 * sk_set_memalloc - sets %SOCK_MEMALLOC
 296 * @sk: socket to set it on
 297 *
 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 299 * It's the responsibility of the admin to adjust min_free_kbytes
 300 * to meet the requirements
 301 */
 302void sk_set_memalloc(struct sock *sk)
 303{
 304	sock_set_flag(sk, SOCK_MEMALLOC);
 305	sk->sk_allocation |= __GFP_MEMALLOC;
 306	static_branch_inc(&memalloc_socks_key);
 307}
 308EXPORT_SYMBOL_GPL(sk_set_memalloc);
 309
 310void sk_clear_memalloc(struct sock *sk)
 311{
 312	sock_reset_flag(sk, SOCK_MEMALLOC);
 313	sk->sk_allocation &= ~__GFP_MEMALLOC;
 314	static_branch_dec(&memalloc_socks_key);
 315
 316	/*
 317	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 318	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 319	 * it has rmem allocations due to the last swapfile being deactivated
 320	 * but there is a risk that the socket is unusable due to exceeding
 321	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 322	 */
 323	sk_mem_reclaim(sk);
 324}
 325EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 326
 327int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 328{
 329	int ret;
 330	unsigned int noreclaim_flag;
 331
 332	/* these should have been dropped before queueing */
 333	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 334
 335	noreclaim_flag = memalloc_noreclaim_save();
 336	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 337				 tcp_v6_do_rcv,
 338				 tcp_v4_do_rcv,
 339				 sk, skb);
 340	memalloc_noreclaim_restore(noreclaim_flag);
 341
 342	return ret;
 343}
 344EXPORT_SYMBOL(__sk_backlog_rcv);
 345
 346void sk_error_report(struct sock *sk)
 347{
 348	sk->sk_error_report(sk);
 349
 350	switch (sk->sk_family) {
 351	case AF_INET:
 352		fallthrough;
 353	case AF_INET6:
 354		trace_inet_sk_error_report(sk);
 355		break;
 356	default:
 357		break;
 358	}
 359}
 360EXPORT_SYMBOL(sk_error_report);
 361
 362int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 363{
 364	struct __kernel_sock_timeval tv;
 365
 366	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 367		tv.tv_sec = 0;
 368		tv.tv_usec = 0;
 369	} else {
 370		tv.tv_sec = timeo / HZ;
 371		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 372	}
 373
 374	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 375		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 376		*(struct old_timeval32 *)optval = tv32;
 377		return sizeof(tv32);
 378	}
 379
 380	if (old_timeval) {
 381		struct __kernel_old_timeval old_tv;
 382		old_tv.tv_sec = tv.tv_sec;
 383		old_tv.tv_usec = tv.tv_usec;
 384		*(struct __kernel_old_timeval *)optval = old_tv;
 385		return sizeof(old_tv);
 386	}
 387
 388	*(struct __kernel_sock_timeval *)optval = tv;
 389	return sizeof(tv);
 390}
 391EXPORT_SYMBOL(sock_get_timeout);
 392
 393int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 394			   sockptr_t optval, int optlen, bool old_timeval)
 395{
 396	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 397		struct old_timeval32 tv32;
 398
 399		if (optlen < sizeof(tv32))
 400			return -EINVAL;
 401
 402		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 403			return -EFAULT;
 404		tv->tv_sec = tv32.tv_sec;
 405		tv->tv_usec = tv32.tv_usec;
 406	} else if (old_timeval) {
 407		struct __kernel_old_timeval old_tv;
 408
 409		if (optlen < sizeof(old_tv))
 410			return -EINVAL;
 411		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 412			return -EFAULT;
 413		tv->tv_sec = old_tv.tv_sec;
 414		tv->tv_usec = old_tv.tv_usec;
 415	} else {
 416		if (optlen < sizeof(*tv))
 417			return -EINVAL;
 418		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 419			return -EFAULT;
 420	}
 421
 422	return 0;
 423}
 424EXPORT_SYMBOL(sock_copy_user_timeval);
 425
 426static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 427			    bool old_timeval)
 428{
 429	struct __kernel_sock_timeval tv;
 430	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 431	long val;
 432
 433	if (err)
 434		return err;
 435
 436	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 437		return -EDOM;
 438
 439	if (tv.tv_sec < 0) {
 440		static int warned __read_mostly;
 441
 442		WRITE_ONCE(*timeo_p, 0);
 443		if (warned < 10 && net_ratelimit()) {
 444			warned++;
 445			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 446				__func__, current->comm, task_pid_nr(current));
 447		}
 448		return 0;
 449	}
 450	val = MAX_SCHEDULE_TIMEOUT;
 451	if ((tv.tv_sec || tv.tv_usec) &&
 452	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 453		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 454						    USEC_PER_SEC / HZ);
 455	WRITE_ONCE(*timeo_p, val);
 456	return 0;
 457}
 458
 459static bool sk_set_prio_allowed(const struct sock *sk, int val)
 460{
 461	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
 462		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
 463		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
 464}
 465
 466static bool sock_needs_netstamp(const struct sock *sk)
 467{
 468	switch (sk->sk_family) {
 469	case AF_UNSPEC:
 470	case AF_UNIX:
 471		return false;
 472	default:
 473		return true;
 474	}
 475}
 476
 477static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 478{
 479	if (sk->sk_flags & flags) {
 480		sk->sk_flags &= ~flags;
 481		if (sock_needs_netstamp(sk) &&
 482		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 483			net_disable_timestamp();
 484	}
 485}
 486
 487
 488int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 489{
 490	unsigned long flags;
 491	struct sk_buff_head *list = &sk->sk_receive_queue;
 492
 493	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
 494		sk_drops_inc(sk);
 495		trace_sock_rcvqueue_full(sk, skb);
 496		return -ENOMEM;
 497	}
 498
 499	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 500		sk_drops_inc(sk);
 501		return -ENOBUFS;
 502	}
 503
 504	skb->dev = NULL;
 505	skb_set_owner_r(skb, sk);
 506
 507	/* we escape from rcu protected region, make sure we dont leak
 508	 * a norefcounted dst
 509	 */
 510	skb_dst_force(skb);
 511
 512	spin_lock_irqsave(&list->lock, flags);
 513	sock_skb_set_dropcount(sk, skb);
 514	__skb_queue_tail(list, skb);
 515	spin_unlock_irqrestore(&list->lock, flags);
 516
 517	if (!sock_flag(sk, SOCK_DEAD))
 518		sk->sk_data_ready(sk);
 519	return 0;
 520}
 521EXPORT_SYMBOL(__sock_queue_rcv_skb);
 522
 523int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 524			      enum skb_drop_reason *reason)
 525{
 526	enum skb_drop_reason drop_reason;
 527	int err;
 528
 529	err = sk_filter_reason(sk, skb, &drop_reason);
 530	if (err)
 531		goto out;
 532
 533	err = __sock_queue_rcv_skb(sk, skb);
 534	switch (err) {
 535	case -ENOMEM:
 536		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 537		break;
 538	case -ENOBUFS:
 539		drop_reason = SKB_DROP_REASON_PROTO_MEM;
 540		break;
 541	default:
 542		drop_reason = SKB_NOT_DROPPED_YET;
 543		break;
 544	}
 545out:
 546	if (reason)
 547		*reason = drop_reason;
 548	return err;
 549}
 550EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 551
 552int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 553		     const int nested, unsigned int trim_cap, bool refcounted)
 554{
 555	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
 556	int rc = NET_RX_SUCCESS;
 557	int err;
 558
 559	if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
 560		goto discard_and_relse;
 561
 562	skb->dev = NULL;
 563
 564	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 565		sk_drops_inc(sk);
 566		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 567		goto discard_and_relse;
 568	}
 569	if (nested)
 570		bh_lock_sock_nested(sk);
 571	else
 572		bh_lock_sock(sk);
 573	if (!sock_owned_by_user(sk)) {
 574		/*
 575		 * trylock + unlock semantics:
 576		 */
 577		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 578
 579		rc = sk_backlog_rcv(sk, skb);
 580
 581		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 582	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
 583		bh_unlock_sock(sk);
 584		if (err == -ENOMEM)
 585			reason = SKB_DROP_REASON_PFMEMALLOC;
 586		if (err == -ENOBUFS)
 587			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
 588		sk_drops_inc(sk);
 589		goto discard_and_relse;
 590	}
 591
 592	bh_unlock_sock(sk);
 593out:
 594	if (refcounted)
 595		sock_put(sk);
 596	return rc;
 597discard_and_relse:
 598	sk_skb_reason_drop(sk, skb, reason);
 599	goto out;
 600}
 601EXPORT_SYMBOL(__sk_receive_skb);
 602
 603INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 604							  u32));
 605INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 606							   u32));
 607struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 608{
 609	struct dst_entry *dst = __sk_dst_get(sk);
 610
 611	if (dst && READ_ONCE(dst->obsolete) &&
 612	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613			       dst, cookie) == NULL) {
 614		sk_tx_queue_clear(sk);
 615		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 616		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 617		dst_release(dst);
 618		return NULL;
 619	}
 620
 621	return dst;
 622}
 623EXPORT_SYMBOL(__sk_dst_check);
 624
 625struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 626{
 627	struct dst_entry *dst = sk_dst_get(sk);
 628
 629	if (dst && READ_ONCE(dst->obsolete) &&
 630	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 631			       dst, cookie) == NULL) {
 632		sk_dst_reset(sk);
 633		dst_release(dst);
 634		return NULL;
 635	}
 636
 637	return dst;
 638}
 639EXPORT_SYMBOL(sk_dst_check);
 640
 641static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 642{
 643	int ret = -ENOPROTOOPT;
 644#ifdef CONFIG_NETDEVICES
 645	struct net *net = sock_net(sk);
 646
 647	/* Sorry... */
 648	ret = -EPERM;
 649	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 650		goto out;
 651
 652	ret = -EINVAL;
 653	if (ifindex < 0)
 654		goto out;
 655
 656	/* Paired with all READ_ONCE() done locklessly. */
 657	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 658
 659	if (sk->sk_prot->rehash)
 660		sk->sk_prot->rehash(sk);
 661	sk_dst_reset(sk);
 662
 663	ret = 0;
 664
 665out:
 666#endif
 667
 668	return ret;
 669}
 670
 671int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 672{
 673	int ret;
 674
 675	if (lock_sk)
 676		lock_sock(sk);
 677	ret = sock_bindtoindex_locked(sk, ifindex);
 678	if (lock_sk)
 679		release_sock(sk);
 680
 681	return ret;
 682}
 683EXPORT_SYMBOL(sock_bindtoindex);
 684
 685static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 686{
 687	int ret = -ENOPROTOOPT;
 688#ifdef CONFIG_NETDEVICES
 689	struct net *net = sock_net(sk);
 690	char devname[IFNAMSIZ];
 691	int index;
 692
 693	ret = -EINVAL;
 694	if (optlen < 0)
 695		goto out;
 696
 697	/* Bind this socket to a particular device like "eth0",
 698	 * as specified in the passed interface name. If the
 699	 * name is "" or the option length is zero the socket
 700	 * is not bound.
 701	 */
 702	if (optlen > IFNAMSIZ - 1)
 703		optlen = IFNAMSIZ - 1;
 704	memset(devname, 0, sizeof(devname));
 705
 706	ret = -EFAULT;
 707	if (copy_from_sockptr(devname, optval, optlen))
 708		goto out;
 709
 710	index = 0;
 711	if (devname[0] != '\0') {
 712		struct net_device *dev;
 713
 714		rcu_read_lock();
 715		dev = dev_get_by_name_rcu(net, devname);
 716		if (dev)
 717			index = dev->ifindex;
 718		rcu_read_unlock();
 719		ret = -ENODEV;
 720		if (!dev)
 721			goto out;
 722	}
 723
 724	sockopt_lock_sock(sk);
 725	ret = sock_bindtoindex_locked(sk, index);
 726	sockopt_release_sock(sk);
 727out:
 728#endif
 729
 730	return ret;
 731}
 732
 733static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 734				sockptr_t optlen, int len)
 735{
 736	int ret = -ENOPROTOOPT;
 737#ifdef CONFIG_NETDEVICES
 738	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 739	struct net *net = sock_net(sk);
 740	char devname[IFNAMSIZ];
 741
 742	if (bound_dev_if == 0) {
 743		len = 0;
 744		goto zero;
 745	}
 746
 747	ret = -EINVAL;
 748	if (len < IFNAMSIZ)
 749		goto out;
 750
 751	ret = netdev_get_name(net, devname, bound_dev_if);
 752	if (ret)
 753		goto out;
 754
 755	len = strlen(devname) + 1;
 756
 757	ret = -EFAULT;
 758	if (copy_to_sockptr(optval, devname, len))
 759		goto out;
 760
 761zero:
 762	ret = -EFAULT;
 763	if (copy_to_sockptr(optlen, &len, sizeof(int)))
 764		goto out;
 765
 766	ret = 0;
 767
 768out:
 769#endif
 770
 771	return ret;
 772}
 773
 774bool sk_mc_loop(const struct sock *sk)
 775{
 776	if (dev_recursion_level())
 777		return false;
 778	if (!sk)
 779		return true;
 780	/* IPV6_ADDRFORM can change sk->sk_family under us. */
 781	switch (READ_ONCE(sk->sk_family)) {
 782	case AF_INET:
 783		return inet_test_bit(MC_LOOP, sk);
 784#if IS_ENABLED(CONFIG_IPV6)
 785	case AF_INET6:
 786		return inet6_test_bit(MC6_LOOP, sk);
 787#endif
 788	}
 789	WARN_ON_ONCE(1);
 790	return true;
 791}
 792EXPORT_SYMBOL(sk_mc_loop);
 793
 794void sock_set_reuseaddr(struct sock *sk)
 795{
 796	lock_sock(sk);
 797	sk->sk_reuse = SK_CAN_REUSE;
 798	release_sock(sk);
 799}
 800EXPORT_SYMBOL(sock_set_reuseaddr);
 801
 802void sock_set_reuseport(struct sock *sk)
 803{
 804	lock_sock(sk);
 805	sk->sk_reuseport = true;
 806	release_sock(sk);
 807}
 808EXPORT_SYMBOL(sock_set_reuseport);
 809
 810void sock_no_linger(struct sock *sk)
 811{
 812	lock_sock(sk);
 813	WRITE_ONCE(sk->sk_lingertime, 0);
 814	sock_set_flag(sk, SOCK_LINGER);
 815	release_sock(sk);
 816}
 817EXPORT_SYMBOL(sock_no_linger);
 818
 819void sock_set_priority(struct sock *sk, u32 priority)
 820{
 821	WRITE_ONCE(sk->sk_priority, priority);
 822}
 823EXPORT_SYMBOL(sock_set_priority);
 824
 825void sock_set_sndtimeo(struct sock *sk, s64 secs)
 826{
 827	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 828		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 829	else
 830		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 831}
 832EXPORT_SYMBOL(sock_set_sndtimeo);
 833
 834static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 835{
 836	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
 837	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
 838	if (val)  {
 839		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 840		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 841	}
 842}
 843
 844void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 845{
 846	switch (optname) {
 847	case SO_TIMESTAMP_OLD:
 848		__sock_set_timestamps(sk, valbool, false, false);
 849		break;
 850	case SO_TIMESTAMP_NEW:
 851		__sock_set_timestamps(sk, valbool, true, false);
 852		break;
 853	case SO_TIMESTAMPNS_OLD:
 854		__sock_set_timestamps(sk, valbool, false, true);
 855		break;
 856	case SO_TIMESTAMPNS_NEW:
 857		__sock_set_timestamps(sk, valbool, true, true);
 858		break;
 859	}
 860}
 861
 862static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 863{
 864	struct net *net = sock_net(sk);
 865	struct net_device *dev = NULL;
 866	bool match = false;
 867	int *vclock_index;
 868	int i, num;
 869
 870	if (sk->sk_bound_dev_if)
 871		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 872
 873	if (!dev) {
 874		pr_err("%s: sock not bind to device\n", __func__);
 875		return -EOPNOTSUPP;
 876	}
 877
 878	num = ethtool_get_phc_vclocks(dev, &vclock_index);
 879	dev_put(dev);
 880
 881	for (i = 0; i < num; i++) {
 882		if (*(vclock_index + i) == phc_index) {
 883			match = true;
 884			break;
 885		}
 886	}
 887
 888	if (num > 0)
 889		kfree(vclock_index);
 890
 891	if (!match)
 892		return -EINVAL;
 893
 894	WRITE_ONCE(sk->sk_bind_phc, phc_index);
 895
 896	return 0;
 897}
 898
 899int sock_set_timestamping(struct sock *sk, int optname,
 900			  struct so_timestamping timestamping)
 901{
 902	int val = timestamping.flags;
 903	int ret;
 904
 905	if (val & ~SOF_TIMESTAMPING_MASK)
 906		return -EINVAL;
 907
 908	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 909	    !(val & SOF_TIMESTAMPING_OPT_ID))
 910		return -EINVAL;
 911
 912	if (val & SOF_TIMESTAMPING_OPT_ID &&
 913	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 914		if (sk_is_tcp(sk)) {
 915			if ((1 << sk->sk_state) &
 916			    (TCPF_CLOSE | TCPF_LISTEN))
 917				return -EINVAL;
 918			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 919				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 920			else
 921				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 922		} else {
 923			atomic_set(&sk->sk_tskey, 0);
 924		}
 925	}
 926
 927	if (val & SOF_TIMESTAMPING_OPT_STATS &&
 928	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 929		return -EINVAL;
 930
 931	if (val & SOF_TIMESTAMPING_BIND_PHC) {
 932		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 933		if (ret)
 934			return ret;
 935	}
 936
 937	WRITE_ONCE(sk->sk_tsflags, val);
 938	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 939	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
 940
 941	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 942		sock_enable_timestamp(sk,
 943				      SOCK_TIMESTAMPING_RX_SOFTWARE);
 944	else
 945		sock_disable_timestamp(sk,
 946				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 947	return 0;
 948}
 949
 950#if defined(CONFIG_CGROUP_BPF)
 951void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
 952{
 953	struct bpf_sock_ops_kern sock_ops;
 954
 955	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
 956	sock_ops.op = op;
 957	sock_ops.is_fullsock = 1;
 958	sock_ops.sk = sk;
 959	bpf_skops_init_skb(&sock_ops, skb, 0);
 960	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
 961}
 962#endif
 963
 964void sock_set_keepalive(struct sock *sk)
 965{
 966	lock_sock(sk);
 967	if (sk->sk_prot->keepalive)
 968		sk->sk_prot->keepalive(sk, true);
 969	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 970	release_sock(sk);
 971}
 972EXPORT_SYMBOL(sock_set_keepalive);
 973
 974static void __sock_set_rcvbuf(struct sock *sk, int val)
 975{
 976	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 977	 * as a negative value.
 978	 */
 979	val = min_t(int, val, INT_MAX / 2);
 980	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 981
 982	/* We double it on the way in to account for "struct sk_buff" etc.
 983	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 984	 * will allow that much actual data to be received on that socket.
 985	 *
 986	 * Applications are unaware that "struct sk_buff" and other overheads
 987	 * allocate from the receive buffer during socket buffer allocation.
 988	 *
 989	 * And after considering the possible alternatives, returning the value
 990	 * we actually used in getsockopt is the most desirable behavior.
 991	 */
 992	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 993}
 994
 995void sock_set_rcvbuf(struct sock *sk, int val)
 996{
 997	lock_sock(sk);
 998	__sock_set_rcvbuf(sk, val);
 999	release_sock(sk);
1000}
1001EXPORT_SYMBOL(sock_set_rcvbuf);
1002
1003static void __sock_set_mark(struct sock *sk, u32 val)
1004{
1005	if (val != sk->sk_mark) {
1006		WRITE_ONCE(sk->sk_mark, val);
1007		sk_dst_reset(sk);
1008	}
1009}
1010
1011void sock_set_mark(struct sock *sk, u32 val)
1012{
1013	lock_sock(sk);
1014	__sock_set_mark(sk, val);
1015	release_sock(sk);
1016}
1017EXPORT_SYMBOL(sock_set_mark);
1018
1019static void sock_release_reserved_memory(struct sock *sk, int bytes)
1020{
1021	/* Round down bytes to multiple of pages */
1022	bytes = round_down(bytes, PAGE_SIZE);
1023
1024	WARN_ON(bytes > sk->sk_reserved_mem);
1025	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1026	sk_mem_reclaim(sk);
1027}
1028
1029static int sock_reserve_memory(struct sock *sk, int bytes)
1030{
1031	long allocated;
1032	bool charged;
1033	int pages;
1034
1035	if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
1036		return -EOPNOTSUPP;
1037
1038	if (!bytes)
1039		return 0;
1040
1041	pages = sk_mem_pages(bytes);
1042
1043	/* pre-charge to memcg */
1044	charged = mem_cgroup_sk_charge(sk, pages,
1045				       GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1046	if (!charged)
1047		return -ENOMEM;
1048
1049	if (sk->sk_bypass_prot_mem)
1050		goto success;
1051
1052	/* pre-charge to forward_alloc */
1053	sk_memory_allocated_add(sk, pages);
1054	allocated = sk_memory_allocated(sk);
1055
1056	/* If the system goes into memory pressure with this
1057	 * precharge, give up and return error.
1058	 */
1059	if (allocated > sk_prot_mem_limits(sk, 1)) {
1060		sk_memory_allocated_sub(sk, pages);
1061		mem_cgroup_sk_uncharge(sk, pages);
1062		return -ENOMEM;
1063	}
1064
1065success:
1066	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1067
1068	WRITE_ONCE(sk->sk_reserved_mem,
1069		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1070
1071	return 0;
1072}
1073
1074#ifdef CONFIG_PAGE_POOL
1075
1076/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1078 * allocates to copy these tokens, and to prevent looping over the frags for
1079 * too long.
1080 */
1081#define MAX_DONTNEED_TOKENS 128
1082#define MAX_DONTNEED_FRAGS 1024
1083
1084static noinline_for_stack int
1085sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1086{
1087	unsigned int num_tokens, i, j, k, netmem_num = 0;
1088	struct dmabuf_token *tokens;
1089	int ret = 0, num_frags = 0;
1090	netmem_ref netmems[16];
1091
1092	if (!sk_is_tcp(sk))
1093		return -EBADF;
1094
1095	if (optlen % sizeof(*tokens) ||
1096	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1097		return -EINVAL;
1098
1099	num_tokens = optlen / sizeof(*tokens);
1100	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1101	if (!tokens)
1102		return -ENOMEM;
1103
1104	if (copy_from_sockptr(tokens, optval, optlen)) {
1105		kvfree(tokens);
1106		return -EFAULT;
1107	}
1108
1109	xa_lock_bh(&sk->sk_user_frags);
1110	for (i = 0; i < num_tokens; i++) {
1111		for (j = 0; j < tokens[i].token_count; j++) {
1112			if (++num_frags > MAX_DONTNEED_FRAGS)
1113				goto frag_limit_reached;
1114
1115			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1116				&sk->sk_user_frags, tokens[i].token_start + j);
1117
1118			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1119				continue;
1120
1121			netmems[netmem_num++] = netmem;
1122			if (netmem_num == ARRAY_SIZE(netmems)) {
1123				xa_unlock_bh(&sk->sk_user_frags);
1124				for (k = 0; k < netmem_num; k++)
1125					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1126				netmem_num = 0;
1127				xa_lock_bh(&sk->sk_user_frags);
1128			}
1129			ret++;
1130		}
1131	}
1132
1133frag_limit_reached:
1134	xa_unlock_bh(&sk->sk_user_frags);
1135	for (k = 0; k < netmem_num; k++)
1136		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1137
1138	kvfree(tokens);
1139	return ret;
1140}
1141#endif
1142
1143void sockopt_lock_sock(struct sock *sk)
1144{
1145	/* When current->bpf_ctx is set, the setsockopt is called from
1146	 * a bpf prog.  bpf has ensured the sk lock has been
1147	 * acquired before calling setsockopt().
1148	 */
1149	if (has_current_bpf_ctx())
1150		return;
1151
1152	lock_sock(sk);
1153}
1154EXPORT_SYMBOL(sockopt_lock_sock);
1155
1156void sockopt_release_sock(struct sock *sk)
1157{
1158	if (has_current_bpf_ctx())
1159		return;
1160
1161	release_sock(sk);
1162}
1163EXPORT_SYMBOL(sockopt_release_sock);
1164
1165bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1166{
1167	return has_current_bpf_ctx() || ns_capable(ns, cap);
1168}
1169EXPORT_SYMBOL(sockopt_ns_capable);
1170
1171bool sockopt_capable(int cap)
1172{
1173	return has_current_bpf_ctx() || capable(cap);
1174}
1175EXPORT_SYMBOL(sockopt_capable);
1176
1177static int sockopt_validate_clockid(__kernel_clockid_t value)
1178{
1179	switch (value) {
1180	case CLOCK_REALTIME:
1181	case CLOCK_MONOTONIC:
1182	case CLOCK_TAI:
1183		return 0;
1184	}
1185	return -EINVAL;
1186}
1187
1188/*
1189 *	This is meant for all protocols to use and covers goings on
1190 *	at the socket level. Everything here is generic.
1191 */
1192
1193int sk_setsockopt(struct sock *sk, int level, int optname,
1194		  sockptr_t optval, unsigned int optlen)
1195{
1196	struct so_timestamping timestamping;
1197	struct socket *sock = sk->sk_socket;
1198	struct sock_txtime sk_txtime;
1199	int val;
1200	int valbool;
1201	struct linger ling;
1202	int ret = 0;
1203
1204	/*
1205	 *	Options without arguments
1206	 */
1207
1208	if (optname == SO_BINDTODEVICE)
1209		return sock_setbindtodevice(sk, optval, optlen);
1210
1211	if (optlen < sizeof(int))
1212		return -EINVAL;
1213
1214	if (copy_from_sockptr(&val, optval, sizeof(val)))
1215		return -EFAULT;
1216
1217	valbool = val ? 1 : 0;
1218
1219	/* handle options which do not require locking the socket. */
1220	switch (optname) {
1221	case SO_PRIORITY:
1222		if (sk_set_prio_allowed(sk, val)) {
1223			sock_set_priority(sk, val);
1224			return 0;
1225		}
1226		return -EPERM;
1227	case SO_TYPE:
1228	case SO_PROTOCOL:
1229	case SO_DOMAIN:
1230	case SO_ERROR:
1231		return -ENOPROTOOPT;
1232#ifdef CONFIG_NET_RX_BUSY_POLL
1233	case SO_BUSY_POLL:
1234		if (val < 0)
1235			return -EINVAL;
1236		WRITE_ONCE(sk->sk_ll_usec, val);
1237		return 0;
1238	case SO_PREFER_BUSY_POLL:
1239		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1240			return -EPERM;
1241		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1242		return 0;
1243	case SO_BUSY_POLL_BUDGET:
1244		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1245		    !sockopt_capable(CAP_NET_ADMIN))
1246			return -EPERM;
1247		if (val < 0 || val > U16_MAX)
1248			return -EINVAL;
1249		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1250		return 0;
1251#endif
1252	case SO_MAX_PACING_RATE:
1253		{
1254		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1255		unsigned long pacing_rate;
1256
1257		if (sizeof(ulval) != sizeof(val) &&
1258		    optlen >= sizeof(ulval) &&
1259		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1260			return -EFAULT;
1261		}
1262		if (ulval != ~0UL)
1263			cmpxchg(&sk->sk_pacing_status,
1264				SK_PACING_NONE,
1265				SK_PACING_NEEDED);
1266		/* Pairs with READ_ONCE() from sk_getsockopt() */
1267		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1268		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1269		if (ulval < pacing_rate)
1270			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1271		return 0;
1272		}
1273	case SO_TXREHASH:
1274		if (!sk_is_tcp(sk))
1275			return -EOPNOTSUPP;
1276		if (val < -1 || val > 1)
1277			return -EINVAL;
1278		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1279			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1280		/* Paired with READ_ONCE() in tcp_rtx_synack()
1281		 * and sk_getsockopt().
1282		 */
1283		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1284		return 0;
1285	case SO_PEEK_OFF:
1286		{
1287		int (*set_peek_off)(struct sock *sk, int val);
1288
1289		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1290		if (set_peek_off)
1291			ret = set_peek_off(sk, val);
1292		else
1293			ret = -EOPNOTSUPP;
1294		return ret;
1295		}
1296#ifdef CONFIG_PAGE_POOL
1297	case SO_DEVMEM_DONTNEED:
1298		return sock_devmem_dontneed(sk, optval, optlen);
1299#endif
1300	case SO_SNDTIMEO_OLD:
1301	case SO_SNDTIMEO_NEW:
1302		return sock_set_timeout(&sk->sk_sndtimeo, optval,
1303					optlen, optname == SO_SNDTIMEO_OLD);
1304	case SO_RCVTIMEO_OLD:
1305	case SO_RCVTIMEO_NEW:
1306		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
1307					optlen, optname == SO_RCVTIMEO_OLD);
1308	}
1309
1310	sockopt_lock_sock(sk);
1311
1312	switch (optname) {
1313	case SO_DEBUG:
1314		if (val && !sockopt_capable(CAP_NET_ADMIN))
1315			ret = -EACCES;
1316		else
1317			sock_valbool_flag(sk, SOCK_DBG, valbool);
1318		break;
1319	case SO_REUSEADDR:
1320		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1321		break;
1322	case SO_REUSEPORT:
1323		if (valbool && !sk_is_inet(sk))
1324			ret = -EOPNOTSUPP;
1325		else
1326			sk->sk_reuseport = valbool;
1327		break;
1328	case SO_DONTROUTE:
1329		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1330		sk_dst_reset(sk);
1331		break;
1332	case SO_BROADCAST:
1333		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1334		break;
1335	case SO_SNDBUF:
1336		/* Don't error on this BSD doesn't and if you think
1337		 * about it this is right. Otherwise apps have to
1338		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1339		 * are treated in BSD as hints
1340		 */
1341		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1342set_sndbuf:
1343		/* Ensure val * 2 fits into an int, to prevent max_t()
1344		 * from treating it as a negative value.
1345		 */
1346		val = min_t(int, val, INT_MAX / 2);
1347		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1348		WRITE_ONCE(sk->sk_sndbuf,
1349			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1350		/* Wake up sending tasks if we upped the value. */
1351		sk->sk_write_space(sk);
1352		break;
1353
1354	case SO_SNDBUFFORCE:
1355		if (!sockopt_capable(CAP_NET_ADMIN)) {
1356			ret = -EPERM;
1357			break;
1358		}
1359
1360		/* No negative values (to prevent underflow, as val will be
1361		 * multiplied by 2).
1362		 */
1363		if (val < 0)
1364			val = 0;
1365		goto set_sndbuf;
1366
1367	case SO_RCVBUF:
1368		/* Don't error on this BSD doesn't and if you think
1369		 * about it this is right. Otherwise apps have to
1370		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1371		 * are treated in BSD as hints
1372		 */
1373		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1374		break;
1375
1376	case SO_RCVBUFFORCE:
1377		if (!sockopt_capable(CAP_NET_ADMIN)) {
1378			ret = -EPERM;
1379			break;
1380		}
1381
1382		/* No negative values (to prevent underflow, as val will be
1383		 * multiplied by 2).
1384		 */
1385		__sock_set_rcvbuf(sk, max(val, 0));
1386		break;
1387
1388	case SO_KEEPALIVE:
1389		if (sk->sk_prot->keepalive)
1390			sk->sk_prot->keepalive(sk, valbool);
1391		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1392		break;
1393
1394	case SO_OOBINLINE:
1395		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1396		break;
1397
1398	case SO_NO_CHECK:
1399		sk->sk_no_check_tx = valbool;
1400		break;
1401
1402	case SO_LINGER:
1403		if (optlen < sizeof(ling)) {
1404			ret = -EINVAL;	/* 1003.1g */
1405			break;
1406		}
1407		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1408			ret = -EFAULT;
1409			break;
1410		}
1411		if (!ling.l_onoff) {
1412			sock_reset_flag(sk, SOCK_LINGER);
1413		} else {
1414			unsigned long t_sec = ling.l_linger;
1415
1416			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1417				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1418			else
1419				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1420			sock_set_flag(sk, SOCK_LINGER);
1421		}
1422		break;
1423
1424	case SO_BSDCOMPAT:
1425		break;
1426
1427	case SO_TIMESTAMP_OLD:
1428	case SO_TIMESTAMP_NEW:
1429	case SO_TIMESTAMPNS_OLD:
1430	case SO_TIMESTAMPNS_NEW:
1431		sock_set_timestamp(sk, optname, valbool);
1432		break;
1433
1434	case SO_TIMESTAMPING_NEW:
1435	case SO_TIMESTAMPING_OLD:
1436		if (optlen == sizeof(timestamping)) {
1437			if (copy_from_sockptr(&timestamping, optval,
1438					      sizeof(timestamping))) {
1439				ret = -EFAULT;
1440				break;
1441			}
1442		} else {
1443			memset(&timestamping, 0, sizeof(timestamping));
1444			timestamping.flags = val;
1445		}
1446		ret = sock_set_timestamping(sk, optname, timestamping);
1447		break;
1448
1449	case SO_RCVLOWAT:
1450		{
1451		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1452
1453		if (val < 0)
1454			val = INT_MAX;
1455		if (sock)
1456			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1457		if (set_rcvlowat)
1458			ret = set_rcvlowat(sk, val);
1459		else
1460			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1461		break;
1462		}
1463	case SO_ATTACH_FILTER: {
1464		struct sock_fprog fprog;
1465
1466		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1467		if (!ret)
1468			ret = sk_attach_filter(&fprog, sk);
1469		break;
1470	}
1471	case SO_ATTACH_BPF:
1472		ret = -EINVAL;
1473		if (optlen == sizeof(u32)) {
1474			u32 ufd;
1475
1476			ret = -EFAULT;
1477			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1478				break;
1479
1480			ret = sk_attach_bpf(ufd, sk);
1481		}
1482		break;
1483
1484	case SO_ATTACH_REUSEPORT_CBPF: {
1485		struct sock_fprog fprog;
1486
1487		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1488		if (!ret)
1489			ret = sk_reuseport_attach_filter(&fprog, sk);
1490		break;
1491	}
1492	case SO_ATTACH_REUSEPORT_EBPF:
1493		ret = -EINVAL;
1494		if (optlen == sizeof(u32)) {
1495			u32 ufd;
1496
1497			ret = -EFAULT;
1498			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1499				break;
1500
1501			ret = sk_reuseport_attach_bpf(ufd, sk);
1502		}
1503		break;
1504
1505	case SO_DETACH_REUSEPORT_BPF:
1506		ret = reuseport_detach_prog(sk);
1507		break;
1508
1509	case SO_DETACH_FILTER:
1510		ret = sk_detach_filter(sk);
1511		break;
1512
1513	case SO_LOCK_FILTER:
1514		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1515			ret = -EPERM;
1516		else
1517			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1518		break;
1519
1520	case SO_MARK:
1521		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1522		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1523			ret = -EPERM;
1524			break;
1525		}
1526
1527		__sock_set_mark(sk, val);
1528		break;
1529	case SO_RCVMARK:
1530		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1531		break;
1532
1533	case SO_RCVPRIORITY:
1534		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1535		break;
1536
1537	case SO_RXQ_OVFL:
1538		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1539		break;
1540
1541	case SO_WIFI_STATUS:
1542		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1543		break;
1544
1545	case SO_NOFCS:
1546		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1547		break;
1548
1549	case SO_SELECT_ERR_QUEUE:
1550		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1551		break;
1552
1553	case SO_PASSCRED:
1554		if (sk_may_scm_recv(sk))
1555			sk->sk_scm_credentials = valbool;
1556		else
1557			ret = -EOPNOTSUPP;
1558		break;
1559
1560	case SO_PASSSEC:
1561		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1562			sk->sk_scm_security = valbool;
1563		else
1564			ret = -EOPNOTSUPP;
1565		break;
1566
1567	case SO_PASSPIDFD:
1568		if (sk_is_unix(sk))
1569			sk->sk_scm_pidfd = valbool;
1570		else
1571			ret = -EOPNOTSUPP;
1572		break;
1573
1574	case SO_PASSRIGHTS:
1575		if (sk_is_unix(sk))
1576			sk->sk_scm_rights = valbool;
1577		else
1578			ret = -EOPNOTSUPP;
1579		break;
1580
1581	case SO_INCOMING_CPU:
1582		reuseport_update_incoming_cpu(sk, val);
1583		break;
1584
1585	case SO_CNX_ADVICE:
1586		if (val == 1)
1587			dst_negative_advice(sk);
1588		break;
1589
1590	case SO_ZEROCOPY:
1591		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1592			if (!(sk_is_tcp(sk) ||
1593			      (sk->sk_type == SOCK_DGRAM &&
1594			       sk->sk_protocol == IPPROTO_UDP)))
1595				ret = -EOPNOTSUPP;
1596		} else if (sk->sk_family != PF_RDS) {
1597			ret = -EOPNOTSUPP;
1598		}
1599		if (!ret) {
1600			if (val < 0 || val > 1)
1601				ret = -EINVAL;
1602			else
1603				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1604		}
1605		break;
1606
1607	case SO_TXTIME:
1608		if (optlen != sizeof(struct sock_txtime)) {
1609			ret = -EINVAL;
1610			break;
1611		} else if (copy_from_sockptr(&sk_txtime, optval,
1612			   sizeof(struct sock_txtime))) {
1613			ret = -EFAULT;
1614			break;
1615		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1616			ret = -EINVAL;
1617			break;
1618		}
1619		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1620		 * scheduler has enough safe guards.
1621		 */
1622		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1623		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1624			ret = -EPERM;
1625			break;
1626		}
1627
1628		ret = sockopt_validate_clockid(sk_txtime.clockid);
1629		if (ret)
1630			break;
1631
1632		sock_valbool_flag(sk, SOCK_TXTIME, true);
1633		sk->sk_clockid = sk_txtime.clockid;
1634		sk->sk_txtime_deadline_mode =
1635			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1636		sk->sk_txtime_report_errors =
1637			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1638		break;
1639
1640	case SO_BINDTOIFINDEX:
1641		ret = sock_bindtoindex_locked(sk, val);
1642		break;
1643
1644	case SO_BUF_LOCK:
1645		if (val & ~SOCK_BUF_LOCK_MASK) {
1646			ret = -EINVAL;
1647			break;
1648		}
1649		sk->sk_userlocks = val | (sk->sk_userlocks &
1650					  ~SOCK_BUF_LOCK_MASK);
1651		break;
1652
1653	case SO_RESERVE_MEM:
1654	{
1655		int delta;
1656
1657		if (val < 0) {
1658			ret = -EINVAL;
1659			break;
1660		}
1661
1662		delta = val - sk->sk_reserved_mem;
1663		if (delta < 0)
1664			sock_release_reserved_memory(sk, -delta);
1665		else
1666			ret = sock_reserve_memory(sk, delta);
1667		break;
1668	}
1669
1670	default:
1671		ret = -ENOPROTOOPT;
1672		break;
1673	}
1674	sockopt_release_sock(sk);
1675	return ret;
1676}
1677
1678int sock_setsockopt(struct socket *sock, int level, int optname,
1679		    sockptr_t optval, unsigned int optlen)
1680{
1681	return sk_setsockopt(sock->sk, level, optname,
1682			     optval, optlen);
1683}
1684EXPORT_SYMBOL(sock_setsockopt);
1685
1686static const struct cred *sk_get_peer_cred(struct sock *sk)
1687{
1688	const struct cred *cred;
1689
1690	spin_lock(&sk->sk_peer_lock);
1691	cred = get_cred(sk->sk_peer_cred);
1692	spin_unlock(&sk->sk_peer_lock);
1693
1694	return cred;
1695}
1696
1697static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1698			  struct ucred *ucred)
1699{
1700	ucred->pid = pid_vnr(pid);
1701	ucred->uid = ucred->gid = -1;
1702	if (cred) {
1703		struct user_namespace *current_ns = current_user_ns();
1704
1705		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1706		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1707	}
1708}
1709
1710static int groups_to_user(sockptr_t dst, const struct group_info *src)
1711{
1712	struct user_namespace *user_ns = current_user_ns();
1713	int i;
1714
1715	for (i = 0; i < src->ngroups; i++) {
1716		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1717
1718		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1719			return -EFAULT;
1720	}
1721
1722	return 0;
1723}
1724
1725int sk_getsockopt(struct sock *sk, int level, int optname,
1726		  sockptr_t optval, sockptr_t optlen)
1727{
1728	struct socket *sock = sk->sk_socket;
1729
1730	union {
1731		int val;
1732		u64 val64;
1733		unsigned long ulval;
1734		struct linger ling;
1735		struct old_timeval32 tm32;
1736		struct __kernel_old_timeval tm;
1737		struct  __kernel_sock_timeval stm;
1738		struct sock_txtime txtime;
1739		struct so_timestamping timestamping;
1740	} v;
1741
1742	int lv = sizeof(int);
1743	int len;
1744
1745	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1746		return -EFAULT;
1747	if (len < 0)
1748		return -EINVAL;
1749
1750	memset(&v, 0, sizeof(v));
1751
1752	switch (optname) {
1753	case SO_DEBUG:
1754		v.val = sock_flag(sk, SOCK_DBG);
1755		break;
1756
1757	case SO_DONTROUTE:
1758		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1759		break;
1760
1761	case SO_BROADCAST:
1762		v.val = sock_flag(sk, SOCK_BROADCAST);
1763		break;
1764
1765	case SO_SNDBUF:
1766		v.val = READ_ONCE(sk->sk_sndbuf);
1767		break;
1768
1769	case SO_RCVBUF:
1770		v.val = READ_ONCE(sk->sk_rcvbuf);
1771		break;
1772
1773	case SO_REUSEADDR:
1774		v.val = sk->sk_reuse;
1775		break;
1776
1777	case SO_REUSEPORT:
1778		v.val = sk->sk_reuseport;
1779		break;
1780
1781	case SO_KEEPALIVE:
1782		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1783		break;
1784
1785	case SO_TYPE:
1786		v.val = sk->sk_type;
1787		break;
1788
1789	case SO_PROTOCOL:
1790		v.val = sk->sk_protocol;
1791		break;
1792
1793	case SO_DOMAIN:
1794		v.val = sk->sk_family;
1795		break;
1796
1797	case SO_ERROR:
1798		v.val = -sock_error(sk);
1799		if (v.val == 0)
1800			v.val = xchg(&sk->sk_err_soft, 0);
1801		break;
1802
1803	case SO_OOBINLINE:
1804		v.val = sock_flag(sk, SOCK_URGINLINE);
1805		break;
1806
1807	case SO_NO_CHECK:
1808		v.val = sk->sk_no_check_tx;
1809		break;
1810
1811	case SO_PRIORITY:
1812		v.val = READ_ONCE(sk->sk_priority);
1813		break;
1814
1815	case SO_LINGER:
1816		lv		= sizeof(v.ling);
1817		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1818		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1819		break;
1820
1821	case SO_BSDCOMPAT:
1822		break;
1823
1824	case SO_TIMESTAMP_OLD:
1825		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1826				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1827				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1828		break;
1829
1830	case SO_TIMESTAMPNS_OLD:
1831		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1832		break;
1833
1834	case SO_TIMESTAMP_NEW:
1835		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1836		break;
1837
1838	case SO_TIMESTAMPNS_NEW:
1839		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1840		break;
1841
1842	case SO_TIMESTAMPING_OLD:
1843	case SO_TIMESTAMPING_NEW:
1844		lv = sizeof(v.timestamping);
1845		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1846		 * returning the flags when they were set through the same option.
1847		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1848		 */
1849		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1850			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1851			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1852		}
1853		break;
1854
1855	case SO_RCVTIMEO_OLD:
1856	case SO_RCVTIMEO_NEW:
1857		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1858				      SO_RCVTIMEO_OLD == optname);
1859		break;
1860
1861	case SO_SNDTIMEO_OLD:
1862	case SO_SNDTIMEO_NEW:
1863		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1864				      SO_SNDTIMEO_OLD == optname);
1865		break;
1866
1867	case SO_RCVLOWAT:
1868		v.val = READ_ONCE(sk->sk_rcvlowat);
1869		break;
1870
1871	case SO_SNDLOWAT:
1872		v.val = 1;
1873		break;
1874
1875	case SO_PASSCRED:
1876		if (!sk_may_scm_recv(sk))
1877			return -EOPNOTSUPP;
1878
1879		v.val = sk->sk_scm_credentials;
1880		break;
1881
1882	case SO_PASSPIDFD:
1883		if (!sk_is_unix(sk))
1884			return -EOPNOTSUPP;
1885
1886		v.val = sk->sk_scm_pidfd;
1887		break;
1888
1889	case SO_PASSRIGHTS:
1890		if (!sk_is_unix(sk))
1891			return -EOPNOTSUPP;
1892
1893		v.val = sk->sk_scm_rights;
1894		break;
1895
1896	case SO_PEERCRED:
1897	{
1898		struct ucred peercred;
1899		if (len > sizeof(peercred))
1900			len = sizeof(peercred);
1901
1902		spin_lock(&sk->sk_peer_lock);
1903		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1904		spin_unlock(&sk->sk_peer_lock);
1905
1906		if (copy_to_sockptr(optval, &peercred, len))
1907			return -EFAULT;
1908		goto lenout;
1909	}
1910
1911	case SO_PEERPIDFD:
1912	{
1913		struct pid *peer_pid;
1914		struct file *pidfd_file = NULL;
1915		unsigned int flags = 0;
1916		int pidfd;
1917
1918		if (len > sizeof(pidfd))
1919			len = sizeof(pidfd);
1920
1921		spin_lock(&sk->sk_peer_lock);
1922		peer_pid = get_pid(sk->sk_peer_pid);
1923		spin_unlock(&sk->sk_peer_lock);
1924
1925		if (!peer_pid)
1926			return -ENODATA;
1927
1928		/* The use of PIDFD_STALE requires stashing of struct pid
1929		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1930		 * were prepared for this.
1931		 */
1932		if (sk->sk_family == AF_UNIX)
1933			flags = PIDFD_STALE;
1934
1935		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1936		put_pid(peer_pid);
1937		if (pidfd < 0)
1938			return pidfd;
1939
1940		if (copy_to_sockptr(optval, &pidfd, len) ||
1941		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1942			put_unused_fd(pidfd);
1943			fput(pidfd_file);
1944
1945			return -EFAULT;
1946		}
1947
1948		fd_install(pidfd, pidfd_file);
1949		return 0;
1950	}
1951
1952	case SO_PEERGROUPS:
1953	{
1954		const struct cred *cred;
1955		int ret, n;
1956
1957		cred = sk_get_peer_cred(sk);
1958		if (!cred)
1959			return -ENODATA;
1960
1961		n = cred->group_info->ngroups;
1962		if (len < n * sizeof(gid_t)) {
1963			len = n * sizeof(gid_t);
1964			put_cred(cred);
1965			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1966		}
1967		len = n * sizeof(gid_t);
1968
1969		ret = groups_to_user(optval, cred->group_info);
1970		put_cred(cred);
1971		if (ret)
1972			return ret;
1973		goto lenout;
1974	}
1975
1976	case SO_PEERNAME:
1977	{
1978		struct sockaddr_storage address;
1979
1980		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1981		if (lv < 0)
1982			return -ENOTCONN;
1983		if (lv < len)
1984			return -EINVAL;
1985		if (copy_to_sockptr(optval, &address, len))
1986			return -EFAULT;
1987		goto lenout;
1988	}
1989
1990	/* Dubious BSD thing... Probably nobody even uses it, but
1991	 * the UNIX standard wants it for whatever reason... -DaveM
1992	 */
1993	case SO_ACCEPTCONN:
1994		v.val = sk->sk_state == TCP_LISTEN;
1995		break;
1996
1997	case SO_PASSSEC:
1998		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1999			return -EOPNOTSUPP;
2000
2001		v.val = sk->sk_scm_security;
2002		break;
2003
2004	case SO_PEERSEC:
2005		return security_socket_getpeersec_stream(sock,
2006							 optval, optlen, len);
2007
2008	case SO_MARK:
2009		v.val = READ_ONCE(sk->sk_mark);
2010		break;
2011
2012	case SO_RCVMARK:
2013		v.val = sock_flag(sk, SOCK_RCVMARK);
2014		break;
2015
2016	case SO_RCVPRIORITY:
2017		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2018		break;
2019
2020	case SO_RXQ_OVFL:
2021		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2022		break;
2023
2024	case SO_WIFI_STATUS:
2025		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2026		break;
2027
2028	case SO_PEEK_OFF:
2029		if (!READ_ONCE(sock->ops)->set_peek_off)
2030			return -EOPNOTSUPP;
2031
2032		v.val = READ_ONCE(sk->sk_peek_off);
2033		break;
2034	case SO_NOFCS:
2035		v.val = sock_flag(sk, SOCK_NOFCS);
2036		break;
2037
2038	case SO_BINDTODEVICE:
2039		return sock_getbindtodevice(sk, optval, optlen, len);
2040
2041	case SO_GET_FILTER:
2042		len = sk_get_filter(sk, optval, len);
2043		if (len < 0)
2044			return len;
2045
2046		goto lenout;
2047
2048	case SO_LOCK_FILTER:
2049		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2050		break;
2051
2052	case SO_BPF_EXTENSIONS:
2053		v.val = bpf_tell_extensions();
2054		break;
2055
2056	case SO_SELECT_ERR_QUEUE:
2057		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2058		break;
2059
2060#ifdef CONFIG_NET_RX_BUSY_POLL
2061	case SO_BUSY_POLL:
2062		v.val = READ_ONCE(sk->sk_ll_usec);
2063		break;
2064	case SO_PREFER_BUSY_POLL:
2065		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2066		break;
2067#endif
2068
2069	case SO_MAX_PACING_RATE:
2070		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2071		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2072			lv = sizeof(v.ulval);
2073			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2074		} else {
2075			/* 32bit version */
2076			v.val = min_t(unsigned long, ~0U,
2077				      READ_ONCE(sk->sk_max_pacing_rate));
2078		}
2079		break;
2080
2081	case SO_INCOMING_CPU:
2082		v.val = READ_ONCE(sk->sk_incoming_cpu);
2083		break;
2084
2085	case SO_MEMINFO:
2086	{
2087		u32 meminfo[SK_MEMINFO_VARS];
2088
2089		sk_get_meminfo(sk, meminfo);
2090
2091		len = min_t(unsigned int, len, sizeof(meminfo));
2092		if (copy_to_sockptr(optval, &meminfo, len))
2093			return -EFAULT;
2094
2095		goto lenout;
2096	}
2097
2098#ifdef CONFIG_NET_RX_BUSY_POLL
2099	case SO_INCOMING_NAPI_ID:
2100		v.val = READ_ONCE(sk->sk_napi_id);
2101
2102		/* aggregate non-NAPI IDs down to 0 */
2103		if (!napi_id_valid(v.val))
2104			v.val = 0;
2105
2106		break;
2107#endif
2108
2109	case SO_COOKIE:
2110		lv = sizeof(u64);
2111		if (len < lv)
2112			return -EINVAL;
2113		v.val64 = sock_gen_cookie(sk);
2114		break;
2115
2116	case SO_ZEROCOPY:
2117		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2118		break;
2119
2120	case SO_TXTIME:
2121		lv = sizeof(v.txtime);
2122		v.txtime.clockid = sk->sk_clockid;
2123		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2124				  SOF_TXTIME_DEADLINE_MODE : 0;
2125		v.txtime.flags |= sk->sk_txtime_report_errors ?
2126				  SOF_TXTIME_REPORT_ERRORS : 0;
2127		break;
2128
2129	case SO_BINDTOIFINDEX:
2130		v.val = READ_ONCE(sk->sk_bound_dev_if);
2131		break;
2132
2133	case SO_NETNS_COOKIE:
2134		lv = sizeof(u64);
2135		if (len != lv)
2136			return -EINVAL;
2137		v.val64 = sock_net(sk)->net_cookie;
2138		break;
2139
2140	case SO_BUF_LOCK:
2141		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2142		break;
2143
2144	case SO_RESERVE_MEM:
2145		v.val = READ_ONCE(sk->sk_reserved_mem);
2146		break;
2147
2148	case SO_TXREHASH:
2149		if (!sk_is_tcp(sk))
2150			return -EOPNOTSUPP;
2151
2152		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2153		v.val = READ_ONCE(sk->sk_txrehash);
2154		break;
2155
2156	default:
2157		/* We implement the SO_SNDLOWAT etc to not be settable
2158		 * (1003.1g 7).
2159		 */
2160		return -ENOPROTOOPT;
2161	}
2162
2163	if (len > lv)
2164		len = lv;
2165	if (copy_to_sockptr(optval, &v, len))
2166		return -EFAULT;
2167lenout:
2168	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2169		return -EFAULT;
2170	return 0;
2171}
2172
2173/*
2174 * Initialize an sk_lock.
2175 *
2176 * (We also register the sk_lock with the lock validator.)
2177 */
2178static inline void sock_lock_init(struct sock *sk)
2179{
2180	sk_owner_clear(sk);
2181
2182	if (sk->sk_kern_sock)
2183		sock_lock_init_class_and_name(
2184			sk,
2185			af_family_kern_slock_key_strings[sk->sk_family],
2186			af_family_kern_slock_keys + sk->sk_family,
2187			af_family_kern_key_strings[sk->sk_family],
2188			af_family_kern_keys + sk->sk_family);
2189	else
2190		sock_lock_init_class_and_name(
2191			sk,
2192			af_family_slock_key_strings[sk->sk_family],
2193			af_family_slock_keys + sk->sk_family,
2194			af_family_key_strings[sk->sk_family],
2195			af_family_keys + sk->sk_family);
2196}
2197
2198/*
2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2200 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2202 */
2203static void sock_copy(struct sock *nsk, const struct sock *osk)
2204{
2205	const struct proto *prot = READ_ONCE(osk->sk_prot);
2206#ifdef CONFIG_SECURITY_NETWORK
2207	void *sptr = nsk->sk_security;
2208#endif
2209
2210	/* If we move sk_tx_queue_mapping out of the private section,
2211	 * we must check if sk_tx_queue_clear() is called after
2212	 * sock_copy() in sk_clone_lock().
2213	 */
2214	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2215		     offsetof(struct sock, sk_dontcopy_begin) ||
2216		     offsetof(struct sock, sk_tx_queue_mapping) >=
2217		     offsetof(struct sock, sk_dontcopy_end));
2218
2219	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2220
2221	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2222		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2223		      /* alloc is larger than struct, see sk_prot_alloc() */);
2224
2225#ifdef CONFIG_SECURITY_NETWORK
2226	nsk->sk_security = sptr;
2227	security_sk_clone(osk, nsk);
2228#endif
2229}
2230
2231static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2232		int family)
2233{
2234	struct sock *sk;
2235	struct kmem_cache *slab;
2236
2237	slab = prot->slab;
2238	if (slab != NULL) {
2239		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2240		if (!sk)
2241			return sk;
2242		if (want_init_on_alloc(priority))
2243			sk_prot_clear_nulls(sk, prot->obj_size);
2244	} else
2245		sk = kmalloc(prot->obj_size, priority);
2246
2247	if (sk != NULL) {
2248		if (security_sk_alloc(sk, family, priority))
2249			goto out_free;
2250
2251		if (!try_module_get(prot->owner))
2252			goto out_free_sec;
2253	}
2254
2255	return sk;
2256
2257out_free_sec:
2258	security_sk_free(sk);
2259out_free:
2260	if (slab != NULL)
2261		kmem_cache_free(slab, sk);
2262	else
2263		kfree(sk);
2264	return NULL;
2265}
2266
2267static void sk_prot_free(struct proto *prot, struct sock *sk)
2268{
2269	struct kmem_cache *slab;
2270	struct module *owner;
2271
2272	owner = prot->owner;
2273	slab = prot->slab;
2274
2275	cgroup_sk_free(&sk->sk_cgrp_data);
2276	mem_cgroup_sk_free(sk);
2277	security_sk_free(sk);
2278
2279	sk_owner_put(sk);
2280
2281	if (slab != NULL)
2282		kmem_cache_free(slab, sk);
2283	else
2284		kfree(sk);
2285	module_put(owner);
2286}
2287
2288/**
2289 *	sk_alloc - All socket objects are allocated here
2290 *	@net: the applicable net namespace
2291 *	@family: protocol family
2292 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2293 *	@prot: struct proto associated with this new sock instance
2294 *	@kern: is this to be a kernel socket?
2295 */
2296struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2297		      struct proto *prot, int kern)
2298{
2299	struct sock *sk;
2300
2301	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2302	if (sk) {
2303		sk->sk_family = family;
2304		/*
2305		 * See comment in struct sock definition to understand
2306		 * why we need sk_prot_creator -acme
2307		 */
2308		sk->sk_prot = sk->sk_prot_creator = prot;
2309
2310		if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
2311			sk->sk_bypass_prot_mem = 1;
2312
2313		sk->sk_kern_sock = kern;
2314		sock_lock_init(sk);
2315
2316		sk->sk_net_refcnt = kern ? 0 : 1;
2317		if (likely(sk->sk_net_refcnt)) {
2318			get_net_track(net, &sk->ns_tracker, priority);
2319			sock_inuse_add(net, 1);
2320		} else {
2321			net_passive_inc(net);
2322			__netns_tracker_alloc(net, &sk->ns_tracker,
2323					      false, priority);
2324		}
2325
2326		sock_net_set(sk, net);
2327		refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2328
2329		mem_cgroup_sk_alloc(sk);
2330		cgroup_sk_alloc(&sk->sk_cgrp_data);
2331		sock_update_classid(&sk->sk_cgrp_data);
2332		sock_update_netprioidx(&sk->sk_cgrp_data);
2333		sk_tx_queue_clear(sk);
2334	}
2335
2336	return sk;
2337}
2338EXPORT_SYMBOL(sk_alloc);
2339
2340/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2341 * grace period. This is the case for UDP sockets and TCP listeners.
2342 */
2343static void __sk_destruct(struct rcu_head *head)
2344{
2345	struct sock *sk = container_of(head, struct sock, sk_rcu);
2346	struct net *net = sock_net(sk);
2347	struct sk_filter *filter;
2348
2349	if (sk->sk_destruct)
2350		sk->sk_destruct(sk);
2351
2352	filter = rcu_dereference_check(sk->sk_filter,
2353				       refcount_read(&sk->sk_wmem_alloc) == 0);
2354	if (filter) {
2355		sk_filter_uncharge(sk, filter);
2356		RCU_INIT_POINTER(sk->sk_filter, NULL);
2357	}
2358
2359	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2360
2361#ifdef CONFIG_BPF_SYSCALL
2362	bpf_sk_storage_free(sk);
2363#endif
2364
2365	if (atomic_read(&sk->sk_omem_alloc))
2366		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2367			 __func__, atomic_read(&sk->sk_omem_alloc));
2368
2369	if (sk->sk_frag.page) {
2370		put_page(sk->sk_frag.page);
2371		sk->sk_frag.page = NULL;
2372	}
2373
2374	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2375	put_cred(sk->sk_peer_cred);
2376	put_pid(sk->sk_peer_pid);
2377
2378	if (likely(sk->sk_net_refcnt)) {
2379		put_net_track(net, &sk->ns_tracker);
2380	} else {
2381		__netns_tracker_free(net, &sk->ns_tracker, false);
2382		net_passive_dec(net);
2383	}
2384	sk_prot_free(sk->sk_prot_creator, sk);
2385}
2386
2387void sk_net_refcnt_upgrade(struct sock *sk)
2388{
2389	struct net *net = sock_net(sk);
2390
2391	WARN_ON_ONCE(sk->sk_net_refcnt);
2392	__netns_tracker_free(net, &sk->ns_tracker, false);
2393	net_passive_dec(net);
2394	sk->sk_net_refcnt = 1;
2395	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2396	sock_inuse_add(net, 1);
2397}
2398EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2399
2400void sk_destruct(struct sock *sk)
2401{
2402	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2403
2404	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2405		reuseport_detach_sock(sk);
2406		use_call_rcu = true;
2407	}
2408
2409	if (use_call_rcu)
2410		call_rcu(&sk->sk_rcu, __sk_destruct);
2411	else
2412		__sk_destruct(&sk->sk_rcu);
2413}
2414
2415static void __sk_free(struct sock *sk)
2416{
2417	if (likely(sk->sk_net_refcnt))
2418		sock_inuse_add(sock_net(sk), -1);
2419
2420	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2421		sock_diag_broadcast_destroy(sk);
2422	else
2423		sk_destruct(sk);
2424}
2425
2426void sk_free(struct sock *sk)
2427{
2428	/*
2429	 * We subtract one from sk_wmem_alloc and can know if
2430	 * some packets are still in some tx queue.
2431	 * If not null, sock_wfree() will call __sk_free(sk) later
2432	 */
2433	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2434		__sk_free(sk);
2435}
2436EXPORT_SYMBOL(sk_free);
2437
2438static void sk_init_common(struct sock *sk)
2439{
2440	skb_queue_head_init(&sk->sk_receive_queue);
2441	skb_queue_head_init(&sk->sk_write_queue);
2442	skb_queue_head_init(&sk->sk_error_queue);
2443
2444	rwlock_init(&sk->sk_callback_lock);
2445	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2446			af_rlock_keys + sk->sk_family,
2447			af_family_rlock_key_strings[sk->sk_family]);
2448	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2449			af_wlock_keys + sk->sk_family,
2450			af_family_wlock_key_strings[sk->sk_family]);
2451	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2452			af_elock_keys + sk->sk_family,
2453			af_family_elock_key_strings[sk->sk_family]);
2454	if (sk->sk_kern_sock)
2455		lockdep_set_class_and_name(&sk->sk_callback_lock,
2456			af_kern_callback_keys + sk->sk_family,
2457			af_family_kern_clock_key_strings[sk->sk_family]);
2458	else
2459		lockdep_set_class_and_name(&sk->sk_callback_lock,
2460			af_callback_keys + sk->sk_family,
2461			af_family_clock_key_strings[sk->sk_family]);
2462}
2463
2464/**
2465 * sk_clone - clone a socket
2466 * @sk: the socket to clone
2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2468 * @lock: if true, lock the cloned sk
2469 *
2470 * If @lock is true, the clone is locked by bh_lock_sock(), and
2471 * caller must unlock socket even in error path by bh_unlock_sock().
2472 */
2473struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
2474		      bool lock)
2475{
2476	struct proto *prot = READ_ONCE(sk->sk_prot);
2477	struct sk_filter *filter;
2478	bool is_charged = true;
2479	struct sock *newsk;
2480
2481	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2482	if (!newsk)
2483		goto out;
2484
2485	sock_copy(newsk, sk);
2486
2487	newsk->sk_prot_creator = prot;
2488
2489	/* SANITY */
2490	if (likely(newsk->sk_net_refcnt)) {
2491		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2492		sock_inuse_add(sock_net(newsk), 1);
2493	} else {
2494		/* Kernel sockets are not elevating the struct net refcount.
2495		 * Instead, use a tracker to more easily detect if a layer
2496		 * is not properly dismantling its kernel sockets at netns
2497		 * destroy time.
2498		 */
2499		net_passive_inc(sock_net(newsk));
2500		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2501				      false, priority);
2502	}
2503
2504	sk_node_init(&newsk->sk_node);
2505	sock_lock_init(newsk);
2506
2507	if (lock)
2508		bh_lock_sock(newsk);
2509
2510	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2511	newsk->sk_backlog.len = 0;
2512
2513	atomic_set(&newsk->sk_rmem_alloc, 0);
2514
2515	refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
2516
2517	atomic_set(&newsk->sk_omem_alloc, 0);
2518	sk_init_common(newsk);
2519
2520	newsk->sk_dst_cache	= NULL;
2521	newsk->sk_dst_pending_confirm = 0;
2522	newsk->sk_wmem_queued	= 0;
2523	newsk->sk_forward_alloc = 0;
2524	newsk->sk_reserved_mem  = 0;
2525	DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
2526	sk_drops_reset(newsk);
2527	newsk->sk_send_head	= NULL;
2528	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2529	atomic_set(&newsk->sk_zckey, 0);
2530
2531	sock_reset_flag(newsk, SOCK_DONE);
2532
2533#ifdef CONFIG_MEMCG
2534	/* sk->sk_memcg will be populated at accept() time */
2535	newsk->sk_memcg = NULL;
2536#endif
2537
2538	cgroup_sk_clone(&newsk->sk_cgrp_data);
2539
2540	rcu_read_lock();
2541	filter = rcu_dereference(sk->sk_filter);
2542	if (filter != NULL)
2543		/* though it's an empty new sock, the charging may fail
2544		 * if sysctl_optmem_max was changed between creation of
2545		 * original socket and cloning
2546		 */
2547		is_charged = sk_filter_charge(newsk, filter);
2548	RCU_INIT_POINTER(newsk->sk_filter, filter);
2549	rcu_read_unlock();
2550
2551	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2552		/* We need to make sure that we don't uncharge the new
2553		 * socket if we couldn't charge it in the first place
2554		 * as otherwise we uncharge the parent's filter.
2555		 */
2556		if (!is_charged)
2557			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2558
2559		goto free;
2560	}
2561
2562	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2563
2564	if (bpf_sk_storage_clone(sk, newsk))
2565		goto free;
2566
2567	/* Clear sk_user_data if parent had the pointer tagged
2568	 * as not suitable for copying when cloning.
2569	 */
2570	if (sk_user_data_is_nocopy(newsk))
2571		newsk->sk_user_data = NULL;
2572
2573	newsk->sk_err	   = 0;
2574	newsk->sk_err_soft = 0;
2575	newsk->sk_priority = 0;
2576	newsk->sk_incoming_cpu = raw_smp_processor_id();
2577
2578	/* Before updating sk_refcnt, we must commit prior changes to memory
2579	 * (Documentation/RCU/rculist_nulls.rst for details)
2580	 */
2581	smp_wmb();
2582	refcount_set(&newsk->sk_refcnt, 2);
2583
2584	sk_set_socket(newsk, NULL);
2585	sk_tx_queue_clear(newsk);
2586	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2587
2588	if (newsk->sk_prot->sockets_allocated)
2589		sk_sockets_allocated_inc(newsk);
2590
2591	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2592		net_enable_timestamp();
2593out:
2594	return newsk;
2595free:
2596	/* It is still raw copy of parent, so invalidate
2597	 * destructor and make plain sk_free()
2598	 */
2599	newsk->sk_destruct = NULL;
2600	if (lock)
2601		bh_unlock_sock(newsk);
2602	sk_free(newsk);
2603	newsk = NULL;
2604	goto out;
2605}
2606EXPORT_SYMBOL_GPL(sk_clone);
2607
2608static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
2609{
2610	bool is_ipv6 = false;
2611	u32 max_size;
2612
2613#if IS_ENABLED(CONFIG_IPV6)
2614	is_ipv6 = (sk->sk_family == AF_INET6 &&
2615		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2616#endif
2617	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2618	max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
2619			READ_ONCE(dev->gso_ipv4_max_size);
2620	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2621		max_size = GSO_LEGACY_MAX_SIZE;
2622
2623	return max_size - (MAX_TCP_HEADER + 1);
2624}
2625
2626void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2627{
2628	const struct net_device *dev;
2629	u32 max_segs = 1;
2630
2631	rcu_read_lock();
2632	dev = dst_dev_rcu(dst);
2633	sk->sk_route_caps = dev->features;
2634	if (sk_is_tcp(sk)) {
2635		struct inet_connection_sock *icsk = inet_csk(sk);
2636
2637		sk->sk_route_caps |= NETIF_F_GSO;
2638		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2639	}
2640	if (sk->sk_route_caps & NETIF_F_GSO)
2641		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2642	if (unlikely(sk->sk_gso_disabled))
2643		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2644	if (sk_can_gso(sk)) {
2645		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2646			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2647		} else {
2648			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2649			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
2650			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2651			max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
2652		}
2653	}
2654	sk->sk_gso_max_segs = max_segs;
2655	sk_dst_set(sk, dst);
2656	rcu_read_unlock();
2657}
2658EXPORT_SYMBOL_GPL(sk_setup_caps);
2659
2660/*
2661 *	Simple resource managers for sockets.
2662 */
2663
2664
2665/*
2666 * Write buffer destructor automatically called from kfree_skb.
2667 */
2668void sock_wfree(struct sk_buff *skb)
2669{
2670	unsigned int len = skb->truesize;
2671	struct sock *sk = skb->sk;
2672	bool free;
2673	int old;
2674
2675	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2676		if (sock_flag(sk, SOCK_RCU_FREE) &&
2677		    sk->sk_write_space == sock_def_write_space) {
2678			rcu_read_lock();
2679			free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
2680						       &old);
2681			sock_def_write_space_wfree(sk, old - len);
2682			rcu_read_unlock();
2683			if (unlikely(free))
2684				__sk_free(sk);
2685			return;
2686		}
2687
2688		/*
2689		 * Keep a reference on sk_wmem_alloc, this will be released
2690		 * after sk_write_space() call
2691		 */
2692		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2693		sk->sk_write_space(sk);
2694		len = 1;
2695	}
2696	/*
2697	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2698	 * could not do because of in-flight packets
2699	 */
2700	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2701		__sk_free(sk);
2702}
2703EXPORT_SYMBOL(sock_wfree);
2704
2705/* This variant of sock_wfree() is used by TCP,
2706 * since it sets SOCK_USE_WRITE_QUEUE.
2707 */
2708void __sock_wfree(struct sk_buff *skb)
2709{
2710	struct sock *sk = skb->sk;
2711
2712	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2713		__sk_free(sk);
2714}
2715
2716void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2717{
2718	int old_wmem;
2719
2720	skb_orphan(skb);
2721#ifdef CONFIG_INET
2722	if (unlikely(!sk_fullsock(sk)))
2723		return skb_set_owner_edemux(skb, sk);
2724#endif
2725	skb->sk = sk;
2726	skb->destructor = sock_wfree;
2727	skb_set_hash_from_sk(skb, sk);
2728	/*
2729	 * We used to take a refcount on sk, but following operation
2730	 * is enough to guarantee sk_free() won't free this sock until
2731	 * all in-flight packets are completed
2732	 */
2733	__refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
2734
2735	/* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
2736	 * is in a host queue (qdisc, NIC queue).
2737	 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
2738	 * based on XPS for better performance.
2739	 * Otherwise clear ooo_okay to not risk Out Of Order delivery.
2740	 */
2741	skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
2742}
2743EXPORT_SYMBOL(skb_set_owner_w);
2744
2745static bool can_skb_orphan_partial(const struct sk_buff *skb)
2746{
2747	/* Drivers depend on in-order delivery for crypto offload,
2748	 * partial orphan breaks out-of-order-OK logic.
2749	 */
2750	if (skb_is_decrypted(skb))
2751		return false;
2752
2753	return (skb->destructor == sock_wfree ||
2754		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2755}
2756
2757/* This helper is used by netem, as it can hold packets in its
2758 * delay queue. We want to allow the owner socket to send more
2759 * packets, as if they were already TX completed by a typical driver.
2760 * But we also want to keep skb->sk set because some packet schedulers
2761 * rely on it (sch_fq for example).
2762 */
2763void skb_orphan_partial(struct sk_buff *skb)
2764{
2765	if (skb_is_tcp_pure_ack(skb))
2766		return;
2767
2768	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2769		return;
2770
2771	skb_orphan(skb);
2772}
2773EXPORT_SYMBOL(skb_orphan_partial);
2774
2775/*
2776 * Read buffer destructor automatically called from kfree_skb.
2777 */
2778void sock_rfree(struct sk_buff *skb)
2779{
2780	struct sock *sk = skb->sk;
2781	unsigned int len = skb->truesize;
2782
2783	atomic_sub(len, &sk->sk_rmem_alloc);
2784	sk_mem_uncharge(sk, len);
2785}
2786EXPORT_SYMBOL(sock_rfree);
2787
2788/*
2789 * Buffer destructor for skbs that are not used directly in read or write
2790 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2791 */
2792void sock_efree(struct sk_buff *skb)
2793{
2794	sock_put(skb->sk);
2795}
2796EXPORT_SYMBOL(sock_efree);
2797
2798/* Buffer destructor for prefetch/receive path where reference count may
2799 * not be held, e.g. for listen sockets.
2800 */
2801#ifdef CONFIG_INET
2802void sock_pfree(struct sk_buff *skb)
2803{
2804	struct sock *sk = skb->sk;
2805
2806	if (!sk_is_refcounted(sk))
2807		return;
2808
2809	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2810		inet_reqsk(sk)->rsk_listener = NULL;
2811		reqsk_free(inet_reqsk(sk));
2812		return;
2813	}
2814
2815	sock_gen_put(sk);
2816}
2817EXPORT_SYMBOL(sock_pfree);
2818#endif /* CONFIG_INET */
2819
2820/*
2821 * Allocate a skb from the socket's send buffer.
2822 */
2823struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2824			     gfp_t priority)
2825{
2826	if (force ||
2827	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2828		struct sk_buff *skb = alloc_skb(size, priority);
2829
2830		if (skb) {
2831			skb_set_owner_w(skb, sk);
2832			return skb;
2833		}
2834	}
2835	return NULL;
2836}
2837EXPORT_SYMBOL(sock_wmalloc);
2838
2839static void sock_ofree(struct sk_buff *skb)
2840{
2841	struct sock *sk = skb->sk;
2842
2843	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2844}
2845
2846struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2847			     gfp_t priority)
2848{
2849	struct sk_buff *skb;
2850
2851	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2852	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2853	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2854		return NULL;
2855
2856	skb = alloc_skb(size, priority);
2857	if (!skb)
2858		return NULL;
2859
2860	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2861	skb->sk = sk;
2862	skb->destructor = sock_ofree;
2863	return skb;
2864}
2865
2866/*
2867 * Allocate a memory block from the socket's option memory buffer.
2868 */
2869void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2870{
2871	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2872
2873	if ((unsigned int)size <= optmem_max &&
2874	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2875		void *mem;
2876		/* First do the add, to avoid the race if kmalloc
2877		 * might sleep.
2878		 */
2879		atomic_add(size, &sk->sk_omem_alloc);
2880		mem = kmalloc(size, priority);
2881		if (mem)
2882			return mem;
2883		atomic_sub(size, &sk->sk_omem_alloc);
2884	}
2885	return NULL;
2886}
2887EXPORT_SYMBOL(sock_kmalloc);
2888
2889/*
2890 * Duplicate the input "src" memory block using the socket's
2891 * option memory buffer.
2892 */
2893void *sock_kmemdup(struct sock *sk, const void *src,
2894		   int size, gfp_t priority)
2895{
2896	void *mem;
2897
2898	mem = sock_kmalloc(sk, size, priority);
2899	if (mem)
2900		memcpy(mem, src, size);
2901	return mem;
2902}
2903EXPORT_SYMBOL(sock_kmemdup);
2904
2905/* Free an option memory block. Note, we actually want the inline
2906 * here as this allows gcc to detect the nullify and fold away the
2907 * condition entirely.
2908 */
2909static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2910				  const bool nullify)
2911{
2912	if (WARN_ON_ONCE(!mem))
2913		return;
2914	if (nullify)
2915		kfree_sensitive(mem);
2916	else
2917		kfree(mem);
2918	atomic_sub(size, &sk->sk_omem_alloc);
2919}
2920
2921void sock_kfree_s(struct sock *sk, void *mem, int size)
2922{
2923	__sock_kfree_s(sk, mem, size, false);
2924}
2925EXPORT_SYMBOL(sock_kfree_s);
2926
2927void sock_kzfree_s(struct sock *sk, void *mem, int size)
2928{
2929	__sock_kfree_s(sk, mem, size, true);
2930}
2931EXPORT_SYMBOL(sock_kzfree_s);
2932
2933/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2934   I think, these locks should be removed for datagram sockets.
2935 */
2936static long sock_wait_for_wmem(struct sock *sk, long timeo)
2937{
2938	DEFINE_WAIT(wait);
2939
2940	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2941	for (;;) {
2942		if (!timeo)
2943			break;
2944		if (signal_pending(current))
2945			break;
2946		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2947		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2948		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2949			break;
2950		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2951			break;
2952		if (READ_ONCE(sk->sk_err))
2953			break;
2954		timeo = schedule_timeout(timeo);
2955	}
2956	finish_wait(sk_sleep(sk), &wait);
2957	return timeo;
2958}
2959
2960
2961/*
2962 *	Generic send/receive buffer handlers
2963 */
2964
2965struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2966				     unsigned long data_len, int noblock,
2967				     int *errcode, int max_page_order)
2968{
2969	struct sk_buff *skb;
2970	long timeo;
2971	int err;
2972
2973	timeo = sock_sndtimeo(sk, noblock);
2974	for (;;) {
2975		err = sock_error(sk);
2976		if (err != 0)
2977			goto failure;
2978
2979		err = -EPIPE;
2980		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2981			goto failure;
2982
2983		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2984			break;
2985
2986		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2987		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2988		err = -EAGAIN;
2989		if (!timeo)
2990			goto failure;
2991		if (signal_pending(current))
2992			goto interrupted;
2993		timeo = sock_wait_for_wmem(sk, timeo);
2994	}
2995	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2996				   errcode, sk->sk_allocation);
2997	if (skb)
2998		skb_set_owner_w(skb, sk);
2999	return skb;
3000
3001interrupted:
3002	err = sock_intr_errno(timeo);
3003failure:
3004	*errcode = err;
3005	return NULL;
3006}
3007EXPORT_SYMBOL(sock_alloc_send_pskb);
3008
3009int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3010		     struct sockcm_cookie *sockc)
3011{
3012	u32 tsflags;
3013
3014	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3015
3016	switch (cmsg->cmsg_type) {
3017	case SO_MARK:
3018		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3019		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3020			return -EPERM;
3021		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3022			return -EINVAL;
3023		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3024		break;
3025	case SO_TIMESTAMPING_OLD:
3026	case SO_TIMESTAMPING_NEW:
3027		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3028			return -EINVAL;
3029
3030		tsflags = *(u32 *)CMSG_DATA(cmsg);
3031		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3032			return -EINVAL;
3033
3034		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3035		sockc->tsflags |= tsflags;
3036		break;
3037	case SCM_TXTIME:
3038		if (!sock_flag(sk, SOCK_TXTIME))
3039			return -EINVAL;
3040		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3041			return -EINVAL;
3042		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3043		break;
3044	case SCM_TS_OPT_ID:
3045		if (sk_is_tcp(sk))
3046			return -EINVAL;
3047		tsflags = READ_ONCE(sk->sk_tsflags);
3048		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3049			return -EINVAL;
3050		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3051			return -EINVAL;
3052		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3053		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3054		break;
3055	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3056	case SCM_RIGHTS:
3057	case SCM_CREDENTIALS:
3058		break;
3059	case SO_PRIORITY:
3060		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3061			return -EINVAL;
3062		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3063			return -EPERM;
3064		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3065		break;
3066	case SCM_DEVMEM_DMABUF:
3067		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3068			return -EINVAL;
3069		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3070		break;
3071	default:
3072		return -EINVAL;
3073	}
3074	return 0;
3075}
3076EXPORT_SYMBOL(__sock_cmsg_send);
3077
3078int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3079		   struct sockcm_cookie *sockc)
3080{
3081	struct cmsghdr *cmsg;
3082	int ret;
3083
3084	for_each_cmsghdr(cmsg, msg) {
3085		if (!CMSG_OK(msg, cmsg))
3086			return -EINVAL;
3087		if (cmsg->cmsg_level != SOL_SOCKET)
3088			continue;
3089		ret = __sock_cmsg_send(sk, cmsg, sockc);
3090		if (ret)
3091			return ret;
3092	}
3093	return 0;
3094}
3095EXPORT_SYMBOL(sock_cmsg_send);
3096
3097static void sk_enter_memory_pressure(struct sock *sk)
3098{
3099	if (!sk->sk_prot->enter_memory_pressure)
3100		return;
3101
3102	sk->sk_prot->enter_memory_pressure(sk);
3103}
3104
3105static void sk_leave_memory_pressure(struct sock *sk)
3106{
3107	if (sk->sk_prot->leave_memory_pressure) {
3108		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3109				     tcp_leave_memory_pressure, sk);
3110	} else {
3111		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3112
3113		if (memory_pressure && READ_ONCE(*memory_pressure))
3114			WRITE_ONCE(*memory_pressure, 0);
3115	}
3116}
3117
3118DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3119
3120/**
3121 * skb_page_frag_refill - check that a page_frag contains enough room
3122 * @sz: minimum size of the fragment we want to get
3123 * @pfrag: pointer to page_frag
3124 * @gfp: priority for memory allocation
3125 *
3126 * Note: While this allocator tries to use high order pages, there is
3127 * no guarantee that allocations succeed. Therefore, @sz MUST be
3128 * less or equal than PAGE_SIZE.
3129 */
3130bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3131{
3132	if (pfrag->page) {
3133		if (page_ref_count(pfrag->page) == 1) {
3134			pfrag->offset = 0;
3135			return true;
3136		}
3137		if (pfrag->offset + sz <= pfrag->size)
3138			return true;
3139		put_page(pfrag->page);
3140	}
3141
3142	pfrag->offset = 0;
3143	if (SKB_FRAG_PAGE_ORDER &&
3144	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3145		/* Avoid direct reclaim but allow kswapd to wake */
3146		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3147					  __GFP_COMP | __GFP_NOWARN |
3148					  __GFP_NORETRY,
3149					  SKB_FRAG_PAGE_ORDER);
3150		if (likely(pfrag->page)) {
3151			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3152			return true;
3153		}
3154	}
3155	pfrag->page = alloc_page(gfp);
3156	if (likely(pfrag->page)) {
3157		pfrag->size = PAGE_SIZE;
3158		return true;
3159	}
3160	return false;
3161}
3162EXPORT_SYMBOL(skb_page_frag_refill);
3163
3164bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3165{
3166	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3167		return true;
3168
3169	if (!sk->sk_bypass_prot_mem)
3170		sk_enter_memory_pressure(sk);
3171
3172	sk_stream_moderate_sndbuf(sk);
3173
3174	return false;
3175}
3176EXPORT_SYMBOL(sk_page_frag_refill);
3177
3178void __lock_sock(struct sock *sk)
3179	__releases(&sk->sk_lock.slock)
3180	__acquires(&sk->sk_lock.slock)
3181{
3182	DEFINE_WAIT(wait);
3183
3184	for (;;) {
3185		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3186					TASK_UNINTERRUPTIBLE);
3187		spin_unlock_bh(&sk->sk_lock.slock);
3188		schedule();
3189		spin_lock_bh(&sk->sk_lock.slock);
3190		if (!sock_owned_by_user(sk))
3191			break;
3192	}
3193	finish_wait(&sk->sk_lock.wq, &wait);
3194}
3195
3196void __release_sock(struct sock *sk)
3197	__releases(&sk->sk_lock.slock)
3198	__acquires(&sk->sk_lock.slock)
3199{
3200	struct sk_buff *skb, *next;
3201	int nb = 0;
3202
3203	while ((skb = sk->sk_backlog.head) != NULL) {
3204		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205
3206		spin_unlock_bh(&sk->sk_lock.slock);
3207
3208		while (1) {
3209			next = skb->next;
3210			prefetch(next);
3211			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212			skb_mark_not_on_list(skb);
3213			sk_backlog_rcv(sk, skb);
3214
3215			skb = next;
3216			if (!skb)
3217				break;
3218
3219			if (!(++nb & 15))
3220				cond_resched();
3221		}
3222
3223		spin_lock_bh(&sk->sk_lock.slock);
3224	}
3225
3226	/*
3227	 * Doing the zeroing here guarantee we can not loop forever
3228	 * while a wild producer attempts to flood us.
3229	 */
3230	sk->sk_backlog.len = 0;
3231}
3232
3233void __sk_flush_backlog(struct sock *sk)
3234{
3235	spin_lock_bh(&sk->sk_lock.slock);
3236	__release_sock(sk);
3237
3238	if (sk->sk_prot->release_cb)
3239		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3240				     tcp_release_cb, sk);
3241
3242	spin_unlock_bh(&sk->sk_lock.slock);
3243}
3244EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3245
3246/**
3247 * sk_wait_data - wait for data to arrive at sk_receive_queue
3248 * @sk:    sock to wait on
3249 * @timeo: for how long
3250 * @skb:   last skb seen on sk_receive_queue
3251 *
3252 * Now socket state including sk->sk_err is changed only under lock,
3253 * hence we may omit checks after joining wait queue.
3254 * We check receive queue before schedule() only as optimization;
3255 * it is very likely that release_sock() added new data.
3256 */
3257int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3258{
3259	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3260	int rc;
3261
3262	add_wait_queue(sk_sleep(sk), &wait);
3263	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3264	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3265	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3266	remove_wait_queue(sk_sleep(sk), &wait);
3267	return rc;
3268}
3269EXPORT_SYMBOL(sk_wait_data);
3270
3271/**
3272 *	__sk_mem_raise_allocated - increase memory_allocated
3273 *	@sk: socket
3274 *	@size: memory size to allocate
3275 *	@amt: pages to allocate
3276 *	@kind: allocation type
3277 *
3278 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3279 *
3280 *	Unlike the globally shared limits among the sockets under same protocol,
3281 *	consuming the budget of a memcg won't have direct effect on other ones.
3282 *	So be optimistic about memcg's tolerance, and leave the callers to decide
3283 *	whether or not to raise allocated through sk_under_memory_pressure() or
3284 *	its variants.
3285 */
3286int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3287{
3288	bool memcg_enabled = false, charged = false;
3289	struct proto *prot = sk->sk_prot;
3290	long allocated = 0;
3291
3292	if (!sk->sk_bypass_prot_mem) {
3293		sk_memory_allocated_add(sk, amt);
3294		allocated = sk_memory_allocated(sk);
3295	}
3296
3297	if (mem_cgroup_sk_enabled(sk)) {
3298		memcg_enabled = true;
3299		charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
3300		if (!charged)
3301			goto suppress_allocation;
3302	}
3303
3304	if (!allocated)
3305		return 1;
3306
3307	/* Under limit. */
3308	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3309		sk_leave_memory_pressure(sk);
3310		return 1;
3311	}
3312
3313	/* Under pressure. */
3314	if (allocated > sk_prot_mem_limits(sk, 1))
3315		sk_enter_memory_pressure(sk);
3316
3317	/* Over hard limit. */
3318	if (allocated > sk_prot_mem_limits(sk, 2))
3319		goto suppress_allocation;
3320
3321	/* Guarantee minimum buffer size under pressure (either global
3322	 * or memcg) to make sure features described in RFC 7323 (TCP
3323	 * Extensions for High Performance) work properly.
3324	 *
3325	 * This rule does NOT stand when exceeds global or memcg's hard
3326	 * limit, or else a DoS attack can be taken place by spawning
3327	 * lots of sockets whose usage are under minimum buffer size.
3328	 */
3329	if (kind == SK_MEM_RECV) {
3330		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3331			return 1;
3332
3333	} else { /* SK_MEM_SEND */
3334		int wmem0 = sk_get_wmem0(sk, prot);
3335
3336		if (sk->sk_type == SOCK_STREAM) {
3337			if (sk->sk_wmem_queued < wmem0)
3338				return 1;
3339		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3340				return 1;
3341		}
3342	}
3343
3344	if (sk_has_memory_pressure(sk)) {
3345		u64 alloc;
3346
3347		/* The following 'average' heuristic is within the
3348		 * scope of global accounting, so it only makes
3349		 * sense for global memory pressure.
3350		 */
3351		if (!sk_under_global_memory_pressure(sk))
3352			return 1;
3353
3354		/* Try to be fair among all the sockets under global
3355		 * pressure by allowing the ones that below average
3356		 * usage to raise.
3357		 */
3358		alloc = sk_sockets_allocated_read_positive(sk);
3359		if (sk_prot_mem_limits(sk, 2) > alloc *
3360		    sk_mem_pages(sk->sk_wmem_queued +
3361				 atomic_read(&sk->sk_rmem_alloc) +
3362				 sk->sk_forward_alloc))
3363			return 1;
3364	}
3365
3366suppress_allocation:
3367
3368	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3369		sk_stream_moderate_sndbuf(sk);
3370
3371		/* Fail only if socket is _under_ its sndbuf.
3372		 * In this case we cannot block, so that we have to fail.
3373		 */
3374		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3375			/* Force charge with __GFP_NOFAIL */
3376			if (memcg_enabled && !charged)
3377				mem_cgroup_sk_charge(sk, amt,
3378						     gfp_memcg_charge() | __GFP_NOFAIL);
3379			return 1;
3380		}
3381	}
3382
3383	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3384
3385	if (allocated)
3386		sk_memory_allocated_sub(sk, amt);
3387
3388	if (charged)
3389		mem_cgroup_sk_uncharge(sk, amt);
3390
3391	return 0;
3392}
3393
3394/**
3395 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3396 *	@sk: socket
3397 *	@size: memory size to allocate
3398 *	@kind: allocation type
3399 *
3400 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3401 *	rmem allocation. This function assumes that protocols which have
3402 *	memory_pressure use sk_wmem_queued as write buffer accounting.
3403 */
3404int __sk_mem_schedule(struct sock *sk, int size, int kind)
3405{
3406	int ret, amt = sk_mem_pages(size);
3407
3408	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3409	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3410	if (!ret)
3411		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3412	return ret;
3413}
3414EXPORT_SYMBOL(__sk_mem_schedule);
3415
3416/**
3417 *	__sk_mem_reduce_allocated - reclaim memory_allocated
3418 *	@sk: socket
3419 *	@amount: number of quanta
3420 *
3421 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3422 */
3423void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3424{
3425	if (mem_cgroup_sk_enabled(sk))
3426		mem_cgroup_sk_uncharge(sk, amount);
3427
3428	if (sk->sk_bypass_prot_mem)
3429		return;
3430
3431	sk_memory_allocated_sub(sk, amount);
3432
3433	if (sk_under_global_memory_pressure(sk) &&
3434	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3435		sk_leave_memory_pressure(sk);
3436}
3437
3438/**
3439 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3440 *	@sk: socket
3441 *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3442 */
3443void __sk_mem_reclaim(struct sock *sk, int amount)
3444{
3445	amount >>= PAGE_SHIFT;
3446	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3447	__sk_mem_reduce_allocated(sk, amount);
3448}
3449EXPORT_SYMBOL(__sk_mem_reclaim);
3450
3451void __sk_charge(struct sock *sk, gfp_t gfp)
3452{
3453	int amt;
3454
3455	gfp |= __GFP_NOFAIL;
3456	if (mem_cgroup_from_sk(sk)) {
3457		/* The socket has not been accepted yet, no need
3458		 * to look at newsk->sk_wmem_queued.
3459		 */
3460		amt = sk_mem_pages(sk->sk_forward_alloc +
3461				   atomic_read(&sk->sk_rmem_alloc));
3462		if (amt)
3463			mem_cgroup_sk_charge(sk, amt, gfp);
3464	}
3465
3466	kmem_cache_charge(sk, gfp);
3467}
3468
3469int sk_set_peek_off(struct sock *sk, int val)
3470{
3471	WRITE_ONCE(sk->sk_peek_off, val);
3472	return 0;
3473}
3474EXPORT_SYMBOL_GPL(sk_set_peek_off);
3475
3476/*
3477 * Set of default routines for initialising struct proto_ops when
3478 * the protocol does not support a particular function. In certain
3479 * cases where it makes no sense for a protocol to have a "do nothing"
3480 * function, some default processing is provided.
3481 */
3482
3483int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
3484{
3485	return -EOPNOTSUPP;
3486}
3487EXPORT_SYMBOL(sock_no_bind);
3488
3489int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
3490		    int len, int flags)
3491{
3492	return -EOPNOTSUPP;
3493}
3494EXPORT_SYMBOL(sock_no_connect);
3495
3496int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3497{
3498	return -EOPNOTSUPP;
3499}
3500EXPORT_SYMBOL(sock_no_socketpair);
3501
3502int sock_no_accept(struct socket *sock, struct socket *newsock,
3503		   struct proto_accept_arg *arg)
3504{
3505	return -EOPNOTSUPP;
3506}
3507EXPORT_SYMBOL(sock_no_accept);
3508
3509int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3510		    int peer)
3511{
3512	return -EOPNOTSUPP;
3513}
3514EXPORT_SYMBOL(sock_no_getname);
3515
3516int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3517{
3518	return -EOPNOTSUPP;
3519}
3520EXPORT_SYMBOL(sock_no_ioctl);
3521
3522int sock_no_listen(struct socket *sock, int backlog)
3523{
3524	return -EOPNOTSUPP;
3525}
3526EXPORT_SYMBOL(sock_no_listen);
3527
3528int sock_no_shutdown(struct socket *sock, int how)
3529{
3530	return -EOPNOTSUPP;
3531}
3532EXPORT_SYMBOL(sock_no_shutdown);
3533
3534int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3535{
3536	return -EOPNOTSUPP;
3537}
3538EXPORT_SYMBOL(sock_no_sendmsg);
3539
3540int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3541{
3542	return -EOPNOTSUPP;
3543}
3544EXPORT_SYMBOL(sock_no_sendmsg_locked);
3545
3546int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3547		    int flags)
3548{
3549	return -EOPNOTSUPP;
3550}
3551EXPORT_SYMBOL(sock_no_recvmsg);
3552
3553int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3554{
3555	/* Mirror missing mmap method error code */
3556	return -ENODEV;
3557}
3558EXPORT_SYMBOL(sock_no_mmap);
3559
3560/*
3561 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3562 * various sock-based usage counts.
3563 */
3564void __receive_sock(struct file *file)
3565{
3566	struct socket *sock;
3567
3568	sock = sock_from_file(file);
3569	if (sock) {
3570		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3571		sock_update_classid(&sock->sk->sk_cgrp_data);
3572	}
3573}
3574
3575/*
3576 *	Default Socket Callbacks
3577 */
3578
3579static void sock_def_wakeup(struct sock *sk)
3580{
3581	struct socket_wq *wq;
3582
3583	rcu_read_lock();
3584	wq = rcu_dereference(sk->sk_wq);
3585	if (skwq_has_sleeper(wq))
3586		wake_up_interruptible_all(&wq->wait);
3587	rcu_read_unlock();
3588}
3589
3590static void sock_def_error_report(struct sock *sk)
3591{
3592	struct socket_wq *wq;
3593
3594	rcu_read_lock();
3595	wq = rcu_dereference(sk->sk_wq);
3596	if (skwq_has_sleeper(wq))
3597		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3598	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3599	rcu_read_unlock();
3600}
3601
3602void sock_def_readable(struct sock *sk)
3603{
3604	struct socket_wq *wq;
3605
3606	trace_sk_data_ready(sk);
3607
3608	rcu_read_lock();
3609	wq = rcu_dereference(sk->sk_wq);
3610	if (skwq_has_sleeper(wq))
3611		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3612						EPOLLRDNORM | EPOLLRDBAND);
3613	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3614	rcu_read_unlock();
3615}
3616
3617static void sock_def_write_space(struct sock *sk)
3618{
3619	struct socket_wq *wq;
3620
3621	rcu_read_lock();
3622
3623	/* Do not wake up a writer until he can make "significant"
3624	 * progress.  --DaveM
3625	 */
3626	if (sock_writeable(sk)) {
3627		wq = rcu_dereference(sk->sk_wq);
3628		if (skwq_has_sleeper(wq))
3629			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3630						EPOLLWRNORM | EPOLLWRBAND);
3631
3632		/* Should agree with poll, otherwise some programs break */
3633		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3634	}
3635
3636	rcu_read_unlock();
3637}
3638
3639/* An optimised version of sock_def_write_space(), should only be called
3640 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3641 * ->sk_wmem_alloc.
3642 */
3643static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
3644{
3645	/* Do not wake up a writer until he can make "significant"
3646	 * progress.  --DaveM
3647	 */
3648	if (__sock_writeable(sk, wmem_alloc)) {
3649		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3650
3651		/* rely on refcount_sub from sock_wfree() */
3652		smp_mb__after_atomic();
3653		if (wq && waitqueue_active(&wq->wait))
3654			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3655						EPOLLWRNORM | EPOLLWRBAND);
3656
3657		/* Should agree with poll, otherwise some programs break */
3658		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3659	}
3660}
3661
3662static void sock_def_destruct(struct sock *sk)
3663{
3664}
3665
3666void sk_send_sigurg(struct sock *sk)
3667{
3668	if (sk->sk_socket && sk->sk_socket->file)
3669		if (send_sigurg(sk->sk_socket->file))
3670			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3671}
3672EXPORT_SYMBOL(sk_send_sigurg);
3673
3674void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3675		    unsigned long expires)
3676{
3677	if (!mod_timer(timer, expires))
3678		sock_hold(sk);
3679}
3680EXPORT_SYMBOL(sk_reset_timer);
3681
3682void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3683{
3684	if (timer_delete(timer))
3685		__sock_put(sk);
3686}
3687EXPORT_SYMBOL(sk_stop_timer);
3688
3689void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3690{
3691	if (timer_delete_sync(timer))
3692		__sock_put(sk);
3693}
3694EXPORT_SYMBOL(sk_stop_timer_sync);
3695
3696void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3697{
3698	sk_init_common(sk);
3699	sk->sk_send_head	=	NULL;
3700
3701	timer_setup(&sk->sk_timer, NULL, 0);
3702
3703	sk->sk_allocation	=	GFP_KERNEL;
3704	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3705	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3706	sk->sk_state		=	TCP_CLOSE;
3707	sk->sk_use_task_frag	=	true;
3708	sk_set_socket(sk, sock);
3709
3710	sock_set_flag(sk, SOCK_ZAPPED);
3711
3712	if (sock) {
3713		sk->sk_type	=	sock->type;
3714		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3715		sock->sk	=	sk;
3716	} else {
3717		RCU_INIT_POINTER(sk->sk_wq, NULL);
3718	}
3719	sk->sk_uid	=	uid;
3720
3721	sk->sk_state_change	=	sock_def_wakeup;
3722	sk->sk_data_ready	=	sock_def_readable;
3723	sk->sk_write_space	=	sock_def_write_space;
3724	sk->sk_error_report	=	sock_def_error_report;
3725	sk->sk_destruct		=	sock_def_destruct;
3726
3727	sk->sk_frag.page	=	NULL;
3728	sk->sk_frag.offset	=	0;
3729	sk->sk_peek_off		=	-1;
3730
3731	sk->sk_peer_pid 	=	NULL;
3732	sk->sk_peer_cred	=	NULL;
3733	spin_lock_init(&sk->sk_peer_lock);
3734
3735	sk->sk_write_pending	=	0;
3736	sk->sk_rcvlowat		=	1;
3737	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3738	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3739
3740	sk->sk_stamp = SK_DEFAULT_STAMP;
3741#if BITS_PER_LONG==32
3742	seqlock_init(&sk->sk_stamp_seq);
3743#endif
3744	atomic_set(&sk->sk_zckey, 0);
3745
3746#ifdef CONFIG_NET_RX_BUSY_POLL
3747	sk->sk_napi_id		=	0;
3748	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3749#endif
3750
3751	sk->sk_max_pacing_rate = ~0UL;
3752	sk->sk_pacing_rate = ~0UL;
3753	WRITE_ONCE(sk->sk_pacing_shift, 10);
3754	sk->sk_incoming_cpu = -1;
3755
3756	sk_rx_queue_clear(sk);
3757	/*
3758	 * Before updating sk_refcnt, we must commit prior changes to memory
3759	 * (Documentation/RCU/rculist_nulls.rst for details)
3760	 */
3761	smp_wmb();
3762	refcount_set(&sk->sk_refcnt, 1);
3763	sk_drops_reset(sk);
3764}
3765EXPORT_SYMBOL(sock_init_data_uid);
3766
3767void sock_init_data(struct socket *sock, struct sock *sk)
3768{
3769	kuid_t uid = sock ?
3770		SOCK_INODE(sock)->i_uid :
3771		make_kuid(sock_net(sk)->user_ns, 0);
3772
3773	sock_init_data_uid(sock, sk, uid);
3774}
3775EXPORT_SYMBOL(sock_init_data);
3776
3777void lock_sock_nested(struct sock *sk, int subclass)
3778{
3779	/* The sk_lock has mutex_lock() semantics here. */
3780	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3781
3782	might_sleep();
3783	spin_lock_bh(&sk->sk_lock.slock);
3784	if (sock_owned_by_user_nocheck(sk))
3785		__lock_sock(sk);
3786	sk->sk_lock.owned = 1;
3787	spin_unlock_bh(&sk->sk_lock.slock);
3788}
3789EXPORT_SYMBOL(lock_sock_nested);
3790
3791void release_sock(struct sock *sk)
3792{
3793	spin_lock_bh(&sk->sk_lock.slock);
3794	if (sk->sk_backlog.tail)
3795		__release_sock(sk);
3796
3797	if (sk->sk_prot->release_cb)
3798		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3799				     tcp_release_cb, sk);
3800
3801	sock_release_ownership(sk);
3802	if (waitqueue_active(&sk->sk_lock.wq))
3803		wake_up(&sk->sk_lock.wq);
3804	spin_unlock_bh(&sk->sk_lock.slock);
3805}
3806EXPORT_SYMBOL(release_sock);
3807
3808bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3809{
3810	might_sleep();
3811	spin_lock_bh(&sk->sk_lock.slock);
3812
3813	if (!sock_owned_by_user_nocheck(sk)) {
3814		/*
3815		 * Fast path return with bottom halves disabled and
3816		 * sock::sk_lock.slock held.
3817		 *
3818		 * The 'mutex' is not contended and holding
3819		 * sock::sk_lock.slock prevents all other lockers to
3820		 * proceed so the corresponding unlock_sock_fast() can
3821		 * avoid the slow path of release_sock() completely and
3822		 * just release slock.
3823		 *
3824		 * From a semantical POV this is equivalent to 'acquiring'
3825		 * the 'mutex', hence the corresponding lockdep
3826		 * mutex_release() has to happen in the fast path of
3827		 * unlock_sock_fast().
3828		 */
3829		return false;
3830	}
3831
3832	__lock_sock(sk);
3833	sk->sk_lock.owned = 1;
3834	__acquire(&sk->sk_lock.slock);
3835	spin_unlock_bh(&sk->sk_lock.slock);
3836	return true;
3837}
3838EXPORT_SYMBOL(__lock_sock_fast);
3839
3840int sock_gettstamp(struct socket *sock, void __user *userstamp,
3841		   bool timeval, bool time32)
3842{
3843	struct sock *sk = sock->sk;
3844	struct timespec64 ts;
3845
3846	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3847	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3848	if (ts.tv_sec == -1)
3849		return -ENOENT;
3850	if (ts.tv_sec == 0) {
3851		ktime_t kt = ktime_get_real();
3852		sock_write_timestamp(sk, kt);
3853		ts = ktime_to_timespec64(kt);
3854	}
3855
3856	if (timeval)
3857		ts.tv_nsec /= 1000;
3858
3859#ifdef CONFIG_COMPAT_32BIT_TIME
3860	if (time32)
3861		return put_old_timespec32(&ts, userstamp);
3862#endif
3863#ifdef CONFIG_SPARC64
3864	/* beware of padding in sparc64 timeval */
3865	if (timeval && !in_compat_syscall()) {
3866		struct __kernel_old_timeval __user tv = {
3867			.tv_sec = ts.tv_sec,
3868			.tv_usec = ts.tv_nsec,
3869		};
3870		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3871			return -EFAULT;
3872		return 0;
3873	}
3874#endif
3875	return put_timespec64(&ts, userstamp);
3876}
3877EXPORT_SYMBOL(sock_gettstamp);
3878
3879void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3880{
3881	if (!sock_flag(sk, flag)) {
3882		unsigned long previous_flags = sk->sk_flags;
3883
3884		sock_set_flag(sk, flag);
3885		/*
3886		 * we just set one of the two flags which require net
3887		 * time stamping, but time stamping might have been on
3888		 * already because of the other one
3889		 */
3890		if (sock_needs_netstamp(sk) &&
3891		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3892			net_enable_timestamp();
3893	}
3894}
3895
3896int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3897		       int level, int type)
3898{
3899	struct sock_extended_err ee;
3900	struct sk_buff *skb;
3901	int copied, err;
3902
3903	err = -EAGAIN;
3904	skb = sock_dequeue_err_skb(sk);
3905	if (skb == NULL)
3906		goto out;
3907
3908	copied = skb->len;
3909	if (copied > len) {
3910		msg->msg_flags |= MSG_TRUNC;
3911		copied = len;
3912	}
3913	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3914	if (err)
3915		goto out_free_skb;
3916
3917	sock_recv_timestamp(msg, sk, skb);
3918
3919	/* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
3920	ee = SKB_EXT_ERR(skb)->ee;
3921	put_cmsg(msg, level, type, sizeof(ee), &ee);
3922
3923	msg->msg_flags |= MSG_ERRQUEUE;
3924	err = copied;
3925
3926out_free_skb:
3927	kfree_skb(skb);
3928out:
3929	return err;
3930}
3931EXPORT_SYMBOL(sock_recv_errqueue);
3932
3933/*
3934 *	Get a socket option on an socket.
3935 *
3936 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3937 *	asynchronous errors should be reported by getsockopt. We assume
3938 *	this means if you specify SO_ERROR (otherwise what is the point of it).
3939 */
3940int sock_common_getsockopt(struct socket *sock, int level, int optname,
3941			   char __user *optval, int __user *optlen)
3942{
3943	struct sock *sk = sock->sk;
3944
3945	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3946	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3947}
3948EXPORT_SYMBOL(sock_common_getsockopt);
3949
3950int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3951			int flags)
3952{
3953	struct sock *sk = sock->sk;
3954	int addr_len = 0;
3955	int err;
3956
3957	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3958	if (err >= 0)
3959		msg->msg_namelen = addr_len;
3960	return err;
3961}
3962EXPORT_SYMBOL(sock_common_recvmsg);
3963
3964/*
3965 *	Set socket options on an inet socket.
3966 */
3967int sock_common_setsockopt(struct socket *sock, int level, int optname,
3968			   sockptr_t optval, unsigned int optlen)
3969{
3970	struct sock *sk = sock->sk;
3971
3972	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3973	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3974}
3975EXPORT_SYMBOL(sock_common_setsockopt);
3976
3977void sk_common_release(struct sock *sk)
3978{
3979	if (sk->sk_prot->destroy)
3980		sk->sk_prot->destroy(sk);
3981
3982	/*
3983	 * Observation: when sk_common_release is called, processes have
3984	 * no access to socket. But net still has.
3985	 * Step one, detach it from networking:
3986	 *
3987	 * A. Remove from hash tables.
3988	 */
3989
3990	sk->sk_prot->unhash(sk);
3991
3992	/*
3993	 * In this point socket cannot receive new packets, but it is possible
3994	 * that some packets are in flight because some CPU runs receiver and
3995	 * did hash table lookup before we unhashed socket. They will achieve
3996	 * receive queue and will be purged by socket destructor.
3997	 *
3998	 * Also we still have packets pending on receive queue and probably,
3999	 * our own packets waiting in device queues. sock_destroy will drain
4000	 * receive queue, but transmitted packets will delay socket destruction
4001	 * until the last reference will be released.
4002	 */
4003
4004	sock_orphan(sk);
4005
4006	xfrm_sk_free_policy(sk);
4007
4008	sock_put(sk);
4009}
4010EXPORT_SYMBOL(sk_common_release);
4011
4012void sk_get_meminfo(const struct sock *sk, u32 *mem)
4013{
4014	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
4015
4016	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
4017	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
4018	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
4019	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
4020	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
4021	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
4022	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
4023	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
4024	mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
4025}
4026
4027#ifdef CONFIG_PROC_FS
4028static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4029
4030int sock_prot_inuse_get(struct net *net, struct proto *prot)
4031{
4032	int cpu, idx = prot->inuse_idx;
4033	int res = 0;
4034
4035	for_each_possible_cpu(cpu)
4036		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4037
4038	return res >= 0 ? res : 0;
4039}
4040EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4041
4042int sock_inuse_get(struct net *net)
4043{
4044	int cpu, res = 0;
4045
4046	for_each_possible_cpu(cpu)
4047		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4048
4049	return res;
4050}
4051
4052EXPORT_SYMBOL_GPL(sock_inuse_get);
4053
4054static int __net_init sock_inuse_init_net(struct net *net)
4055{
4056	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4057	if (net->core.prot_inuse == NULL)
4058		return -ENOMEM;
4059	return 0;
4060}
4061
4062static void __net_exit sock_inuse_exit_net(struct net *net)
4063{
4064	free_percpu(net->core.prot_inuse);
4065}
4066
4067static struct pernet_operations net_inuse_ops = {
4068	.init = sock_inuse_init_net,
4069	.exit = sock_inuse_exit_net,
4070};
4071
4072static __init int net_inuse_init(void)
4073{
4074	if (register_pernet_subsys(&net_inuse_ops))
4075		panic("Cannot initialize net inuse counters");
4076
4077	return 0;
4078}
4079
4080core_initcall(net_inuse_init);
4081
4082static int assign_proto_idx(struct proto *prot)
4083{
4084	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4085
4086	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4087		pr_err("PROTO_INUSE_NR exhausted\n");
4088		return -ENOSPC;
4089	}
4090
4091	set_bit(prot->inuse_idx, proto_inuse_idx);
4092	return 0;
4093}
4094
4095static void release_proto_idx(struct proto *prot)
4096{
4097	if (prot->inuse_idx != PROTO_INUSE_NR)
4098		clear_bit(prot->inuse_idx, proto_inuse_idx);
4099}
4100#else
4101static inline int assign_proto_idx(struct proto *prot)
4102{
4103	return 0;
4104}
4105
4106static inline void release_proto_idx(struct proto *prot)
4107{
4108}
4109
4110#endif
4111
4112static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4113{
4114	if (!twsk_prot)
4115		return;
4116	kfree(twsk_prot->twsk_slab_name);
4117	twsk_prot->twsk_slab_name = NULL;
4118	kmem_cache_destroy(twsk_prot->twsk_slab);
4119	twsk_prot->twsk_slab = NULL;
4120}
4121
4122static int tw_prot_init(const struct proto *prot)
4123{
4124	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4125
4126	if (!twsk_prot)
4127		return 0;
4128
4129	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4130					      prot->name);
4131	if (!twsk_prot->twsk_slab_name)
4132		return -ENOMEM;
4133
4134	twsk_prot->twsk_slab =
4135		kmem_cache_create(twsk_prot->twsk_slab_name,
4136				  twsk_prot->twsk_obj_size, 0,
4137				  SLAB_ACCOUNT | prot->slab_flags,
4138				  NULL);
4139	if (!twsk_prot->twsk_slab) {
4140		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4141			prot->name);
4142		return -ENOMEM;
4143	}
4144
4145	return 0;
4146}
4147
4148static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4149{
4150	if (!rsk_prot)
4151		return;
4152	kfree(rsk_prot->slab_name);
4153	rsk_prot->slab_name = NULL;
4154	kmem_cache_destroy(rsk_prot->slab);
4155	rsk_prot->slab = NULL;
4156}
4157
4158static int req_prot_init(const struct proto *prot)
4159{
4160	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4161
4162	if (!rsk_prot)
4163		return 0;
4164
4165	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4166					prot->name);
4167	if (!rsk_prot->slab_name)
4168		return -ENOMEM;
4169
4170	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4171					   rsk_prot->obj_size, 0,
4172					   SLAB_ACCOUNT | prot->slab_flags,
4173					   NULL);
4174
4175	if (!rsk_prot->slab) {
4176		pr_crit("%s: Can't create request sock SLAB cache!\n",
4177			prot->name);
4178		return -ENOMEM;
4179	}
4180	return 0;
4181}
4182
4183int proto_register(struct proto *prot, int alloc_slab)
4184{
4185	int ret = -ENOBUFS;
4186
4187	if (prot->memory_allocated && !prot->sysctl_mem) {
4188		pr_err("%s: missing sysctl_mem\n", prot->name);
4189		return -EINVAL;
4190	}
4191	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4192		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4193		return -EINVAL;
4194	}
4195	if (alloc_slab) {
4196		prot->slab = kmem_cache_create_usercopy(prot->name,
4197					prot->obj_size, 0,
4198					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4199					prot->slab_flags,
4200					prot->useroffset, prot->usersize,
4201					NULL);
4202
4203		if (prot->slab == NULL) {
4204			pr_crit("%s: Can't create sock SLAB cache!\n",
4205				prot->name);
4206			goto out;
4207		}
4208
4209		if (req_prot_init(prot))
4210			goto out_free_request_sock_slab;
4211
4212		if (tw_prot_init(prot))
4213			goto out_free_timewait_sock_slab;
4214	}
4215
4216	mutex_lock(&proto_list_mutex);
4217	ret = assign_proto_idx(prot);
4218	if (ret) {
4219		mutex_unlock(&proto_list_mutex);
4220		goto out_free_timewait_sock_slab;
4221	}
4222	list_add(&prot->node, &proto_list);
4223	mutex_unlock(&proto_list_mutex);
4224	return ret;
4225
4226out_free_timewait_sock_slab:
4227	if (alloc_slab)
4228		tw_prot_cleanup(prot->twsk_prot);
4229out_free_request_sock_slab:
4230	if (alloc_slab) {
4231		req_prot_cleanup(prot->rsk_prot);
4232
4233		kmem_cache_destroy(prot->slab);
4234		prot->slab = NULL;
4235	}
4236out:
4237	return ret;
4238}
4239EXPORT_SYMBOL(proto_register);
4240
4241void proto_unregister(struct proto *prot)
4242{
4243	mutex_lock(&proto_list_mutex);
4244	release_proto_idx(prot);
4245	list_del(&prot->node);
4246	mutex_unlock(&proto_list_mutex);
4247
4248	kmem_cache_destroy(prot->slab);
4249	prot->slab = NULL;
4250
4251	req_prot_cleanup(prot->rsk_prot);
4252	tw_prot_cleanup(prot->twsk_prot);
4253}
4254EXPORT_SYMBOL(proto_unregister);
4255
4256int sock_load_diag_module(int family, int protocol)
4257{
4258	if (!protocol) {
4259		if (!sock_is_registered(family))
4260			return -ENOENT;
4261
4262		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4263				      NETLINK_SOCK_DIAG, family);
4264	}
4265
4266#ifdef CONFIG_INET
4267	if (family == AF_INET &&
4268	    protocol != IPPROTO_RAW &&
4269	    protocol < MAX_INET_PROTOS &&
4270	    !rcu_access_pointer(inet_protos[protocol]))
4271		return -ENOENT;
4272#endif
4273
4274	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4275			      NETLINK_SOCK_DIAG, family, protocol);
4276}
4277EXPORT_SYMBOL(sock_load_diag_module);
4278
4279#ifdef CONFIG_PROC_FS
4280static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4281	__acquires(proto_list_mutex)
4282{
4283	mutex_lock(&proto_list_mutex);
4284	return seq_list_start_head(&proto_list, *pos);
4285}
4286
4287static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4288{
4289	return seq_list_next(v, &proto_list, pos);
4290}
4291
4292static void proto_seq_stop(struct seq_file *seq, void *v)
4293	__releases(proto_list_mutex)
4294{
4295	mutex_unlock(&proto_list_mutex);
4296}
4297
4298static char proto_method_implemented(const void *method)
4299{
4300	return method == NULL ? 'n' : 'y';
4301}
4302static long sock_prot_memory_allocated(struct proto *proto)
4303{
4304	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4305}
4306
4307static const char *sock_prot_memory_pressure(struct proto *proto)
4308{
4309	return proto->memory_pressure != NULL ?
4310	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4311}
4312
4313static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4314{
4315
4316	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4317			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4318		   proto->name,
4319		   proto->obj_size,
4320		   sock_prot_inuse_get(seq_file_net(seq), proto),
4321		   sock_prot_memory_allocated(proto),
4322		   sock_prot_memory_pressure(proto),
4323		   proto->max_header,
4324		   proto->slab == NULL ? "no" : "yes",
4325		   module_name(proto->owner),
4326		   proto_method_implemented(proto->close),
4327		   proto_method_implemented(proto->connect),
4328		   proto_method_implemented(proto->disconnect),
4329		   proto_method_implemented(proto->accept),
4330		   proto_method_implemented(proto->ioctl),
4331		   proto_method_implemented(proto->init),
4332		   proto_method_implemented(proto->destroy),
4333		   proto_method_implemented(proto->shutdown),
4334		   proto_method_implemented(proto->setsockopt),
4335		   proto_method_implemented(proto->getsockopt),
4336		   proto_method_implemented(proto->sendmsg),
4337		   proto_method_implemented(proto->recvmsg),
4338		   proto_method_implemented(proto->bind),
4339		   proto_method_implemented(proto->backlog_rcv),
4340		   proto_method_implemented(proto->hash),
4341		   proto_method_implemented(proto->unhash),
4342		   proto_method_implemented(proto->get_port),
4343		   proto_method_implemented(proto->enter_memory_pressure));
4344}
4345
4346static int proto_seq_show(struct seq_file *seq, void *v)
4347{
4348	if (v == &proto_list)
4349		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4350			   "protocol",
4351			   "size",
4352			   "sockets",
4353			   "memory",
4354			   "press",
4355			   "maxhdr",
4356			   "slab",
4357			   "module",
4358			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4359	else
4360		proto_seq_printf(seq, list_entry(v, struct proto, node));
4361	return 0;
4362}
4363
4364static const struct seq_operations proto_seq_ops = {
4365	.start  = proto_seq_start,
4366	.next   = proto_seq_next,
4367	.stop   = proto_seq_stop,
4368	.show   = proto_seq_show,
4369};
4370
4371static __net_init int proto_init_net(struct net *net)
4372{
4373	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4374			sizeof(struct seq_net_private)))
4375		return -ENOMEM;
4376
4377	return 0;
4378}
4379
4380static __net_exit void proto_exit_net(struct net *net)
4381{
4382	remove_proc_entry("protocols", net->proc_net);
4383}
4384
4385
4386static __net_initdata struct pernet_operations proto_net_ops = {
4387	.init = proto_init_net,
4388	.exit = proto_exit_net,
4389};
4390
4391static int __init proto_init(void)
4392{
4393	return register_pernet_subsys(&proto_net_ops);
4394}
4395
4396subsys_initcall(proto_init);
4397
4398#endif /* PROC_FS */
4399
4400#ifdef CONFIG_NET_RX_BUSY_POLL
4401bool sk_busy_loop_end(void *p, unsigned long start_time)
4402{
4403	struct sock *sk = p;
4404
4405	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4406		return true;
4407
4408	if (sk_is_udp(sk) &&
4409	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4410		return true;
4411
4412	return sk_busy_loop_timeout(sk, start_time);
4413}
4414EXPORT_SYMBOL(sk_busy_loop_end);
4415#endif /* CONFIG_NET_RX_BUSY_POLL */
4416
4417int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
4418{
4419	if (!sk->sk_prot->bind_add)
4420		return -EOPNOTSUPP;
4421	return sk->sk_prot->bind_add(sk, addr, addr_len);
4422}
4423EXPORT_SYMBOL(sock_bind_add);
4424
4425/* Copy 'size' bytes from userspace and return `size` back to userspace */
4426int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4427		     void __user *arg, void *karg, size_t size)
4428{
4429	int ret;
4430
4431	if (copy_from_user(karg, arg, size))
4432		return -EFAULT;
4433
4434	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4435	if (ret)
4436		return ret;
4437
4438	if (copy_to_user(arg, karg, size))
4439		return -EFAULT;
4440
4441	return 0;
4442}
4443EXPORT_SYMBOL(sock_ioctl_inout);
4444
4445/* This is the most common ioctl prep function, where the result (4 bytes) is
4446 * copied back to userspace if the ioctl() returns successfully. No input is
4447 * copied from userspace as input argument.
4448 */
4449static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4450{
4451	int ret, karg = 0;
4452
4453	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4454	if (ret)
4455		return ret;
4456
4457	return put_user(karg, (int __user *)arg);
4458}
4459
4460/* A wrapper around sock ioctls, which copies the data from userspace
4461 * (depending on the protocol/ioctl), and copies back the result to userspace.
4462 * The main motivation for this function is to pass kernel memory to the
4463 * protocol ioctl callbacks, instead of userspace memory.
4464 */
4465int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4466{
4467	int rc = 1;
4468
4469	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4470		rc = ipmr_sk_ioctl(sk, cmd, arg);
4471	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4472		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4473	else if (sk_is_phonet(sk))
4474		rc = phonet_sk_ioctl(sk, cmd, arg);
4475
4476	/* If ioctl was processed, returns its value */
4477	if (rc <= 0)
4478		return rc;
4479
4480	/* Otherwise call the default handler */
4481	return sock_ioctl_out(sk, cmd, arg);
4482}
4483EXPORT_SYMBOL(sk_ioctl);
4484
4485static int __init sock_struct_check(void)
4486{
4487	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4488	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4489	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4490	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4491	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4492
4493	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4494	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4495	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4496	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4497	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4498	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4499	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4500	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4501	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4502
4503	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4504	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4505#ifdef CONFIG_MEMCG
4506	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4507#endif
4508
4509	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4510	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4511	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4512	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4513
4514	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4515	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4516	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
4517	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4518	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4519	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4520	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4521	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4522	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4523	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4524	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4525	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4526	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4527	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4528
4529	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
4530	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
4531	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4532	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4533	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4534	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4535	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
4536	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
4537	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4538	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4539	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4540	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4541	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4542	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4543	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
4544	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4545	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4546	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4547	return 0;
4548}
4549
4550core_initcall(sock_struct_check);