net/core/sock.c at v5.18-rc5 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v5.18-rc5 3948 lines 96 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144static DEFINE_MUTEX(proto_list_mutex);
 145static LIST_HEAD(proto_list);
 146
 147/**
 148 * sk_ns_capable - General socket capability test
 149 * @sk: Socket to use a capability on or through
 150 * @user_ns: The user namespace of the capability to use
 151 * @cap: The capability to use
 152 *
 153 * Test to see if the opener of the socket had when the socket was
 154 * created and the current process has the capability @cap in the user
 155 * namespace @user_ns.
 156 */
 157bool sk_ns_capable(const struct sock *sk,
 158		   struct user_namespace *user_ns, int cap)
 159{
 160	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161		ns_capable(user_ns, cap);
 162}
 163EXPORT_SYMBOL(sk_ns_capable);
 164
 165/**
 166 * sk_capable - Socket global capability test
 167 * @sk: Socket to use a capability on or through
 168 * @cap: The global capability to use
 169 *
 170 * Test to see if the opener of the socket had when the socket was
 171 * created and the current process has the capability @cap in all user
 172 * namespaces.
 173 */
 174bool sk_capable(const struct sock *sk, int cap)
 175{
 176	return sk_ns_capable(sk, &init_user_ns, cap);
 177}
 178EXPORT_SYMBOL(sk_capable);
 179
 180/**
 181 * sk_net_capable - Network namespace socket capability test
 182 * @sk: Socket to use a capability on or through
 183 * @cap: The capability to use
 184 *
 185 * Test to see if the opener of the socket had when the socket was created
 186 * and the current process has the capability @cap over the network namespace
 187 * the socket is a member of.
 188 */
 189bool sk_net_capable(const struct sock *sk, int cap)
 190{
 191	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192}
 193EXPORT_SYMBOL(sk_net_capable);
 194
 195/*
 196 * Each address family might have different locking rules, so we have
 197 * one slock key per address family and separate keys for internal and
 198 * userspace sockets.
 199 */
 200static struct lock_class_key af_family_keys[AF_MAX];
 201static struct lock_class_key af_family_kern_keys[AF_MAX];
 202static struct lock_class_key af_family_slock_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210
 211#define _sock_locks(x)						  \
 212  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 213  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 214  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 215  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 216  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 217  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 218  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 219  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 220  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 221  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 222  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 223  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 224  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 225  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 226  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 227  x "AF_MCTP"  , \
 228  x "AF_MAX"
 229
 230static const char *const af_family_key_strings[AF_MAX+1] = {
 231	_sock_locks("sk_lock-")
 232};
 233static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 234	_sock_locks("slock-")
 235};
 236static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 237	_sock_locks("clock-")
 238};
 239
 240static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 241	_sock_locks("k-sk_lock-")
 242};
 243static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 244	_sock_locks("k-slock-")
 245};
 246static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 247	_sock_locks("k-clock-")
 248};
 249static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 250	_sock_locks("rlock-")
 251};
 252static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 253	_sock_locks("wlock-")
 254};
 255static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 256	_sock_locks("elock-")
 257};
 258
 259/*
 260 * sk_callback_lock and sk queues locking rules are per-address-family,
 261 * so split the lock classes by using a per-AF key:
 262 */
 263static struct lock_class_key af_callback_keys[AF_MAX];
 264static struct lock_class_key af_rlock_keys[AF_MAX];
 265static struct lock_class_key af_wlock_keys[AF_MAX];
 266static struct lock_class_key af_elock_keys[AF_MAX];
 267static struct lock_class_key af_kern_callback_keys[AF_MAX];
 268
 269/* Run time adjustable parameters. */
 270__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 271EXPORT_SYMBOL(sysctl_wmem_max);
 272__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 273EXPORT_SYMBOL(sysctl_rmem_max);
 274__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 275__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 276
 277/* Maximal space eaten by iovec or ancillary data plus some space */
 278int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 279EXPORT_SYMBOL(sysctl_optmem_max);
 280
 281int sysctl_tstamp_allow_data __read_mostly = 1;
 282
 283DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 284EXPORT_SYMBOL_GPL(memalloc_socks_key);
 285
 286/**
 287 * sk_set_memalloc - sets %SOCK_MEMALLOC
 288 * @sk: socket to set it on
 289 *
 290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 291 * It's the responsibility of the admin to adjust min_free_kbytes
 292 * to meet the requirements
 293 */
 294void sk_set_memalloc(struct sock *sk)
 295{
 296	sock_set_flag(sk, SOCK_MEMALLOC);
 297	sk->sk_allocation |= __GFP_MEMALLOC;
 298	static_branch_inc(&memalloc_socks_key);
 299}
 300EXPORT_SYMBOL_GPL(sk_set_memalloc);
 301
 302void sk_clear_memalloc(struct sock *sk)
 303{
 304	sock_reset_flag(sk, SOCK_MEMALLOC);
 305	sk->sk_allocation &= ~__GFP_MEMALLOC;
 306	static_branch_dec(&memalloc_socks_key);
 307
 308	/*
 309	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 310	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 311	 * it has rmem allocations due to the last swapfile being deactivated
 312	 * but there is a risk that the socket is unusable due to exceeding
 313	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 314	 */
 315	sk_mem_reclaim(sk);
 316}
 317EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 318
 319int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 320{
 321	int ret;
 322	unsigned int noreclaim_flag;
 323
 324	/* these should have been dropped before queueing */
 325	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 326
 327	noreclaim_flag = memalloc_noreclaim_save();
 328	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 329				 tcp_v6_do_rcv,
 330				 tcp_v4_do_rcv,
 331				 sk, skb);
 332	memalloc_noreclaim_restore(noreclaim_flag);
 333
 334	return ret;
 335}
 336EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338void sk_error_report(struct sock *sk)
 339{
 340	sk->sk_error_report(sk);
 341
 342	switch (sk->sk_family) {
 343	case AF_INET:
 344		fallthrough;
 345	case AF_INET6:
 346		trace_inet_sk_error_report(sk);
 347		break;
 348	default:
 349		break;
 350	}
 351}
 352EXPORT_SYMBOL(sk_error_report);
 353
 354int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 355{
 356	struct __kernel_sock_timeval tv;
 357
 358	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 359		tv.tv_sec = 0;
 360		tv.tv_usec = 0;
 361	} else {
 362		tv.tv_sec = timeo / HZ;
 363		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 364	}
 365
 366	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 367		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 368		*(struct old_timeval32 *)optval = tv32;
 369		return sizeof(tv32);
 370	}
 371
 372	if (old_timeval) {
 373		struct __kernel_old_timeval old_tv;
 374		old_tv.tv_sec = tv.tv_sec;
 375		old_tv.tv_usec = tv.tv_usec;
 376		*(struct __kernel_old_timeval *)optval = old_tv;
 377		return sizeof(old_tv);
 378	}
 379
 380	*(struct __kernel_sock_timeval *)optval = tv;
 381	return sizeof(tv);
 382}
 383EXPORT_SYMBOL(sock_get_timeout);
 384
 385int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 386			   sockptr_t optval, int optlen, bool old_timeval)
 387{
 388	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389		struct old_timeval32 tv32;
 390
 391		if (optlen < sizeof(tv32))
 392			return -EINVAL;
 393
 394		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395			return -EFAULT;
 396		tv->tv_sec = tv32.tv_sec;
 397		tv->tv_usec = tv32.tv_usec;
 398	} else if (old_timeval) {
 399		struct __kernel_old_timeval old_tv;
 400
 401		if (optlen < sizeof(old_tv))
 402			return -EINVAL;
 403		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404			return -EFAULT;
 405		tv->tv_sec = old_tv.tv_sec;
 406		tv->tv_usec = old_tv.tv_usec;
 407	} else {
 408		if (optlen < sizeof(*tv))
 409			return -EINVAL;
 410		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 411			return -EFAULT;
 412	}
 413
 414	return 0;
 415}
 416EXPORT_SYMBOL(sock_copy_user_timeval);
 417
 418static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 419			    bool old_timeval)
 420{
 421	struct __kernel_sock_timeval tv;
 422	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 423
 424	if (err)
 425		return err;
 426
 427	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 428		return -EDOM;
 429
 430	if (tv.tv_sec < 0) {
 431		static int warned __read_mostly;
 432
 433		*timeo_p = 0;
 434		if (warned < 10 && net_ratelimit()) {
 435			warned++;
 436			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 437				__func__, current->comm, task_pid_nr(current));
 438		}
 439		return 0;
 440	}
 441	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 442	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 443		return 0;
 444	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 445		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 446	return 0;
 447}
 448
 449static bool sock_needs_netstamp(const struct sock *sk)
 450{
 451	switch (sk->sk_family) {
 452	case AF_UNSPEC:
 453	case AF_UNIX:
 454		return false;
 455	default:
 456		return true;
 457	}
 458}
 459
 460static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 461{
 462	if (sk->sk_flags & flags) {
 463		sk->sk_flags &= ~flags;
 464		if (sock_needs_netstamp(sk) &&
 465		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 466			net_disable_timestamp();
 467	}
 468}
 469
 470
 471int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 472{
 473	unsigned long flags;
 474	struct sk_buff_head *list = &sk->sk_receive_queue;
 475
 476	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 477		atomic_inc(&sk->sk_drops);
 478		trace_sock_rcvqueue_full(sk, skb);
 479		return -ENOMEM;
 480	}
 481
 482	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 483		atomic_inc(&sk->sk_drops);
 484		return -ENOBUFS;
 485	}
 486
 487	skb->dev = NULL;
 488	skb_set_owner_r(skb, sk);
 489
 490	/* we escape from rcu protected region, make sure we dont leak
 491	 * a norefcounted dst
 492	 */
 493	skb_dst_force(skb);
 494
 495	spin_lock_irqsave(&list->lock, flags);
 496	sock_skb_set_dropcount(sk, skb);
 497	__skb_queue_tail(list, skb);
 498	spin_unlock_irqrestore(&list->lock, flags);
 499
 500	if (!sock_flag(sk, SOCK_DEAD))
 501		sk->sk_data_ready(sk);
 502	return 0;
 503}
 504EXPORT_SYMBOL(__sock_queue_rcv_skb);
 505
 506int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 507{
 508	int err;
 509
 510	err = sk_filter(sk, skb);
 511	if (err)
 512		return err;
 513
 514	return __sock_queue_rcv_skb(sk, skb);
 515}
 516EXPORT_SYMBOL(sock_queue_rcv_skb);
 517
 518int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 519		     const int nested, unsigned int trim_cap, bool refcounted)
 520{
 521	int rc = NET_RX_SUCCESS;
 522
 523	if (sk_filter_trim_cap(sk, skb, trim_cap))
 524		goto discard_and_relse;
 525
 526	skb->dev = NULL;
 527
 528	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 529		atomic_inc(&sk->sk_drops);
 530		goto discard_and_relse;
 531	}
 532	if (nested)
 533		bh_lock_sock_nested(sk);
 534	else
 535		bh_lock_sock(sk);
 536	if (!sock_owned_by_user(sk)) {
 537		/*
 538		 * trylock + unlock semantics:
 539		 */
 540		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 541
 542		rc = sk_backlog_rcv(sk, skb);
 543
 544		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 545	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 546		bh_unlock_sock(sk);
 547		atomic_inc(&sk->sk_drops);
 548		goto discard_and_relse;
 549	}
 550
 551	bh_unlock_sock(sk);
 552out:
 553	if (refcounted)
 554		sock_put(sk);
 555	return rc;
 556discard_and_relse:
 557	kfree_skb(skb);
 558	goto out;
 559}
 560EXPORT_SYMBOL(__sk_receive_skb);
 561
 562INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 563							  u32));
 564INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 565							   u32));
 566struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 567{
 568	struct dst_entry *dst = __sk_dst_get(sk);
 569
 570	if (dst && dst->obsolete &&
 571	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 572			       dst, cookie) == NULL) {
 573		sk_tx_queue_clear(sk);
 574		sk->sk_dst_pending_confirm = 0;
 575		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 576		dst_release(dst);
 577		return NULL;
 578	}
 579
 580	return dst;
 581}
 582EXPORT_SYMBOL(__sk_dst_check);
 583
 584struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 585{
 586	struct dst_entry *dst = sk_dst_get(sk);
 587
 588	if (dst && dst->obsolete &&
 589	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 590			       dst, cookie) == NULL) {
 591		sk_dst_reset(sk);
 592		dst_release(dst);
 593		return NULL;
 594	}
 595
 596	return dst;
 597}
 598EXPORT_SYMBOL(sk_dst_check);
 599
 600static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 601{
 602	int ret = -ENOPROTOOPT;
 603#ifdef CONFIG_NETDEVICES
 604	struct net *net = sock_net(sk);
 605
 606	/* Sorry... */
 607	ret = -EPERM;
 608	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 609		goto out;
 610
 611	ret = -EINVAL;
 612	if (ifindex < 0)
 613		goto out;
 614
 615	sk->sk_bound_dev_if = ifindex;
 616	if (sk->sk_prot->rehash)
 617		sk->sk_prot->rehash(sk);
 618	sk_dst_reset(sk);
 619
 620	ret = 0;
 621
 622out:
 623#endif
 624
 625	return ret;
 626}
 627
 628int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 629{
 630	int ret;
 631
 632	if (lock_sk)
 633		lock_sock(sk);
 634	ret = sock_bindtoindex_locked(sk, ifindex);
 635	if (lock_sk)
 636		release_sock(sk);
 637
 638	return ret;
 639}
 640EXPORT_SYMBOL(sock_bindtoindex);
 641
 642static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 643{
 644	int ret = -ENOPROTOOPT;
 645#ifdef CONFIG_NETDEVICES
 646	struct net *net = sock_net(sk);
 647	char devname[IFNAMSIZ];
 648	int index;
 649
 650	ret = -EINVAL;
 651	if (optlen < 0)
 652		goto out;
 653
 654	/* Bind this socket to a particular device like "eth0",
 655	 * as specified in the passed interface name. If the
 656	 * name is "" or the option length is zero the socket
 657	 * is not bound.
 658	 */
 659	if (optlen > IFNAMSIZ - 1)
 660		optlen = IFNAMSIZ - 1;
 661	memset(devname, 0, sizeof(devname));
 662
 663	ret = -EFAULT;
 664	if (copy_from_sockptr(devname, optval, optlen))
 665		goto out;
 666
 667	index = 0;
 668	if (devname[0] != '\0') {
 669		struct net_device *dev;
 670
 671		rcu_read_lock();
 672		dev = dev_get_by_name_rcu(net, devname);
 673		if (dev)
 674			index = dev->ifindex;
 675		rcu_read_unlock();
 676		ret = -ENODEV;
 677		if (!dev)
 678			goto out;
 679	}
 680
 681	return sock_bindtoindex(sk, index, true);
 682out:
 683#endif
 684
 685	return ret;
 686}
 687
 688static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 689				int __user *optlen, int len)
 690{
 691	int ret = -ENOPROTOOPT;
 692#ifdef CONFIG_NETDEVICES
 693	struct net *net = sock_net(sk);
 694	char devname[IFNAMSIZ];
 695
 696	if (sk->sk_bound_dev_if == 0) {
 697		len = 0;
 698		goto zero;
 699	}
 700
 701	ret = -EINVAL;
 702	if (len < IFNAMSIZ)
 703		goto out;
 704
 705	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 706	if (ret)
 707		goto out;
 708
 709	len = strlen(devname) + 1;
 710
 711	ret = -EFAULT;
 712	if (copy_to_user(optval, devname, len))
 713		goto out;
 714
 715zero:
 716	ret = -EFAULT;
 717	if (put_user(len, optlen))
 718		goto out;
 719
 720	ret = 0;
 721
 722out:
 723#endif
 724
 725	return ret;
 726}
 727
 728bool sk_mc_loop(struct sock *sk)
 729{
 730	if (dev_recursion_level())
 731		return false;
 732	if (!sk)
 733		return true;
 734	switch (sk->sk_family) {
 735	case AF_INET:
 736		return inet_sk(sk)->mc_loop;
 737#if IS_ENABLED(CONFIG_IPV6)
 738	case AF_INET6:
 739		return inet6_sk(sk)->mc_loop;
 740#endif
 741	}
 742	WARN_ON_ONCE(1);
 743	return true;
 744}
 745EXPORT_SYMBOL(sk_mc_loop);
 746
 747void sock_set_reuseaddr(struct sock *sk)
 748{
 749	lock_sock(sk);
 750	sk->sk_reuse = SK_CAN_REUSE;
 751	release_sock(sk);
 752}
 753EXPORT_SYMBOL(sock_set_reuseaddr);
 754
 755void sock_set_reuseport(struct sock *sk)
 756{
 757	lock_sock(sk);
 758	sk->sk_reuseport = true;
 759	release_sock(sk);
 760}
 761EXPORT_SYMBOL(sock_set_reuseport);
 762
 763void sock_no_linger(struct sock *sk)
 764{
 765	lock_sock(sk);
 766	sk->sk_lingertime = 0;
 767	sock_set_flag(sk, SOCK_LINGER);
 768	release_sock(sk);
 769}
 770EXPORT_SYMBOL(sock_no_linger);
 771
 772void sock_set_priority(struct sock *sk, u32 priority)
 773{
 774	lock_sock(sk);
 775	sk->sk_priority = priority;
 776	release_sock(sk);
 777}
 778EXPORT_SYMBOL(sock_set_priority);
 779
 780void sock_set_sndtimeo(struct sock *sk, s64 secs)
 781{
 782	lock_sock(sk);
 783	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 784		sk->sk_sndtimeo = secs * HZ;
 785	else
 786		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 787	release_sock(sk);
 788}
 789EXPORT_SYMBOL(sock_set_sndtimeo);
 790
 791static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 792{
 793	if (val)  {
 794		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 795		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 796		sock_set_flag(sk, SOCK_RCVTSTAMP);
 797		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 798	} else {
 799		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 800		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 801	}
 802}
 803
 804void sock_enable_timestamps(struct sock *sk)
 805{
 806	lock_sock(sk);
 807	__sock_set_timestamps(sk, true, false, true);
 808	release_sock(sk);
 809}
 810EXPORT_SYMBOL(sock_enable_timestamps);
 811
 812void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 813{
 814	switch (optname) {
 815	case SO_TIMESTAMP_OLD:
 816		__sock_set_timestamps(sk, valbool, false, false);
 817		break;
 818	case SO_TIMESTAMP_NEW:
 819		__sock_set_timestamps(sk, valbool, true, false);
 820		break;
 821	case SO_TIMESTAMPNS_OLD:
 822		__sock_set_timestamps(sk, valbool, false, true);
 823		break;
 824	case SO_TIMESTAMPNS_NEW:
 825		__sock_set_timestamps(sk, valbool, true, true);
 826		break;
 827	}
 828}
 829
 830static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 831{
 832	struct net *net = sock_net(sk);
 833	struct net_device *dev = NULL;
 834	bool match = false;
 835	int *vclock_index;
 836	int i, num;
 837
 838	if (sk->sk_bound_dev_if)
 839		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 840
 841	if (!dev) {
 842		pr_err("%s: sock not bind to device\n", __func__);
 843		return -EOPNOTSUPP;
 844	}
 845
 846	num = ethtool_get_phc_vclocks(dev, &vclock_index);
 847	dev_put(dev);
 848
 849	for (i = 0; i < num; i++) {
 850		if (*(vclock_index + i) == phc_index) {
 851			match = true;
 852			break;
 853		}
 854	}
 855
 856	if (num > 0)
 857		kfree(vclock_index);
 858
 859	if (!match)
 860		return -EINVAL;
 861
 862	sk->sk_bind_phc = phc_index;
 863
 864	return 0;
 865}
 866
 867int sock_set_timestamping(struct sock *sk, int optname,
 868			  struct so_timestamping timestamping)
 869{
 870	int val = timestamping.flags;
 871	int ret;
 872
 873	if (val & ~SOF_TIMESTAMPING_MASK)
 874		return -EINVAL;
 875
 876	if (val & SOF_TIMESTAMPING_OPT_ID &&
 877	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 878		if (sk_is_tcp(sk)) {
 879			if ((1 << sk->sk_state) &
 880			    (TCPF_CLOSE | TCPF_LISTEN))
 881				return -EINVAL;
 882			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 883		} else {
 884			atomic_set(&sk->sk_tskey, 0);
 885		}
 886	}
 887
 888	if (val & SOF_TIMESTAMPING_OPT_STATS &&
 889	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 890		return -EINVAL;
 891
 892	if (val & SOF_TIMESTAMPING_BIND_PHC) {
 893		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 894		if (ret)
 895			return ret;
 896	}
 897
 898	sk->sk_tsflags = val;
 899	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 900
 901	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 902		sock_enable_timestamp(sk,
 903				      SOCK_TIMESTAMPING_RX_SOFTWARE);
 904	else
 905		sock_disable_timestamp(sk,
 906				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 907	return 0;
 908}
 909
 910void sock_set_keepalive(struct sock *sk)
 911{
 912	lock_sock(sk);
 913	if (sk->sk_prot->keepalive)
 914		sk->sk_prot->keepalive(sk, true);
 915	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 916	release_sock(sk);
 917}
 918EXPORT_SYMBOL(sock_set_keepalive);
 919
 920static void __sock_set_rcvbuf(struct sock *sk, int val)
 921{
 922	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 923	 * as a negative value.
 924	 */
 925	val = min_t(int, val, INT_MAX / 2);
 926	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 927
 928	/* We double it on the way in to account for "struct sk_buff" etc.
 929	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 930	 * will allow that much actual data to be received on that socket.
 931	 *
 932	 * Applications are unaware that "struct sk_buff" and other overheads
 933	 * allocate from the receive buffer during socket buffer allocation.
 934	 *
 935	 * And after considering the possible alternatives, returning the value
 936	 * we actually used in getsockopt is the most desirable behavior.
 937	 */
 938	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 939}
 940
 941void sock_set_rcvbuf(struct sock *sk, int val)
 942{
 943	lock_sock(sk);
 944	__sock_set_rcvbuf(sk, val);
 945	release_sock(sk);
 946}
 947EXPORT_SYMBOL(sock_set_rcvbuf);
 948
 949static void __sock_set_mark(struct sock *sk, u32 val)
 950{
 951	if (val != sk->sk_mark) {
 952		sk->sk_mark = val;
 953		sk_dst_reset(sk);
 954	}
 955}
 956
 957void sock_set_mark(struct sock *sk, u32 val)
 958{
 959	lock_sock(sk);
 960	__sock_set_mark(sk, val);
 961	release_sock(sk);
 962}
 963EXPORT_SYMBOL(sock_set_mark);
 964
 965static void sock_release_reserved_memory(struct sock *sk, int bytes)
 966{
 967	/* Round down bytes to multiple of pages */
 968	bytes &= ~(SK_MEM_QUANTUM - 1);
 969
 970	WARN_ON(bytes > sk->sk_reserved_mem);
 971	sk->sk_reserved_mem -= bytes;
 972	sk_mem_reclaim(sk);
 973}
 974
 975static int sock_reserve_memory(struct sock *sk, int bytes)
 976{
 977	long allocated;
 978	bool charged;
 979	int pages;
 980
 981	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
 982		return -EOPNOTSUPP;
 983
 984	if (!bytes)
 985		return 0;
 986
 987	pages = sk_mem_pages(bytes);
 988
 989	/* pre-charge to memcg */
 990	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
 991					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 992	if (!charged)
 993		return -ENOMEM;
 994
 995	/* pre-charge to forward_alloc */
 996	allocated = sk_memory_allocated_add(sk, pages);
 997	/* If the system goes into memory pressure with this
 998	 * precharge, give up and return error.
 999	 */
1000	if (allocated > sk_prot_mem_limits(sk, 1)) {
1001		sk_memory_allocated_sub(sk, pages);
1002		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1003		return -ENOMEM;
1004	}
1005	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1006
1007	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1008
1009	return 0;
1010}
1011
1012/*
1013 *	This is meant for all protocols to use and covers goings on
1014 *	at the socket level. Everything here is generic.
1015 */
1016
1017int sock_setsockopt(struct socket *sock, int level, int optname,
1018		    sockptr_t optval, unsigned int optlen)
1019{
1020	struct so_timestamping timestamping;
1021	struct sock_txtime sk_txtime;
1022	struct sock *sk = sock->sk;
1023	int val;
1024	int valbool;
1025	struct linger ling;
1026	int ret = 0;
1027
1028	/*
1029	 *	Options without arguments
1030	 */
1031
1032	if (optname == SO_BINDTODEVICE)
1033		return sock_setbindtodevice(sk, optval, optlen);
1034
1035	if (optlen < sizeof(int))
1036		return -EINVAL;
1037
1038	if (copy_from_sockptr(&val, optval, sizeof(val)))
1039		return -EFAULT;
1040
1041	valbool = val ? 1 : 0;
1042
1043	lock_sock(sk);
1044
1045	switch (optname) {
1046	case SO_DEBUG:
1047		if (val && !capable(CAP_NET_ADMIN))
1048			ret = -EACCES;
1049		else
1050			sock_valbool_flag(sk, SOCK_DBG, valbool);
1051		break;
1052	case SO_REUSEADDR:
1053		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1054		break;
1055	case SO_REUSEPORT:
1056		sk->sk_reuseport = valbool;
1057		break;
1058	case SO_TYPE:
1059	case SO_PROTOCOL:
1060	case SO_DOMAIN:
1061	case SO_ERROR:
1062		ret = -ENOPROTOOPT;
1063		break;
1064	case SO_DONTROUTE:
1065		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1066		sk_dst_reset(sk);
1067		break;
1068	case SO_BROADCAST:
1069		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1070		break;
1071	case SO_SNDBUF:
1072		/* Don't error on this BSD doesn't and if you think
1073		 * about it this is right. Otherwise apps have to
1074		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1075		 * are treated in BSD as hints
1076		 */
1077		val = min_t(u32, val, sysctl_wmem_max);
1078set_sndbuf:
1079		/* Ensure val * 2 fits into an int, to prevent max_t()
1080		 * from treating it as a negative value.
1081		 */
1082		val = min_t(int, val, INT_MAX / 2);
1083		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1084		WRITE_ONCE(sk->sk_sndbuf,
1085			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1086		/* Wake up sending tasks if we upped the value. */
1087		sk->sk_write_space(sk);
1088		break;
1089
1090	case SO_SNDBUFFORCE:
1091		if (!capable(CAP_NET_ADMIN)) {
1092			ret = -EPERM;
1093			break;
1094		}
1095
1096		/* No negative values (to prevent underflow, as val will be
1097		 * multiplied by 2).
1098		 */
1099		if (val < 0)
1100			val = 0;
1101		goto set_sndbuf;
1102
1103	case SO_RCVBUF:
1104		/* Don't error on this BSD doesn't and if you think
1105		 * about it this is right. Otherwise apps have to
1106		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1107		 * are treated in BSD as hints
1108		 */
1109		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1110		break;
1111
1112	case SO_RCVBUFFORCE:
1113		if (!capable(CAP_NET_ADMIN)) {
1114			ret = -EPERM;
1115			break;
1116		}
1117
1118		/* No negative values (to prevent underflow, as val will be
1119		 * multiplied by 2).
1120		 */
1121		__sock_set_rcvbuf(sk, max(val, 0));
1122		break;
1123
1124	case SO_KEEPALIVE:
1125		if (sk->sk_prot->keepalive)
1126			sk->sk_prot->keepalive(sk, valbool);
1127		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1128		break;
1129
1130	case SO_OOBINLINE:
1131		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1132		break;
1133
1134	case SO_NO_CHECK:
1135		sk->sk_no_check_tx = valbool;
1136		break;
1137
1138	case SO_PRIORITY:
1139		if ((val >= 0 && val <= 6) ||
1140		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1141		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1142			sk->sk_priority = val;
1143		else
1144			ret = -EPERM;
1145		break;
1146
1147	case SO_LINGER:
1148		if (optlen < sizeof(ling)) {
1149			ret = -EINVAL;	/* 1003.1g */
1150			break;
1151		}
1152		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1153			ret = -EFAULT;
1154			break;
1155		}
1156		if (!ling.l_onoff)
1157			sock_reset_flag(sk, SOCK_LINGER);
1158		else {
1159#if (BITS_PER_LONG == 32)
1160			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1161				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1162			else
1163#endif
1164				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1165			sock_set_flag(sk, SOCK_LINGER);
1166		}
1167		break;
1168
1169	case SO_BSDCOMPAT:
1170		break;
1171
1172	case SO_PASSCRED:
1173		if (valbool)
1174			set_bit(SOCK_PASSCRED, &sock->flags);
1175		else
1176			clear_bit(SOCK_PASSCRED, &sock->flags);
1177		break;
1178
1179	case SO_TIMESTAMP_OLD:
1180	case SO_TIMESTAMP_NEW:
1181	case SO_TIMESTAMPNS_OLD:
1182	case SO_TIMESTAMPNS_NEW:
1183		sock_set_timestamp(sk, optname, valbool);
1184		break;
1185
1186	case SO_TIMESTAMPING_NEW:
1187	case SO_TIMESTAMPING_OLD:
1188		if (optlen == sizeof(timestamping)) {
1189			if (copy_from_sockptr(&timestamping, optval,
1190					      sizeof(timestamping))) {
1191				ret = -EFAULT;
1192				break;
1193			}
1194		} else {
1195			memset(&timestamping, 0, sizeof(timestamping));
1196			timestamping.flags = val;
1197		}
1198		ret = sock_set_timestamping(sk, optname, timestamping);
1199		break;
1200
1201	case SO_RCVLOWAT:
1202		if (val < 0)
1203			val = INT_MAX;
1204		if (sock->ops->set_rcvlowat)
1205			ret = sock->ops->set_rcvlowat(sk, val);
1206		else
1207			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1208		break;
1209
1210	case SO_RCVTIMEO_OLD:
1211	case SO_RCVTIMEO_NEW:
1212		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1213				       optlen, optname == SO_RCVTIMEO_OLD);
1214		break;
1215
1216	case SO_SNDTIMEO_OLD:
1217	case SO_SNDTIMEO_NEW:
1218		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1219				       optlen, optname == SO_SNDTIMEO_OLD);
1220		break;
1221
1222	case SO_ATTACH_FILTER: {
1223		struct sock_fprog fprog;
1224
1225		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1226		if (!ret)
1227			ret = sk_attach_filter(&fprog, sk);
1228		break;
1229	}
1230	case SO_ATTACH_BPF:
1231		ret = -EINVAL;
1232		if (optlen == sizeof(u32)) {
1233			u32 ufd;
1234
1235			ret = -EFAULT;
1236			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1237				break;
1238
1239			ret = sk_attach_bpf(ufd, sk);
1240		}
1241		break;
1242
1243	case SO_ATTACH_REUSEPORT_CBPF: {
1244		struct sock_fprog fprog;
1245
1246		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1247		if (!ret)
1248			ret = sk_reuseport_attach_filter(&fprog, sk);
1249		break;
1250	}
1251	case SO_ATTACH_REUSEPORT_EBPF:
1252		ret = -EINVAL;
1253		if (optlen == sizeof(u32)) {
1254			u32 ufd;
1255
1256			ret = -EFAULT;
1257			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1258				break;
1259
1260			ret = sk_reuseport_attach_bpf(ufd, sk);
1261		}
1262		break;
1263
1264	case SO_DETACH_REUSEPORT_BPF:
1265		ret = reuseport_detach_prog(sk);
1266		break;
1267
1268	case SO_DETACH_FILTER:
1269		ret = sk_detach_filter(sk);
1270		break;
1271
1272	case SO_LOCK_FILTER:
1273		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1274			ret = -EPERM;
1275		else
1276			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1277		break;
1278
1279	case SO_PASSSEC:
1280		if (valbool)
1281			set_bit(SOCK_PASSSEC, &sock->flags);
1282		else
1283			clear_bit(SOCK_PASSSEC, &sock->flags);
1284		break;
1285	case SO_MARK:
1286		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1287		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1288			ret = -EPERM;
1289			break;
1290		}
1291
1292		__sock_set_mark(sk, val);
1293		break;
1294
1295	case SO_RXQ_OVFL:
1296		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1297		break;
1298
1299	case SO_WIFI_STATUS:
1300		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1301		break;
1302
1303	case SO_PEEK_OFF:
1304		if (sock->ops->set_peek_off)
1305			ret = sock->ops->set_peek_off(sk, val);
1306		else
1307			ret = -EOPNOTSUPP;
1308		break;
1309
1310	case SO_NOFCS:
1311		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1312		break;
1313
1314	case SO_SELECT_ERR_QUEUE:
1315		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1316		break;
1317
1318#ifdef CONFIG_NET_RX_BUSY_POLL
1319	case SO_BUSY_POLL:
1320		/* allow unprivileged users to decrease the value */
1321		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1322			ret = -EPERM;
1323		else {
1324			if (val < 0)
1325				ret = -EINVAL;
1326			else
1327				WRITE_ONCE(sk->sk_ll_usec, val);
1328		}
1329		break;
1330	case SO_PREFER_BUSY_POLL:
1331		if (valbool && !capable(CAP_NET_ADMIN))
1332			ret = -EPERM;
1333		else
1334			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1335		break;
1336	case SO_BUSY_POLL_BUDGET:
1337		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1338			ret = -EPERM;
1339		} else {
1340			if (val < 0 || val > U16_MAX)
1341				ret = -EINVAL;
1342			else
1343				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1344		}
1345		break;
1346#endif
1347
1348	case SO_MAX_PACING_RATE:
1349		{
1350		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1351
1352		if (sizeof(ulval) != sizeof(val) &&
1353		    optlen >= sizeof(ulval) &&
1354		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1355			ret = -EFAULT;
1356			break;
1357		}
1358		if (ulval != ~0UL)
1359			cmpxchg(&sk->sk_pacing_status,
1360				SK_PACING_NONE,
1361				SK_PACING_NEEDED);
1362		sk->sk_max_pacing_rate = ulval;
1363		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1364		break;
1365		}
1366	case SO_INCOMING_CPU:
1367		WRITE_ONCE(sk->sk_incoming_cpu, val);
1368		break;
1369
1370	case SO_CNX_ADVICE:
1371		if (val == 1)
1372			dst_negative_advice(sk);
1373		break;
1374
1375	case SO_ZEROCOPY:
1376		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1377			if (!(sk_is_tcp(sk) ||
1378			      (sk->sk_type == SOCK_DGRAM &&
1379			       sk->sk_protocol == IPPROTO_UDP)))
1380				ret = -EOPNOTSUPP;
1381		} else if (sk->sk_family != PF_RDS) {
1382			ret = -EOPNOTSUPP;
1383		}
1384		if (!ret) {
1385			if (val < 0 || val > 1)
1386				ret = -EINVAL;
1387			else
1388				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1389		}
1390		break;
1391
1392	case SO_TXTIME:
1393		if (optlen != sizeof(struct sock_txtime)) {
1394			ret = -EINVAL;
1395			break;
1396		} else if (copy_from_sockptr(&sk_txtime, optval,
1397			   sizeof(struct sock_txtime))) {
1398			ret = -EFAULT;
1399			break;
1400		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1401			ret = -EINVAL;
1402			break;
1403		}
1404		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1405		 * scheduler has enough safe guards.
1406		 */
1407		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1408		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1409			ret = -EPERM;
1410			break;
1411		}
1412		sock_valbool_flag(sk, SOCK_TXTIME, true);
1413		sk->sk_clockid = sk_txtime.clockid;
1414		sk->sk_txtime_deadline_mode =
1415			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1416		sk->sk_txtime_report_errors =
1417			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1418		break;
1419
1420	case SO_BINDTOIFINDEX:
1421		ret = sock_bindtoindex_locked(sk, val);
1422		break;
1423
1424	case SO_BUF_LOCK:
1425		if (val & ~SOCK_BUF_LOCK_MASK) {
1426			ret = -EINVAL;
1427			break;
1428		}
1429		sk->sk_userlocks = val | (sk->sk_userlocks &
1430					  ~SOCK_BUF_LOCK_MASK);
1431		break;
1432
1433	case SO_RESERVE_MEM:
1434	{
1435		int delta;
1436
1437		if (val < 0) {
1438			ret = -EINVAL;
1439			break;
1440		}
1441
1442		delta = val - sk->sk_reserved_mem;
1443		if (delta < 0)
1444			sock_release_reserved_memory(sk, -delta);
1445		else
1446			ret = sock_reserve_memory(sk, delta);
1447		break;
1448	}
1449
1450	case SO_TXREHASH:
1451		if (val < -1 || val > 1) {
1452			ret = -EINVAL;
1453			break;
1454		}
1455		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1456		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1457		break;
1458
1459	default:
1460		ret = -ENOPROTOOPT;
1461		break;
1462	}
1463	release_sock(sk);
1464	return ret;
1465}
1466EXPORT_SYMBOL(sock_setsockopt);
1467
1468static const struct cred *sk_get_peer_cred(struct sock *sk)
1469{
1470	const struct cred *cred;
1471
1472	spin_lock(&sk->sk_peer_lock);
1473	cred = get_cred(sk->sk_peer_cred);
1474	spin_unlock(&sk->sk_peer_lock);
1475
1476	return cred;
1477}
1478
1479static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1480			  struct ucred *ucred)
1481{
1482	ucred->pid = pid_vnr(pid);
1483	ucred->uid = ucred->gid = -1;
1484	if (cred) {
1485		struct user_namespace *current_ns = current_user_ns();
1486
1487		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1488		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1489	}
1490}
1491
1492static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1493{
1494	struct user_namespace *user_ns = current_user_ns();
1495	int i;
1496
1497	for (i = 0; i < src->ngroups; i++)
1498		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1499			return -EFAULT;
1500
1501	return 0;
1502}
1503
1504int sock_getsockopt(struct socket *sock, int level, int optname,
1505		    char __user *optval, int __user *optlen)
1506{
1507	struct sock *sk = sock->sk;
1508
1509	union {
1510		int val;
1511		u64 val64;
1512		unsigned long ulval;
1513		struct linger ling;
1514		struct old_timeval32 tm32;
1515		struct __kernel_old_timeval tm;
1516		struct  __kernel_sock_timeval stm;
1517		struct sock_txtime txtime;
1518		struct so_timestamping timestamping;
1519	} v;
1520
1521	int lv = sizeof(int);
1522	int len;
1523
1524	if (get_user(len, optlen))
1525		return -EFAULT;
1526	if (len < 0)
1527		return -EINVAL;
1528
1529	memset(&v, 0, sizeof(v));
1530
1531	switch (optname) {
1532	case SO_DEBUG:
1533		v.val = sock_flag(sk, SOCK_DBG);
1534		break;
1535
1536	case SO_DONTROUTE:
1537		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1538		break;
1539
1540	case SO_BROADCAST:
1541		v.val = sock_flag(sk, SOCK_BROADCAST);
1542		break;
1543
1544	case SO_SNDBUF:
1545		v.val = sk->sk_sndbuf;
1546		break;
1547
1548	case SO_RCVBUF:
1549		v.val = sk->sk_rcvbuf;
1550		break;
1551
1552	case SO_REUSEADDR:
1553		v.val = sk->sk_reuse;
1554		break;
1555
1556	case SO_REUSEPORT:
1557		v.val = sk->sk_reuseport;
1558		break;
1559
1560	case SO_KEEPALIVE:
1561		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1562		break;
1563
1564	case SO_TYPE:
1565		v.val = sk->sk_type;
1566		break;
1567
1568	case SO_PROTOCOL:
1569		v.val = sk->sk_protocol;
1570		break;
1571
1572	case SO_DOMAIN:
1573		v.val = sk->sk_family;
1574		break;
1575
1576	case SO_ERROR:
1577		v.val = -sock_error(sk);
1578		if (v.val == 0)
1579			v.val = xchg(&sk->sk_err_soft, 0);
1580		break;
1581
1582	case SO_OOBINLINE:
1583		v.val = sock_flag(sk, SOCK_URGINLINE);
1584		break;
1585
1586	case SO_NO_CHECK:
1587		v.val = sk->sk_no_check_tx;
1588		break;
1589
1590	case SO_PRIORITY:
1591		v.val = sk->sk_priority;
1592		break;
1593
1594	case SO_LINGER:
1595		lv		= sizeof(v.ling);
1596		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1597		v.ling.l_linger	= sk->sk_lingertime / HZ;
1598		break;
1599
1600	case SO_BSDCOMPAT:
1601		break;
1602
1603	case SO_TIMESTAMP_OLD:
1604		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1605				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1606				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1607		break;
1608
1609	case SO_TIMESTAMPNS_OLD:
1610		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1611		break;
1612
1613	case SO_TIMESTAMP_NEW:
1614		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1615		break;
1616
1617	case SO_TIMESTAMPNS_NEW:
1618		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1619		break;
1620
1621	case SO_TIMESTAMPING_OLD:
1622		lv = sizeof(v.timestamping);
1623		v.timestamping.flags = sk->sk_tsflags;
1624		v.timestamping.bind_phc = sk->sk_bind_phc;
1625		break;
1626
1627	case SO_RCVTIMEO_OLD:
1628	case SO_RCVTIMEO_NEW:
1629		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1630		break;
1631
1632	case SO_SNDTIMEO_OLD:
1633	case SO_SNDTIMEO_NEW:
1634		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1635		break;
1636
1637	case SO_RCVLOWAT:
1638		v.val = sk->sk_rcvlowat;
1639		break;
1640
1641	case SO_SNDLOWAT:
1642		v.val = 1;
1643		break;
1644
1645	case SO_PASSCRED:
1646		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1647		break;
1648
1649	case SO_PEERCRED:
1650	{
1651		struct ucred peercred;
1652		if (len > sizeof(peercred))
1653			len = sizeof(peercred);
1654
1655		spin_lock(&sk->sk_peer_lock);
1656		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1657		spin_unlock(&sk->sk_peer_lock);
1658
1659		if (copy_to_user(optval, &peercred, len))
1660			return -EFAULT;
1661		goto lenout;
1662	}
1663
1664	case SO_PEERGROUPS:
1665	{
1666		const struct cred *cred;
1667		int ret, n;
1668
1669		cred = sk_get_peer_cred(sk);
1670		if (!cred)
1671			return -ENODATA;
1672
1673		n = cred->group_info->ngroups;
1674		if (len < n * sizeof(gid_t)) {
1675			len = n * sizeof(gid_t);
1676			put_cred(cred);
1677			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1678		}
1679		len = n * sizeof(gid_t);
1680
1681		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1682		put_cred(cred);
1683		if (ret)
1684			return ret;
1685		goto lenout;
1686	}
1687
1688	case SO_PEERNAME:
1689	{
1690		char address[128];
1691
1692		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1693		if (lv < 0)
1694			return -ENOTCONN;
1695		if (lv < len)
1696			return -EINVAL;
1697		if (copy_to_user(optval, address, len))
1698			return -EFAULT;
1699		goto lenout;
1700	}
1701
1702	/* Dubious BSD thing... Probably nobody even uses it, but
1703	 * the UNIX standard wants it for whatever reason... -DaveM
1704	 */
1705	case SO_ACCEPTCONN:
1706		v.val = sk->sk_state == TCP_LISTEN;
1707		break;
1708
1709	case SO_PASSSEC:
1710		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1711		break;
1712
1713	case SO_PEERSEC:
1714		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1715
1716	case SO_MARK:
1717		v.val = sk->sk_mark;
1718		break;
1719
1720	case SO_RXQ_OVFL:
1721		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1722		break;
1723
1724	case SO_WIFI_STATUS:
1725		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1726		break;
1727
1728	case SO_PEEK_OFF:
1729		if (!sock->ops->set_peek_off)
1730			return -EOPNOTSUPP;
1731
1732		v.val = sk->sk_peek_off;
1733		break;
1734	case SO_NOFCS:
1735		v.val = sock_flag(sk, SOCK_NOFCS);
1736		break;
1737
1738	case SO_BINDTODEVICE:
1739		return sock_getbindtodevice(sk, optval, optlen, len);
1740
1741	case SO_GET_FILTER:
1742		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1743		if (len < 0)
1744			return len;
1745
1746		goto lenout;
1747
1748	case SO_LOCK_FILTER:
1749		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1750		break;
1751
1752	case SO_BPF_EXTENSIONS:
1753		v.val = bpf_tell_extensions();
1754		break;
1755
1756	case SO_SELECT_ERR_QUEUE:
1757		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1758		break;
1759
1760#ifdef CONFIG_NET_RX_BUSY_POLL
1761	case SO_BUSY_POLL:
1762		v.val = sk->sk_ll_usec;
1763		break;
1764	case SO_PREFER_BUSY_POLL:
1765		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1766		break;
1767#endif
1768
1769	case SO_MAX_PACING_RATE:
1770		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1771			lv = sizeof(v.ulval);
1772			v.ulval = sk->sk_max_pacing_rate;
1773		} else {
1774			/* 32bit version */
1775			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1776		}
1777		break;
1778
1779	case SO_INCOMING_CPU:
1780		v.val = READ_ONCE(sk->sk_incoming_cpu);
1781		break;
1782
1783	case SO_MEMINFO:
1784	{
1785		u32 meminfo[SK_MEMINFO_VARS];
1786
1787		sk_get_meminfo(sk, meminfo);
1788
1789		len = min_t(unsigned int, len, sizeof(meminfo));
1790		if (copy_to_user(optval, &meminfo, len))
1791			return -EFAULT;
1792
1793		goto lenout;
1794	}
1795
1796#ifdef CONFIG_NET_RX_BUSY_POLL
1797	case SO_INCOMING_NAPI_ID:
1798		v.val = READ_ONCE(sk->sk_napi_id);
1799
1800		/* aggregate non-NAPI IDs down to 0 */
1801		if (v.val < MIN_NAPI_ID)
1802			v.val = 0;
1803
1804		break;
1805#endif
1806
1807	case SO_COOKIE:
1808		lv = sizeof(u64);
1809		if (len < lv)
1810			return -EINVAL;
1811		v.val64 = sock_gen_cookie(sk);
1812		break;
1813
1814	case SO_ZEROCOPY:
1815		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1816		break;
1817
1818	case SO_TXTIME:
1819		lv = sizeof(v.txtime);
1820		v.txtime.clockid = sk->sk_clockid;
1821		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1822				  SOF_TXTIME_DEADLINE_MODE : 0;
1823		v.txtime.flags |= sk->sk_txtime_report_errors ?
1824				  SOF_TXTIME_REPORT_ERRORS : 0;
1825		break;
1826
1827	case SO_BINDTOIFINDEX:
1828		v.val = sk->sk_bound_dev_if;
1829		break;
1830
1831	case SO_NETNS_COOKIE:
1832		lv = sizeof(u64);
1833		if (len != lv)
1834			return -EINVAL;
1835		v.val64 = sock_net(sk)->net_cookie;
1836		break;
1837
1838	case SO_BUF_LOCK:
1839		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1840		break;
1841
1842	case SO_RESERVE_MEM:
1843		v.val = sk->sk_reserved_mem;
1844		break;
1845
1846	case SO_TXREHASH:
1847		v.val = sk->sk_txrehash;
1848		break;
1849
1850	default:
1851		/* We implement the SO_SNDLOWAT etc to not be settable
1852		 * (1003.1g 7).
1853		 */
1854		return -ENOPROTOOPT;
1855	}
1856
1857	if (len > lv)
1858		len = lv;
1859	if (copy_to_user(optval, &v, len))
1860		return -EFAULT;
1861lenout:
1862	if (put_user(len, optlen))
1863		return -EFAULT;
1864	return 0;
1865}
1866
1867/*
1868 * Initialize an sk_lock.
1869 *
1870 * (We also register the sk_lock with the lock validator.)
1871 */
1872static inline void sock_lock_init(struct sock *sk)
1873{
1874	if (sk->sk_kern_sock)
1875		sock_lock_init_class_and_name(
1876			sk,
1877			af_family_kern_slock_key_strings[sk->sk_family],
1878			af_family_kern_slock_keys + sk->sk_family,
1879			af_family_kern_key_strings[sk->sk_family],
1880			af_family_kern_keys + sk->sk_family);
1881	else
1882		sock_lock_init_class_and_name(
1883			sk,
1884			af_family_slock_key_strings[sk->sk_family],
1885			af_family_slock_keys + sk->sk_family,
1886			af_family_key_strings[sk->sk_family],
1887			af_family_keys + sk->sk_family);
1888}
1889
1890/*
1891 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1892 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1893 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1894 */
1895static void sock_copy(struct sock *nsk, const struct sock *osk)
1896{
1897	const struct proto *prot = READ_ONCE(osk->sk_prot);
1898#ifdef CONFIG_SECURITY_NETWORK
1899	void *sptr = nsk->sk_security;
1900#endif
1901
1902	/* If we move sk_tx_queue_mapping out of the private section,
1903	 * we must check if sk_tx_queue_clear() is called after
1904	 * sock_copy() in sk_clone_lock().
1905	 */
1906	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1907		     offsetof(struct sock, sk_dontcopy_begin) ||
1908		     offsetof(struct sock, sk_tx_queue_mapping) >=
1909		     offsetof(struct sock, sk_dontcopy_end));
1910
1911	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1912
1913	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1914	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1915
1916#ifdef CONFIG_SECURITY_NETWORK
1917	nsk->sk_security = sptr;
1918	security_sk_clone(osk, nsk);
1919#endif
1920}
1921
1922static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1923		int family)
1924{
1925	struct sock *sk;
1926	struct kmem_cache *slab;
1927
1928	slab = prot->slab;
1929	if (slab != NULL) {
1930		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1931		if (!sk)
1932			return sk;
1933		if (want_init_on_alloc(priority))
1934			sk_prot_clear_nulls(sk, prot->obj_size);
1935	} else
1936		sk = kmalloc(prot->obj_size, priority);
1937
1938	if (sk != NULL) {
1939		if (security_sk_alloc(sk, family, priority))
1940			goto out_free;
1941
1942		if (!try_module_get(prot->owner))
1943			goto out_free_sec;
1944	}
1945
1946	return sk;
1947
1948out_free_sec:
1949	security_sk_free(sk);
1950out_free:
1951	if (slab != NULL)
1952		kmem_cache_free(slab, sk);
1953	else
1954		kfree(sk);
1955	return NULL;
1956}
1957
1958static void sk_prot_free(struct proto *prot, struct sock *sk)
1959{
1960	struct kmem_cache *slab;
1961	struct module *owner;
1962
1963	owner = prot->owner;
1964	slab = prot->slab;
1965
1966	cgroup_sk_free(&sk->sk_cgrp_data);
1967	mem_cgroup_sk_free(sk);
1968	security_sk_free(sk);
1969	if (slab != NULL)
1970		kmem_cache_free(slab, sk);
1971	else
1972		kfree(sk);
1973	module_put(owner);
1974}
1975
1976/**
1977 *	sk_alloc - All socket objects are allocated here
1978 *	@net: the applicable net namespace
1979 *	@family: protocol family
1980 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1981 *	@prot: struct proto associated with this new sock instance
1982 *	@kern: is this to be a kernel socket?
1983 */
1984struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1985		      struct proto *prot, int kern)
1986{
1987	struct sock *sk;
1988
1989	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1990	if (sk) {
1991		sk->sk_family = family;
1992		/*
1993		 * See comment in struct sock definition to understand
1994		 * why we need sk_prot_creator -acme
1995		 */
1996		sk->sk_prot = sk->sk_prot_creator = prot;
1997		sk->sk_kern_sock = kern;
1998		sock_lock_init(sk);
1999		sk->sk_net_refcnt = kern ? 0 : 1;
2000		if (likely(sk->sk_net_refcnt)) {
2001			get_net_track(net, &sk->ns_tracker, priority);
2002			sock_inuse_add(net, 1);
2003		}
2004
2005		sock_net_set(sk, net);
2006		refcount_set(&sk->sk_wmem_alloc, 1);
2007
2008		mem_cgroup_sk_alloc(sk);
2009		cgroup_sk_alloc(&sk->sk_cgrp_data);
2010		sock_update_classid(&sk->sk_cgrp_data);
2011		sock_update_netprioidx(&sk->sk_cgrp_data);
2012		sk_tx_queue_clear(sk);
2013	}
2014
2015	return sk;
2016}
2017EXPORT_SYMBOL(sk_alloc);
2018
2019/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2020 * grace period. This is the case for UDP sockets and TCP listeners.
2021 */
2022static void __sk_destruct(struct rcu_head *head)
2023{
2024	struct sock *sk = container_of(head, struct sock, sk_rcu);
2025	struct sk_filter *filter;
2026
2027	if (sk->sk_destruct)
2028		sk->sk_destruct(sk);
2029
2030	filter = rcu_dereference_check(sk->sk_filter,
2031				       refcount_read(&sk->sk_wmem_alloc) == 0);
2032	if (filter) {
2033		sk_filter_uncharge(sk, filter);
2034		RCU_INIT_POINTER(sk->sk_filter, NULL);
2035	}
2036
2037	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2038
2039#ifdef CONFIG_BPF_SYSCALL
2040	bpf_sk_storage_free(sk);
2041#endif
2042
2043	if (atomic_read(&sk->sk_omem_alloc))
2044		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2045			 __func__, atomic_read(&sk->sk_omem_alloc));
2046
2047	if (sk->sk_frag.page) {
2048		put_page(sk->sk_frag.page);
2049		sk->sk_frag.page = NULL;
2050	}
2051
2052	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2053	put_cred(sk->sk_peer_cred);
2054	put_pid(sk->sk_peer_pid);
2055
2056	if (likely(sk->sk_net_refcnt))
2057		put_net_track(sock_net(sk), &sk->ns_tracker);
2058	sk_prot_free(sk->sk_prot_creator, sk);
2059}
2060
2061void sk_destruct(struct sock *sk)
2062{
2063	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2064
2065	WARN_ON_ONCE(!llist_empty(&sk->defer_list));
2066	sk_defer_free_flush(sk);
2067
2068	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2069		reuseport_detach_sock(sk);
2070		use_call_rcu = true;
2071	}
2072
2073	if (use_call_rcu)
2074		call_rcu(&sk->sk_rcu, __sk_destruct);
2075	else
2076		__sk_destruct(&sk->sk_rcu);
2077}
2078
2079static void __sk_free(struct sock *sk)
2080{
2081	if (likely(sk->sk_net_refcnt))
2082		sock_inuse_add(sock_net(sk), -1);
2083
2084	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2085		sock_diag_broadcast_destroy(sk);
2086	else
2087		sk_destruct(sk);
2088}
2089
2090void sk_free(struct sock *sk)
2091{
2092	/*
2093	 * We subtract one from sk_wmem_alloc and can know if
2094	 * some packets are still in some tx queue.
2095	 * If not null, sock_wfree() will call __sk_free(sk) later
2096	 */
2097	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2098		__sk_free(sk);
2099}
2100EXPORT_SYMBOL(sk_free);
2101
2102static void sk_init_common(struct sock *sk)
2103{
2104	skb_queue_head_init(&sk->sk_receive_queue);
2105	skb_queue_head_init(&sk->sk_write_queue);
2106	skb_queue_head_init(&sk->sk_error_queue);
2107
2108	rwlock_init(&sk->sk_callback_lock);
2109	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2110			af_rlock_keys + sk->sk_family,
2111			af_family_rlock_key_strings[sk->sk_family]);
2112	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2113			af_wlock_keys + sk->sk_family,
2114			af_family_wlock_key_strings[sk->sk_family]);
2115	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2116			af_elock_keys + sk->sk_family,
2117			af_family_elock_key_strings[sk->sk_family]);
2118	lockdep_set_class_and_name(&sk->sk_callback_lock,
2119			af_callback_keys + sk->sk_family,
2120			af_family_clock_key_strings[sk->sk_family]);
2121}
2122
2123/**
2124 *	sk_clone_lock - clone a socket, and lock its clone
2125 *	@sk: the socket to clone
2126 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2127 *
2128 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2129 */
2130struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2131{
2132	struct proto *prot = READ_ONCE(sk->sk_prot);
2133	struct sk_filter *filter;
2134	bool is_charged = true;
2135	struct sock *newsk;
2136
2137	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2138	if (!newsk)
2139		goto out;
2140
2141	sock_copy(newsk, sk);
2142
2143	newsk->sk_prot_creator = prot;
2144
2145	/* SANITY */
2146	if (likely(newsk->sk_net_refcnt)) {
2147		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2148		sock_inuse_add(sock_net(newsk), 1);
2149	}
2150	sk_node_init(&newsk->sk_node);
2151	sock_lock_init(newsk);
2152	bh_lock_sock(newsk);
2153	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2154	newsk->sk_backlog.len = 0;
2155
2156	atomic_set(&newsk->sk_rmem_alloc, 0);
2157
2158	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2159	refcount_set(&newsk->sk_wmem_alloc, 1);
2160
2161	atomic_set(&newsk->sk_omem_alloc, 0);
2162	sk_init_common(newsk);
2163
2164	newsk->sk_dst_cache	= NULL;
2165	newsk->sk_dst_pending_confirm = 0;
2166	newsk->sk_wmem_queued	= 0;
2167	newsk->sk_forward_alloc = 0;
2168	newsk->sk_reserved_mem  = 0;
2169	atomic_set(&newsk->sk_drops, 0);
2170	newsk->sk_send_head	= NULL;
2171	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2172	atomic_set(&newsk->sk_zckey, 0);
2173
2174	sock_reset_flag(newsk, SOCK_DONE);
2175
2176	/* sk->sk_memcg will be populated at accept() time */
2177	newsk->sk_memcg = NULL;
2178
2179	cgroup_sk_clone(&newsk->sk_cgrp_data);
2180
2181	rcu_read_lock();
2182	filter = rcu_dereference(sk->sk_filter);
2183	if (filter != NULL)
2184		/* though it's an empty new sock, the charging may fail
2185		 * if sysctl_optmem_max was changed between creation of
2186		 * original socket and cloning
2187		 */
2188		is_charged = sk_filter_charge(newsk, filter);
2189	RCU_INIT_POINTER(newsk->sk_filter, filter);
2190	rcu_read_unlock();
2191
2192	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2193		/* We need to make sure that we don't uncharge the new
2194		 * socket if we couldn't charge it in the first place
2195		 * as otherwise we uncharge the parent's filter.
2196		 */
2197		if (!is_charged)
2198			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2199		sk_free_unlock_clone(newsk);
2200		newsk = NULL;
2201		goto out;
2202	}
2203	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2204
2205	if (bpf_sk_storage_clone(sk, newsk)) {
2206		sk_free_unlock_clone(newsk);
2207		newsk = NULL;
2208		goto out;
2209	}
2210
2211	/* Clear sk_user_data if parent had the pointer tagged
2212	 * as not suitable for copying when cloning.
2213	 */
2214	if (sk_user_data_is_nocopy(newsk))
2215		newsk->sk_user_data = NULL;
2216
2217	newsk->sk_err	   = 0;
2218	newsk->sk_err_soft = 0;
2219	newsk->sk_priority = 0;
2220	newsk->sk_incoming_cpu = raw_smp_processor_id();
2221
2222	/* Before updating sk_refcnt, we must commit prior changes to memory
2223	 * (Documentation/RCU/rculist_nulls.rst for details)
2224	 */
2225	smp_wmb();
2226	refcount_set(&newsk->sk_refcnt, 2);
2227
2228	/* Increment the counter in the same struct proto as the master
2229	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2230	 * is the same as sk->sk_prot->socks, as this field was copied
2231	 * with memcpy).
2232	 *
2233	 * This _changes_ the previous behaviour, where
2234	 * tcp_create_openreq_child always was incrementing the
2235	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2236	 * to be taken into account in all callers. -acme
2237	 */
2238	sk_refcnt_debug_inc(newsk);
2239	sk_set_socket(newsk, NULL);
2240	sk_tx_queue_clear(newsk);
2241	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2242
2243	if (newsk->sk_prot->sockets_allocated)
2244		sk_sockets_allocated_inc(newsk);
2245
2246	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2247		net_enable_timestamp();
2248out:
2249	return newsk;
2250}
2251EXPORT_SYMBOL_GPL(sk_clone_lock);
2252
2253void sk_free_unlock_clone(struct sock *sk)
2254{
2255	/* It is still raw copy of parent, so invalidate
2256	 * destructor and make plain sk_free() */
2257	sk->sk_destruct = NULL;
2258	bh_unlock_sock(sk);
2259	sk_free(sk);
2260}
2261EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2262
2263void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2264{
2265	u32 max_segs = 1;
2266
2267	sk_dst_set(sk, dst);
2268	sk->sk_route_caps = dst->dev->features;
2269	if (sk_is_tcp(sk))
2270		sk->sk_route_caps |= NETIF_F_GSO;
2271	if (sk->sk_route_caps & NETIF_F_GSO)
2272		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2273	if (unlikely(sk->sk_gso_disabled))
2274		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2275	if (sk_can_gso(sk)) {
2276		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2277			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2278		} else {
2279			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2280			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2281			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2282			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
2283			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2284			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2285		}
2286	}
2287	sk->sk_gso_max_segs = max_segs;
2288}
2289EXPORT_SYMBOL_GPL(sk_setup_caps);
2290
2291/*
2292 *	Simple resource managers for sockets.
2293 */
2294
2295
2296/*
2297 * Write buffer destructor automatically called from kfree_skb.
2298 */
2299void sock_wfree(struct sk_buff *skb)
2300{
2301	struct sock *sk = skb->sk;
2302	unsigned int len = skb->truesize;
2303
2304	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2305		/*
2306		 * Keep a reference on sk_wmem_alloc, this will be released
2307		 * after sk_write_space() call
2308		 */
2309		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2310		sk->sk_write_space(sk);
2311		len = 1;
2312	}
2313	/*
2314	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2315	 * could not do because of in-flight packets
2316	 */
2317	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2318		__sk_free(sk);
2319}
2320EXPORT_SYMBOL(sock_wfree);
2321
2322/* This variant of sock_wfree() is used by TCP,
2323 * since it sets SOCK_USE_WRITE_QUEUE.
2324 */
2325void __sock_wfree(struct sk_buff *skb)
2326{
2327	struct sock *sk = skb->sk;
2328
2329	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2330		__sk_free(sk);
2331}
2332
2333void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2334{
2335	skb_orphan(skb);
2336	skb->sk = sk;
2337#ifdef CONFIG_INET
2338	if (unlikely(!sk_fullsock(sk))) {
2339		skb->destructor = sock_edemux;
2340		sock_hold(sk);
2341		return;
2342	}
2343#endif
2344	skb->destructor = sock_wfree;
2345	skb_set_hash_from_sk(skb, sk);
2346	/*
2347	 * We used to take a refcount on sk, but following operation
2348	 * is enough to guarantee sk_free() wont free this sock until
2349	 * all in-flight packets are completed
2350	 */
2351	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2352}
2353EXPORT_SYMBOL(skb_set_owner_w);
2354
2355static bool can_skb_orphan_partial(const struct sk_buff *skb)
2356{
2357#ifdef CONFIG_TLS_DEVICE
2358	/* Drivers depend on in-order delivery for crypto offload,
2359	 * partial orphan breaks out-of-order-OK logic.
2360	 */
2361	if (skb->decrypted)
2362		return false;
2363#endif
2364	return (skb->destructor == sock_wfree ||
2365		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2366}
2367
2368/* This helper is used by netem, as it can hold packets in its
2369 * delay queue. We want to allow the owner socket to send more
2370 * packets, as if they were already TX completed by a typical driver.
2371 * But we also want to keep skb->sk set because some packet schedulers
2372 * rely on it (sch_fq for example).
2373 */
2374void skb_orphan_partial(struct sk_buff *skb)
2375{
2376	if (skb_is_tcp_pure_ack(skb))
2377		return;
2378
2379	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2380		return;
2381
2382	skb_orphan(skb);
2383}
2384EXPORT_SYMBOL(skb_orphan_partial);
2385
2386/*
2387 * Read buffer destructor automatically called from kfree_skb.
2388 */
2389void sock_rfree(struct sk_buff *skb)
2390{
2391	struct sock *sk = skb->sk;
2392	unsigned int len = skb->truesize;
2393
2394	atomic_sub(len, &sk->sk_rmem_alloc);
2395	sk_mem_uncharge(sk, len);
2396}
2397EXPORT_SYMBOL(sock_rfree);
2398
2399/*
2400 * Buffer destructor for skbs that are not used directly in read or write
2401 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2402 */
2403void sock_efree(struct sk_buff *skb)
2404{
2405	sock_put(skb->sk);
2406}
2407EXPORT_SYMBOL(sock_efree);
2408
2409/* Buffer destructor for prefetch/receive path where reference count may
2410 * not be held, e.g. for listen sockets.
2411 */
2412#ifdef CONFIG_INET
2413void sock_pfree(struct sk_buff *skb)
2414{
2415	if (sk_is_refcounted(skb->sk))
2416		sock_gen_put(skb->sk);
2417}
2418EXPORT_SYMBOL(sock_pfree);
2419#endif /* CONFIG_INET */
2420
2421kuid_t sock_i_uid(struct sock *sk)
2422{
2423	kuid_t uid;
2424
2425	read_lock_bh(&sk->sk_callback_lock);
2426	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2427	read_unlock_bh(&sk->sk_callback_lock);
2428	return uid;
2429}
2430EXPORT_SYMBOL(sock_i_uid);
2431
2432unsigned long sock_i_ino(struct sock *sk)
2433{
2434	unsigned long ino;
2435
2436	read_lock_bh(&sk->sk_callback_lock);
2437	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2438	read_unlock_bh(&sk->sk_callback_lock);
2439	return ino;
2440}
2441EXPORT_SYMBOL(sock_i_ino);
2442
2443/*
2444 * Allocate a skb from the socket's send buffer.
2445 */
2446struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2447			     gfp_t priority)
2448{
2449	if (force ||
2450	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2451		struct sk_buff *skb = alloc_skb(size, priority);
2452
2453		if (skb) {
2454			skb_set_owner_w(skb, sk);
2455			return skb;
2456		}
2457	}
2458	return NULL;
2459}
2460EXPORT_SYMBOL(sock_wmalloc);
2461
2462static void sock_ofree(struct sk_buff *skb)
2463{
2464	struct sock *sk = skb->sk;
2465
2466	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2467}
2468
2469struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2470			     gfp_t priority)
2471{
2472	struct sk_buff *skb;
2473
2474	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2475	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2476	    sysctl_optmem_max)
2477		return NULL;
2478
2479	skb = alloc_skb(size, priority);
2480	if (!skb)
2481		return NULL;
2482
2483	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2484	skb->sk = sk;
2485	skb->destructor = sock_ofree;
2486	return skb;
2487}
2488
2489/*
2490 * Allocate a memory block from the socket's option memory buffer.
2491 */
2492void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2493{
2494	if ((unsigned int)size <= sysctl_optmem_max &&
2495	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2496		void *mem;
2497		/* First do the add, to avoid the race if kmalloc
2498		 * might sleep.
2499		 */
2500		atomic_add(size, &sk->sk_omem_alloc);
2501		mem = kmalloc(size, priority);
2502		if (mem)
2503			return mem;
2504		atomic_sub(size, &sk->sk_omem_alloc);
2505	}
2506	return NULL;
2507}
2508EXPORT_SYMBOL(sock_kmalloc);
2509
2510/* Free an option memory block. Note, we actually want the inline
2511 * here as this allows gcc to detect the nullify and fold away the
2512 * condition entirely.
2513 */
2514static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2515				  const bool nullify)
2516{
2517	if (WARN_ON_ONCE(!mem))
2518		return;
2519	if (nullify)
2520		kfree_sensitive(mem);
2521	else
2522		kfree(mem);
2523	atomic_sub(size, &sk->sk_omem_alloc);
2524}
2525
2526void sock_kfree_s(struct sock *sk, void *mem, int size)
2527{
2528	__sock_kfree_s(sk, mem, size, false);
2529}
2530EXPORT_SYMBOL(sock_kfree_s);
2531
2532void sock_kzfree_s(struct sock *sk, void *mem, int size)
2533{
2534	__sock_kfree_s(sk, mem, size, true);
2535}
2536EXPORT_SYMBOL(sock_kzfree_s);
2537
2538/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2539   I think, these locks should be removed for datagram sockets.
2540 */
2541static long sock_wait_for_wmem(struct sock *sk, long timeo)
2542{
2543	DEFINE_WAIT(wait);
2544
2545	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2546	for (;;) {
2547		if (!timeo)
2548			break;
2549		if (signal_pending(current))
2550			break;
2551		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2552		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2553		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2554			break;
2555		if (sk->sk_shutdown & SEND_SHUTDOWN)
2556			break;
2557		if (sk->sk_err)
2558			break;
2559		timeo = schedule_timeout(timeo);
2560	}
2561	finish_wait(sk_sleep(sk), &wait);
2562	return timeo;
2563}
2564
2565
2566/*
2567 *	Generic send/receive buffer handlers
2568 */
2569
2570struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2571				     unsigned long data_len, int noblock,
2572				     int *errcode, int max_page_order)
2573{
2574	struct sk_buff *skb;
2575	long timeo;
2576	int err;
2577
2578	timeo = sock_sndtimeo(sk, noblock);
2579	for (;;) {
2580		err = sock_error(sk);
2581		if (err != 0)
2582			goto failure;
2583
2584		err = -EPIPE;
2585		if (sk->sk_shutdown & SEND_SHUTDOWN)
2586			goto failure;
2587
2588		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2589			break;
2590
2591		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2592		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2593		err = -EAGAIN;
2594		if (!timeo)
2595			goto failure;
2596		if (signal_pending(current))
2597			goto interrupted;
2598		timeo = sock_wait_for_wmem(sk, timeo);
2599	}
2600	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2601				   errcode, sk->sk_allocation);
2602	if (skb)
2603		skb_set_owner_w(skb, sk);
2604	return skb;
2605
2606interrupted:
2607	err = sock_intr_errno(timeo);
2608failure:
2609	*errcode = err;
2610	return NULL;
2611}
2612EXPORT_SYMBOL(sock_alloc_send_pskb);
2613
2614struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2615				    int noblock, int *errcode)
2616{
2617	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2618}
2619EXPORT_SYMBOL(sock_alloc_send_skb);
2620
2621int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2622		     struct sockcm_cookie *sockc)
2623{
2624	u32 tsflags;
2625
2626	switch (cmsg->cmsg_type) {
2627	case SO_MARK:
2628		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2629		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2630			return -EPERM;
2631		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2632			return -EINVAL;
2633		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2634		break;
2635	case SO_TIMESTAMPING_OLD:
2636		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2637			return -EINVAL;
2638
2639		tsflags = *(u32 *)CMSG_DATA(cmsg);
2640		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2641			return -EINVAL;
2642
2643		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2644		sockc->tsflags |= tsflags;
2645		break;
2646	case SCM_TXTIME:
2647		if (!sock_flag(sk, SOCK_TXTIME))
2648			return -EINVAL;
2649		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2650			return -EINVAL;
2651		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2652		break;
2653	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2654	case SCM_RIGHTS:
2655	case SCM_CREDENTIALS:
2656		break;
2657	default:
2658		return -EINVAL;
2659	}
2660	return 0;
2661}
2662EXPORT_SYMBOL(__sock_cmsg_send);
2663
2664int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2665		   struct sockcm_cookie *sockc)
2666{
2667	struct cmsghdr *cmsg;
2668	int ret;
2669
2670	for_each_cmsghdr(cmsg, msg) {
2671		if (!CMSG_OK(msg, cmsg))
2672			return -EINVAL;
2673		if (cmsg->cmsg_level != SOL_SOCKET)
2674			continue;
2675		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2676		if (ret)
2677			return ret;
2678	}
2679	return 0;
2680}
2681EXPORT_SYMBOL(sock_cmsg_send);
2682
2683static void sk_enter_memory_pressure(struct sock *sk)
2684{
2685	if (!sk->sk_prot->enter_memory_pressure)
2686		return;
2687
2688	sk->sk_prot->enter_memory_pressure(sk);
2689}
2690
2691static void sk_leave_memory_pressure(struct sock *sk)
2692{
2693	if (sk->sk_prot->leave_memory_pressure) {
2694		sk->sk_prot->leave_memory_pressure(sk);
2695	} else {
2696		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2697
2698		if (memory_pressure && READ_ONCE(*memory_pressure))
2699			WRITE_ONCE(*memory_pressure, 0);
2700	}
2701}
2702
2703DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2704
2705/**
2706 * skb_page_frag_refill - check that a page_frag contains enough room
2707 * @sz: minimum size of the fragment we want to get
2708 * @pfrag: pointer to page_frag
2709 * @gfp: priority for memory allocation
2710 *
2711 * Note: While this allocator tries to use high order pages, there is
2712 * no guarantee that allocations succeed. Therefore, @sz MUST be
2713 * less or equal than PAGE_SIZE.
2714 */
2715bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2716{
2717	if (pfrag->page) {
2718		if (page_ref_count(pfrag->page) == 1) {
2719			pfrag->offset = 0;
2720			return true;
2721		}
2722		if (pfrag->offset + sz <= pfrag->size)
2723			return true;
2724		put_page(pfrag->page);
2725	}
2726
2727	pfrag->offset = 0;
2728	if (SKB_FRAG_PAGE_ORDER &&
2729	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2730		/* Avoid direct reclaim but allow kswapd to wake */
2731		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2732					  __GFP_COMP | __GFP_NOWARN |
2733					  __GFP_NORETRY,
2734					  SKB_FRAG_PAGE_ORDER);
2735		if (likely(pfrag->page)) {
2736			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2737			return true;
2738		}
2739	}
2740	pfrag->page = alloc_page(gfp);
2741	if (likely(pfrag->page)) {
2742		pfrag->size = PAGE_SIZE;
2743		return true;
2744	}
2745	return false;
2746}
2747EXPORT_SYMBOL(skb_page_frag_refill);
2748
2749bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2750{
2751	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2752		return true;
2753
2754	sk_enter_memory_pressure(sk);
2755	sk_stream_moderate_sndbuf(sk);
2756	return false;
2757}
2758EXPORT_SYMBOL(sk_page_frag_refill);
2759
2760void __lock_sock(struct sock *sk)
2761	__releases(&sk->sk_lock.slock)
2762	__acquires(&sk->sk_lock.slock)
2763{
2764	DEFINE_WAIT(wait);
2765
2766	for (;;) {
2767		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2768					TASK_UNINTERRUPTIBLE);
2769		spin_unlock_bh(&sk->sk_lock.slock);
2770		schedule();
2771		spin_lock_bh(&sk->sk_lock.slock);
2772		if (!sock_owned_by_user(sk))
2773			break;
2774	}
2775	finish_wait(&sk->sk_lock.wq, &wait);
2776}
2777
2778void __release_sock(struct sock *sk)
2779	__releases(&sk->sk_lock.slock)
2780	__acquires(&sk->sk_lock.slock)
2781{
2782	struct sk_buff *skb, *next;
2783
2784	while ((skb = sk->sk_backlog.head) != NULL) {
2785		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2786
2787		spin_unlock_bh(&sk->sk_lock.slock);
2788
2789		do {
2790			next = skb->next;
2791			prefetch(next);
2792			WARN_ON_ONCE(skb_dst_is_noref(skb));
2793			skb_mark_not_on_list(skb);
2794			sk_backlog_rcv(sk, skb);
2795
2796			cond_resched();
2797
2798			skb = next;
2799		} while (skb != NULL);
2800
2801		spin_lock_bh(&sk->sk_lock.slock);
2802	}
2803
2804	/*
2805	 * Doing the zeroing here guarantee we can not loop forever
2806	 * while a wild producer attempts to flood us.
2807	 */
2808	sk->sk_backlog.len = 0;
2809}
2810
2811void __sk_flush_backlog(struct sock *sk)
2812{
2813	spin_lock_bh(&sk->sk_lock.slock);
2814	__release_sock(sk);
2815	spin_unlock_bh(&sk->sk_lock.slock);
2816}
2817
2818/**
2819 * sk_wait_data - wait for data to arrive at sk_receive_queue
2820 * @sk:    sock to wait on
2821 * @timeo: for how long
2822 * @skb:   last skb seen on sk_receive_queue
2823 *
2824 * Now socket state including sk->sk_err is changed only under lock,
2825 * hence we may omit checks after joining wait queue.
2826 * We check receive queue before schedule() only as optimization;
2827 * it is very likely that release_sock() added new data.
2828 */
2829int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2830{
2831	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2832	int rc;
2833
2834	add_wait_queue(sk_sleep(sk), &wait);
2835	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2836	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2837	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2838	remove_wait_queue(sk_sleep(sk), &wait);
2839	return rc;
2840}
2841EXPORT_SYMBOL(sk_wait_data);
2842
2843/**
2844 *	__sk_mem_raise_allocated - increase memory_allocated
2845 *	@sk: socket
2846 *	@size: memory size to allocate
2847 *	@amt: pages to allocate
2848 *	@kind: allocation type
2849 *
2850 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2851 */
2852int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2853{
2854	struct proto *prot = sk->sk_prot;
2855	long allocated = sk_memory_allocated_add(sk, amt);
2856	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2857	bool charged = true;
2858
2859	if (memcg_charge &&
2860	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2861						gfp_memcg_charge())))
2862		goto suppress_allocation;
2863
2864	/* Under limit. */
2865	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2866		sk_leave_memory_pressure(sk);
2867		return 1;
2868	}
2869
2870	/* Under pressure. */
2871	if (allocated > sk_prot_mem_limits(sk, 1))
2872		sk_enter_memory_pressure(sk);
2873
2874	/* Over hard limit. */
2875	if (allocated > sk_prot_mem_limits(sk, 2))
2876		goto suppress_allocation;
2877
2878	/* guarantee minimum buffer size under pressure */
2879	if (kind == SK_MEM_RECV) {
2880		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2881			return 1;
2882
2883	} else { /* SK_MEM_SEND */
2884		int wmem0 = sk_get_wmem0(sk, prot);
2885
2886		if (sk->sk_type == SOCK_STREAM) {
2887			if (sk->sk_wmem_queued < wmem0)
2888				return 1;
2889		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2890				return 1;
2891		}
2892	}
2893
2894	if (sk_has_memory_pressure(sk)) {
2895		u64 alloc;
2896
2897		if (!sk_under_memory_pressure(sk))
2898			return 1;
2899		alloc = sk_sockets_allocated_read_positive(sk);
2900		if (sk_prot_mem_limits(sk, 2) > alloc *
2901		    sk_mem_pages(sk->sk_wmem_queued +
2902				 atomic_read(&sk->sk_rmem_alloc) +
2903				 sk->sk_forward_alloc))
2904			return 1;
2905	}
2906
2907suppress_allocation:
2908
2909	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2910		sk_stream_moderate_sndbuf(sk);
2911
2912		/* Fail only if socket is _under_ its sndbuf.
2913		 * In this case we cannot block, so that we have to fail.
2914		 */
2915		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2916			/* Force charge with __GFP_NOFAIL */
2917			if (memcg_charge && !charged) {
2918				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2919					gfp_memcg_charge() | __GFP_NOFAIL);
2920			}
2921			return 1;
2922		}
2923	}
2924
2925	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2926		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2927
2928	sk_memory_allocated_sub(sk, amt);
2929
2930	if (memcg_charge && charged)
2931		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2932
2933	return 0;
2934}
2935EXPORT_SYMBOL(__sk_mem_raise_allocated);
2936
2937/**
2938 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2939 *	@sk: socket
2940 *	@size: memory size to allocate
2941 *	@kind: allocation type
2942 *
2943 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2944 *	rmem allocation. This function assumes that protocols which have
2945 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2946 */
2947int __sk_mem_schedule(struct sock *sk, int size, int kind)
2948{
2949	int ret, amt = sk_mem_pages(size);
2950
2951	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2952	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2953	if (!ret)
2954		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2955	return ret;
2956}
2957EXPORT_SYMBOL(__sk_mem_schedule);
2958
2959/**
2960 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2961 *	@sk: socket
2962 *	@amount: number of quanta
2963 *
2964 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2965 */
2966void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2967{
2968	sk_memory_allocated_sub(sk, amount);
2969
2970	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2971		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2972
2973	if (sk_under_memory_pressure(sk) &&
2974	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2975		sk_leave_memory_pressure(sk);
2976}
2977EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2978
2979/**
2980 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2981 *	@sk: socket
2982 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2983 */
2984void __sk_mem_reclaim(struct sock *sk, int amount)
2985{
2986	amount >>= SK_MEM_QUANTUM_SHIFT;
2987	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2988	__sk_mem_reduce_allocated(sk, amount);
2989}
2990EXPORT_SYMBOL(__sk_mem_reclaim);
2991
2992int sk_set_peek_off(struct sock *sk, int val)
2993{
2994	sk->sk_peek_off = val;
2995	return 0;
2996}
2997EXPORT_SYMBOL_GPL(sk_set_peek_off);
2998
2999/*
3000 * Set of default routines for initialising struct proto_ops when
3001 * the protocol does not support a particular function. In certain
3002 * cases where it makes no sense for a protocol to have a "do nothing"
3003 * function, some default processing is provided.
3004 */
3005
3006int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3007{
3008	return -EOPNOTSUPP;
3009}
3010EXPORT_SYMBOL(sock_no_bind);
3011
3012int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3013		    int len, int flags)
3014{
3015	return -EOPNOTSUPP;
3016}
3017EXPORT_SYMBOL(sock_no_connect);
3018
3019int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3020{
3021	return -EOPNOTSUPP;
3022}
3023EXPORT_SYMBOL(sock_no_socketpair);
3024
3025int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3026		   bool kern)
3027{
3028	return -EOPNOTSUPP;
3029}
3030EXPORT_SYMBOL(sock_no_accept);
3031
3032int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3033		    int peer)
3034{
3035	return -EOPNOTSUPP;
3036}
3037EXPORT_SYMBOL(sock_no_getname);
3038
3039int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3040{
3041	return -EOPNOTSUPP;
3042}
3043EXPORT_SYMBOL(sock_no_ioctl);
3044
3045int sock_no_listen(struct socket *sock, int backlog)
3046{
3047	return -EOPNOTSUPP;
3048}
3049EXPORT_SYMBOL(sock_no_listen);
3050
3051int sock_no_shutdown(struct socket *sock, int how)
3052{
3053	return -EOPNOTSUPP;
3054}
3055EXPORT_SYMBOL(sock_no_shutdown);
3056
3057int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3058{
3059	return -EOPNOTSUPP;
3060}
3061EXPORT_SYMBOL(sock_no_sendmsg);
3062
3063int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3064{
3065	return -EOPNOTSUPP;
3066}
3067EXPORT_SYMBOL(sock_no_sendmsg_locked);
3068
3069int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3070		    int flags)
3071{
3072	return -EOPNOTSUPP;
3073}
3074EXPORT_SYMBOL(sock_no_recvmsg);
3075
3076int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3077{
3078	/* Mirror missing mmap method error code */
3079	return -ENODEV;
3080}
3081EXPORT_SYMBOL(sock_no_mmap);
3082
3083/*
3084 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3085 * various sock-based usage counts.
3086 */
3087void __receive_sock(struct file *file)
3088{
3089	struct socket *sock;
3090
3091	sock = sock_from_file(file);
3092	if (sock) {
3093		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3094		sock_update_classid(&sock->sk->sk_cgrp_data);
3095	}
3096}
3097
3098ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3099{
3100	ssize_t res;
3101	struct msghdr msg = {.msg_flags = flags};
3102	struct kvec iov;
3103	char *kaddr = kmap(page);
3104	iov.iov_base = kaddr + offset;
3105	iov.iov_len = size;
3106	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3107	kunmap(page);
3108	return res;
3109}
3110EXPORT_SYMBOL(sock_no_sendpage);
3111
3112ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3113				int offset, size_t size, int flags)
3114{
3115	ssize_t res;
3116	struct msghdr msg = {.msg_flags = flags};
3117	struct kvec iov;
3118	char *kaddr = kmap(page);
3119
3120	iov.iov_base = kaddr + offset;
3121	iov.iov_len = size;
3122	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3123	kunmap(page);
3124	return res;
3125}
3126EXPORT_SYMBOL(sock_no_sendpage_locked);
3127
3128/*
3129 *	Default Socket Callbacks
3130 */
3131
3132static void sock_def_wakeup(struct sock *sk)
3133{
3134	struct socket_wq *wq;
3135
3136	rcu_read_lock();
3137	wq = rcu_dereference(sk->sk_wq);
3138	if (skwq_has_sleeper(wq))
3139		wake_up_interruptible_all(&wq->wait);
3140	rcu_read_unlock();
3141}
3142
3143static void sock_def_error_report(struct sock *sk)
3144{
3145	struct socket_wq *wq;
3146
3147	rcu_read_lock();
3148	wq = rcu_dereference(sk->sk_wq);
3149	if (skwq_has_sleeper(wq))
3150		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3151	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3152	rcu_read_unlock();
3153}
3154
3155void sock_def_readable(struct sock *sk)
3156{
3157	struct socket_wq *wq;
3158
3159	rcu_read_lock();
3160	wq = rcu_dereference(sk->sk_wq);
3161	if (skwq_has_sleeper(wq))
3162		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3163						EPOLLRDNORM | EPOLLRDBAND);
3164	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3165	rcu_read_unlock();
3166}
3167
3168static void sock_def_write_space(struct sock *sk)
3169{
3170	struct socket_wq *wq;
3171
3172	rcu_read_lock();
3173
3174	/* Do not wake up a writer until he can make "significant"
3175	 * progress.  --DaveM
3176	 */
3177	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3178		wq = rcu_dereference(sk->sk_wq);
3179		if (skwq_has_sleeper(wq))
3180			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3181						EPOLLWRNORM | EPOLLWRBAND);
3182
3183		/* Should agree with poll, otherwise some programs break */
3184		if (sock_writeable(sk))
3185			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3186	}
3187
3188	rcu_read_unlock();
3189}
3190
3191static void sock_def_destruct(struct sock *sk)
3192{
3193}
3194
3195void sk_send_sigurg(struct sock *sk)
3196{
3197	if (sk->sk_socket && sk->sk_socket->file)
3198		if (send_sigurg(&sk->sk_socket->file->f_owner))
3199			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3200}
3201EXPORT_SYMBOL(sk_send_sigurg);
3202
3203void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3204		    unsigned long expires)
3205{
3206	if (!mod_timer(timer, expires))
3207		sock_hold(sk);
3208}
3209EXPORT_SYMBOL(sk_reset_timer);
3210
3211void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3212{
3213	if (del_timer(timer))
3214		__sock_put(sk);
3215}
3216EXPORT_SYMBOL(sk_stop_timer);
3217
3218void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3219{
3220	if (del_timer_sync(timer))
3221		__sock_put(sk);
3222}
3223EXPORT_SYMBOL(sk_stop_timer_sync);
3224
3225void sock_init_data(struct socket *sock, struct sock *sk)
3226{
3227	sk_init_common(sk);
3228	sk->sk_send_head	=	NULL;
3229
3230	timer_setup(&sk->sk_timer, NULL, 0);
3231
3232	sk->sk_allocation	=	GFP_KERNEL;
3233	sk->sk_rcvbuf		=	sysctl_rmem_default;
3234	sk->sk_sndbuf		=	sysctl_wmem_default;
3235	sk->sk_state		=	TCP_CLOSE;
3236	sk_set_socket(sk, sock);
3237
3238	sock_set_flag(sk, SOCK_ZAPPED);
3239
3240	if (sock) {
3241		sk->sk_type	=	sock->type;
3242		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3243		sock->sk	=	sk;
3244		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3245	} else {
3246		RCU_INIT_POINTER(sk->sk_wq, NULL);
3247		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3248	}
3249
3250	rwlock_init(&sk->sk_callback_lock);
3251	if (sk->sk_kern_sock)
3252		lockdep_set_class_and_name(
3253			&sk->sk_callback_lock,
3254			af_kern_callback_keys + sk->sk_family,
3255			af_family_kern_clock_key_strings[sk->sk_family]);
3256	else
3257		lockdep_set_class_and_name(
3258			&sk->sk_callback_lock,
3259			af_callback_keys + sk->sk_family,
3260			af_family_clock_key_strings[sk->sk_family]);
3261
3262	sk->sk_state_change	=	sock_def_wakeup;
3263	sk->sk_data_ready	=	sock_def_readable;
3264	sk->sk_write_space	=	sock_def_write_space;
3265	sk->sk_error_report	=	sock_def_error_report;
3266	sk->sk_destruct		=	sock_def_destruct;
3267
3268	sk->sk_frag.page	=	NULL;
3269	sk->sk_frag.offset	=	0;
3270	sk->sk_peek_off		=	-1;
3271
3272	sk->sk_peer_pid 	=	NULL;
3273	sk->sk_peer_cred	=	NULL;
3274	spin_lock_init(&sk->sk_peer_lock);
3275
3276	sk->sk_write_pending	=	0;
3277	sk->sk_rcvlowat		=	1;
3278	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3279	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3280
3281	sk->sk_stamp = SK_DEFAULT_STAMP;
3282#if BITS_PER_LONG==32
3283	seqlock_init(&sk->sk_stamp_seq);
3284#endif
3285	atomic_set(&sk->sk_zckey, 0);
3286
3287#ifdef CONFIG_NET_RX_BUSY_POLL
3288	sk->sk_napi_id		=	0;
3289	sk->sk_ll_usec		=	sysctl_net_busy_read;
3290#endif
3291
3292	sk->sk_max_pacing_rate = ~0UL;
3293	sk->sk_pacing_rate = ~0UL;
3294	WRITE_ONCE(sk->sk_pacing_shift, 10);
3295	sk->sk_incoming_cpu = -1;
3296	sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
3297
3298	sk_rx_queue_clear(sk);
3299	/*
3300	 * Before updating sk_refcnt, we must commit prior changes to memory
3301	 * (Documentation/RCU/rculist_nulls.rst for details)
3302	 */
3303	smp_wmb();
3304	refcount_set(&sk->sk_refcnt, 1);
3305	atomic_set(&sk->sk_drops, 0);
3306}
3307EXPORT_SYMBOL(sock_init_data);
3308
3309void lock_sock_nested(struct sock *sk, int subclass)
3310{
3311	/* The sk_lock has mutex_lock() semantics here. */
3312	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3313
3314	might_sleep();
3315	spin_lock_bh(&sk->sk_lock.slock);
3316	if (sock_owned_by_user_nocheck(sk))
3317		__lock_sock(sk);
3318	sk->sk_lock.owned = 1;
3319	spin_unlock_bh(&sk->sk_lock.slock);
3320}
3321EXPORT_SYMBOL(lock_sock_nested);
3322
3323void release_sock(struct sock *sk)
3324{
3325	spin_lock_bh(&sk->sk_lock.slock);
3326	if (sk->sk_backlog.tail)
3327		__release_sock(sk);
3328
3329	/* Warning : release_cb() might need to release sk ownership,
3330	 * ie call sock_release_ownership(sk) before us.
3331	 */
3332	if (sk->sk_prot->release_cb)
3333		sk->sk_prot->release_cb(sk);
3334
3335	sock_release_ownership(sk);
3336	if (waitqueue_active(&sk->sk_lock.wq))
3337		wake_up(&sk->sk_lock.wq);
3338	spin_unlock_bh(&sk->sk_lock.slock);
3339}
3340EXPORT_SYMBOL(release_sock);
3341
3342bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3343{
3344	might_sleep();
3345	spin_lock_bh(&sk->sk_lock.slock);
3346
3347	if (!sock_owned_by_user_nocheck(sk)) {
3348		/*
3349		 * Fast path return with bottom halves disabled and
3350		 * sock::sk_lock.slock held.
3351		 *
3352		 * The 'mutex' is not contended and holding
3353		 * sock::sk_lock.slock prevents all other lockers to
3354		 * proceed so the corresponding unlock_sock_fast() can
3355		 * avoid the slow path of release_sock() completely and
3356		 * just release slock.
3357		 *
3358		 * From a semantical POV this is equivalent to 'acquiring'
3359		 * the 'mutex', hence the corresponding lockdep
3360		 * mutex_release() has to happen in the fast path of
3361		 * unlock_sock_fast().
3362		 */
3363		return false;
3364	}
3365
3366	__lock_sock(sk);
3367	sk->sk_lock.owned = 1;
3368	__acquire(&sk->sk_lock.slock);
3369	spin_unlock_bh(&sk->sk_lock.slock);
3370	return true;
3371}
3372EXPORT_SYMBOL(__lock_sock_fast);
3373
3374int sock_gettstamp(struct socket *sock, void __user *userstamp,
3375		   bool timeval, bool time32)
3376{
3377	struct sock *sk = sock->sk;
3378	struct timespec64 ts;
3379
3380	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3381	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3382	if (ts.tv_sec == -1)
3383		return -ENOENT;
3384	if (ts.tv_sec == 0) {
3385		ktime_t kt = ktime_get_real();
3386		sock_write_timestamp(sk, kt);
3387		ts = ktime_to_timespec64(kt);
3388	}
3389
3390	if (timeval)
3391		ts.tv_nsec /= 1000;
3392
3393#ifdef CONFIG_COMPAT_32BIT_TIME
3394	if (time32)
3395		return put_old_timespec32(&ts, userstamp);
3396#endif
3397#ifdef CONFIG_SPARC64
3398	/* beware of padding in sparc64 timeval */
3399	if (timeval && !in_compat_syscall()) {
3400		struct __kernel_old_timeval __user tv = {
3401			.tv_sec = ts.tv_sec,
3402			.tv_usec = ts.tv_nsec,
3403		};
3404		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3405			return -EFAULT;
3406		return 0;
3407	}
3408#endif
3409	return put_timespec64(&ts, userstamp);
3410}
3411EXPORT_SYMBOL(sock_gettstamp);
3412
3413void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3414{
3415	if (!sock_flag(sk, flag)) {
3416		unsigned long previous_flags = sk->sk_flags;
3417
3418		sock_set_flag(sk, flag);
3419		/*
3420		 * we just set one of the two flags which require net
3421		 * time stamping, but time stamping might have been on
3422		 * already because of the other one
3423		 */
3424		if (sock_needs_netstamp(sk) &&
3425		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3426			net_enable_timestamp();
3427	}
3428}
3429
3430int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3431		       int level, int type)
3432{
3433	struct sock_exterr_skb *serr;
3434	struct sk_buff *skb;
3435	int copied, err;
3436
3437	err = -EAGAIN;
3438	skb = sock_dequeue_err_skb(sk);
3439	if (skb == NULL)
3440		goto out;
3441
3442	copied = skb->len;
3443	if (copied > len) {
3444		msg->msg_flags |= MSG_TRUNC;
3445		copied = len;
3446	}
3447	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3448	if (err)
3449		goto out_free_skb;
3450
3451	sock_recv_timestamp(msg, sk, skb);
3452
3453	serr = SKB_EXT_ERR(skb);
3454	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3455
3456	msg->msg_flags |= MSG_ERRQUEUE;
3457	err = copied;
3458
3459out_free_skb:
3460	kfree_skb(skb);
3461out:
3462	return err;
3463}
3464EXPORT_SYMBOL(sock_recv_errqueue);
3465
3466/*
3467 *	Get a socket option on an socket.
3468 *
3469 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3470 *	asynchronous errors should be reported by getsockopt. We assume
3471 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3472 */
3473int sock_common_getsockopt(struct socket *sock, int level, int optname,
3474			   char __user *optval, int __user *optlen)
3475{
3476	struct sock *sk = sock->sk;
3477
3478	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3479}
3480EXPORT_SYMBOL(sock_common_getsockopt);
3481
3482int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3483			int flags)
3484{
3485	struct sock *sk = sock->sk;
3486	int addr_len = 0;
3487	int err;
3488
3489	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3490				   flags & ~MSG_DONTWAIT, &addr_len);
3491	if (err >= 0)
3492		msg->msg_namelen = addr_len;
3493	return err;
3494}
3495EXPORT_SYMBOL(sock_common_recvmsg);
3496
3497/*
3498 *	Set socket options on an inet socket.
3499 */
3500int sock_common_setsockopt(struct socket *sock, int level, int optname,
3501			   sockptr_t optval, unsigned int optlen)
3502{
3503	struct sock *sk = sock->sk;
3504
3505	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3506}
3507EXPORT_SYMBOL(sock_common_setsockopt);
3508
3509void sk_common_release(struct sock *sk)
3510{
3511	if (sk->sk_prot->destroy)
3512		sk->sk_prot->destroy(sk);
3513
3514	/*
3515	 * Observation: when sk_common_release is called, processes have
3516	 * no access to socket. But net still has.
3517	 * Step one, detach it from networking:
3518	 *
3519	 * A. Remove from hash tables.
3520	 */
3521
3522	sk->sk_prot->unhash(sk);
3523
3524	/*
3525	 * In this point socket cannot receive new packets, but it is possible
3526	 * that some packets are in flight because some CPU runs receiver and
3527	 * did hash table lookup before we unhashed socket. They will achieve
3528	 * receive queue and will be purged by socket destructor.
3529	 *
3530	 * Also we still have packets pending on receive queue and probably,
3531	 * our own packets waiting in device queues. sock_destroy will drain
3532	 * receive queue, but transmitted packets will delay socket destruction
3533	 * until the last reference will be released.
3534	 */
3535
3536	sock_orphan(sk);
3537
3538	xfrm_sk_free_policy(sk);
3539
3540	sk_refcnt_debug_release(sk);
3541
3542	sock_put(sk);
3543}
3544EXPORT_SYMBOL(sk_common_release);
3545
3546void sk_get_meminfo(const struct sock *sk, u32 *mem)
3547{
3548	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3549
3550	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3551	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3552	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3553	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3554	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3555	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3556	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3557	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3558	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3559}
3560
3561#ifdef CONFIG_PROC_FS
3562static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3563
3564int sock_prot_inuse_get(struct net *net, struct proto *prot)
3565{
3566	int cpu, idx = prot->inuse_idx;
3567	int res = 0;
3568
3569	for_each_possible_cpu(cpu)
3570		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3571
3572	return res >= 0 ? res : 0;
3573}
3574EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3575
3576int sock_inuse_get(struct net *net)
3577{
3578	int cpu, res = 0;
3579
3580	for_each_possible_cpu(cpu)
3581		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3582
3583	return res;
3584}
3585
3586EXPORT_SYMBOL_GPL(sock_inuse_get);
3587
3588static int __net_init sock_inuse_init_net(struct net *net)
3589{
3590	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3591	if (net->core.prot_inuse == NULL)
3592		return -ENOMEM;
3593	return 0;
3594}
3595
3596static void __net_exit sock_inuse_exit_net(struct net *net)
3597{
3598	free_percpu(net->core.prot_inuse);
3599}
3600
3601static struct pernet_operations net_inuse_ops = {
3602	.init = sock_inuse_init_net,
3603	.exit = sock_inuse_exit_net,
3604};
3605
3606static __init int net_inuse_init(void)
3607{
3608	if (register_pernet_subsys(&net_inuse_ops))
3609		panic("Cannot initialize net inuse counters");
3610
3611	return 0;
3612}
3613
3614core_initcall(net_inuse_init);
3615
3616static int assign_proto_idx(struct proto *prot)
3617{
3618	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3619
3620	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3621		pr_err("PROTO_INUSE_NR exhausted\n");
3622		return -ENOSPC;
3623	}
3624
3625	set_bit(prot->inuse_idx, proto_inuse_idx);
3626	return 0;
3627}
3628
3629static void release_proto_idx(struct proto *prot)
3630{
3631	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3632		clear_bit(prot->inuse_idx, proto_inuse_idx);
3633}
3634#else
3635static inline int assign_proto_idx(struct proto *prot)
3636{
3637	return 0;
3638}
3639
3640static inline void release_proto_idx(struct proto *prot)
3641{
3642}
3643
3644#endif
3645
3646static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3647{
3648	if (!twsk_prot)
3649		return;
3650	kfree(twsk_prot->twsk_slab_name);
3651	twsk_prot->twsk_slab_name = NULL;
3652	kmem_cache_destroy(twsk_prot->twsk_slab);
3653	twsk_prot->twsk_slab = NULL;
3654}
3655
3656static int tw_prot_init(const struct proto *prot)
3657{
3658	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3659
3660	if (!twsk_prot)
3661		return 0;
3662
3663	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3664					      prot->name);
3665	if (!twsk_prot->twsk_slab_name)
3666		return -ENOMEM;
3667
3668	twsk_prot->twsk_slab =
3669		kmem_cache_create(twsk_prot->twsk_slab_name,
3670				  twsk_prot->twsk_obj_size, 0,
3671				  SLAB_ACCOUNT | prot->slab_flags,
3672				  NULL);
3673	if (!twsk_prot->twsk_slab) {
3674		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3675			prot->name);
3676		return -ENOMEM;
3677	}
3678
3679	return 0;
3680}
3681
3682static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3683{
3684	if (!rsk_prot)
3685		return;
3686	kfree(rsk_prot->slab_name);
3687	rsk_prot->slab_name = NULL;
3688	kmem_cache_destroy(rsk_prot->slab);
3689	rsk_prot->slab = NULL;
3690}
3691
3692static int req_prot_init(const struct proto *prot)
3693{
3694	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3695
3696	if (!rsk_prot)
3697		return 0;
3698
3699	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3700					prot->name);
3701	if (!rsk_prot->slab_name)
3702		return -ENOMEM;
3703
3704	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3705					   rsk_prot->obj_size, 0,
3706					   SLAB_ACCOUNT | prot->slab_flags,
3707					   NULL);
3708
3709	if (!rsk_prot->slab) {
3710		pr_crit("%s: Can't create request sock SLAB cache!\n",
3711			prot->name);
3712		return -ENOMEM;
3713	}
3714	return 0;
3715}
3716
3717int proto_register(struct proto *prot, int alloc_slab)
3718{
3719	int ret = -ENOBUFS;
3720
3721	if (prot->memory_allocated && !prot->sysctl_mem) {
3722		pr_err("%s: missing sysctl_mem\n", prot->name);
3723		return -EINVAL;
3724	}
3725	if (alloc_slab) {
3726		prot->slab = kmem_cache_create_usercopy(prot->name,
3727					prot->obj_size, 0,
3728					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3729					prot->slab_flags,
3730					prot->useroffset, prot->usersize,
3731					NULL);
3732
3733		if (prot->slab == NULL) {
3734			pr_crit("%s: Can't create sock SLAB cache!\n",
3735				prot->name);
3736			goto out;
3737		}
3738
3739		if (req_prot_init(prot))
3740			goto out_free_request_sock_slab;
3741
3742		if (tw_prot_init(prot))
3743			goto out_free_timewait_sock_slab;
3744	}
3745
3746	mutex_lock(&proto_list_mutex);
3747	ret = assign_proto_idx(prot);
3748	if (ret) {
3749		mutex_unlock(&proto_list_mutex);
3750		goto out_free_timewait_sock_slab;
3751	}
3752	list_add(&prot->node, &proto_list);
3753	mutex_unlock(&proto_list_mutex);
3754	return ret;
3755
3756out_free_timewait_sock_slab:
3757	if (alloc_slab)
3758		tw_prot_cleanup(prot->twsk_prot);
3759out_free_request_sock_slab:
3760	if (alloc_slab) {
3761		req_prot_cleanup(prot->rsk_prot);
3762
3763		kmem_cache_destroy(prot->slab);
3764		prot->slab = NULL;
3765	}
3766out:
3767	return ret;
3768}
3769EXPORT_SYMBOL(proto_register);
3770
3771void proto_unregister(struct proto *prot)
3772{
3773	mutex_lock(&proto_list_mutex);
3774	release_proto_idx(prot);
3775	list_del(&prot->node);
3776	mutex_unlock(&proto_list_mutex);
3777
3778	kmem_cache_destroy(prot->slab);
3779	prot->slab = NULL;
3780
3781	req_prot_cleanup(prot->rsk_prot);
3782	tw_prot_cleanup(prot->twsk_prot);
3783}
3784EXPORT_SYMBOL(proto_unregister);
3785
3786int sock_load_diag_module(int family, int protocol)
3787{
3788	if (!protocol) {
3789		if (!sock_is_registered(family))
3790			return -ENOENT;
3791
3792		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3793				      NETLINK_SOCK_DIAG, family);
3794	}
3795
3796#ifdef CONFIG_INET
3797	if (family == AF_INET &&
3798	    protocol != IPPROTO_RAW &&
3799	    protocol < MAX_INET_PROTOS &&
3800	    !rcu_access_pointer(inet_protos[protocol]))
3801		return -ENOENT;
3802#endif
3803
3804	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3805			      NETLINK_SOCK_DIAG, family, protocol);
3806}
3807EXPORT_SYMBOL(sock_load_diag_module);
3808
3809#ifdef CONFIG_PROC_FS
3810static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3811	__acquires(proto_list_mutex)
3812{
3813	mutex_lock(&proto_list_mutex);
3814	return seq_list_start_head(&proto_list, *pos);
3815}
3816
3817static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3818{
3819	return seq_list_next(v, &proto_list, pos);
3820}
3821
3822static void proto_seq_stop(struct seq_file *seq, void *v)
3823	__releases(proto_list_mutex)
3824{
3825	mutex_unlock(&proto_list_mutex);
3826}
3827
3828static char proto_method_implemented(const void *method)
3829{
3830	return method == NULL ? 'n' : 'y';
3831}
3832static long sock_prot_memory_allocated(struct proto *proto)
3833{
3834	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3835}
3836
3837static const char *sock_prot_memory_pressure(struct proto *proto)
3838{
3839	return proto->memory_pressure != NULL ?
3840	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3841}
3842
3843static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3844{
3845
3846	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3847			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3848		   proto->name,
3849		   proto->obj_size,
3850		   sock_prot_inuse_get(seq_file_net(seq), proto),
3851		   sock_prot_memory_allocated(proto),
3852		   sock_prot_memory_pressure(proto),
3853		   proto->max_header,
3854		   proto->slab == NULL ? "no" : "yes",
3855		   module_name(proto->owner),
3856		   proto_method_implemented(proto->close),
3857		   proto_method_implemented(proto->connect),
3858		   proto_method_implemented(proto->disconnect),
3859		   proto_method_implemented(proto->accept),
3860		   proto_method_implemented(proto->ioctl),
3861		   proto_method_implemented(proto->init),
3862		   proto_method_implemented(proto->destroy),
3863		   proto_method_implemented(proto->shutdown),
3864		   proto_method_implemented(proto->setsockopt),
3865		   proto_method_implemented(proto->getsockopt),
3866		   proto_method_implemented(proto->sendmsg),
3867		   proto_method_implemented(proto->recvmsg),
3868		   proto_method_implemented(proto->sendpage),
3869		   proto_method_implemented(proto->bind),
3870		   proto_method_implemented(proto->backlog_rcv),
3871		   proto_method_implemented(proto->hash),
3872		   proto_method_implemented(proto->unhash),
3873		   proto_method_implemented(proto->get_port),
3874		   proto_method_implemented(proto->enter_memory_pressure));
3875}
3876
3877static int proto_seq_show(struct seq_file *seq, void *v)
3878{
3879	if (v == &proto_list)
3880		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3881			   "protocol",
3882			   "size",
3883			   "sockets",
3884			   "memory",
3885			   "press",
3886			   "maxhdr",
3887			   "slab",
3888			   "module",
3889			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3890	else
3891		proto_seq_printf(seq, list_entry(v, struct proto, node));
3892	return 0;
3893}
3894
3895static const struct seq_operations proto_seq_ops = {
3896	.start  = proto_seq_start,
3897	.next   = proto_seq_next,
3898	.stop   = proto_seq_stop,
3899	.show   = proto_seq_show,
3900};
3901
3902static __net_init int proto_init_net(struct net *net)
3903{
3904	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3905			sizeof(struct seq_net_private)))
3906		return -ENOMEM;
3907
3908	return 0;
3909}
3910
3911static __net_exit void proto_exit_net(struct net *net)
3912{
3913	remove_proc_entry("protocols", net->proc_net);
3914}
3915
3916
3917static __net_initdata struct pernet_operations proto_net_ops = {
3918	.init = proto_init_net,
3919	.exit = proto_exit_net,
3920};
3921
3922static int __init proto_init(void)
3923{
3924	return register_pernet_subsys(&proto_net_ops);
3925}
3926
3927subsys_initcall(proto_init);
3928
3929#endif /* PROC_FS */
3930
3931#ifdef CONFIG_NET_RX_BUSY_POLL
3932bool sk_busy_loop_end(void *p, unsigned long start_time)
3933{
3934	struct sock *sk = p;
3935
3936	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3937	       sk_busy_loop_timeout(sk, start_time);
3938}
3939EXPORT_SYMBOL(sk_busy_loop_end);
3940#endif /* CONFIG_NET_RX_BUSY_POLL */
3941
3942int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3943{
3944	if (!sk->sk_prot->bind_add)
3945		return -EOPNOTSUPP;
3946	return sk->sk_prot->bind_add(sk, addr, addr_len);
3947}
3948EXPORT_SYMBOL(sock_bind_add);