net/core/sock.c at v5.17-rc7 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v5.17-rc7 3928 lines 96 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144static DEFINE_MUTEX(proto_list_mutex);
 145static LIST_HEAD(proto_list);
 146
 147/**
 148 * sk_ns_capable - General socket capability test
 149 * @sk: Socket to use a capability on or through
 150 * @user_ns: The user namespace of the capability to use
 151 * @cap: The capability to use
 152 *
 153 * Test to see if the opener of the socket had when the socket was
 154 * created and the current process has the capability @cap in the user
 155 * namespace @user_ns.
 156 */
 157bool sk_ns_capable(const struct sock *sk,
 158		   struct user_namespace *user_ns, int cap)
 159{
 160	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161		ns_capable(user_ns, cap);
 162}
 163EXPORT_SYMBOL(sk_ns_capable);
 164
 165/**
 166 * sk_capable - Socket global capability test
 167 * @sk: Socket to use a capability on or through
 168 * @cap: The global capability to use
 169 *
 170 * Test to see if the opener of the socket had when the socket was
 171 * created and the current process has the capability @cap in all user
 172 * namespaces.
 173 */
 174bool sk_capable(const struct sock *sk, int cap)
 175{
 176	return sk_ns_capable(sk, &init_user_ns, cap);
 177}
 178EXPORT_SYMBOL(sk_capable);
 179
 180/**
 181 * sk_net_capable - Network namespace socket capability test
 182 * @sk: Socket to use a capability on or through
 183 * @cap: The capability to use
 184 *
 185 * Test to see if the opener of the socket had when the socket was created
 186 * and the current process has the capability @cap over the network namespace
 187 * the socket is a member of.
 188 */
 189bool sk_net_capable(const struct sock *sk, int cap)
 190{
 191	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192}
 193EXPORT_SYMBOL(sk_net_capable);
 194
 195/*
 196 * Each address family might have different locking rules, so we have
 197 * one slock key per address family and separate keys for internal and
 198 * userspace sockets.
 199 */
 200static struct lock_class_key af_family_keys[AF_MAX];
 201static struct lock_class_key af_family_kern_keys[AF_MAX];
 202static struct lock_class_key af_family_slock_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210
 211#define _sock_locks(x)						  \
 212  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 213  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 214  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 215  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 216  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 217  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 218  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 219  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 220  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 221  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 222  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 223  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 224  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 225  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 226  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 227  x "AF_MCTP"  , \
 228  x "AF_MAX"
 229
 230static const char *const af_family_key_strings[AF_MAX+1] = {
 231	_sock_locks("sk_lock-")
 232};
 233static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 234	_sock_locks("slock-")
 235};
 236static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 237	_sock_locks("clock-")
 238};
 239
 240static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 241	_sock_locks("k-sk_lock-")
 242};
 243static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 244	_sock_locks("k-slock-")
 245};
 246static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 247	_sock_locks("k-clock-")
 248};
 249static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 250	_sock_locks("rlock-")
 251};
 252static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 253	_sock_locks("wlock-")
 254};
 255static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 256	_sock_locks("elock-")
 257};
 258
 259/*
 260 * sk_callback_lock and sk queues locking rules are per-address-family,
 261 * so split the lock classes by using a per-AF key:
 262 */
 263static struct lock_class_key af_callback_keys[AF_MAX];
 264static struct lock_class_key af_rlock_keys[AF_MAX];
 265static struct lock_class_key af_wlock_keys[AF_MAX];
 266static struct lock_class_key af_elock_keys[AF_MAX];
 267static struct lock_class_key af_kern_callback_keys[AF_MAX];
 268
 269/* Run time adjustable parameters. */
 270__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 271EXPORT_SYMBOL(sysctl_wmem_max);
 272__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 273EXPORT_SYMBOL(sysctl_rmem_max);
 274__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 275__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 276
 277/* Maximal space eaten by iovec or ancillary data plus some space */
 278int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 279EXPORT_SYMBOL(sysctl_optmem_max);
 280
 281int sysctl_tstamp_allow_data __read_mostly = 1;
 282
 283DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 284EXPORT_SYMBOL_GPL(memalloc_socks_key);
 285
 286/**
 287 * sk_set_memalloc - sets %SOCK_MEMALLOC
 288 * @sk: socket to set it on
 289 *
 290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 291 * It's the responsibility of the admin to adjust min_free_kbytes
 292 * to meet the requirements
 293 */
 294void sk_set_memalloc(struct sock *sk)
 295{
 296	sock_set_flag(sk, SOCK_MEMALLOC);
 297	sk->sk_allocation |= __GFP_MEMALLOC;
 298	static_branch_inc(&memalloc_socks_key);
 299}
 300EXPORT_SYMBOL_GPL(sk_set_memalloc);
 301
 302void sk_clear_memalloc(struct sock *sk)
 303{
 304	sock_reset_flag(sk, SOCK_MEMALLOC);
 305	sk->sk_allocation &= ~__GFP_MEMALLOC;
 306	static_branch_dec(&memalloc_socks_key);
 307
 308	/*
 309	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 310	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 311	 * it has rmem allocations due to the last swapfile being deactivated
 312	 * but there is a risk that the socket is unusable due to exceeding
 313	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 314	 */
 315	sk_mem_reclaim(sk);
 316}
 317EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 318
 319int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 320{
 321	int ret;
 322	unsigned int noreclaim_flag;
 323
 324	/* these should have been dropped before queueing */
 325	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 326
 327	noreclaim_flag = memalloc_noreclaim_save();
 328	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 329				 tcp_v6_do_rcv,
 330				 tcp_v4_do_rcv,
 331				 sk, skb);
 332	memalloc_noreclaim_restore(noreclaim_flag);
 333
 334	return ret;
 335}
 336EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338void sk_error_report(struct sock *sk)
 339{
 340	sk->sk_error_report(sk);
 341
 342	switch (sk->sk_family) {
 343	case AF_INET:
 344		fallthrough;
 345	case AF_INET6:
 346		trace_inet_sk_error_report(sk);
 347		break;
 348	default:
 349		break;
 350	}
 351}
 352EXPORT_SYMBOL(sk_error_report);
 353
 354int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 355{
 356	struct __kernel_sock_timeval tv;
 357
 358	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 359		tv.tv_sec = 0;
 360		tv.tv_usec = 0;
 361	} else {
 362		tv.tv_sec = timeo / HZ;
 363		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 364	}
 365
 366	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 367		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 368		*(struct old_timeval32 *)optval = tv32;
 369		return sizeof(tv32);
 370	}
 371
 372	if (old_timeval) {
 373		struct __kernel_old_timeval old_tv;
 374		old_tv.tv_sec = tv.tv_sec;
 375		old_tv.tv_usec = tv.tv_usec;
 376		*(struct __kernel_old_timeval *)optval = old_tv;
 377		return sizeof(old_tv);
 378	}
 379
 380	*(struct __kernel_sock_timeval *)optval = tv;
 381	return sizeof(tv);
 382}
 383EXPORT_SYMBOL(sock_get_timeout);
 384
 385int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 386			   sockptr_t optval, int optlen, bool old_timeval)
 387{
 388	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389		struct old_timeval32 tv32;
 390
 391		if (optlen < sizeof(tv32))
 392			return -EINVAL;
 393
 394		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395			return -EFAULT;
 396		tv->tv_sec = tv32.tv_sec;
 397		tv->tv_usec = tv32.tv_usec;
 398	} else if (old_timeval) {
 399		struct __kernel_old_timeval old_tv;
 400
 401		if (optlen < sizeof(old_tv))
 402			return -EINVAL;
 403		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404			return -EFAULT;
 405		tv->tv_sec = old_tv.tv_sec;
 406		tv->tv_usec = old_tv.tv_usec;
 407	} else {
 408		if (optlen < sizeof(*tv))
 409			return -EINVAL;
 410		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 411			return -EFAULT;
 412	}
 413
 414	return 0;
 415}
 416EXPORT_SYMBOL(sock_copy_user_timeval);
 417
 418static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 419			    bool old_timeval)
 420{
 421	struct __kernel_sock_timeval tv;
 422	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 423
 424	if (err)
 425		return err;
 426
 427	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 428		return -EDOM;
 429
 430	if (tv.tv_sec < 0) {
 431		static int warned __read_mostly;
 432
 433		*timeo_p = 0;
 434		if (warned < 10 && net_ratelimit()) {
 435			warned++;
 436			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 437				__func__, current->comm, task_pid_nr(current));
 438		}
 439		return 0;
 440	}
 441	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 442	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 443		return 0;
 444	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 445		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 446	return 0;
 447}
 448
 449static bool sock_needs_netstamp(const struct sock *sk)
 450{
 451	switch (sk->sk_family) {
 452	case AF_UNSPEC:
 453	case AF_UNIX:
 454		return false;
 455	default:
 456		return true;
 457	}
 458}
 459
 460static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 461{
 462	if (sk->sk_flags & flags) {
 463		sk->sk_flags &= ~flags;
 464		if (sock_needs_netstamp(sk) &&
 465		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 466			net_disable_timestamp();
 467	}
 468}
 469
 470
 471int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 472{
 473	unsigned long flags;
 474	struct sk_buff_head *list = &sk->sk_receive_queue;
 475
 476	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 477		atomic_inc(&sk->sk_drops);
 478		trace_sock_rcvqueue_full(sk, skb);
 479		return -ENOMEM;
 480	}
 481
 482	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 483		atomic_inc(&sk->sk_drops);
 484		return -ENOBUFS;
 485	}
 486
 487	skb->dev = NULL;
 488	skb_set_owner_r(skb, sk);
 489
 490	/* we escape from rcu protected region, make sure we dont leak
 491	 * a norefcounted dst
 492	 */
 493	skb_dst_force(skb);
 494
 495	spin_lock_irqsave(&list->lock, flags);
 496	sock_skb_set_dropcount(sk, skb);
 497	__skb_queue_tail(list, skb);
 498	spin_unlock_irqrestore(&list->lock, flags);
 499
 500	if (!sock_flag(sk, SOCK_DEAD))
 501		sk->sk_data_ready(sk);
 502	return 0;
 503}
 504EXPORT_SYMBOL(__sock_queue_rcv_skb);
 505
 506int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 507{
 508	int err;
 509
 510	err = sk_filter(sk, skb);
 511	if (err)
 512		return err;
 513
 514	return __sock_queue_rcv_skb(sk, skb);
 515}
 516EXPORT_SYMBOL(sock_queue_rcv_skb);
 517
 518int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 519		     const int nested, unsigned int trim_cap, bool refcounted)
 520{
 521	int rc = NET_RX_SUCCESS;
 522
 523	if (sk_filter_trim_cap(sk, skb, trim_cap))
 524		goto discard_and_relse;
 525
 526	skb->dev = NULL;
 527
 528	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 529		atomic_inc(&sk->sk_drops);
 530		goto discard_and_relse;
 531	}
 532	if (nested)
 533		bh_lock_sock_nested(sk);
 534	else
 535		bh_lock_sock(sk);
 536	if (!sock_owned_by_user(sk)) {
 537		/*
 538		 * trylock + unlock semantics:
 539		 */
 540		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 541
 542		rc = sk_backlog_rcv(sk, skb);
 543
 544		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 545	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 546		bh_unlock_sock(sk);
 547		atomic_inc(&sk->sk_drops);
 548		goto discard_and_relse;
 549	}
 550
 551	bh_unlock_sock(sk);
 552out:
 553	if (refcounted)
 554		sock_put(sk);
 555	return rc;
 556discard_and_relse:
 557	kfree_skb(skb);
 558	goto out;
 559}
 560EXPORT_SYMBOL(__sk_receive_skb);
 561
 562INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 563							  u32));
 564INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 565							   u32));
 566struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 567{
 568	struct dst_entry *dst = __sk_dst_get(sk);
 569
 570	if (dst && dst->obsolete &&
 571	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 572			       dst, cookie) == NULL) {
 573		sk_tx_queue_clear(sk);
 574		sk->sk_dst_pending_confirm = 0;
 575		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 576		dst_release(dst);
 577		return NULL;
 578	}
 579
 580	return dst;
 581}
 582EXPORT_SYMBOL(__sk_dst_check);
 583
 584struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 585{
 586	struct dst_entry *dst = sk_dst_get(sk);
 587
 588	if (dst && dst->obsolete &&
 589	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 590			       dst, cookie) == NULL) {
 591		sk_dst_reset(sk);
 592		dst_release(dst);
 593		return NULL;
 594	}
 595
 596	return dst;
 597}
 598EXPORT_SYMBOL(sk_dst_check);
 599
 600static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 601{
 602	int ret = -ENOPROTOOPT;
 603#ifdef CONFIG_NETDEVICES
 604	struct net *net = sock_net(sk);
 605
 606	/* Sorry... */
 607	ret = -EPERM;
 608	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 609		goto out;
 610
 611	ret = -EINVAL;
 612	if (ifindex < 0)
 613		goto out;
 614
 615	sk->sk_bound_dev_if = ifindex;
 616	if (sk->sk_prot->rehash)
 617		sk->sk_prot->rehash(sk);
 618	sk_dst_reset(sk);
 619
 620	ret = 0;
 621
 622out:
 623#endif
 624
 625	return ret;
 626}
 627
 628int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 629{
 630	int ret;
 631
 632	if (lock_sk)
 633		lock_sock(sk);
 634	ret = sock_bindtoindex_locked(sk, ifindex);
 635	if (lock_sk)
 636		release_sock(sk);
 637
 638	return ret;
 639}
 640EXPORT_SYMBOL(sock_bindtoindex);
 641
 642static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 643{
 644	int ret = -ENOPROTOOPT;
 645#ifdef CONFIG_NETDEVICES
 646	struct net *net = sock_net(sk);
 647	char devname[IFNAMSIZ];
 648	int index;
 649
 650	ret = -EINVAL;
 651	if (optlen < 0)
 652		goto out;
 653
 654	/* Bind this socket to a particular device like "eth0",
 655	 * as specified in the passed interface name. If the
 656	 * name is "" or the option length is zero the socket
 657	 * is not bound.
 658	 */
 659	if (optlen > IFNAMSIZ - 1)
 660		optlen = IFNAMSIZ - 1;
 661	memset(devname, 0, sizeof(devname));
 662
 663	ret = -EFAULT;
 664	if (copy_from_sockptr(devname, optval, optlen))
 665		goto out;
 666
 667	index = 0;
 668	if (devname[0] != '\0') {
 669		struct net_device *dev;
 670
 671		rcu_read_lock();
 672		dev = dev_get_by_name_rcu(net, devname);
 673		if (dev)
 674			index = dev->ifindex;
 675		rcu_read_unlock();
 676		ret = -ENODEV;
 677		if (!dev)
 678			goto out;
 679	}
 680
 681	return sock_bindtoindex(sk, index, true);
 682out:
 683#endif
 684
 685	return ret;
 686}
 687
 688static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 689				int __user *optlen, int len)
 690{
 691	int ret = -ENOPROTOOPT;
 692#ifdef CONFIG_NETDEVICES
 693	struct net *net = sock_net(sk);
 694	char devname[IFNAMSIZ];
 695
 696	if (sk->sk_bound_dev_if == 0) {
 697		len = 0;
 698		goto zero;
 699	}
 700
 701	ret = -EINVAL;
 702	if (len < IFNAMSIZ)
 703		goto out;
 704
 705	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 706	if (ret)
 707		goto out;
 708
 709	len = strlen(devname) + 1;
 710
 711	ret = -EFAULT;
 712	if (copy_to_user(optval, devname, len))
 713		goto out;
 714
 715zero:
 716	ret = -EFAULT;
 717	if (put_user(len, optlen))
 718		goto out;
 719
 720	ret = 0;
 721
 722out:
 723#endif
 724
 725	return ret;
 726}
 727
 728bool sk_mc_loop(struct sock *sk)
 729{
 730	if (dev_recursion_level())
 731		return false;
 732	if (!sk)
 733		return true;
 734	switch (sk->sk_family) {
 735	case AF_INET:
 736		return inet_sk(sk)->mc_loop;
 737#if IS_ENABLED(CONFIG_IPV6)
 738	case AF_INET6:
 739		return inet6_sk(sk)->mc_loop;
 740#endif
 741	}
 742	WARN_ON_ONCE(1);
 743	return true;
 744}
 745EXPORT_SYMBOL(sk_mc_loop);
 746
 747void sock_set_reuseaddr(struct sock *sk)
 748{
 749	lock_sock(sk);
 750	sk->sk_reuse = SK_CAN_REUSE;
 751	release_sock(sk);
 752}
 753EXPORT_SYMBOL(sock_set_reuseaddr);
 754
 755void sock_set_reuseport(struct sock *sk)
 756{
 757	lock_sock(sk);
 758	sk->sk_reuseport = true;
 759	release_sock(sk);
 760}
 761EXPORT_SYMBOL(sock_set_reuseport);
 762
 763void sock_no_linger(struct sock *sk)
 764{
 765	lock_sock(sk);
 766	sk->sk_lingertime = 0;
 767	sock_set_flag(sk, SOCK_LINGER);
 768	release_sock(sk);
 769}
 770EXPORT_SYMBOL(sock_no_linger);
 771
 772void sock_set_priority(struct sock *sk, u32 priority)
 773{
 774	lock_sock(sk);
 775	sk->sk_priority = priority;
 776	release_sock(sk);
 777}
 778EXPORT_SYMBOL(sock_set_priority);
 779
 780void sock_set_sndtimeo(struct sock *sk, s64 secs)
 781{
 782	lock_sock(sk);
 783	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 784		sk->sk_sndtimeo = secs * HZ;
 785	else
 786		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 787	release_sock(sk);
 788}
 789EXPORT_SYMBOL(sock_set_sndtimeo);
 790
 791static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 792{
 793	if (val)  {
 794		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 795		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 796		sock_set_flag(sk, SOCK_RCVTSTAMP);
 797		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 798	} else {
 799		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 800		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 801	}
 802}
 803
 804void sock_enable_timestamps(struct sock *sk)
 805{
 806	lock_sock(sk);
 807	__sock_set_timestamps(sk, true, false, true);
 808	release_sock(sk);
 809}
 810EXPORT_SYMBOL(sock_enable_timestamps);
 811
 812void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 813{
 814	switch (optname) {
 815	case SO_TIMESTAMP_OLD:
 816		__sock_set_timestamps(sk, valbool, false, false);
 817		break;
 818	case SO_TIMESTAMP_NEW:
 819		__sock_set_timestamps(sk, valbool, true, false);
 820		break;
 821	case SO_TIMESTAMPNS_OLD:
 822		__sock_set_timestamps(sk, valbool, false, true);
 823		break;
 824	case SO_TIMESTAMPNS_NEW:
 825		__sock_set_timestamps(sk, valbool, true, true);
 826		break;
 827	}
 828}
 829
 830static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 831{
 832	struct net *net = sock_net(sk);
 833	struct net_device *dev = NULL;
 834	bool match = false;
 835	int *vclock_index;
 836	int i, num;
 837
 838	if (sk->sk_bound_dev_if)
 839		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 840
 841	if (!dev) {
 842		pr_err("%s: sock not bind to device\n", __func__);
 843		return -EOPNOTSUPP;
 844	}
 845
 846	num = ethtool_get_phc_vclocks(dev, &vclock_index);
 847	dev_put(dev);
 848
 849	for (i = 0; i < num; i++) {
 850		if (*(vclock_index + i) == phc_index) {
 851			match = true;
 852			break;
 853		}
 854	}
 855
 856	if (num > 0)
 857		kfree(vclock_index);
 858
 859	if (!match)
 860		return -EINVAL;
 861
 862	sk->sk_bind_phc = phc_index;
 863
 864	return 0;
 865}
 866
 867int sock_set_timestamping(struct sock *sk, int optname,
 868			  struct so_timestamping timestamping)
 869{
 870	int val = timestamping.flags;
 871	int ret;
 872
 873	if (val & ~SOF_TIMESTAMPING_MASK)
 874		return -EINVAL;
 875
 876	if (val & SOF_TIMESTAMPING_OPT_ID &&
 877	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 878		if (sk_is_tcp(sk)) {
 879			if ((1 << sk->sk_state) &
 880			    (TCPF_CLOSE | TCPF_LISTEN))
 881				return -EINVAL;
 882			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 883		} else {
 884			atomic_set(&sk->sk_tskey, 0);
 885		}
 886	}
 887
 888	if (val & SOF_TIMESTAMPING_OPT_STATS &&
 889	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 890		return -EINVAL;
 891
 892	if (val & SOF_TIMESTAMPING_BIND_PHC) {
 893		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 894		if (ret)
 895			return ret;
 896	}
 897
 898	sk->sk_tsflags = val;
 899	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 900
 901	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 902		sock_enable_timestamp(sk,
 903				      SOCK_TIMESTAMPING_RX_SOFTWARE);
 904	else
 905		sock_disable_timestamp(sk,
 906				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 907	return 0;
 908}
 909
 910void sock_set_keepalive(struct sock *sk)
 911{
 912	lock_sock(sk);
 913	if (sk->sk_prot->keepalive)
 914		sk->sk_prot->keepalive(sk, true);
 915	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 916	release_sock(sk);
 917}
 918EXPORT_SYMBOL(sock_set_keepalive);
 919
 920static void __sock_set_rcvbuf(struct sock *sk, int val)
 921{
 922	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 923	 * as a negative value.
 924	 */
 925	val = min_t(int, val, INT_MAX / 2);
 926	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 927
 928	/* We double it on the way in to account for "struct sk_buff" etc.
 929	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 930	 * will allow that much actual data to be received on that socket.
 931	 *
 932	 * Applications are unaware that "struct sk_buff" and other overheads
 933	 * allocate from the receive buffer during socket buffer allocation.
 934	 *
 935	 * And after considering the possible alternatives, returning the value
 936	 * we actually used in getsockopt is the most desirable behavior.
 937	 */
 938	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 939}
 940
 941void sock_set_rcvbuf(struct sock *sk, int val)
 942{
 943	lock_sock(sk);
 944	__sock_set_rcvbuf(sk, val);
 945	release_sock(sk);
 946}
 947EXPORT_SYMBOL(sock_set_rcvbuf);
 948
 949static void __sock_set_mark(struct sock *sk, u32 val)
 950{
 951	if (val != sk->sk_mark) {
 952		sk->sk_mark = val;
 953		sk_dst_reset(sk);
 954	}
 955}
 956
 957void sock_set_mark(struct sock *sk, u32 val)
 958{
 959	lock_sock(sk);
 960	__sock_set_mark(sk, val);
 961	release_sock(sk);
 962}
 963EXPORT_SYMBOL(sock_set_mark);
 964
 965static void sock_release_reserved_memory(struct sock *sk, int bytes)
 966{
 967	/* Round down bytes to multiple of pages */
 968	bytes &= ~(SK_MEM_QUANTUM - 1);
 969
 970	WARN_ON(bytes > sk->sk_reserved_mem);
 971	sk->sk_reserved_mem -= bytes;
 972	sk_mem_reclaim(sk);
 973}
 974
 975static int sock_reserve_memory(struct sock *sk, int bytes)
 976{
 977	long allocated;
 978	bool charged;
 979	int pages;
 980
 981	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
 982		return -EOPNOTSUPP;
 983
 984	if (!bytes)
 985		return 0;
 986
 987	pages = sk_mem_pages(bytes);
 988
 989	/* pre-charge to memcg */
 990	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
 991					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 992	if (!charged)
 993		return -ENOMEM;
 994
 995	/* pre-charge to forward_alloc */
 996	allocated = sk_memory_allocated_add(sk, pages);
 997	/* If the system goes into memory pressure with this
 998	 * precharge, give up and return error.
 999	 */
1000	if (allocated > sk_prot_mem_limits(sk, 1)) {
1001		sk_memory_allocated_sub(sk, pages);
1002		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1003		return -ENOMEM;
1004	}
1005	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1006
1007	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1008
1009	return 0;
1010}
1011
1012/*
1013 *	This is meant for all protocols to use and covers goings on
1014 *	at the socket level. Everything here is generic.
1015 */
1016
1017int sock_setsockopt(struct socket *sock, int level, int optname,
1018		    sockptr_t optval, unsigned int optlen)
1019{
1020	struct so_timestamping timestamping;
1021	struct sock_txtime sk_txtime;
1022	struct sock *sk = sock->sk;
1023	int val;
1024	int valbool;
1025	struct linger ling;
1026	int ret = 0;
1027
1028	/*
1029	 *	Options without arguments
1030	 */
1031
1032	if (optname == SO_BINDTODEVICE)
1033		return sock_setbindtodevice(sk, optval, optlen);
1034
1035	if (optlen < sizeof(int))
1036		return -EINVAL;
1037
1038	if (copy_from_sockptr(&val, optval, sizeof(val)))
1039		return -EFAULT;
1040
1041	valbool = val ? 1 : 0;
1042
1043	lock_sock(sk);
1044
1045	switch (optname) {
1046	case SO_DEBUG:
1047		if (val && !capable(CAP_NET_ADMIN))
1048			ret = -EACCES;
1049		else
1050			sock_valbool_flag(sk, SOCK_DBG, valbool);
1051		break;
1052	case SO_REUSEADDR:
1053		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1054		break;
1055	case SO_REUSEPORT:
1056		sk->sk_reuseport = valbool;
1057		break;
1058	case SO_TYPE:
1059	case SO_PROTOCOL:
1060	case SO_DOMAIN:
1061	case SO_ERROR:
1062		ret = -ENOPROTOOPT;
1063		break;
1064	case SO_DONTROUTE:
1065		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1066		sk_dst_reset(sk);
1067		break;
1068	case SO_BROADCAST:
1069		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1070		break;
1071	case SO_SNDBUF:
1072		/* Don't error on this BSD doesn't and if you think
1073		 * about it this is right. Otherwise apps have to
1074		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1075		 * are treated in BSD as hints
1076		 */
1077		val = min_t(u32, val, sysctl_wmem_max);
1078set_sndbuf:
1079		/* Ensure val * 2 fits into an int, to prevent max_t()
1080		 * from treating it as a negative value.
1081		 */
1082		val = min_t(int, val, INT_MAX / 2);
1083		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1084		WRITE_ONCE(sk->sk_sndbuf,
1085			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1086		/* Wake up sending tasks if we upped the value. */
1087		sk->sk_write_space(sk);
1088		break;
1089
1090	case SO_SNDBUFFORCE:
1091		if (!capable(CAP_NET_ADMIN)) {
1092			ret = -EPERM;
1093			break;
1094		}
1095
1096		/* No negative values (to prevent underflow, as val will be
1097		 * multiplied by 2).
1098		 */
1099		if (val < 0)
1100			val = 0;
1101		goto set_sndbuf;
1102
1103	case SO_RCVBUF:
1104		/* Don't error on this BSD doesn't and if you think
1105		 * about it this is right. Otherwise apps have to
1106		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1107		 * are treated in BSD as hints
1108		 */
1109		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1110		break;
1111
1112	case SO_RCVBUFFORCE:
1113		if (!capable(CAP_NET_ADMIN)) {
1114			ret = -EPERM;
1115			break;
1116		}
1117
1118		/* No negative values (to prevent underflow, as val will be
1119		 * multiplied by 2).
1120		 */
1121		__sock_set_rcvbuf(sk, max(val, 0));
1122		break;
1123
1124	case SO_KEEPALIVE:
1125		if (sk->sk_prot->keepalive)
1126			sk->sk_prot->keepalive(sk, valbool);
1127		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1128		break;
1129
1130	case SO_OOBINLINE:
1131		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1132		break;
1133
1134	case SO_NO_CHECK:
1135		sk->sk_no_check_tx = valbool;
1136		break;
1137
1138	case SO_PRIORITY:
1139		if ((val >= 0 && val <= 6) ||
1140		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1141		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1142			sk->sk_priority = val;
1143		else
1144			ret = -EPERM;
1145		break;
1146
1147	case SO_LINGER:
1148		if (optlen < sizeof(ling)) {
1149			ret = -EINVAL;	/* 1003.1g */
1150			break;
1151		}
1152		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1153			ret = -EFAULT;
1154			break;
1155		}
1156		if (!ling.l_onoff)
1157			sock_reset_flag(sk, SOCK_LINGER);
1158		else {
1159#if (BITS_PER_LONG == 32)
1160			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1161				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1162			else
1163#endif
1164				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1165			sock_set_flag(sk, SOCK_LINGER);
1166		}
1167		break;
1168
1169	case SO_BSDCOMPAT:
1170		break;
1171
1172	case SO_PASSCRED:
1173		if (valbool)
1174			set_bit(SOCK_PASSCRED, &sock->flags);
1175		else
1176			clear_bit(SOCK_PASSCRED, &sock->flags);
1177		break;
1178
1179	case SO_TIMESTAMP_OLD:
1180	case SO_TIMESTAMP_NEW:
1181	case SO_TIMESTAMPNS_OLD:
1182	case SO_TIMESTAMPNS_NEW:
1183		sock_set_timestamp(sk, optname, valbool);
1184		break;
1185
1186	case SO_TIMESTAMPING_NEW:
1187	case SO_TIMESTAMPING_OLD:
1188		if (optlen == sizeof(timestamping)) {
1189			if (copy_from_sockptr(&timestamping, optval,
1190					      sizeof(timestamping))) {
1191				ret = -EFAULT;
1192				break;
1193			}
1194		} else {
1195			memset(&timestamping, 0, sizeof(timestamping));
1196			timestamping.flags = val;
1197		}
1198		ret = sock_set_timestamping(sk, optname, timestamping);
1199		break;
1200
1201	case SO_RCVLOWAT:
1202		if (val < 0)
1203			val = INT_MAX;
1204		if (sock->ops->set_rcvlowat)
1205			ret = sock->ops->set_rcvlowat(sk, val);
1206		else
1207			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1208		break;
1209
1210	case SO_RCVTIMEO_OLD:
1211	case SO_RCVTIMEO_NEW:
1212		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1213				       optlen, optname == SO_RCVTIMEO_OLD);
1214		break;
1215
1216	case SO_SNDTIMEO_OLD:
1217	case SO_SNDTIMEO_NEW:
1218		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1219				       optlen, optname == SO_SNDTIMEO_OLD);
1220		break;
1221
1222	case SO_ATTACH_FILTER: {
1223		struct sock_fprog fprog;
1224
1225		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1226		if (!ret)
1227			ret = sk_attach_filter(&fprog, sk);
1228		break;
1229	}
1230	case SO_ATTACH_BPF:
1231		ret = -EINVAL;
1232		if (optlen == sizeof(u32)) {
1233			u32 ufd;
1234
1235			ret = -EFAULT;
1236			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1237				break;
1238
1239			ret = sk_attach_bpf(ufd, sk);
1240		}
1241		break;
1242
1243	case SO_ATTACH_REUSEPORT_CBPF: {
1244		struct sock_fprog fprog;
1245
1246		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1247		if (!ret)
1248			ret = sk_reuseport_attach_filter(&fprog, sk);
1249		break;
1250	}
1251	case SO_ATTACH_REUSEPORT_EBPF:
1252		ret = -EINVAL;
1253		if (optlen == sizeof(u32)) {
1254			u32 ufd;
1255
1256			ret = -EFAULT;
1257			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1258				break;
1259
1260			ret = sk_reuseport_attach_bpf(ufd, sk);
1261		}
1262		break;
1263
1264	case SO_DETACH_REUSEPORT_BPF:
1265		ret = reuseport_detach_prog(sk);
1266		break;
1267
1268	case SO_DETACH_FILTER:
1269		ret = sk_detach_filter(sk);
1270		break;
1271
1272	case SO_LOCK_FILTER:
1273		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1274			ret = -EPERM;
1275		else
1276			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1277		break;
1278
1279	case SO_PASSSEC:
1280		if (valbool)
1281			set_bit(SOCK_PASSSEC, &sock->flags);
1282		else
1283			clear_bit(SOCK_PASSSEC, &sock->flags);
1284		break;
1285	case SO_MARK:
1286		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1287		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1288			ret = -EPERM;
1289			break;
1290		}
1291
1292		__sock_set_mark(sk, val);
1293		break;
1294
1295	case SO_RXQ_OVFL:
1296		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1297		break;
1298
1299	case SO_WIFI_STATUS:
1300		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1301		break;
1302
1303	case SO_PEEK_OFF:
1304		if (sock->ops->set_peek_off)
1305			ret = sock->ops->set_peek_off(sk, val);
1306		else
1307			ret = -EOPNOTSUPP;
1308		break;
1309
1310	case SO_NOFCS:
1311		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1312		break;
1313
1314	case SO_SELECT_ERR_QUEUE:
1315		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1316		break;
1317
1318#ifdef CONFIG_NET_RX_BUSY_POLL
1319	case SO_BUSY_POLL:
1320		/* allow unprivileged users to decrease the value */
1321		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1322			ret = -EPERM;
1323		else {
1324			if (val < 0)
1325				ret = -EINVAL;
1326			else
1327				WRITE_ONCE(sk->sk_ll_usec, val);
1328		}
1329		break;
1330	case SO_PREFER_BUSY_POLL:
1331		if (valbool && !capable(CAP_NET_ADMIN))
1332			ret = -EPERM;
1333		else
1334			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1335		break;
1336	case SO_BUSY_POLL_BUDGET:
1337		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1338			ret = -EPERM;
1339		} else {
1340			if (val < 0 || val > U16_MAX)
1341				ret = -EINVAL;
1342			else
1343				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1344		}
1345		break;
1346#endif
1347
1348	case SO_MAX_PACING_RATE:
1349		{
1350		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1351
1352		if (sizeof(ulval) != sizeof(val) &&
1353		    optlen >= sizeof(ulval) &&
1354		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1355			ret = -EFAULT;
1356			break;
1357		}
1358		if (ulval != ~0UL)
1359			cmpxchg(&sk->sk_pacing_status,
1360				SK_PACING_NONE,
1361				SK_PACING_NEEDED);
1362		sk->sk_max_pacing_rate = ulval;
1363		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1364		break;
1365		}
1366	case SO_INCOMING_CPU:
1367		WRITE_ONCE(sk->sk_incoming_cpu, val);
1368		break;
1369
1370	case SO_CNX_ADVICE:
1371		if (val == 1)
1372			dst_negative_advice(sk);
1373		break;
1374
1375	case SO_ZEROCOPY:
1376		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1377			if (!(sk_is_tcp(sk) ||
1378			      (sk->sk_type == SOCK_DGRAM &&
1379			       sk->sk_protocol == IPPROTO_UDP)))
1380				ret = -ENOTSUPP;
1381		} else if (sk->sk_family != PF_RDS) {
1382			ret = -ENOTSUPP;
1383		}
1384		if (!ret) {
1385			if (val < 0 || val > 1)
1386				ret = -EINVAL;
1387			else
1388				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1389		}
1390		break;
1391
1392	case SO_TXTIME:
1393		if (optlen != sizeof(struct sock_txtime)) {
1394			ret = -EINVAL;
1395			break;
1396		} else if (copy_from_sockptr(&sk_txtime, optval,
1397			   sizeof(struct sock_txtime))) {
1398			ret = -EFAULT;
1399			break;
1400		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1401			ret = -EINVAL;
1402			break;
1403		}
1404		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1405		 * scheduler has enough safe guards.
1406		 */
1407		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1408		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1409			ret = -EPERM;
1410			break;
1411		}
1412		sock_valbool_flag(sk, SOCK_TXTIME, true);
1413		sk->sk_clockid = sk_txtime.clockid;
1414		sk->sk_txtime_deadline_mode =
1415			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1416		sk->sk_txtime_report_errors =
1417			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1418		break;
1419
1420	case SO_BINDTOIFINDEX:
1421		ret = sock_bindtoindex_locked(sk, val);
1422		break;
1423
1424	case SO_BUF_LOCK:
1425		if (val & ~SOCK_BUF_LOCK_MASK) {
1426			ret = -EINVAL;
1427			break;
1428		}
1429		sk->sk_userlocks = val | (sk->sk_userlocks &
1430					  ~SOCK_BUF_LOCK_MASK);
1431		break;
1432
1433	case SO_RESERVE_MEM:
1434	{
1435		int delta;
1436
1437		if (val < 0) {
1438			ret = -EINVAL;
1439			break;
1440		}
1441
1442		delta = val - sk->sk_reserved_mem;
1443		if (delta < 0)
1444			sock_release_reserved_memory(sk, -delta);
1445		else
1446			ret = sock_reserve_memory(sk, delta);
1447		break;
1448	}
1449
1450	default:
1451		ret = -ENOPROTOOPT;
1452		break;
1453	}
1454	release_sock(sk);
1455	return ret;
1456}
1457EXPORT_SYMBOL(sock_setsockopt);
1458
1459static const struct cred *sk_get_peer_cred(struct sock *sk)
1460{
1461	const struct cred *cred;
1462
1463	spin_lock(&sk->sk_peer_lock);
1464	cred = get_cred(sk->sk_peer_cred);
1465	spin_unlock(&sk->sk_peer_lock);
1466
1467	return cred;
1468}
1469
1470static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1471			  struct ucred *ucred)
1472{
1473	ucred->pid = pid_vnr(pid);
1474	ucred->uid = ucred->gid = -1;
1475	if (cred) {
1476		struct user_namespace *current_ns = current_user_ns();
1477
1478		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1479		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1480	}
1481}
1482
1483static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1484{
1485	struct user_namespace *user_ns = current_user_ns();
1486	int i;
1487
1488	for (i = 0; i < src->ngroups; i++)
1489		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1490			return -EFAULT;
1491
1492	return 0;
1493}
1494
1495int sock_getsockopt(struct socket *sock, int level, int optname,
1496		    char __user *optval, int __user *optlen)
1497{
1498	struct sock *sk = sock->sk;
1499
1500	union {
1501		int val;
1502		u64 val64;
1503		unsigned long ulval;
1504		struct linger ling;
1505		struct old_timeval32 tm32;
1506		struct __kernel_old_timeval tm;
1507		struct  __kernel_sock_timeval stm;
1508		struct sock_txtime txtime;
1509		struct so_timestamping timestamping;
1510	} v;
1511
1512	int lv = sizeof(int);
1513	int len;
1514
1515	if (get_user(len, optlen))
1516		return -EFAULT;
1517	if (len < 0)
1518		return -EINVAL;
1519
1520	memset(&v, 0, sizeof(v));
1521
1522	switch (optname) {
1523	case SO_DEBUG:
1524		v.val = sock_flag(sk, SOCK_DBG);
1525		break;
1526
1527	case SO_DONTROUTE:
1528		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1529		break;
1530
1531	case SO_BROADCAST:
1532		v.val = sock_flag(sk, SOCK_BROADCAST);
1533		break;
1534
1535	case SO_SNDBUF:
1536		v.val = sk->sk_sndbuf;
1537		break;
1538
1539	case SO_RCVBUF:
1540		v.val = sk->sk_rcvbuf;
1541		break;
1542
1543	case SO_REUSEADDR:
1544		v.val = sk->sk_reuse;
1545		break;
1546
1547	case SO_REUSEPORT:
1548		v.val = sk->sk_reuseport;
1549		break;
1550
1551	case SO_KEEPALIVE:
1552		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1553		break;
1554
1555	case SO_TYPE:
1556		v.val = sk->sk_type;
1557		break;
1558
1559	case SO_PROTOCOL:
1560		v.val = sk->sk_protocol;
1561		break;
1562
1563	case SO_DOMAIN:
1564		v.val = sk->sk_family;
1565		break;
1566
1567	case SO_ERROR:
1568		v.val = -sock_error(sk);
1569		if (v.val == 0)
1570			v.val = xchg(&sk->sk_err_soft, 0);
1571		break;
1572
1573	case SO_OOBINLINE:
1574		v.val = sock_flag(sk, SOCK_URGINLINE);
1575		break;
1576
1577	case SO_NO_CHECK:
1578		v.val = sk->sk_no_check_tx;
1579		break;
1580
1581	case SO_PRIORITY:
1582		v.val = sk->sk_priority;
1583		break;
1584
1585	case SO_LINGER:
1586		lv		= sizeof(v.ling);
1587		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1588		v.ling.l_linger	= sk->sk_lingertime / HZ;
1589		break;
1590
1591	case SO_BSDCOMPAT:
1592		break;
1593
1594	case SO_TIMESTAMP_OLD:
1595		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1596				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1597				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1598		break;
1599
1600	case SO_TIMESTAMPNS_OLD:
1601		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1602		break;
1603
1604	case SO_TIMESTAMP_NEW:
1605		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1606		break;
1607
1608	case SO_TIMESTAMPNS_NEW:
1609		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1610		break;
1611
1612	case SO_TIMESTAMPING_OLD:
1613		lv = sizeof(v.timestamping);
1614		v.timestamping.flags = sk->sk_tsflags;
1615		v.timestamping.bind_phc = sk->sk_bind_phc;
1616		break;
1617
1618	case SO_RCVTIMEO_OLD:
1619	case SO_RCVTIMEO_NEW:
1620		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1621		break;
1622
1623	case SO_SNDTIMEO_OLD:
1624	case SO_SNDTIMEO_NEW:
1625		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1626		break;
1627
1628	case SO_RCVLOWAT:
1629		v.val = sk->sk_rcvlowat;
1630		break;
1631
1632	case SO_SNDLOWAT:
1633		v.val = 1;
1634		break;
1635
1636	case SO_PASSCRED:
1637		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1638		break;
1639
1640	case SO_PEERCRED:
1641	{
1642		struct ucred peercred;
1643		if (len > sizeof(peercred))
1644			len = sizeof(peercred);
1645
1646		spin_lock(&sk->sk_peer_lock);
1647		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1648		spin_unlock(&sk->sk_peer_lock);
1649
1650		if (copy_to_user(optval, &peercred, len))
1651			return -EFAULT;
1652		goto lenout;
1653	}
1654
1655	case SO_PEERGROUPS:
1656	{
1657		const struct cred *cred;
1658		int ret, n;
1659
1660		cred = sk_get_peer_cred(sk);
1661		if (!cred)
1662			return -ENODATA;
1663
1664		n = cred->group_info->ngroups;
1665		if (len < n * sizeof(gid_t)) {
1666			len = n * sizeof(gid_t);
1667			put_cred(cred);
1668			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1669		}
1670		len = n * sizeof(gid_t);
1671
1672		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1673		put_cred(cred);
1674		if (ret)
1675			return ret;
1676		goto lenout;
1677	}
1678
1679	case SO_PEERNAME:
1680	{
1681		char address[128];
1682
1683		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1684		if (lv < 0)
1685			return -ENOTCONN;
1686		if (lv < len)
1687			return -EINVAL;
1688		if (copy_to_user(optval, address, len))
1689			return -EFAULT;
1690		goto lenout;
1691	}
1692
1693	/* Dubious BSD thing... Probably nobody even uses it, but
1694	 * the UNIX standard wants it for whatever reason... -DaveM
1695	 */
1696	case SO_ACCEPTCONN:
1697		v.val = sk->sk_state == TCP_LISTEN;
1698		break;
1699
1700	case SO_PASSSEC:
1701		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1702		break;
1703
1704	case SO_PEERSEC:
1705		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1706
1707	case SO_MARK:
1708		v.val = sk->sk_mark;
1709		break;
1710
1711	case SO_RXQ_OVFL:
1712		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1713		break;
1714
1715	case SO_WIFI_STATUS:
1716		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1717		break;
1718
1719	case SO_PEEK_OFF:
1720		if (!sock->ops->set_peek_off)
1721			return -EOPNOTSUPP;
1722
1723		v.val = sk->sk_peek_off;
1724		break;
1725	case SO_NOFCS:
1726		v.val = sock_flag(sk, SOCK_NOFCS);
1727		break;
1728
1729	case SO_BINDTODEVICE:
1730		return sock_getbindtodevice(sk, optval, optlen, len);
1731
1732	case SO_GET_FILTER:
1733		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1734		if (len < 0)
1735			return len;
1736
1737		goto lenout;
1738
1739	case SO_LOCK_FILTER:
1740		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1741		break;
1742
1743	case SO_BPF_EXTENSIONS:
1744		v.val = bpf_tell_extensions();
1745		break;
1746
1747	case SO_SELECT_ERR_QUEUE:
1748		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1749		break;
1750
1751#ifdef CONFIG_NET_RX_BUSY_POLL
1752	case SO_BUSY_POLL:
1753		v.val = sk->sk_ll_usec;
1754		break;
1755	case SO_PREFER_BUSY_POLL:
1756		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1757		break;
1758#endif
1759
1760	case SO_MAX_PACING_RATE:
1761		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1762			lv = sizeof(v.ulval);
1763			v.ulval = sk->sk_max_pacing_rate;
1764		} else {
1765			/* 32bit version */
1766			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1767		}
1768		break;
1769
1770	case SO_INCOMING_CPU:
1771		v.val = READ_ONCE(sk->sk_incoming_cpu);
1772		break;
1773
1774	case SO_MEMINFO:
1775	{
1776		u32 meminfo[SK_MEMINFO_VARS];
1777
1778		sk_get_meminfo(sk, meminfo);
1779
1780		len = min_t(unsigned int, len, sizeof(meminfo));
1781		if (copy_to_user(optval, &meminfo, len))
1782			return -EFAULT;
1783
1784		goto lenout;
1785	}
1786
1787#ifdef CONFIG_NET_RX_BUSY_POLL
1788	case SO_INCOMING_NAPI_ID:
1789		v.val = READ_ONCE(sk->sk_napi_id);
1790
1791		/* aggregate non-NAPI IDs down to 0 */
1792		if (v.val < MIN_NAPI_ID)
1793			v.val = 0;
1794
1795		break;
1796#endif
1797
1798	case SO_COOKIE:
1799		lv = sizeof(u64);
1800		if (len < lv)
1801			return -EINVAL;
1802		v.val64 = sock_gen_cookie(sk);
1803		break;
1804
1805	case SO_ZEROCOPY:
1806		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1807		break;
1808
1809	case SO_TXTIME:
1810		lv = sizeof(v.txtime);
1811		v.txtime.clockid = sk->sk_clockid;
1812		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1813				  SOF_TXTIME_DEADLINE_MODE : 0;
1814		v.txtime.flags |= sk->sk_txtime_report_errors ?
1815				  SOF_TXTIME_REPORT_ERRORS : 0;
1816		break;
1817
1818	case SO_BINDTOIFINDEX:
1819		v.val = sk->sk_bound_dev_if;
1820		break;
1821
1822	case SO_NETNS_COOKIE:
1823		lv = sizeof(u64);
1824		if (len != lv)
1825			return -EINVAL;
1826		v.val64 = sock_net(sk)->net_cookie;
1827		break;
1828
1829	case SO_BUF_LOCK:
1830		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1831		break;
1832
1833	case SO_RESERVE_MEM:
1834		v.val = sk->sk_reserved_mem;
1835		break;
1836
1837	default:
1838		/* We implement the SO_SNDLOWAT etc to not be settable
1839		 * (1003.1g 7).
1840		 */
1841		return -ENOPROTOOPT;
1842	}
1843
1844	if (len > lv)
1845		len = lv;
1846	if (copy_to_user(optval, &v, len))
1847		return -EFAULT;
1848lenout:
1849	if (put_user(len, optlen))
1850		return -EFAULT;
1851	return 0;
1852}
1853
1854/*
1855 * Initialize an sk_lock.
1856 *
1857 * (We also register the sk_lock with the lock validator.)
1858 */
1859static inline void sock_lock_init(struct sock *sk)
1860{
1861	if (sk->sk_kern_sock)
1862		sock_lock_init_class_and_name(
1863			sk,
1864			af_family_kern_slock_key_strings[sk->sk_family],
1865			af_family_kern_slock_keys + sk->sk_family,
1866			af_family_kern_key_strings[sk->sk_family],
1867			af_family_kern_keys + sk->sk_family);
1868	else
1869		sock_lock_init_class_and_name(
1870			sk,
1871			af_family_slock_key_strings[sk->sk_family],
1872			af_family_slock_keys + sk->sk_family,
1873			af_family_key_strings[sk->sk_family],
1874			af_family_keys + sk->sk_family);
1875}
1876
1877/*
1878 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1879 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1880 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1881 */
1882static void sock_copy(struct sock *nsk, const struct sock *osk)
1883{
1884	const struct proto *prot = READ_ONCE(osk->sk_prot);
1885#ifdef CONFIG_SECURITY_NETWORK
1886	void *sptr = nsk->sk_security;
1887#endif
1888
1889	/* If we move sk_tx_queue_mapping out of the private section,
1890	 * we must check if sk_tx_queue_clear() is called after
1891	 * sock_copy() in sk_clone_lock().
1892	 */
1893	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1894		     offsetof(struct sock, sk_dontcopy_begin) ||
1895		     offsetof(struct sock, sk_tx_queue_mapping) >=
1896		     offsetof(struct sock, sk_dontcopy_end));
1897
1898	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1899
1900	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1901	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1902
1903#ifdef CONFIG_SECURITY_NETWORK
1904	nsk->sk_security = sptr;
1905	security_sk_clone(osk, nsk);
1906#endif
1907}
1908
1909static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1910		int family)
1911{
1912	struct sock *sk;
1913	struct kmem_cache *slab;
1914
1915	slab = prot->slab;
1916	if (slab != NULL) {
1917		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1918		if (!sk)
1919			return sk;
1920		if (want_init_on_alloc(priority))
1921			sk_prot_clear_nulls(sk, prot->obj_size);
1922	} else
1923		sk = kmalloc(prot->obj_size, priority);
1924
1925	if (sk != NULL) {
1926		if (security_sk_alloc(sk, family, priority))
1927			goto out_free;
1928
1929		if (!try_module_get(prot->owner))
1930			goto out_free_sec;
1931	}
1932
1933	return sk;
1934
1935out_free_sec:
1936	security_sk_free(sk);
1937out_free:
1938	if (slab != NULL)
1939		kmem_cache_free(slab, sk);
1940	else
1941		kfree(sk);
1942	return NULL;
1943}
1944
1945static void sk_prot_free(struct proto *prot, struct sock *sk)
1946{
1947	struct kmem_cache *slab;
1948	struct module *owner;
1949
1950	owner = prot->owner;
1951	slab = prot->slab;
1952
1953	cgroup_sk_free(&sk->sk_cgrp_data);
1954	mem_cgroup_sk_free(sk);
1955	security_sk_free(sk);
1956	if (slab != NULL)
1957		kmem_cache_free(slab, sk);
1958	else
1959		kfree(sk);
1960	module_put(owner);
1961}
1962
1963/**
1964 *	sk_alloc - All socket objects are allocated here
1965 *	@net: the applicable net namespace
1966 *	@family: protocol family
1967 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1968 *	@prot: struct proto associated with this new sock instance
1969 *	@kern: is this to be a kernel socket?
1970 */
1971struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1972		      struct proto *prot, int kern)
1973{
1974	struct sock *sk;
1975
1976	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1977	if (sk) {
1978		sk->sk_family = family;
1979		/*
1980		 * See comment in struct sock definition to understand
1981		 * why we need sk_prot_creator -acme
1982		 */
1983		sk->sk_prot = sk->sk_prot_creator = prot;
1984		sk->sk_kern_sock = kern;
1985		sock_lock_init(sk);
1986		sk->sk_net_refcnt = kern ? 0 : 1;
1987		if (likely(sk->sk_net_refcnt)) {
1988			get_net_track(net, &sk->ns_tracker, priority);
1989			sock_inuse_add(net, 1);
1990		}
1991
1992		sock_net_set(sk, net);
1993		refcount_set(&sk->sk_wmem_alloc, 1);
1994
1995		mem_cgroup_sk_alloc(sk);
1996		cgroup_sk_alloc(&sk->sk_cgrp_data);
1997		sock_update_classid(&sk->sk_cgrp_data);
1998		sock_update_netprioidx(&sk->sk_cgrp_data);
1999		sk_tx_queue_clear(sk);
2000	}
2001
2002	return sk;
2003}
2004EXPORT_SYMBOL(sk_alloc);
2005
2006/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2007 * grace period. This is the case for UDP sockets and TCP listeners.
2008 */
2009static void __sk_destruct(struct rcu_head *head)
2010{
2011	struct sock *sk = container_of(head, struct sock, sk_rcu);
2012	struct sk_filter *filter;
2013
2014	if (sk->sk_destruct)
2015		sk->sk_destruct(sk);
2016
2017	filter = rcu_dereference_check(sk->sk_filter,
2018				       refcount_read(&sk->sk_wmem_alloc) == 0);
2019	if (filter) {
2020		sk_filter_uncharge(sk, filter);
2021		RCU_INIT_POINTER(sk->sk_filter, NULL);
2022	}
2023
2024	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2025
2026#ifdef CONFIG_BPF_SYSCALL
2027	bpf_sk_storage_free(sk);
2028#endif
2029
2030	if (atomic_read(&sk->sk_omem_alloc))
2031		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2032			 __func__, atomic_read(&sk->sk_omem_alloc));
2033
2034	if (sk->sk_frag.page) {
2035		put_page(sk->sk_frag.page);
2036		sk->sk_frag.page = NULL;
2037	}
2038
2039	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2040	put_cred(sk->sk_peer_cred);
2041	put_pid(sk->sk_peer_pid);
2042
2043	if (likely(sk->sk_net_refcnt))
2044		put_net_track(sock_net(sk), &sk->ns_tracker);
2045	sk_prot_free(sk->sk_prot_creator, sk);
2046}
2047
2048void sk_destruct(struct sock *sk)
2049{
2050	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2051
2052	WARN_ON_ONCE(!llist_empty(&sk->defer_list));
2053	sk_defer_free_flush(sk);
2054
2055	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2056		reuseport_detach_sock(sk);
2057		use_call_rcu = true;
2058	}
2059
2060	if (use_call_rcu)
2061		call_rcu(&sk->sk_rcu, __sk_destruct);
2062	else
2063		__sk_destruct(&sk->sk_rcu);
2064}
2065
2066static void __sk_free(struct sock *sk)
2067{
2068	if (likely(sk->sk_net_refcnt))
2069		sock_inuse_add(sock_net(sk), -1);
2070
2071	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2072		sock_diag_broadcast_destroy(sk);
2073	else
2074		sk_destruct(sk);
2075}
2076
2077void sk_free(struct sock *sk)
2078{
2079	/*
2080	 * We subtract one from sk_wmem_alloc and can know if
2081	 * some packets are still in some tx queue.
2082	 * If not null, sock_wfree() will call __sk_free(sk) later
2083	 */
2084	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2085		__sk_free(sk);
2086}
2087EXPORT_SYMBOL(sk_free);
2088
2089static void sk_init_common(struct sock *sk)
2090{
2091	skb_queue_head_init(&sk->sk_receive_queue);
2092	skb_queue_head_init(&sk->sk_write_queue);
2093	skb_queue_head_init(&sk->sk_error_queue);
2094
2095	rwlock_init(&sk->sk_callback_lock);
2096	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2097			af_rlock_keys + sk->sk_family,
2098			af_family_rlock_key_strings[sk->sk_family]);
2099	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2100			af_wlock_keys + sk->sk_family,
2101			af_family_wlock_key_strings[sk->sk_family]);
2102	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2103			af_elock_keys + sk->sk_family,
2104			af_family_elock_key_strings[sk->sk_family]);
2105	lockdep_set_class_and_name(&sk->sk_callback_lock,
2106			af_callback_keys + sk->sk_family,
2107			af_family_clock_key_strings[sk->sk_family]);
2108}
2109
2110/**
2111 *	sk_clone_lock - clone a socket, and lock its clone
2112 *	@sk: the socket to clone
2113 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2114 *
2115 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2116 */
2117struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2118{
2119	struct proto *prot = READ_ONCE(sk->sk_prot);
2120	struct sk_filter *filter;
2121	bool is_charged = true;
2122	struct sock *newsk;
2123
2124	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2125	if (!newsk)
2126		goto out;
2127
2128	sock_copy(newsk, sk);
2129
2130	newsk->sk_prot_creator = prot;
2131
2132	/* SANITY */
2133	if (likely(newsk->sk_net_refcnt)) {
2134		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2135		sock_inuse_add(sock_net(newsk), 1);
2136	}
2137	sk_node_init(&newsk->sk_node);
2138	sock_lock_init(newsk);
2139	bh_lock_sock(newsk);
2140	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2141	newsk->sk_backlog.len = 0;
2142
2143	atomic_set(&newsk->sk_rmem_alloc, 0);
2144
2145	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2146	refcount_set(&newsk->sk_wmem_alloc, 1);
2147
2148	atomic_set(&newsk->sk_omem_alloc, 0);
2149	sk_init_common(newsk);
2150
2151	newsk->sk_dst_cache	= NULL;
2152	newsk->sk_dst_pending_confirm = 0;
2153	newsk->sk_wmem_queued	= 0;
2154	newsk->sk_forward_alloc = 0;
2155	newsk->sk_reserved_mem  = 0;
2156	atomic_set(&newsk->sk_drops, 0);
2157	newsk->sk_send_head	= NULL;
2158	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2159	atomic_set(&newsk->sk_zckey, 0);
2160
2161	sock_reset_flag(newsk, SOCK_DONE);
2162
2163	/* sk->sk_memcg will be populated at accept() time */
2164	newsk->sk_memcg = NULL;
2165
2166	cgroup_sk_clone(&newsk->sk_cgrp_data);
2167
2168	rcu_read_lock();
2169	filter = rcu_dereference(sk->sk_filter);
2170	if (filter != NULL)
2171		/* though it's an empty new sock, the charging may fail
2172		 * if sysctl_optmem_max was changed between creation of
2173		 * original socket and cloning
2174		 */
2175		is_charged = sk_filter_charge(newsk, filter);
2176	RCU_INIT_POINTER(newsk->sk_filter, filter);
2177	rcu_read_unlock();
2178
2179	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2180		/* We need to make sure that we don't uncharge the new
2181		 * socket if we couldn't charge it in the first place
2182		 * as otherwise we uncharge the parent's filter.
2183		 */
2184		if (!is_charged)
2185			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2186		sk_free_unlock_clone(newsk);
2187		newsk = NULL;
2188		goto out;
2189	}
2190	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2191
2192	if (bpf_sk_storage_clone(sk, newsk)) {
2193		sk_free_unlock_clone(newsk);
2194		newsk = NULL;
2195		goto out;
2196	}
2197
2198	/* Clear sk_user_data if parent had the pointer tagged
2199	 * as not suitable for copying when cloning.
2200	 */
2201	if (sk_user_data_is_nocopy(newsk))
2202		newsk->sk_user_data = NULL;
2203
2204	newsk->sk_err	   = 0;
2205	newsk->sk_err_soft = 0;
2206	newsk->sk_priority = 0;
2207	newsk->sk_incoming_cpu = raw_smp_processor_id();
2208
2209	/* Before updating sk_refcnt, we must commit prior changes to memory
2210	 * (Documentation/RCU/rculist_nulls.rst for details)
2211	 */
2212	smp_wmb();
2213	refcount_set(&newsk->sk_refcnt, 2);
2214
2215	/* Increment the counter in the same struct proto as the master
2216	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2217	 * is the same as sk->sk_prot->socks, as this field was copied
2218	 * with memcpy).
2219	 *
2220	 * This _changes_ the previous behaviour, where
2221	 * tcp_create_openreq_child always was incrementing the
2222	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2223	 * to be taken into account in all callers. -acme
2224	 */
2225	sk_refcnt_debug_inc(newsk);
2226	sk_set_socket(newsk, NULL);
2227	sk_tx_queue_clear(newsk);
2228	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2229
2230	if (newsk->sk_prot->sockets_allocated)
2231		sk_sockets_allocated_inc(newsk);
2232
2233	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2234		net_enable_timestamp();
2235out:
2236	return newsk;
2237}
2238EXPORT_SYMBOL_GPL(sk_clone_lock);
2239
2240void sk_free_unlock_clone(struct sock *sk)
2241{
2242	/* It is still raw copy of parent, so invalidate
2243	 * destructor and make plain sk_free() */
2244	sk->sk_destruct = NULL;
2245	bh_unlock_sock(sk);
2246	sk_free(sk);
2247}
2248EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2249
2250void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2251{
2252	u32 max_segs = 1;
2253
2254	sk_dst_set(sk, dst);
2255	sk->sk_route_caps = dst->dev->features;
2256	if (sk_is_tcp(sk))
2257		sk->sk_route_caps |= NETIF_F_GSO;
2258	if (sk->sk_route_caps & NETIF_F_GSO)
2259		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2260	if (unlikely(sk->sk_gso_disabled))
2261		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2262	if (sk_can_gso(sk)) {
2263		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2264			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2265		} else {
2266			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2267			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2268			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2269			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2270			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2271		}
2272	}
2273	sk->sk_gso_max_segs = max_segs;
2274}
2275EXPORT_SYMBOL_GPL(sk_setup_caps);
2276
2277/*
2278 *	Simple resource managers for sockets.
2279 */
2280
2281
2282/*
2283 * Write buffer destructor automatically called from kfree_skb.
2284 */
2285void sock_wfree(struct sk_buff *skb)
2286{
2287	struct sock *sk = skb->sk;
2288	unsigned int len = skb->truesize;
2289
2290	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2291		/*
2292		 * Keep a reference on sk_wmem_alloc, this will be released
2293		 * after sk_write_space() call
2294		 */
2295		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2296		sk->sk_write_space(sk);
2297		len = 1;
2298	}
2299	/*
2300	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2301	 * could not do because of in-flight packets
2302	 */
2303	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2304		__sk_free(sk);
2305}
2306EXPORT_SYMBOL(sock_wfree);
2307
2308/* This variant of sock_wfree() is used by TCP,
2309 * since it sets SOCK_USE_WRITE_QUEUE.
2310 */
2311void __sock_wfree(struct sk_buff *skb)
2312{
2313	struct sock *sk = skb->sk;
2314
2315	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2316		__sk_free(sk);
2317}
2318
2319void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2320{
2321	skb_orphan(skb);
2322	skb->sk = sk;
2323#ifdef CONFIG_INET
2324	if (unlikely(!sk_fullsock(sk))) {
2325		skb->destructor = sock_edemux;
2326		sock_hold(sk);
2327		return;
2328	}
2329#endif
2330	skb->destructor = sock_wfree;
2331	skb_set_hash_from_sk(skb, sk);
2332	/*
2333	 * We used to take a refcount on sk, but following operation
2334	 * is enough to guarantee sk_free() wont free this sock until
2335	 * all in-flight packets are completed
2336	 */
2337	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2338}
2339EXPORT_SYMBOL(skb_set_owner_w);
2340
2341static bool can_skb_orphan_partial(const struct sk_buff *skb)
2342{
2343#ifdef CONFIG_TLS_DEVICE
2344	/* Drivers depend on in-order delivery for crypto offload,
2345	 * partial orphan breaks out-of-order-OK logic.
2346	 */
2347	if (skb->decrypted)
2348		return false;
2349#endif
2350	return (skb->destructor == sock_wfree ||
2351		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2352}
2353
2354/* This helper is used by netem, as it can hold packets in its
2355 * delay queue. We want to allow the owner socket to send more
2356 * packets, as if they were already TX completed by a typical driver.
2357 * But we also want to keep skb->sk set because some packet schedulers
2358 * rely on it (sch_fq for example).
2359 */
2360void skb_orphan_partial(struct sk_buff *skb)
2361{
2362	if (skb_is_tcp_pure_ack(skb))
2363		return;
2364
2365	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2366		return;
2367
2368	skb_orphan(skb);
2369}
2370EXPORT_SYMBOL(skb_orphan_partial);
2371
2372/*
2373 * Read buffer destructor automatically called from kfree_skb.
2374 */
2375void sock_rfree(struct sk_buff *skb)
2376{
2377	struct sock *sk = skb->sk;
2378	unsigned int len = skb->truesize;
2379
2380	atomic_sub(len, &sk->sk_rmem_alloc);
2381	sk_mem_uncharge(sk, len);
2382}
2383EXPORT_SYMBOL(sock_rfree);
2384
2385/*
2386 * Buffer destructor for skbs that are not used directly in read or write
2387 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2388 */
2389void sock_efree(struct sk_buff *skb)
2390{
2391	sock_put(skb->sk);
2392}
2393EXPORT_SYMBOL(sock_efree);
2394
2395/* Buffer destructor for prefetch/receive path where reference count may
2396 * not be held, e.g. for listen sockets.
2397 */
2398#ifdef CONFIG_INET
2399void sock_pfree(struct sk_buff *skb)
2400{
2401	if (sk_is_refcounted(skb->sk))
2402		sock_gen_put(skb->sk);
2403}
2404EXPORT_SYMBOL(sock_pfree);
2405#endif /* CONFIG_INET */
2406
2407kuid_t sock_i_uid(struct sock *sk)
2408{
2409	kuid_t uid;
2410
2411	read_lock_bh(&sk->sk_callback_lock);
2412	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2413	read_unlock_bh(&sk->sk_callback_lock);
2414	return uid;
2415}
2416EXPORT_SYMBOL(sock_i_uid);
2417
2418unsigned long sock_i_ino(struct sock *sk)
2419{
2420	unsigned long ino;
2421
2422	read_lock_bh(&sk->sk_callback_lock);
2423	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2424	read_unlock_bh(&sk->sk_callback_lock);
2425	return ino;
2426}
2427EXPORT_SYMBOL(sock_i_ino);
2428
2429/*
2430 * Allocate a skb from the socket's send buffer.
2431 */
2432struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2433			     gfp_t priority)
2434{
2435	if (force ||
2436	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2437		struct sk_buff *skb = alloc_skb(size, priority);
2438
2439		if (skb) {
2440			skb_set_owner_w(skb, sk);
2441			return skb;
2442		}
2443	}
2444	return NULL;
2445}
2446EXPORT_SYMBOL(sock_wmalloc);
2447
2448static void sock_ofree(struct sk_buff *skb)
2449{
2450	struct sock *sk = skb->sk;
2451
2452	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2453}
2454
2455struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2456			     gfp_t priority)
2457{
2458	struct sk_buff *skb;
2459
2460	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2461	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2462	    sysctl_optmem_max)
2463		return NULL;
2464
2465	skb = alloc_skb(size, priority);
2466	if (!skb)
2467		return NULL;
2468
2469	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2470	skb->sk = sk;
2471	skb->destructor = sock_ofree;
2472	return skb;
2473}
2474
2475/*
2476 * Allocate a memory block from the socket's option memory buffer.
2477 */
2478void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2479{
2480	if ((unsigned int)size <= sysctl_optmem_max &&
2481	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2482		void *mem;
2483		/* First do the add, to avoid the race if kmalloc
2484		 * might sleep.
2485		 */
2486		atomic_add(size, &sk->sk_omem_alloc);
2487		mem = kmalloc(size, priority);
2488		if (mem)
2489			return mem;
2490		atomic_sub(size, &sk->sk_omem_alloc);
2491	}
2492	return NULL;
2493}
2494EXPORT_SYMBOL(sock_kmalloc);
2495
2496/* Free an option memory block. Note, we actually want the inline
2497 * here as this allows gcc to detect the nullify and fold away the
2498 * condition entirely.
2499 */
2500static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2501				  const bool nullify)
2502{
2503	if (WARN_ON_ONCE(!mem))
2504		return;
2505	if (nullify)
2506		kfree_sensitive(mem);
2507	else
2508		kfree(mem);
2509	atomic_sub(size, &sk->sk_omem_alloc);
2510}
2511
2512void sock_kfree_s(struct sock *sk, void *mem, int size)
2513{
2514	__sock_kfree_s(sk, mem, size, false);
2515}
2516EXPORT_SYMBOL(sock_kfree_s);
2517
2518void sock_kzfree_s(struct sock *sk, void *mem, int size)
2519{
2520	__sock_kfree_s(sk, mem, size, true);
2521}
2522EXPORT_SYMBOL(sock_kzfree_s);
2523
2524/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2525   I think, these locks should be removed for datagram sockets.
2526 */
2527static long sock_wait_for_wmem(struct sock *sk, long timeo)
2528{
2529	DEFINE_WAIT(wait);
2530
2531	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2532	for (;;) {
2533		if (!timeo)
2534			break;
2535		if (signal_pending(current))
2536			break;
2537		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2538		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2539		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2540			break;
2541		if (sk->sk_shutdown & SEND_SHUTDOWN)
2542			break;
2543		if (sk->sk_err)
2544			break;
2545		timeo = schedule_timeout(timeo);
2546	}
2547	finish_wait(sk_sleep(sk), &wait);
2548	return timeo;
2549}
2550
2551
2552/*
2553 *	Generic send/receive buffer handlers
2554 */
2555
2556struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2557				     unsigned long data_len, int noblock,
2558				     int *errcode, int max_page_order)
2559{
2560	struct sk_buff *skb;
2561	long timeo;
2562	int err;
2563
2564	timeo = sock_sndtimeo(sk, noblock);
2565	for (;;) {
2566		err = sock_error(sk);
2567		if (err != 0)
2568			goto failure;
2569
2570		err = -EPIPE;
2571		if (sk->sk_shutdown & SEND_SHUTDOWN)
2572			goto failure;
2573
2574		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2575			break;
2576
2577		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2578		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2579		err = -EAGAIN;
2580		if (!timeo)
2581			goto failure;
2582		if (signal_pending(current))
2583			goto interrupted;
2584		timeo = sock_wait_for_wmem(sk, timeo);
2585	}
2586	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2587				   errcode, sk->sk_allocation);
2588	if (skb)
2589		skb_set_owner_w(skb, sk);
2590	return skb;
2591
2592interrupted:
2593	err = sock_intr_errno(timeo);
2594failure:
2595	*errcode = err;
2596	return NULL;
2597}
2598EXPORT_SYMBOL(sock_alloc_send_pskb);
2599
2600struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2601				    int noblock, int *errcode)
2602{
2603	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2604}
2605EXPORT_SYMBOL(sock_alloc_send_skb);
2606
2607int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2608		     struct sockcm_cookie *sockc)
2609{
2610	u32 tsflags;
2611
2612	switch (cmsg->cmsg_type) {
2613	case SO_MARK:
2614		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2615			return -EPERM;
2616		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2617			return -EINVAL;
2618		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2619		break;
2620	case SO_TIMESTAMPING_OLD:
2621		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2622			return -EINVAL;
2623
2624		tsflags = *(u32 *)CMSG_DATA(cmsg);
2625		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2626			return -EINVAL;
2627
2628		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2629		sockc->tsflags |= tsflags;
2630		break;
2631	case SCM_TXTIME:
2632		if (!sock_flag(sk, SOCK_TXTIME))
2633			return -EINVAL;
2634		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2635			return -EINVAL;
2636		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2637		break;
2638	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2639	case SCM_RIGHTS:
2640	case SCM_CREDENTIALS:
2641		break;
2642	default:
2643		return -EINVAL;
2644	}
2645	return 0;
2646}
2647EXPORT_SYMBOL(__sock_cmsg_send);
2648
2649int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2650		   struct sockcm_cookie *sockc)
2651{
2652	struct cmsghdr *cmsg;
2653	int ret;
2654
2655	for_each_cmsghdr(cmsg, msg) {
2656		if (!CMSG_OK(msg, cmsg))
2657			return -EINVAL;
2658		if (cmsg->cmsg_level != SOL_SOCKET)
2659			continue;
2660		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2661		if (ret)
2662			return ret;
2663	}
2664	return 0;
2665}
2666EXPORT_SYMBOL(sock_cmsg_send);
2667
2668static void sk_enter_memory_pressure(struct sock *sk)
2669{
2670	if (!sk->sk_prot->enter_memory_pressure)
2671		return;
2672
2673	sk->sk_prot->enter_memory_pressure(sk);
2674}
2675
2676static void sk_leave_memory_pressure(struct sock *sk)
2677{
2678	if (sk->sk_prot->leave_memory_pressure) {
2679		sk->sk_prot->leave_memory_pressure(sk);
2680	} else {
2681		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2682
2683		if (memory_pressure && READ_ONCE(*memory_pressure))
2684			WRITE_ONCE(*memory_pressure, 0);
2685	}
2686}
2687
2688DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2689
2690/**
2691 * skb_page_frag_refill - check that a page_frag contains enough room
2692 * @sz: minimum size of the fragment we want to get
2693 * @pfrag: pointer to page_frag
2694 * @gfp: priority for memory allocation
2695 *
2696 * Note: While this allocator tries to use high order pages, there is
2697 * no guarantee that allocations succeed. Therefore, @sz MUST be
2698 * less or equal than PAGE_SIZE.
2699 */
2700bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2701{
2702	if (pfrag->page) {
2703		if (page_ref_count(pfrag->page) == 1) {
2704			pfrag->offset = 0;
2705			return true;
2706		}
2707		if (pfrag->offset + sz <= pfrag->size)
2708			return true;
2709		put_page(pfrag->page);
2710	}
2711
2712	pfrag->offset = 0;
2713	if (SKB_FRAG_PAGE_ORDER &&
2714	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2715		/* Avoid direct reclaim but allow kswapd to wake */
2716		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2717					  __GFP_COMP | __GFP_NOWARN |
2718					  __GFP_NORETRY,
2719					  SKB_FRAG_PAGE_ORDER);
2720		if (likely(pfrag->page)) {
2721			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2722			return true;
2723		}
2724	}
2725	pfrag->page = alloc_page(gfp);
2726	if (likely(pfrag->page)) {
2727		pfrag->size = PAGE_SIZE;
2728		return true;
2729	}
2730	return false;
2731}
2732EXPORT_SYMBOL(skb_page_frag_refill);
2733
2734bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2735{
2736	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2737		return true;
2738
2739	sk_enter_memory_pressure(sk);
2740	sk_stream_moderate_sndbuf(sk);
2741	return false;
2742}
2743EXPORT_SYMBOL(sk_page_frag_refill);
2744
2745void __lock_sock(struct sock *sk)
2746	__releases(&sk->sk_lock.slock)
2747	__acquires(&sk->sk_lock.slock)
2748{
2749	DEFINE_WAIT(wait);
2750
2751	for (;;) {
2752		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2753					TASK_UNINTERRUPTIBLE);
2754		spin_unlock_bh(&sk->sk_lock.slock);
2755		schedule();
2756		spin_lock_bh(&sk->sk_lock.slock);
2757		if (!sock_owned_by_user(sk))
2758			break;
2759	}
2760	finish_wait(&sk->sk_lock.wq, &wait);
2761}
2762
2763void __release_sock(struct sock *sk)
2764	__releases(&sk->sk_lock.slock)
2765	__acquires(&sk->sk_lock.slock)
2766{
2767	struct sk_buff *skb, *next;
2768
2769	while ((skb = sk->sk_backlog.head) != NULL) {
2770		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2771
2772		spin_unlock_bh(&sk->sk_lock.slock);
2773
2774		do {
2775			next = skb->next;
2776			prefetch(next);
2777			WARN_ON_ONCE(skb_dst_is_noref(skb));
2778			skb_mark_not_on_list(skb);
2779			sk_backlog_rcv(sk, skb);
2780
2781			cond_resched();
2782
2783			skb = next;
2784		} while (skb != NULL);
2785
2786		spin_lock_bh(&sk->sk_lock.slock);
2787	}
2788
2789	/*
2790	 * Doing the zeroing here guarantee we can not loop forever
2791	 * while a wild producer attempts to flood us.
2792	 */
2793	sk->sk_backlog.len = 0;
2794}
2795
2796void __sk_flush_backlog(struct sock *sk)
2797{
2798	spin_lock_bh(&sk->sk_lock.slock);
2799	__release_sock(sk);
2800	spin_unlock_bh(&sk->sk_lock.slock);
2801}
2802
2803/**
2804 * sk_wait_data - wait for data to arrive at sk_receive_queue
2805 * @sk:    sock to wait on
2806 * @timeo: for how long
2807 * @skb:   last skb seen on sk_receive_queue
2808 *
2809 * Now socket state including sk->sk_err is changed only under lock,
2810 * hence we may omit checks after joining wait queue.
2811 * We check receive queue before schedule() only as optimization;
2812 * it is very likely that release_sock() added new data.
2813 */
2814int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2815{
2816	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2817	int rc;
2818
2819	add_wait_queue(sk_sleep(sk), &wait);
2820	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2821	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2822	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2823	remove_wait_queue(sk_sleep(sk), &wait);
2824	return rc;
2825}
2826EXPORT_SYMBOL(sk_wait_data);
2827
2828/**
2829 *	__sk_mem_raise_allocated - increase memory_allocated
2830 *	@sk: socket
2831 *	@size: memory size to allocate
2832 *	@amt: pages to allocate
2833 *	@kind: allocation type
2834 *
2835 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2836 */
2837int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2838{
2839	struct proto *prot = sk->sk_prot;
2840	long allocated = sk_memory_allocated_add(sk, amt);
2841	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2842	bool charged = true;
2843
2844	if (memcg_charge &&
2845	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2846						gfp_memcg_charge())))
2847		goto suppress_allocation;
2848
2849	/* Under limit. */
2850	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2851		sk_leave_memory_pressure(sk);
2852		return 1;
2853	}
2854
2855	/* Under pressure. */
2856	if (allocated > sk_prot_mem_limits(sk, 1))
2857		sk_enter_memory_pressure(sk);
2858
2859	/* Over hard limit. */
2860	if (allocated > sk_prot_mem_limits(sk, 2))
2861		goto suppress_allocation;
2862
2863	/* guarantee minimum buffer size under pressure */
2864	if (kind == SK_MEM_RECV) {
2865		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2866			return 1;
2867
2868	} else { /* SK_MEM_SEND */
2869		int wmem0 = sk_get_wmem0(sk, prot);
2870
2871		if (sk->sk_type == SOCK_STREAM) {
2872			if (sk->sk_wmem_queued < wmem0)
2873				return 1;
2874		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2875				return 1;
2876		}
2877	}
2878
2879	if (sk_has_memory_pressure(sk)) {
2880		u64 alloc;
2881
2882		if (!sk_under_memory_pressure(sk))
2883			return 1;
2884		alloc = sk_sockets_allocated_read_positive(sk);
2885		if (sk_prot_mem_limits(sk, 2) > alloc *
2886		    sk_mem_pages(sk->sk_wmem_queued +
2887				 atomic_read(&sk->sk_rmem_alloc) +
2888				 sk->sk_forward_alloc))
2889			return 1;
2890	}
2891
2892suppress_allocation:
2893
2894	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2895		sk_stream_moderate_sndbuf(sk);
2896
2897		/* Fail only if socket is _under_ its sndbuf.
2898		 * In this case we cannot block, so that we have to fail.
2899		 */
2900		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2901			/* Force charge with __GFP_NOFAIL */
2902			if (memcg_charge && !charged) {
2903				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2904					gfp_memcg_charge() | __GFP_NOFAIL);
2905			}
2906			return 1;
2907		}
2908	}
2909
2910	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2911		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2912
2913	sk_memory_allocated_sub(sk, amt);
2914
2915	if (memcg_charge && charged)
2916		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2917
2918	return 0;
2919}
2920EXPORT_SYMBOL(__sk_mem_raise_allocated);
2921
2922/**
2923 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2924 *	@sk: socket
2925 *	@size: memory size to allocate
2926 *	@kind: allocation type
2927 *
2928 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2929 *	rmem allocation. This function assumes that protocols which have
2930 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2931 */
2932int __sk_mem_schedule(struct sock *sk, int size, int kind)
2933{
2934	int ret, amt = sk_mem_pages(size);
2935
2936	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2937	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2938	if (!ret)
2939		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2940	return ret;
2941}
2942EXPORT_SYMBOL(__sk_mem_schedule);
2943
2944/**
2945 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2946 *	@sk: socket
2947 *	@amount: number of quanta
2948 *
2949 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2950 */
2951void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2952{
2953	sk_memory_allocated_sub(sk, amount);
2954
2955	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2956		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2957
2958	if (sk_under_memory_pressure(sk) &&
2959	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2960		sk_leave_memory_pressure(sk);
2961}
2962EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2963
2964/**
2965 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2966 *	@sk: socket
2967 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2968 */
2969void __sk_mem_reclaim(struct sock *sk, int amount)
2970{
2971	amount >>= SK_MEM_QUANTUM_SHIFT;
2972	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2973	__sk_mem_reduce_allocated(sk, amount);
2974}
2975EXPORT_SYMBOL(__sk_mem_reclaim);
2976
2977int sk_set_peek_off(struct sock *sk, int val)
2978{
2979	sk->sk_peek_off = val;
2980	return 0;
2981}
2982EXPORT_SYMBOL_GPL(sk_set_peek_off);
2983
2984/*
2985 * Set of default routines for initialising struct proto_ops when
2986 * the protocol does not support a particular function. In certain
2987 * cases where it makes no sense for a protocol to have a "do nothing"
2988 * function, some default processing is provided.
2989 */
2990
2991int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2992{
2993	return -EOPNOTSUPP;
2994}
2995EXPORT_SYMBOL(sock_no_bind);
2996
2997int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2998		    int len, int flags)
2999{
3000	return -EOPNOTSUPP;
3001}
3002EXPORT_SYMBOL(sock_no_connect);
3003
3004int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3005{
3006	return -EOPNOTSUPP;
3007}
3008EXPORT_SYMBOL(sock_no_socketpair);
3009
3010int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3011		   bool kern)
3012{
3013	return -EOPNOTSUPP;
3014}
3015EXPORT_SYMBOL(sock_no_accept);
3016
3017int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3018		    int peer)
3019{
3020	return -EOPNOTSUPP;
3021}
3022EXPORT_SYMBOL(sock_no_getname);
3023
3024int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3025{
3026	return -EOPNOTSUPP;
3027}
3028EXPORT_SYMBOL(sock_no_ioctl);
3029
3030int sock_no_listen(struct socket *sock, int backlog)
3031{
3032	return -EOPNOTSUPP;
3033}
3034EXPORT_SYMBOL(sock_no_listen);
3035
3036int sock_no_shutdown(struct socket *sock, int how)
3037{
3038	return -EOPNOTSUPP;
3039}
3040EXPORT_SYMBOL(sock_no_shutdown);
3041
3042int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3043{
3044	return -EOPNOTSUPP;
3045}
3046EXPORT_SYMBOL(sock_no_sendmsg);
3047
3048int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3049{
3050	return -EOPNOTSUPP;
3051}
3052EXPORT_SYMBOL(sock_no_sendmsg_locked);
3053
3054int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3055		    int flags)
3056{
3057	return -EOPNOTSUPP;
3058}
3059EXPORT_SYMBOL(sock_no_recvmsg);
3060
3061int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3062{
3063	/* Mirror missing mmap method error code */
3064	return -ENODEV;
3065}
3066EXPORT_SYMBOL(sock_no_mmap);
3067
3068/*
3069 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3070 * various sock-based usage counts.
3071 */
3072void __receive_sock(struct file *file)
3073{
3074	struct socket *sock;
3075
3076	sock = sock_from_file(file);
3077	if (sock) {
3078		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3079		sock_update_classid(&sock->sk->sk_cgrp_data);
3080	}
3081}
3082
3083ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3084{
3085	ssize_t res;
3086	struct msghdr msg = {.msg_flags = flags};
3087	struct kvec iov;
3088	char *kaddr = kmap(page);
3089	iov.iov_base = kaddr + offset;
3090	iov.iov_len = size;
3091	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3092	kunmap(page);
3093	return res;
3094}
3095EXPORT_SYMBOL(sock_no_sendpage);
3096
3097ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3098				int offset, size_t size, int flags)
3099{
3100	ssize_t res;
3101	struct msghdr msg = {.msg_flags = flags};
3102	struct kvec iov;
3103	char *kaddr = kmap(page);
3104
3105	iov.iov_base = kaddr + offset;
3106	iov.iov_len = size;
3107	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3108	kunmap(page);
3109	return res;
3110}
3111EXPORT_SYMBOL(sock_no_sendpage_locked);
3112
3113/*
3114 *	Default Socket Callbacks
3115 */
3116
3117static void sock_def_wakeup(struct sock *sk)
3118{
3119	struct socket_wq *wq;
3120
3121	rcu_read_lock();
3122	wq = rcu_dereference(sk->sk_wq);
3123	if (skwq_has_sleeper(wq))
3124		wake_up_interruptible_all(&wq->wait);
3125	rcu_read_unlock();
3126}
3127
3128static void sock_def_error_report(struct sock *sk)
3129{
3130	struct socket_wq *wq;
3131
3132	rcu_read_lock();
3133	wq = rcu_dereference(sk->sk_wq);
3134	if (skwq_has_sleeper(wq))
3135		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3136	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3137	rcu_read_unlock();
3138}
3139
3140void sock_def_readable(struct sock *sk)
3141{
3142	struct socket_wq *wq;
3143
3144	rcu_read_lock();
3145	wq = rcu_dereference(sk->sk_wq);
3146	if (skwq_has_sleeper(wq))
3147		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3148						EPOLLRDNORM | EPOLLRDBAND);
3149	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3150	rcu_read_unlock();
3151}
3152
3153static void sock_def_write_space(struct sock *sk)
3154{
3155	struct socket_wq *wq;
3156
3157	rcu_read_lock();
3158
3159	/* Do not wake up a writer until he can make "significant"
3160	 * progress.  --DaveM
3161	 */
3162	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3163		wq = rcu_dereference(sk->sk_wq);
3164		if (skwq_has_sleeper(wq))
3165			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3166						EPOLLWRNORM | EPOLLWRBAND);
3167
3168		/* Should agree with poll, otherwise some programs break */
3169		if (sock_writeable(sk))
3170			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3171	}
3172
3173	rcu_read_unlock();
3174}
3175
3176static void sock_def_destruct(struct sock *sk)
3177{
3178}
3179
3180void sk_send_sigurg(struct sock *sk)
3181{
3182	if (sk->sk_socket && sk->sk_socket->file)
3183		if (send_sigurg(&sk->sk_socket->file->f_owner))
3184			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3185}
3186EXPORT_SYMBOL(sk_send_sigurg);
3187
3188void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3189		    unsigned long expires)
3190{
3191	if (!mod_timer(timer, expires))
3192		sock_hold(sk);
3193}
3194EXPORT_SYMBOL(sk_reset_timer);
3195
3196void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3197{
3198	if (del_timer(timer))
3199		__sock_put(sk);
3200}
3201EXPORT_SYMBOL(sk_stop_timer);
3202
3203void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3204{
3205	if (del_timer_sync(timer))
3206		__sock_put(sk);
3207}
3208EXPORT_SYMBOL(sk_stop_timer_sync);
3209
3210void sock_init_data(struct socket *sock, struct sock *sk)
3211{
3212	sk_init_common(sk);
3213	sk->sk_send_head	=	NULL;
3214
3215	timer_setup(&sk->sk_timer, NULL, 0);
3216
3217	sk->sk_allocation	=	GFP_KERNEL;
3218	sk->sk_rcvbuf		=	sysctl_rmem_default;
3219	sk->sk_sndbuf		=	sysctl_wmem_default;
3220	sk->sk_state		=	TCP_CLOSE;
3221	sk_set_socket(sk, sock);
3222
3223	sock_set_flag(sk, SOCK_ZAPPED);
3224
3225	if (sock) {
3226		sk->sk_type	=	sock->type;
3227		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3228		sock->sk	=	sk;
3229		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3230	} else {
3231		RCU_INIT_POINTER(sk->sk_wq, NULL);
3232		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3233	}
3234
3235	rwlock_init(&sk->sk_callback_lock);
3236	if (sk->sk_kern_sock)
3237		lockdep_set_class_and_name(
3238			&sk->sk_callback_lock,
3239			af_kern_callback_keys + sk->sk_family,
3240			af_family_kern_clock_key_strings[sk->sk_family]);
3241	else
3242		lockdep_set_class_and_name(
3243			&sk->sk_callback_lock,
3244			af_callback_keys + sk->sk_family,
3245			af_family_clock_key_strings[sk->sk_family]);
3246
3247	sk->sk_state_change	=	sock_def_wakeup;
3248	sk->sk_data_ready	=	sock_def_readable;
3249	sk->sk_write_space	=	sock_def_write_space;
3250	sk->sk_error_report	=	sock_def_error_report;
3251	sk->sk_destruct		=	sock_def_destruct;
3252
3253	sk->sk_frag.page	=	NULL;
3254	sk->sk_frag.offset	=	0;
3255	sk->sk_peek_off		=	-1;
3256
3257	sk->sk_peer_pid 	=	NULL;
3258	sk->sk_peer_cred	=	NULL;
3259	spin_lock_init(&sk->sk_peer_lock);
3260
3261	sk->sk_write_pending	=	0;
3262	sk->sk_rcvlowat		=	1;
3263	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3264	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3265
3266	sk->sk_stamp = SK_DEFAULT_STAMP;
3267#if BITS_PER_LONG==32
3268	seqlock_init(&sk->sk_stamp_seq);
3269#endif
3270	atomic_set(&sk->sk_zckey, 0);
3271
3272#ifdef CONFIG_NET_RX_BUSY_POLL
3273	sk->sk_napi_id		=	0;
3274	sk->sk_ll_usec		=	sysctl_net_busy_read;
3275#endif
3276
3277	sk->sk_max_pacing_rate = ~0UL;
3278	sk->sk_pacing_rate = ~0UL;
3279	WRITE_ONCE(sk->sk_pacing_shift, 10);
3280	sk->sk_incoming_cpu = -1;
3281
3282	sk_rx_queue_clear(sk);
3283	/*
3284	 * Before updating sk_refcnt, we must commit prior changes to memory
3285	 * (Documentation/RCU/rculist_nulls.rst for details)
3286	 */
3287	smp_wmb();
3288	refcount_set(&sk->sk_refcnt, 1);
3289	atomic_set(&sk->sk_drops, 0);
3290}
3291EXPORT_SYMBOL(sock_init_data);
3292
3293void lock_sock_nested(struct sock *sk, int subclass)
3294{
3295	/* The sk_lock has mutex_lock() semantics here. */
3296	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3297
3298	might_sleep();
3299	spin_lock_bh(&sk->sk_lock.slock);
3300	if (sock_owned_by_user_nocheck(sk))
3301		__lock_sock(sk);
3302	sk->sk_lock.owned = 1;
3303	spin_unlock_bh(&sk->sk_lock.slock);
3304}
3305EXPORT_SYMBOL(lock_sock_nested);
3306
3307void release_sock(struct sock *sk)
3308{
3309	spin_lock_bh(&sk->sk_lock.slock);
3310	if (sk->sk_backlog.tail)
3311		__release_sock(sk);
3312
3313	/* Warning : release_cb() might need to release sk ownership,
3314	 * ie call sock_release_ownership(sk) before us.
3315	 */
3316	if (sk->sk_prot->release_cb)
3317		sk->sk_prot->release_cb(sk);
3318
3319	sock_release_ownership(sk);
3320	if (waitqueue_active(&sk->sk_lock.wq))
3321		wake_up(&sk->sk_lock.wq);
3322	spin_unlock_bh(&sk->sk_lock.slock);
3323}
3324EXPORT_SYMBOL(release_sock);
3325
3326bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3327{
3328	might_sleep();
3329	spin_lock_bh(&sk->sk_lock.slock);
3330
3331	if (!sock_owned_by_user_nocheck(sk)) {
3332		/*
3333		 * Fast path return with bottom halves disabled and
3334		 * sock::sk_lock.slock held.
3335		 *
3336		 * The 'mutex' is not contended and holding
3337		 * sock::sk_lock.slock prevents all other lockers to
3338		 * proceed so the corresponding unlock_sock_fast() can
3339		 * avoid the slow path of release_sock() completely and
3340		 * just release slock.
3341		 *
3342		 * From a semantical POV this is equivalent to 'acquiring'
3343		 * the 'mutex', hence the corresponding lockdep
3344		 * mutex_release() has to happen in the fast path of
3345		 * unlock_sock_fast().
3346		 */
3347		return false;
3348	}
3349
3350	__lock_sock(sk);
3351	sk->sk_lock.owned = 1;
3352	__acquire(&sk->sk_lock.slock);
3353	spin_unlock_bh(&sk->sk_lock.slock);
3354	return true;
3355}
3356EXPORT_SYMBOL(__lock_sock_fast);
3357
3358int sock_gettstamp(struct socket *sock, void __user *userstamp,
3359		   bool timeval, bool time32)
3360{
3361	struct sock *sk = sock->sk;
3362	struct timespec64 ts;
3363
3364	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3365	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3366	if (ts.tv_sec == -1)
3367		return -ENOENT;
3368	if (ts.tv_sec == 0) {
3369		ktime_t kt = ktime_get_real();
3370		sock_write_timestamp(sk, kt);
3371		ts = ktime_to_timespec64(kt);
3372	}
3373
3374	if (timeval)
3375		ts.tv_nsec /= 1000;
3376
3377#ifdef CONFIG_COMPAT_32BIT_TIME
3378	if (time32)
3379		return put_old_timespec32(&ts, userstamp);
3380#endif
3381#ifdef CONFIG_SPARC64
3382	/* beware of padding in sparc64 timeval */
3383	if (timeval && !in_compat_syscall()) {
3384		struct __kernel_old_timeval __user tv = {
3385			.tv_sec = ts.tv_sec,
3386			.tv_usec = ts.tv_nsec,
3387		};
3388		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3389			return -EFAULT;
3390		return 0;
3391	}
3392#endif
3393	return put_timespec64(&ts, userstamp);
3394}
3395EXPORT_SYMBOL(sock_gettstamp);
3396
3397void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3398{
3399	if (!sock_flag(sk, flag)) {
3400		unsigned long previous_flags = sk->sk_flags;
3401
3402		sock_set_flag(sk, flag);
3403		/*
3404		 * we just set one of the two flags which require net
3405		 * time stamping, but time stamping might have been on
3406		 * already because of the other one
3407		 */
3408		if (sock_needs_netstamp(sk) &&
3409		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3410			net_enable_timestamp();
3411	}
3412}
3413
3414int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3415		       int level, int type)
3416{
3417	struct sock_exterr_skb *serr;
3418	struct sk_buff *skb;
3419	int copied, err;
3420
3421	err = -EAGAIN;
3422	skb = sock_dequeue_err_skb(sk);
3423	if (skb == NULL)
3424		goto out;
3425
3426	copied = skb->len;
3427	if (copied > len) {
3428		msg->msg_flags |= MSG_TRUNC;
3429		copied = len;
3430	}
3431	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3432	if (err)
3433		goto out_free_skb;
3434
3435	sock_recv_timestamp(msg, sk, skb);
3436
3437	serr = SKB_EXT_ERR(skb);
3438	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3439
3440	msg->msg_flags |= MSG_ERRQUEUE;
3441	err = copied;
3442
3443out_free_skb:
3444	kfree_skb(skb);
3445out:
3446	return err;
3447}
3448EXPORT_SYMBOL(sock_recv_errqueue);
3449
3450/*
3451 *	Get a socket option on an socket.
3452 *
3453 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3454 *	asynchronous errors should be reported by getsockopt. We assume
3455 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3456 */
3457int sock_common_getsockopt(struct socket *sock, int level, int optname,
3458			   char __user *optval, int __user *optlen)
3459{
3460	struct sock *sk = sock->sk;
3461
3462	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3463}
3464EXPORT_SYMBOL(sock_common_getsockopt);
3465
3466int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3467			int flags)
3468{
3469	struct sock *sk = sock->sk;
3470	int addr_len = 0;
3471	int err;
3472
3473	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3474				   flags & ~MSG_DONTWAIT, &addr_len);
3475	if (err >= 0)
3476		msg->msg_namelen = addr_len;
3477	return err;
3478}
3479EXPORT_SYMBOL(sock_common_recvmsg);
3480
3481/*
3482 *	Set socket options on an inet socket.
3483 */
3484int sock_common_setsockopt(struct socket *sock, int level, int optname,
3485			   sockptr_t optval, unsigned int optlen)
3486{
3487	struct sock *sk = sock->sk;
3488
3489	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3490}
3491EXPORT_SYMBOL(sock_common_setsockopt);
3492
3493void sk_common_release(struct sock *sk)
3494{
3495	if (sk->sk_prot->destroy)
3496		sk->sk_prot->destroy(sk);
3497
3498	/*
3499	 * Observation: when sk_common_release is called, processes have
3500	 * no access to socket. But net still has.
3501	 * Step one, detach it from networking:
3502	 *
3503	 * A. Remove from hash tables.
3504	 */
3505
3506	sk->sk_prot->unhash(sk);
3507
3508	/*
3509	 * In this point socket cannot receive new packets, but it is possible
3510	 * that some packets are in flight because some CPU runs receiver and
3511	 * did hash table lookup before we unhashed socket. They will achieve
3512	 * receive queue and will be purged by socket destructor.
3513	 *
3514	 * Also we still have packets pending on receive queue and probably,
3515	 * our own packets waiting in device queues. sock_destroy will drain
3516	 * receive queue, but transmitted packets will delay socket destruction
3517	 * until the last reference will be released.
3518	 */
3519
3520	sock_orphan(sk);
3521
3522	xfrm_sk_free_policy(sk);
3523
3524	sk_refcnt_debug_release(sk);
3525
3526	sock_put(sk);
3527}
3528EXPORT_SYMBOL(sk_common_release);
3529
3530void sk_get_meminfo(const struct sock *sk, u32 *mem)
3531{
3532	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3533
3534	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3535	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3536	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3537	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3538	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3539	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3540	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3541	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3542	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3543}
3544
3545#ifdef CONFIG_PROC_FS
3546static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3547
3548int sock_prot_inuse_get(struct net *net, struct proto *prot)
3549{
3550	int cpu, idx = prot->inuse_idx;
3551	int res = 0;
3552
3553	for_each_possible_cpu(cpu)
3554		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3555
3556	return res >= 0 ? res : 0;
3557}
3558EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3559
3560int sock_inuse_get(struct net *net)
3561{
3562	int cpu, res = 0;
3563
3564	for_each_possible_cpu(cpu)
3565		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3566
3567	return res;
3568}
3569
3570EXPORT_SYMBOL_GPL(sock_inuse_get);
3571
3572static int __net_init sock_inuse_init_net(struct net *net)
3573{
3574	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3575	if (net->core.prot_inuse == NULL)
3576		return -ENOMEM;
3577	return 0;
3578}
3579
3580static void __net_exit sock_inuse_exit_net(struct net *net)
3581{
3582	free_percpu(net->core.prot_inuse);
3583}
3584
3585static struct pernet_operations net_inuse_ops = {
3586	.init = sock_inuse_init_net,
3587	.exit = sock_inuse_exit_net,
3588};
3589
3590static __init int net_inuse_init(void)
3591{
3592	if (register_pernet_subsys(&net_inuse_ops))
3593		panic("Cannot initialize net inuse counters");
3594
3595	return 0;
3596}
3597
3598core_initcall(net_inuse_init);
3599
3600static int assign_proto_idx(struct proto *prot)
3601{
3602	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3603
3604	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3605		pr_err("PROTO_INUSE_NR exhausted\n");
3606		return -ENOSPC;
3607	}
3608
3609	set_bit(prot->inuse_idx, proto_inuse_idx);
3610	return 0;
3611}
3612
3613static void release_proto_idx(struct proto *prot)
3614{
3615	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3616		clear_bit(prot->inuse_idx, proto_inuse_idx);
3617}
3618#else
3619static inline int assign_proto_idx(struct proto *prot)
3620{
3621	return 0;
3622}
3623
3624static inline void release_proto_idx(struct proto *prot)
3625{
3626}
3627
3628#endif
3629
3630static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3631{
3632	if (!twsk_prot)
3633		return;
3634	kfree(twsk_prot->twsk_slab_name);
3635	twsk_prot->twsk_slab_name = NULL;
3636	kmem_cache_destroy(twsk_prot->twsk_slab);
3637	twsk_prot->twsk_slab = NULL;
3638}
3639
3640static int tw_prot_init(const struct proto *prot)
3641{
3642	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3643
3644	if (!twsk_prot)
3645		return 0;
3646
3647	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3648					      prot->name);
3649	if (!twsk_prot->twsk_slab_name)
3650		return -ENOMEM;
3651
3652	twsk_prot->twsk_slab =
3653		kmem_cache_create(twsk_prot->twsk_slab_name,
3654				  twsk_prot->twsk_obj_size, 0,
3655				  SLAB_ACCOUNT | prot->slab_flags,
3656				  NULL);
3657	if (!twsk_prot->twsk_slab) {
3658		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3659			prot->name);
3660		return -ENOMEM;
3661	}
3662
3663	return 0;
3664}
3665
3666static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3667{
3668	if (!rsk_prot)
3669		return;
3670	kfree(rsk_prot->slab_name);
3671	rsk_prot->slab_name = NULL;
3672	kmem_cache_destroy(rsk_prot->slab);
3673	rsk_prot->slab = NULL;
3674}
3675
3676static int req_prot_init(const struct proto *prot)
3677{
3678	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3679
3680	if (!rsk_prot)
3681		return 0;
3682
3683	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3684					prot->name);
3685	if (!rsk_prot->slab_name)
3686		return -ENOMEM;
3687
3688	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3689					   rsk_prot->obj_size, 0,
3690					   SLAB_ACCOUNT | prot->slab_flags,
3691					   NULL);
3692
3693	if (!rsk_prot->slab) {
3694		pr_crit("%s: Can't create request sock SLAB cache!\n",
3695			prot->name);
3696		return -ENOMEM;
3697	}
3698	return 0;
3699}
3700
3701int proto_register(struct proto *prot, int alloc_slab)
3702{
3703	int ret = -ENOBUFS;
3704
3705	if (alloc_slab) {
3706		prot->slab = kmem_cache_create_usercopy(prot->name,
3707					prot->obj_size, 0,
3708					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3709					prot->slab_flags,
3710					prot->useroffset, prot->usersize,
3711					NULL);
3712
3713		if (prot->slab == NULL) {
3714			pr_crit("%s: Can't create sock SLAB cache!\n",
3715				prot->name);
3716			goto out;
3717		}
3718
3719		if (req_prot_init(prot))
3720			goto out_free_request_sock_slab;
3721
3722		if (tw_prot_init(prot))
3723			goto out_free_timewait_sock_slab;
3724	}
3725
3726	mutex_lock(&proto_list_mutex);
3727	ret = assign_proto_idx(prot);
3728	if (ret) {
3729		mutex_unlock(&proto_list_mutex);
3730		goto out_free_timewait_sock_slab;
3731	}
3732	list_add(&prot->node, &proto_list);
3733	mutex_unlock(&proto_list_mutex);
3734	return ret;
3735
3736out_free_timewait_sock_slab:
3737	if (alloc_slab)
3738		tw_prot_cleanup(prot->twsk_prot);
3739out_free_request_sock_slab:
3740	if (alloc_slab) {
3741		req_prot_cleanup(prot->rsk_prot);
3742
3743		kmem_cache_destroy(prot->slab);
3744		prot->slab = NULL;
3745	}
3746out:
3747	return ret;
3748}
3749EXPORT_SYMBOL(proto_register);
3750
3751void proto_unregister(struct proto *prot)
3752{
3753	mutex_lock(&proto_list_mutex);
3754	release_proto_idx(prot);
3755	list_del(&prot->node);
3756	mutex_unlock(&proto_list_mutex);
3757
3758	kmem_cache_destroy(prot->slab);
3759	prot->slab = NULL;
3760
3761	req_prot_cleanup(prot->rsk_prot);
3762	tw_prot_cleanup(prot->twsk_prot);
3763}
3764EXPORT_SYMBOL(proto_unregister);
3765
3766int sock_load_diag_module(int family, int protocol)
3767{
3768	if (!protocol) {
3769		if (!sock_is_registered(family))
3770			return -ENOENT;
3771
3772		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3773				      NETLINK_SOCK_DIAG, family);
3774	}
3775
3776#ifdef CONFIG_INET
3777	if (family == AF_INET &&
3778	    protocol != IPPROTO_RAW &&
3779	    protocol < MAX_INET_PROTOS &&
3780	    !rcu_access_pointer(inet_protos[protocol]))
3781		return -ENOENT;
3782#endif
3783
3784	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3785			      NETLINK_SOCK_DIAG, family, protocol);
3786}
3787EXPORT_SYMBOL(sock_load_diag_module);
3788
3789#ifdef CONFIG_PROC_FS
3790static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3791	__acquires(proto_list_mutex)
3792{
3793	mutex_lock(&proto_list_mutex);
3794	return seq_list_start_head(&proto_list, *pos);
3795}
3796
3797static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3798{
3799	return seq_list_next(v, &proto_list, pos);
3800}
3801
3802static void proto_seq_stop(struct seq_file *seq, void *v)
3803	__releases(proto_list_mutex)
3804{
3805	mutex_unlock(&proto_list_mutex);
3806}
3807
3808static char proto_method_implemented(const void *method)
3809{
3810	return method == NULL ? 'n' : 'y';
3811}
3812static long sock_prot_memory_allocated(struct proto *proto)
3813{
3814	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3815}
3816
3817static const char *sock_prot_memory_pressure(struct proto *proto)
3818{
3819	return proto->memory_pressure != NULL ?
3820	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3821}
3822
3823static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3824{
3825
3826	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3827			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3828		   proto->name,
3829		   proto->obj_size,
3830		   sock_prot_inuse_get(seq_file_net(seq), proto),
3831		   sock_prot_memory_allocated(proto),
3832		   sock_prot_memory_pressure(proto),
3833		   proto->max_header,
3834		   proto->slab == NULL ? "no" : "yes",
3835		   module_name(proto->owner),
3836		   proto_method_implemented(proto->close),
3837		   proto_method_implemented(proto->connect),
3838		   proto_method_implemented(proto->disconnect),
3839		   proto_method_implemented(proto->accept),
3840		   proto_method_implemented(proto->ioctl),
3841		   proto_method_implemented(proto->init),
3842		   proto_method_implemented(proto->destroy),
3843		   proto_method_implemented(proto->shutdown),
3844		   proto_method_implemented(proto->setsockopt),
3845		   proto_method_implemented(proto->getsockopt),
3846		   proto_method_implemented(proto->sendmsg),
3847		   proto_method_implemented(proto->recvmsg),
3848		   proto_method_implemented(proto->sendpage),
3849		   proto_method_implemented(proto->bind),
3850		   proto_method_implemented(proto->backlog_rcv),
3851		   proto_method_implemented(proto->hash),
3852		   proto_method_implemented(proto->unhash),
3853		   proto_method_implemented(proto->get_port),
3854		   proto_method_implemented(proto->enter_memory_pressure));
3855}
3856
3857static int proto_seq_show(struct seq_file *seq, void *v)
3858{
3859	if (v == &proto_list)
3860		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3861			   "protocol",
3862			   "size",
3863			   "sockets",
3864			   "memory",
3865			   "press",
3866			   "maxhdr",
3867			   "slab",
3868			   "module",
3869			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3870	else
3871		proto_seq_printf(seq, list_entry(v, struct proto, node));
3872	return 0;
3873}
3874
3875static const struct seq_operations proto_seq_ops = {
3876	.start  = proto_seq_start,
3877	.next   = proto_seq_next,
3878	.stop   = proto_seq_stop,
3879	.show   = proto_seq_show,
3880};
3881
3882static __net_init int proto_init_net(struct net *net)
3883{
3884	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3885			sizeof(struct seq_net_private)))
3886		return -ENOMEM;
3887
3888	return 0;
3889}
3890
3891static __net_exit void proto_exit_net(struct net *net)
3892{
3893	remove_proc_entry("protocols", net->proc_net);
3894}
3895
3896
3897static __net_initdata struct pernet_operations proto_net_ops = {
3898	.init = proto_init_net,
3899	.exit = proto_exit_net,
3900};
3901
3902static int __init proto_init(void)
3903{
3904	return register_pernet_subsys(&proto_net_ops);
3905}
3906
3907subsys_initcall(proto_init);
3908
3909#endif /* PROC_FS */
3910
3911#ifdef CONFIG_NET_RX_BUSY_POLL
3912bool sk_busy_loop_end(void *p, unsigned long start_time)
3913{
3914	struct sock *sk = p;
3915
3916	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3917	       sk_busy_loop_timeout(sk, start_time);
3918}
3919EXPORT_SYMBOL(sk_busy_loop_end);
3920#endif /* CONFIG_NET_RX_BUSY_POLL */
3921
3922int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3923{
3924	if (!sk->sk_prot->bind_add)
3925		return -EOPNOTSUPP;
3926	return sk->sk_prot->bind_add(sk, addr, addr_len);
3927}
3928EXPORT_SYMBOL(sock_bind_add);