net/core/sock.c at v5.12-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v5.12-rc3 3719 lines 91 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145static void sock_inuse_add(struct net *net, int val);
 146
 147/**
 148 * sk_ns_capable - General socket capability test
 149 * @sk: Socket to use a capability on or through
 150 * @user_ns: The user namespace of the capability to use
 151 * @cap: The capability to use
 152 *
 153 * Test to see if the opener of the socket had when the socket was
 154 * created and the current process has the capability @cap in the user
 155 * namespace @user_ns.
 156 */
 157bool sk_ns_capable(const struct sock *sk,
 158		   struct user_namespace *user_ns, int cap)
 159{
 160	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161		ns_capable(user_ns, cap);
 162}
 163EXPORT_SYMBOL(sk_ns_capable);
 164
 165/**
 166 * sk_capable - Socket global capability test
 167 * @sk: Socket to use a capability on or through
 168 * @cap: The global capability to use
 169 *
 170 * Test to see if the opener of the socket had when the socket was
 171 * created and the current process has the capability @cap in all user
 172 * namespaces.
 173 */
 174bool sk_capable(const struct sock *sk, int cap)
 175{
 176	return sk_ns_capable(sk, &init_user_ns, cap);
 177}
 178EXPORT_SYMBOL(sk_capable);
 179
 180/**
 181 * sk_net_capable - Network namespace socket capability test
 182 * @sk: Socket to use a capability on or through
 183 * @cap: The capability to use
 184 *
 185 * Test to see if the opener of the socket had when the socket was created
 186 * and the current process has the capability @cap over the network namespace
 187 * the socket is a member of.
 188 */
 189bool sk_net_capable(const struct sock *sk, int cap)
 190{
 191	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192}
 193EXPORT_SYMBOL(sk_net_capable);
 194
 195/*
 196 * Each address family might have different locking rules, so we have
 197 * one slock key per address family and separate keys for internal and
 198 * userspace sockets.
 199 */
 200static struct lock_class_key af_family_keys[AF_MAX];
 201static struct lock_class_key af_family_kern_keys[AF_MAX];
 202static struct lock_class_key af_family_slock_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210
 211#define _sock_locks(x)						  \
 212  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 213  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 214  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 215  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 216  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 217  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 218  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 219  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 220  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 221  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 222  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 223  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 224  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 225  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 226  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 227  x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230	_sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233	_sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236	_sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240	_sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243	_sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246	_sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249	_sock_locks("rlock-")
 250};
 251static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 252	_sock_locks("wlock-")
 253};
 254static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 255	_sock_locks("elock-")
 256};
 257
 258/*
 259 * sk_callback_lock and sk queues locking rules are per-address-family,
 260 * so split the lock classes by using a per-AF key:
 261 */
 262static struct lock_class_key af_callback_keys[AF_MAX];
 263static struct lock_class_key af_rlock_keys[AF_MAX];
 264static struct lock_class_key af_wlock_keys[AF_MAX];
 265static struct lock_class_key af_elock_keys[AF_MAX];
 266static struct lock_class_key af_kern_callback_keys[AF_MAX];
 267
 268/* Run time adjustable parameters. */
 269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270EXPORT_SYMBOL(sysctl_wmem_max);
 271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272EXPORT_SYMBOL(sysctl_rmem_max);
 273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276/* Maximal space eaten by iovec or ancillary data plus some space */
 277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 283EXPORT_SYMBOL_GPL(memalloc_socks_key);
 284
 285/**
 286 * sk_set_memalloc - sets %SOCK_MEMALLOC
 287 * @sk: socket to set it on
 288 *
 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290 * It's the responsibility of the admin to adjust min_free_kbytes
 291 * to meet the requirements
 292 */
 293void sk_set_memalloc(struct sock *sk)
 294{
 295	sock_set_flag(sk, SOCK_MEMALLOC);
 296	sk->sk_allocation |= __GFP_MEMALLOC;
 297	static_branch_inc(&memalloc_socks_key);
 298}
 299EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301void sk_clear_memalloc(struct sock *sk)
 302{
 303	sock_reset_flag(sk, SOCK_MEMALLOC);
 304	sk->sk_allocation &= ~__GFP_MEMALLOC;
 305	static_branch_dec(&memalloc_socks_key);
 306
 307	/*
 308	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 310	 * it has rmem allocations due to the last swapfile being deactivated
 311	 * but there is a risk that the socket is unusable due to exceeding
 312	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313	 */
 314	sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320	int ret;
 321	unsigned int noreclaim_flag;
 322
 323	/* these should have been dropped before queueing */
 324	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326	noreclaim_flag = memalloc_noreclaim_save();
 327	ret = sk->sk_backlog_rcv(sk, skb);
 328	memalloc_noreclaim_restore(noreclaim_flag);
 329
 330	return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 335{
 336	struct __kernel_sock_timeval tv;
 337
 338	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339		tv.tv_sec = 0;
 340		tv.tv_usec = 0;
 341	} else {
 342		tv.tv_sec = timeo / HZ;
 343		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344	}
 345
 346	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348		*(struct old_timeval32 *)optval = tv32;
 349		return sizeof(tv32);
 350	}
 351
 352	if (old_timeval) {
 353		struct __kernel_old_timeval old_tv;
 354		old_tv.tv_sec = tv.tv_sec;
 355		old_tv.tv_usec = tv.tv_usec;
 356		*(struct __kernel_old_timeval *)optval = old_tv;
 357		return sizeof(old_tv);
 358	}
 359
 360	*(struct __kernel_sock_timeval *)optval = tv;
 361	return sizeof(tv);
 362}
 363
 364static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 365			    bool old_timeval)
 366{
 367	struct __kernel_sock_timeval tv;
 368
 369	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 370		struct old_timeval32 tv32;
 371
 372		if (optlen < sizeof(tv32))
 373			return -EINVAL;
 374
 375		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 376			return -EFAULT;
 377		tv.tv_sec = tv32.tv_sec;
 378		tv.tv_usec = tv32.tv_usec;
 379	} else if (old_timeval) {
 380		struct __kernel_old_timeval old_tv;
 381
 382		if (optlen < sizeof(old_tv))
 383			return -EINVAL;
 384		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 385			return -EFAULT;
 386		tv.tv_sec = old_tv.tv_sec;
 387		tv.tv_usec = old_tv.tv_usec;
 388	} else {
 389		if (optlen < sizeof(tv))
 390			return -EINVAL;
 391		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 392			return -EFAULT;
 393	}
 394	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395		return -EDOM;
 396
 397	if (tv.tv_sec < 0) {
 398		static int warned __read_mostly;
 399
 400		*timeo_p = 0;
 401		if (warned < 10 && net_ratelimit()) {
 402			warned++;
 403			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404				__func__, current->comm, task_pid_nr(current));
 405		}
 406		return 0;
 407	}
 408	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 409	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410		return 0;
 411	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 412		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 413	return 0;
 414}
 415
 416static bool sock_needs_netstamp(const struct sock *sk)
 417{
 418	switch (sk->sk_family) {
 419	case AF_UNSPEC:
 420	case AF_UNIX:
 421		return false;
 422	default:
 423		return true;
 424	}
 425}
 426
 427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428{
 429	if (sk->sk_flags & flags) {
 430		sk->sk_flags &= ~flags;
 431		if (sock_needs_netstamp(sk) &&
 432		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 433			net_disable_timestamp();
 434	}
 435}
 436
 437
 438int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 439{
 440	unsigned long flags;
 441	struct sk_buff_head *list = &sk->sk_receive_queue;
 442
 443	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 444		atomic_inc(&sk->sk_drops);
 445		trace_sock_rcvqueue_full(sk, skb);
 446		return -ENOMEM;
 447	}
 448
 449	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 450		atomic_inc(&sk->sk_drops);
 451		return -ENOBUFS;
 452	}
 453
 454	skb->dev = NULL;
 455	skb_set_owner_r(skb, sk);
 456
 457	/* we escape from rcu protected region, make sure we dont leak
 458	 * a norefcounted dst
 459	 */
 460	skb_dst_force(skb);
 461
 462	spin_lock_irqsave(&list->lock, flags);
 463	sock_skb_set_dropcount(sk, skb);
 464	__skb_queue_tail(list, skb);
 465	spin_unlock_irqrestore(&list->lock, flags);
 466
 467	if (!sock_flag(sk, SOCK_DEAD))
 468		sk->sk_data_ready(sk);
 469	return 0;
 470}
 471EXPORT_SYMBOL(__sock_queue_rcv_skb);
 472
 473int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 474{
 475	int err;
 476
 477	err = sk_filter(sk, skb);
 478	if (err)
 479		return err;
 480
 481	return __sock_queue_rcv_skb(sk, skb);
 482}
 483EXPORT_SYMBOL(sock_queue_rcv_skb);
 484
 485int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 486		     const int nested, unsigned int trim_cap, bool refcounted)
 487{
 488	int rc = NET_RX_SUCCESS;
 489
 490	if (sk_filter_trim_cap(sk, skb, trim_cap))
 491		goto discard_and_relse;
 492
 493	skb->dev = NULL;
 494
 495	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 496		atomic_inc(&sk->sk_drops);
 497		goto discard_and_relse;
 498	}
 499	if (nested)
 500		bh_lock_sock_nested(sk);
 501	else
 502		bh_lock_sock(sk);
 503	if (!sock_owned_by_user(sk)) {
 504		/*
 505		 * trylock + unlock semantics:
 506		 */
 507		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 508
 509		rc = sk_backlog_rcv(sk, skb);
 510
 511		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 512	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 513		bh_unlock_sock(sk);
 514		atomic_inc(&sk->sk_drops);
 515		goto discard_and_relse;
 516	}
 517
 518	bh_unlock_sock(sk);
 519out:
 520	if (refcounted)
 521		sock_put(sk);
 522	return rc;
 523discard_and_relse:
 524	kfree_skb(skb);
 525	goto out;
 526}
 527EXPORT_SYMBOL(__sk_receive_skb);
 528
 529INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 530							  u32));
 531INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 532							   u32));
 533struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 534{
 535	struct dst_entry *dst = __sk_dst_get(sk);
 536
 537	if (dst && dst->obsolete &&
 538	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 539			       dst, cookie) == NULL) {
 540		sk_tx_queue_clear(sk);
 541		sk->sk_dst_pending_confirm = 0;
 542		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 543		dst_release(dst);
 544		return NULL;
 545	}
 546
 547	return dst;
 548}
 549EXPORT_SYMBOL(__sk_dst_check);
 550
 551struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 552{
 553	struct dst_entry *dst = sk_dst_get(sk);
 554
 555	if (dst && dst->obsolete &&
 556	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 557			       dst, cookie) == NULL) {
 558		sk_dst_reset(sk);
 559		dst_release(dst);
 560		return NULL;
 561	}
 562
 563	return dst;
 564}
 565EXPORT_SYMBOL(sk_dst_check);
 566
 567static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 568{
 569	int ret = -ENOPROTOOPT;
 570#ifdef CONFIG_NETDEVICES
 571	struct net *net = sock_net(sk);
 572
 573	/* Sorry... */
 574	ret = -EPERM;
 575	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 576		goto out;
 577
 578	ret = -EINVAL;
 579	if (ifindex < 0)
 580		goto out;
 581
 582	sk->sk_bound_dev_if = ifindex;
 583	if (sk->sk_prot->rehash)
 584		sk->sk_prot->rehash(sk);
 585	sk_dst_reset(sk);
 586
 587	ret = 0;
 588
 589out:
 590#endif
 591
 592	return ret;
 593}
 594
 595int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 596{
 597	int ret;
 598
 599	if (lock_sk)
 600		lock_sock(sk);
 601	ret = sock_bindtoindex_locked(sk, ifindex);
 602	if (lock_sk)
 603		release_sock(sk);
 604
 605	return ret;
 606}
 607EXPORT_SYMBOL(sock_bindtoindex);
 608
 609static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 610{
 611	int ret = -ENOPROTOOPT;
 612#ifdef CONFIG_NETDEVICES
 613	struct net *net = sock_net(sk);
 614	char devname[IFNAMSIZ];
 615	int index;
 616
 617	ret = -EINVAL;
 618	if (optlen < 0)
 619		goto out;
 620
 621	/* Bind this socket to a particular device like "eth0",
 622	 * as specified in the passed interface name. If the
 623	 * name is "" or the option length is zero the socket
 624	 * is not bound.
 625	 */
 626	if (optlen > IFNAMSIZ - 1)
 627		optlen = IFNAMSIZ - 1;
 628	memset(devname, 0, sizeof(devname));
 629
 630	ret = -EFAULT;
 631	if (copy_from_sockptr(devname, optval, optlen))
 632		goto out;
 633
 634	index = 0;
 635	if (devname[0] != '\0') {
 636		struct net_device *dev;
 637
 638		rcu_read_lock();
 639		dev = dev_get_by_name_rcu(net, devname);
 640		if (dev)
 641			index = dev->ifindex;
 642		rcu_read_unlock();
 643		ret = -ENODEV;
 644		if (!dev)
 645			goto out;
 646	}
 647
 648	return sock_bindtoindex(sk, index, true);
 649out:
 650#endif
 651
 652	return ret;
 653}
 654
 655static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 656				int __user *optlen, int len)
 657{
 658	int ret = -ENOPROTOOPT;
 659#ifdef CONFIG_NETDEVICES
 660	struct net *net = sock_net(sk);
 661	char devname[IFNAMSIZ];
 662
 663	if (sk->sk_bound_dev_if == 0) {
 664		len = 0;
 665		goto zero;
 666	}
 667
 668	ret = -EINVAL;
 669	if (len < IFNAMSIZ)
 670		goto out;
 671
 672	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 673	if (ret)
 674		goto out;
 675
 676	len = strlen(devname) + 1;
 677
 678	ret = -EFAULT;
 679	if (copy_to_user(optval, devname, len))
 680		goto out;
 681
 682zero:
 683	ret = -EFAULT;
 684	if (put_user(len, optlen))
 685		goto out;
 686
 687	ret = 0;
 688
 689out:
 690#endif
 691
 692	return ret;
 693}
 694
 695bool sk_mc_loop(struct sock *sk)
 696{
 697	if (dev_recursion_level())
 698		return false;
 699	if (!sk)
 700		return true;
 701	switch (sk->sk_family) {
 702	case AF_INET:
 703		return inet_sk(sk)->mc_loop;
 704#if IS_ENABLED(CONFIG_IPV6)
 705	case AF_INET6:
 706		return inet6_sk(sk)->mc_loop;
 707#endif
 708	}
 709	WARN_ON_ONCE(1);
 710	return true;
 711}
 712EXPORT_SYMBOL(sk_mc_loop);
 713
 714void sock_set_reuseaddr(struct sock *sk)
 715{
 716	lock_sock(sk);
 717	sk->sk_reuse = SK_CAN_REUSE;
 718	release_sock(sk);
 719}
 720EXPORT_SYMBOL(sock_set_reuseaddr);
 721
 722void sock_set_reuseport(struct sock *sk)
 723{
 724	lock_sock(sk);
 725	sk->sk_reuseport = true;
 726	release_sock(sk);
 727}
 728EXPORT_SYMBOL(sock_set_reuseport);
 729
 730void sock_no_linger(struct sock *sk)
 731{
 732	lock_sock(sk);
 733	sk->sk_lingertime = 0;
 734	sock_set_flag(sk, SOCK_LINGER);
 735	release_sock(sk);
 736}
 737EXPORT_SYMBOL(sock_no_linger);
 738
 739void sock_set_priority(struct sock *sk, u32 priority)
 740{
 741	lock_sock(sk);
 742	sk->sk_priority = priority;
 743	release_sock(sk);
 744}
 745EXPORT_SYMBOL(sock_set_priority);
 746
 747void sock_set_sndtimeo(struct sock *sk, s64 secs)
 748{
 749	lock_sock(sk);
 750	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 751		sk->sk_sndtimeo = secs * HZ;
 752	else
 753		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 754	release_sock(sk);
 755}
 756EXPORT_SYMBOL(sock_set_sndtimeo);
 757
 758static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 759{
 760	if (val)  {
 761		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 762		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 763		sock_set_flag(sk, SOCK_RCVTSTAMP);
 764		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 765	} else {
 766		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 767		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 768	}
 769}
 770
 771void sock_enable_timestamps(struct sock *sk)
 772{
 773	lock_sock(sk);
 774	__sock_set_timestamps(sk, true, false, true);
 775	release_sock(sk);
 776}
 777EXPORT_SYMBOL(sock_enable_timestamps);
 778
 779void sock_set_keepalive(struct sock *sk)
 780{
 781	lock_sock(sk);
 782	if (sk->sk_prot->keepalive)
 783		sk->sk_prot->keepalive(sk, true);
 784	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 785	release_sock(sk);
 786}
 787EXPORT_SYMBOL(sock_set_keepalive);
 788
 789static void __sock_set_rcvbuf(struct sock *sk, int val)
 790{
 791	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 792	 * as a negative value.
 793	 */
 794	val = min_t(int, val, INT_MAX / 2);
 795	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 796
 797	/* We double it on the way in to account for "struct sk_buff" etc.
 798	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 799	 * will allow that much actual data to be received on that socket.
 800	 *
 801	 * Applications are unaware that "struct sk_buff" and other overheads
 802	 * allocate from the receive buffer during socket buffer allocation.
 803	 *
 804	 * And after considering the possible alternatives, returning the value
 805	 * we actually used in getsockopt is the most desirable behavior.
 806	 */
 807	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 808}
 809
 810void sock_set_rcvbuf(struct sock *sk, int val)
 811{
 812	lock_sock(sk);
 813	__sock_set_rcvbuf(sk, val);
 814	release_sock(sk);
 815}
 816EXPORT_SYMBOL(sock_set_rcvbuf);
 817
 818void sock_set_mark(struct sock *sk, u32 val)
 819{
 820	lock_sock(sk);
 821	sk->sk_mark = val;
 822	release_sock(sk);
 823}
 824EXPORT_SYMBOL(sock_set_mark);
 825
 826/*
 827 *	This is meant for all protocols to use and covers goings on
 828 *	at the socket level. Everything here is generic.
 829 */
 830
 831int sock_setsockopt(struct socket *sock, int level, int optname,
 832		    sockptr_t optval, unsigned int optlen)
 833{
 834	struct sock_txtime sk_txtime;
 835	struct sock *sk = sock->sk;
 836	int val;
 837	int valbool;
 838	struct linger ling;
 839	int ret = 0;
 840
 841	/*
 842	 *	Options without arguments
 843	 */
 844
 845	if (optname == SO_BINDTODEVICE)
 846		return sock_setbindtodevice(sk, optval, optlen);
 847
 848	if (optlen < sizeof(int))
 849		return -EINVAL;
 850
 851	if (copy_from_sockptr(&val, optval, sizeof(val)))
 852		return -EFAULT;
 853
 854	valbool = val ? 1 : 0;
 855
 856	lock_sock(sk);
 857
 858	switch (optname) {
 859	case SO_DEBUG:
 860		if (val && !capable(CAP_NET_ADMIN))
 861			ret = -EACCES;
 862		else
 863			sock_valbool_flag(sk, SOCK_DBG, valbool);
 864		break;
 865	case SO_REUSEADDR:
 866		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 867		break;
 868	case SO_REUSEPORT:
 869		sk->sk_reuseport = valbool;
 870		break;
 871	case SO_TYPE:
 872	case SO_PROTOCOL:
 873	case SO_DOMAIN:
 874	case SO_ERROR:
 875		ret = -ENOPROTOOPT;
 876		break;
 877	case SO_DONTROUTE:
 878		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 879		sk_dst_reset(sk);
 880		break;
 881	case SO_BROADCAST:
 882		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 883		break;
 884	case SO_SNDBUF:
 885		/* Don't error on this BSD doesn't and if you think
 886		 * about it this is right. Otherwise apps have to
 887		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 888		 * are treated in BSD as hints
 889		 */
 890		val = min_t(u32, val, sysctl_wmem_max);
 891set_sndbuf:
 892		/* Ensure val * 2 fits into an int, to prevent max_t()
 893		 * from treating it as a negative value.
 894		 */
 895		val = min_t(int, val, INT_MAX / 2);
 896		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 897		WRITE_ONCE(sk->sk_sndbuf,
 898			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
 899		/* Wake up sending tasks if we upped the value. */
 900		sk->sk_write_space(sk);
 901		break;
 902
 903	case SO_SNDBUFFORCE:
 904		if (!capable(CAP_NET_ADMIN)) {
 905			ret = -EPERM;
 906			break;
 907		}
 908
 909		/* No negative values (to prevent underflow, as val will be
 910		 * multiplied by 2).
 911		 */
 912		if (val < 0)
 913			val = 0;
 914		goto set_sndbuf;
 915
 916	case SO_RCVBUF:
 917		/* Don't error on this BSD doesn't and if you think
 918		 * about it this is right. Otherwise apps have to
 919		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 920		 * are treated in BSD as hints
 921		 */
 922		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 923		break;
 924
 925	case SO_RCVBUFFORCE:
 926		if (!capable(CAP_NET_ADMIN)) {
 927			ret = -EPERM;
 928			break;
 929		}
 930
 931		/* No negative values (to prevent underflow, as val will be
 932		 * multiplied by 2).
 933		 */
 934		__sock_set_rcvbuf(sk, max(val, 0));
 935		break;
 936
 937	case SO_KEEPALIVE:
 938		if (sk->sk_prot->keepalive)
 939			sk->sk_prot->keepalive(sk, valbool);
 940		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 941		break;
 942
 943	case SO_OOBINLINE:
 944		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 945		break;
 946
 947	case SO_NO_CHECK:
 948		sk->sk_no_check_tx = valbool;
 949		break;
 950
 951	case SO_PRIORITY:
 952		if ((val >= 0 && val <= 6) ||
 953		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 954			sk->sk_priority = val;
 955		else
 956			ret = -EPERM;
 957		break;
 958
 959	case SO_LINGER:
 960		if (optlen < sizeof(ling)) {
 961			ret = -EINVAL;	/* 1003.1g */
 962			break;
 963		}
 964		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
 965			ret = -EFAULT;
 966			break;
 967		}
 968		if (!ling.l_onoff)
 969			sock_reset_flag(sk, SOCK_LINGER);
 970		else {
 971#if (BITS_PER_LONG == 32)
 972			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 973				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 974			else
 975#endif
 976				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 977			sock_set_flag(sk, SOCK_LINGER);
 978		}
 979		break;
 980
 981	case SO_BSDCOMPAT:
 982		break;
 983
 984	case SO_PASSCRED:
 985		if (valbool)
 986			set_bit(SOCK_PASSCRED, &sock->flags);
 987		else
 988			clear_bit(SOCK_PASSCRED, &sock->flags);
 989		break;
 990
 991	case SO_TIMESTAMP_OLD:
 992		__sock_set_timestamps(sk, valbool, false, false);
 993		break;
 994	case SO_TIMESTAMP_NEW:
 995		__sock_set_timestamps(sk, valbool, true, false);
 996		break;
 997	case SO_TIMESTAMPNS_OLD:
 998		__sock_set_timestamps(sk, valbool, false, true);
 999		break;
1000	case SO_TIMESTAMPNS_NEW:
1001		__sock_set_timestamps(sk, valbool, true, true);
1002		break;
1003	case SO_TIMESTAMPING_NEW:
1004	case SO_TIMESTAMPING_OLD:
1005		if (val & ~SOF_TIMESTAMPING_MASK) {
1006			ret = -EINVAL;
1007			break;
1008		}
1009
1010		if (val & SOF_TIMESTAMPING_OPT_ID &&
1011		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1012			if (sk->sk_protocol == IPPROTO_TCP &&
1013			    sk->sk_type == SOCK_STREAM) {
1014				if ((1 << sk->sk_state) &
1015				    (TCPF_CLOSE | TCPF_LISTEN)) {
1016					ret = -EINVAL;
1017					break;
1018				}
1019				sk->sk_tskey = tcp_sk(sk)->snd_una;
1020			} else {
1021				sk->sk_tskey = 0;
1022			}
1023		}
1024
1025		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1026		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1027			ret = -EINVAL;
1028			break;
1029		}
1030
1031		sk->sk_tsflags = val;
1032		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1033
1034		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1035			sock_enable_timestamp(sk,
1036					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1037		else
1038			sock_disable_timestamp(sk,
1039					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1040		break;
1041
1042	case SO_RCVLOWAT:
1043		if (val < 0)
1044			val = INT_MAX;
1045		if (sock->ops->set_rcvlowat)
1046			ret = sock->ops->set_rcvlowat(sk, val);
1047		else
1048			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1049		break;
1050
1051	case SO_RCVTIMEO_OLD:
1052	case SO_RCVTIMEO_NEW:
1053		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1054				       optlen, optname == SO_RCVTIMEO_OLD);
1055		break;
1056
1057	case SO_SNDTIMEO_OLD:
1058	case SO_SNDTIMEO_NEW:
1059		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1060				       optlen, optname == SO_SNDTIMEO_OLD);
1061		break;
1062
1063	case SO_ATTACH_FILTER: {
1064		struct sock_fprog fprog;
1065
1066		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1067		if (!ret)
1068			ret = sk_attach_filter(&fprog, sk);
1069		break;
1070	}
1071	case SO_ATTACH_BPF:
1072		ret = -EINVAL;
1073		if (optlen == sizeof(u32)) {
1074			u32 ufd;
1075
1076			ret = -EFAULT;
1077			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1078				break;
1079
1080			ret = sk_attach_bpf(ufd, sk);
1081		}
1082		break;
1083
1084	case SO_ATTACH_REUSEPORT_CBPF: {
1085		struct sock_fprog fprog;
1086
1087		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1088		if (!ret)
1089			ret = sk_reuseport_attach_filter(&fprog, sk);
1090		break;
1091	}
1092	case SO_ATTACH_REUSEPORT_EBPF:
1093		ret = -EINVAL;
1094		if (optlen == sizeof(u32)) {
1095			u32 ufd;
1096
1097			ret = -EFAULT;
1098			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1099				break;
1100
1101			ret = sk_reuseport_attach_bpf(ufd, sk);
1102		}
1103		break;
1104
1105	case SO_DETACH_REUSEPORT_BPF:
1106		ret = reuseport_detach_prog(sk);
1107		break;
1108
1109	case SO_DETACH_FILTER:
1110		ret = sk_detach_filter(sk);
1111		break;
1112
1113	case SO_LOCK_FILTER:
1114		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1115			ret = -EPERM;
1116		else
1117			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1118		break;
1119
1120	case SO_PASSSEC:
1121		if (valbool)
1122			set_bit(SOCK_PASSSEC, &sock->flags);
1123		else
1124			clear_bit(SOCK_PASSSEC, &sock->flags);
1125		break;
1126	case SO_MARK:
1127		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1128			ret = -EPERM;
1129		} else if (val != sk->sk_mark) {
1130			sk->sk_mark = val;
1131			sk_dst_reset(sk);
1132		}
1133		break;
1134
1135	case SO_RXQ_OVFL:
1136		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1137		break;
1138
1139	case SO_WIFI_STATUS:
1140		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1141		break;
1142
1143	case SO_PEEK_OFF:
1144		if (sock->ops->set_peek_off)
1145			ret = sock->ops->set_peek_off(sk, val);
1146		else
1147			ret = -EOPNOTSUPP;
1148		break;
1149
1150	case SO_NOFCS:
1151		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1152		break;
1153
1154	case SO_SELECT_ERR_QUEUE:
1155		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1156		break;
1157
1158#ifdef CONFIG_NET_RX_BUSY_POLL
1159	case SO_BUSY_POLL:
1160		/* allow unprivileged users to decrease the value */
1161		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1162			ret = -EPERM;
1163		else {
1164			if (val < 0)
1165				ret = -EINVAL;
1166			else
1167				sk->sk_ll_usec = val;
1168		}
1169		break;
1170	case SO_PREFER_BUSY_POLL:
1171		if (valbool && !capable(CAP_NET_ADMIN))
1172			ret = -EPERM;
1173		else
1174			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1175		break;
1176	case SO_BUSY_POLL_BUDGET:
1177		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1178			ret = -EPERM;
1179		} else {
1180			if (val < 0 || val > U16_MAX)
1181				ret = -EINVAL;
1182			else
1183				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1184		}
1185		break;
1186#endif
1187
1188	case SO_MAX_PACING_RATE:
1189		{
1190		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1191
1192		if (sizeof(ulval) != sizeof(val) &&
1193		    optlen >= sizeof(ulval) &&
1194		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1195			ret = -EFAULT;
1196			break;
1197		}
1198		if (ulval != ~0UL)
1199			cmpxchg(&sk->sk_pacing_status,
1200				SK_PACING_NONE,
1201				SK_PACING_NEEDED);
1202		sk->sk_max_pacing_rate = ulval;
1203		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1204		break;
1205		}
1206	case SO_INCOMING_CPU:
1207		WRITE_ONCE(sk->sk_incoming_cpu, val);
1208		break;
1209
1210	case SO_CNX_ADVICE:
1211		if (val == 1)
1212			dst_negative_advice(sk);
1213		break;
1214
1215	case SO_ZEROCOPY:
1216		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1217			if (!((sk->sk_type == SOCK_STREAM &&
1218			       sk->sk_protocol == IPPROTO_TCP) ||
1219			      (sk->sk_type == SOCK_DGRAM &&
1220			       sk->sk_protocol == IPPROTO_UDP)))
1221				ret = -ENOTSUPP;
1222		} else if (sk->sk_family != PF_RDS) {
1223			ret = -ENOTSUPP;
1224		}
1225		if (!ret) {
1226			if (val < 0 || val > 1)
1227				ret = -EINVAL;
1228			else
1229				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1230		}
1231		break;
1232
1233	case SO_TXTIME:
1234		if (optlen != sizeof(struct sock_txtime)) {
1235			ret = -EINVAL;
1236			break;
1237		} else if (copy_from_sockptr(&sk_txtime, optval,
1238			   sizeof(struct sock_txtime))) {
1239			ret = -EFAULT;
1240			break;
1241		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1242			ret = -EINVAL;
1243			break;
1244		}
1245		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1246		 * scheduler has enough safe guards.
1247		 */
1248		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1249		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1250			ret = -EPERM;
1251			break;
1252		}
1253		sock_valbool_flag(sk, SOCK_TXTIME, true);
1254		sk->sk_clockid = sk_txtime.clockid;
1255		sk->sk_txtime_deadline_mode =
1256			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1257		sk->sk_txtime_report_errors =
1258			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1259		break;
1260
1261	case SO_BINDTOIFINDEX:
1262		ret = sock_bindtoindex_locked(sk, val);
1263		break;
1264
1265	default:
1266		ret = -ENOPROTOOPT;
1267		break;
1268	}
1269	release_sock(sk);
1270	return ret;
1271}
1272EXPORT_SYMBOL(sock_setsockopt);
1273
1274
1275static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1276			  struct ucred *ucred)
1277{
1278	ucred->pid = pid_vnr(pid);
1279	ucred->uid = ucred->gid = -1;
1280	if (cred) {
1281		struct user_namespace *current_ns = current_user_ns();
1282
1283		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1284		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1285	}
1286}
1287
1288static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1289{
1290	struct user_namespace *user_ns = current_user_ns();
1291	int i;
1292
1293	for (i = 0; i < src->ngroups; i++)
1294		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1295			return -EFAULT;
1296
1297	return 0;
1298}
1299
1300int sock_getsockopt(struct socket *sock, int level, int optname,
1301		    char __user *optval, int __user *optlen)
1302{
1303	struct sock *sk = sock->sk;
1304
1305	union {
1306		int val;
1307		u64 val64;
1308		unsigned long ulval;
1309		struct linger ling;
1310		struct old_timeval32 tm32;
1311		struct __kernel_old_timeval tm;
1312		struct  __kernel_sock_timeval stm;
1313		struct sock_txtime txtime;
1314	} v;
1315
1316	int lv = sizeof(int);
1317	int len;
1318
1319	if (get_user(len, optlen))
1320		return -EFAULT;
1321	if (len < 0)
1322		return -EINVAL;
1323
1324	memset(&v, 0, sizeof(v));
1325
1326	switch (optname) {
1327	case SO_DEBUG:
1328		v.val = sock_flag(sk, SOCK_DBG);
1329		break;
1330
1331	case SO_DONTROUTE:
1332		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1333		break;
1334
1335	case SO_BROADCAST:
1336		v.val = sock_flag(sk, SOCK_BROADCAST);
1337		break;
1338
1339	case SO_SNDBUF:
1340		v.val = sk->sk_sndbuf;
1341		break;
1342
1343	case SO_RCVBUF:
1344		v.val = sk->sk_rcvbuf;
1345		break;
1346
1347	case SO_REUSEADDR:
1348		v.val = sk->sk_reuse;
1349		break;
1350
1351	case SO_REUSEPORT:
1352		v.val = sk->sk_reuseport;
1353		break;
1354
1355	case SO_KEEPALIVE:
1356		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1357		break;
1358
1359	case SO_TYPE:
1360		v.val = sk->sk_type;
1361		break;
1362
1363	case SO_PROTOCOL:
1364		v.val = sk->sk_protocol;
1365		break;
1366
1367	case SO_DOMAIN:
1368		v.val = sk->sk_family;
1369		break;
1370
1371	case SO_ERROR:
1372		v.val = -sock_error(sk);
1373		if (v.val == 0)
1374			v.val = xchg(&sk->sk_err_soft, 0);
1375		break;
1376
1377	case SO_OOBINLINE:
1378		v.val = sock_flag(sk, SOCK_URGINLINE);
1379		break;
1380
1381	case SO_NO_CHECK:
1382		v.val = sk->sk_no_check_tx;
1383		break;
1384
1385	case SO_PRIORITY:
1386		v.val = sk->sk_priority;
1387		break;
1388
1389	case SO_LINGER:
1390		lv		= sizeof(v.ling);
1391		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1392		v.ling.l_linger	= sk->sk_lingertime / HZ;
1393		break;
1394
1395	case SO_BSDCOMPAT:
1396		break;
1397
1398	case SO_TIMESTAMP_OLD:
1399		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1400				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1401				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1402		break;
1403
1404	case SO_TIMESTAMPNS_OLD:
1405		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1406		break;
1407
1408	case SO_TIMESTAMP_NEW:
1409		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1410		break;
1411
1412	case SO_TIMESTAMPNS_NEW:
1413		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1414		break;
1415
1416	case SO_TIMESTAMPING_OLD:
1417		v.val = sk->sk_tsflags;
1418		break;
1419
1420	case SO_RCVTIMEO_OLD:
1421	case SO_RCVTIMEO_NEW:
1422		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1423		break;
1424
1425	case SO_SNDTIMEO_OLD:
1426	case SO_SNDTIMEO_NEW:
1427		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1428		break;
1429
1430	case SO_RCVLOWAT:
1431		v.val = sk->sk_rcvlowat;
1432		break;
1433
1434	case SO_SNDLOWAT:
1435		v.val = 1;
1436		break;
1437
1438	case SO_PASSCRED:
1439		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1440		break;
1441
1442	case SO_PEERCRED:
1443	{
1444		struct ucred peercred;
1445		if (len > sizeof(peercred))
1446			len = sizeof(peercred);
1447		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1448		if (copy_to_user(optval, &peercred, len))
1449			return -EFAULT;
1450		goto lenout;
1451	}
1452
1453	case SO_PEERGROUPS:
1454	{
1455		int ret, n;
1456
1457		if (!sk->sk_peer_cred)
1458			return -ENODATA;
1459
1460		n = sk->sk_peer_cred->group_info->ngroups;
1461		if (len < n * sizeof(gid_t)) {
1462			len = n * sizeof(gid_t);
1463			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1464		}
1465		len = n * sizeof(gid_t);
1466
1467		ret = groups_to_user((gid_t __user *)optval,
1468				     sk->sk_peer_cred->group_info);
1469		if (ret)
1470			return ret;
1471		goto lenout;
1472	}
1473
1474	case SO_PEERNAME:
1475	{
1476		char address[128];
1477
1478		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1479		if (lv < 0)
1480			return -ENOTCONN;
1481		if (lv < len)
1482			return -EINVAL;
1483		if (copy_to_user(optval, address, len))
1484			return -EFAULT;
1485		goto lenout;
1486	}
1487
1488	/* Dubious BSD thing... Probably nobody even uses it, but
1489	 * the UNIX standard wants it for whatever reason... -DaveM
1490	 */
1491	case SO_ACCEPTCONN:
1492		v.val = sk->sk_state == TCP_LISTEN;
1493		break;
1494
1495	case SO_PASSSEC:
1496		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1497		break;
1498
1499	case SO_PEERSEC:
1500		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1501
1502	case SO_MARK:
1503		v.val = sk->sk_mark;
1504		break;
1505
1506	case SO_RXQ_OVFL:
1507		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1508		break;
1509
1510	case SO_WIFI_STATUS:
1511		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1512		break;
1513
1514	case SO_PEEK_OFF:
1515		if (!sock->ops->set_peek_off)
1516			return -EOPNOTSUPP;
1517
1518		v.val = sk->sk_peek_off;
1519		break;
1520	case SO_NOFCS:
1521		v.val = sock_flag(sk, SOCK_NOFCS);
1522		break;
1523
1524	case SO_BINDTODEVICE:
1525		return sock_getbindtodevice(sk, optval, optlen, len);
1526
1527	case SO_GET_FILTER:
1528		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1529		if (len < 0)
1530			return len;
1531
1532		goto lenout;
1533
1534	case SO_LOCK_FILTER:
1535		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1536		break;
1537
1538	case SO_BPF_EXTENSIONS:
1539		v.val = bpf_tell_extensions();
1540		break;
1541
1542	case SO_SELECT_ERR_QUEUE:
1543		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1544		break;
1545
1546#ifdef CONFIG_NET_RX_BUSY_POLL
1547	case SO_BUSY_POLL:
1548		v.val = sk->sk_ll_usec;
1549		break;
1550	case SO_PREFER_BUSY_POLL:
1551		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1552		break;
1553#endif
1554
1555	case SO_MAX_PACING_RATE:
1556		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1557			lv = sizeof(v.ulval);
1558			v.ulval = sk->sk_max_pacing_rate;
1559		} else {
1560			/* 32bit version */
1561			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1562		}
1563		break;
1564
1565	case SO_INCOMING_CPU:
1566		v.val = READ_ONCE(sk->sk_incoming_cpu);
1567		break;
1568
1569	case SO_MEMINFO:
1570	{
1571		u32 meminfo[SK_MEMINFO_VARS];
1572
1573		sk_get_meminfo(sk, meminfo);
1574
1575		len = min_t(unsigned int, len, sizeof(meminfo));
1576		if (copy_to_user(optval, &meminfo, len))
1577			return -EFAULT;
1578
1579		goto lenout;
1580	}
1581
1582#ifdef CONFIG_NET_RX_BUSY_POLL
1583	case SO_INCOMING_NAPI_ID:
1584		v.val = READ_ONCE(sk->sk_napi_id);
1585
1586		/* aggregate non-NAPI IDs down to 0 */
1587		if (v.val < MIN_NAPI_ID)
1588			v.val = 0;
1589
1590		break;
1591#endif
1592
1593	case SO_COOKIE:
1594		lv = sizeof(u64);
1595		if (len < lv)
1596			return -EINVAL;
1597		v.val64 = sock_gen_cookie(sk);
1598		break;
1599
1600	case SO_ZEROCOPY:
1601		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1602		break;
1603
1604	case SO_TXTIME:
1605		lv = sizeof(v.txtime);
1606		v.txtime.clockid = sk->sk_clockid;
1607		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1608				  SOF_TXTIME_DEADLINE_MODE : 0;
1609		v.txtime.flags |= sk->sk_txtime_report_errors ?
1610				  SOF_TXTIME_REPORT_ERRORS : 0;
1611		break;
1612
1613	case SO_BINDTOIFINDEX:
1614		v.val = sk->sk_bound_dev_if;
1615		break;
1616
1617	default:
1618		/* We implement the SO_SNDLOWAT etc to not be settable
1619		 * (1003.1g 7).
1620		 */
1621		return -ENOPROTOOPT;
1622	}
1623
1624	if (len > lv)
1625		len = lv;
1626	if (copy_to_user(optval, &v, len))
1627		return -EFAULT;
1628lenout:
1629	if (put_user(len, optlen))
1630		return -EFAULT;
1631	return 0;
1632}
1633
1634/*
1635 * Initialize an sk_lock.
1636 *
1637 * (We also register the sk_lock with the lock validator.)
1638 */
1639static inline void sock_lock_init(struct sock *sk)
1640{
1641	if (sk->sk_kern_sock)
1642		sock_lock_init_class_and_name(
1643			sk,
1644			af_family_kern_slock_key_strings[sk->sk_family],
1645			af_family_kern_slock_keys + sk->sk_family,
1646			af_family_kern_key_strings[sk->sk_family],
1647			af_family_kern_keys + sk->sk_family);
1648	else
1649		sock_lock_init_class_and_name(
1650			sk,
1651			af_family_slock_key_strings[sk->sk_family],
1652			af_family_slock_keys + sk->sk_family,
1653			af_family_key_strings[sk->sk_family],
1654			af_family_keys + sk->sk_family);
1655}
1656
1657/*
1658 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1659 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1660 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1661 */
1662static void sock_copy(struct sock *nsk, const struct sock *osk)
1663{
1664	const struct proto *prot = READ_ONCE(osk->sk_prot);
1665#ifdef CONFIG_SECURITY_NETWORK
1666	void *sptr = nsk->sk_security;
1667#endif
1668
1669	/* If we move sk_tx_queue_mapping out of the private section,
1670	 * we must check if sk_tx_queue_clear() is called after
1671	 * sock_copy() in sk_clone_lock().
1672	 */
1673	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1674		     offsetof(struct sock, sk_dontcopy_begin) ||
1675		     offsetof(struct sock, sk_tx_queue_mapping) >=
1676		     offsetof(struct sock, sk_dontcopy_end));
1677
1678	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1679
1680	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1681	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1682
1683#ifdef CONFIG_SECURITY_NETWORK
1684	nsk->sk_security = sptr;
1685	security_sk_clone(osk, nsk);
1686#endif
1687}
1688
1689static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1690		int family)
1691{
1692	struct sock *sk;
1693	struct kmem_cache *slab;
1694
1695	slab = prot->slab;
1696	if (slab != NULL) {
1697		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1698		if (!sk)
1699			return sk;
1700		if (want_init_on_alloc(priority))
1701			sk_prot_clear_nulls(sk, prot->obj_size);
1702	} else
1703		sk = kmalloc(prot->obj_size, priority);
1704
1705	if (sk != NULL) {
1706		if (security_sk_alloc(sk, family, priority))
1707			goto out_free;
1708
1709		if (!try_module_get(prot->owner))
1710			goto out_free_sec;
1711	}
1712
1713	return sk;
1714
1715out_free_sec:
1716	security_sk_free(sk);
1717out_free:
1718	if (slab != NULL)
1719		kmem_cache_free(slab, sk);
1720	else
1721		kfree(sk);
1722	return NULL;
1723}
1724
1725static void sk_prot_free(struct proto *prot, struct sock *sk)
1726{
1727	struct kmem_cache *slab;
1728	struct module *owner;
1729
1730	owner = prot->owner;
1731	slab = prot->slab;
1732
1733	cgroup_sk_free(&sk->sk_cgrp_data);
1734	mem_cgroup_sk_free(sk);
1735	security_sk_free(sk);
1736	if (slab != NULL)
1737		kmem_cache_free(slab, sk);
1738	else
1739		kfree(sk);
1740	module_put(owner);
1741}
1742
1743/**
1744 *	sk_alloc - All socket objects are allocated here
1745 *	@net: the applicable net namespace
1746 *	@family: protocol family
1747 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1748 *	@prot: struct proto associated with this new sock instance
1749 *	@kern: is this to be a kernel socket?
1750 */
1751struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1752		      struct proto *prot, int kern)
1753{
1754	struct sock *sk;
1755
1756	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1757	if (sk) {
1758		sk->sk_family = family;
1759		/*
1760		 * See comment in struct sock definition to understand
1761		 * why we need sk_prot_creator -acme
1762		 */
1763		sk->sk_prot = sk->sk_prot_creator = prot;
1764		sk->sk_kern_sock = kern;
1765		sock_lock_init(sk);
1766		sk->sk_net_refcnt = kern ? 0 : 1;
1767		if (likely(sk->sk_net_refcnt)) {
1768			get_net(net);
1769			sock_inuse_add(net, 1);
1770		}
1771
1772		sock_net_set(sk, net);
1773		refcount_set(&sk->sk_wmem_alloc, 1);
1774
1775		mem_cgroup_sk_alloc(sk);
1776		cgroup_sk_alloc(&sk->sk_cgrp_data);
1777		sock_update_classid(&sk->sk_cgrp_data);
1778		sock_update_netprioidx(&sk->sk_cgrp_data);
1779		sk_tx_queue_clear(sk);
1780	}
1781
1782	return sk;
1783}
1784EXPORT_SYMBOL(sk_alloc);
1785
1786/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1787 * grace period. This is the case for UDP sockets and TCP listeners.
1788 */
1789static void __sk_destruct(struct rcu_head *head)
1790{
1791	struct sock *sk = container_of(head, struct sock, sk_rcu);
1792	struct sk_filter *filter;
1793
1794	if (sk->sk_destruct)
1795		sk->sk_destruct(sk);
1796
1797	filter = rcu_dereference_check(sk->sk_filter,
1798				       refcount_read(&sk->sk_wmem_alloc) == 0);
1799	if (filter) {
1800		sk_filter_uncharge(sk, filter);
1801		RCU_INIT_POINTER(sk->sk_filter, NULL);
1802	}
1803
1804	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1805
1806#ifdef CONFIG_BPF_SYSCALL
1807	bpf_sk_storage_free(sk);
1808#endif
1809
1810	if (atomic_read(&sk->sk_omem_alloc))
1811		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1812			 __func__, atomic_read(&sk->sk_omem_alloc));
1813
1814	if (sk->sk_frag.page) {
1815		put_page(sk->sk_frag.page);
1816		sk->sk_frag.page = NULL;
1817	}
1818
1819	if (sk->sk_peer_cred)
1820		put_cred(sk->sk_peer_cred);
1821	put_pid(sk->sk_peer_pid);
1822	if (likely(sk->sk_net_refcnt))
1823		put_net(sock_net(sk));
1824	sk_prot_free(sk->sk_prot_creator, sk);
1825}
1826
1827void sk_destruct(struct sock *sk)
1828{
1829	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1830
1831	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1832		reuseport_detach_sock(sk);
1833		use_call_rcu = true;
1834	}
1835
1836	if (use_call_rcu)
1837		call_rcu(&sk->sk_rcu, __sk_destruct);
1838	else
1839		__sk_destruct(&sk->sk_rcu);
1840}
1841
1842static void __sk_free(struct sock *sk)
1843{
1844	if (likely(sk->sk_net_refcnt))
1845		sock_inuse_add(sock_net(sk), -1);
1846
1847	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1848		sock_diag_broadcast_destroy(sk);
1849	else
1850		sk_destruct(sk);
1851}
1852
1853void sk_free(struct sock *sk)
1854{
1855	/*
1856	 * We subtract one from sk_wmem_alloc and can know if
1857	 * some packets are still in some tx queue.
1858	 * If not null, sock_wfree() will call __sk_free(sk) later
1859	 */
1860	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1861		__sk_free(sk);
1862}
1863EXPORT_SYMBOL(sk_free);
1864
1865static void sk_init_common(struct sock *sk)
1866{
1867	skb_queue_head_init(&sk->sk_receive_queue);
1868	skb_queue_head_init(&sk->sk_write_queue);
1869	skb_queue_head_init(&sk->sk_error_queue);
1870
1871	rwlock_init(&sk->sk_callback_lock);
1872	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1873			af_rlock_keys + sk->sk_family,
1874			af_family_rlock_key_strings[sk->sk_family]);
1875	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1876			af_wlock_keys + sk->sk_family,
1877			af_family_wlock_key_strings[sk->sk_family]);
1878	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1879			af_elock_keys + sk->sk_family,
1880			af_family_elock_key_strings[sk->sk_family]);
1881	lockdep_set_class_and_name(&sk->sk_callback_lock,
1882			af_callback_keys + sk->sk_family,
1883			af_family_clock_key_strings[sk->sk_family]);
1884}
1885
1886/**
1887 *	sk_clone_lock - clone a socket, and lock its clone
1888 *	@sk: the socket to clone
1889 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1890 *
1891 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1892 */
1893struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1894{
1895	struct proto *prot = READ_ONCE(sk->sk_prot);
1896	struct sk_filter *filter;
1897	bool is_charged = true;
1898	struct sock *newsk;
1899
1900	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1901	if (!newsk)
1902		goto out;
1903
1904	sock_copy(newsk, sk);
1905
1906	newsk->sk_prot_creator = prot;
1907
1908	/* SANITY */
1909	if (likely(newsk->sk_net_refcnt))
1910		get_net(sock_net(newsk));
1911	sk_node_init(&newsk->sk_node);
1912	sock_lock_init(newsk);
1913	bh_lock_sock(newsk);
1914	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1915	newsk->sk_backlog.len = 0;
1916
1917	atomic_set(&newsk->sk_rmem_alloc, 0);
1918
1919	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1920	refcount_set(&newsk->sk_wmem_alloc, 1);
1921
1922	atomic_set(&newsk->sk_omem_alloc, 0);
1923	sk_init_common(newsk);
1924
1925	newsk->sk_dst_cache	= NULL;
1926	newsk->sk_dst_pending_confirm = 0;
1927	newsk->sk_wmem_queued	= 0;
1928	newsk->sk_forward_alloc = 0;
1929	atomic_set(&newsk->sk_drops, 0);
1930	newsk->sk_send_head	= NULL;
1931	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1932	atomic_set(&newsk->sk_zckey, 0);
1933
1934	sock_reset_flag(newsk, SOCK_DONE);
1935
1936	/* sk->sk_memcg will be populated at accept() time */
1937	newsk->sk_memcg = NULL;
1938
1939	cgroup_sk_clone(&newsk->sk_cgrp_data);
1940
1941	rcu_read_lock();
1942	filter = rcu_dereference(sk->sk_filter);
1943	if (filter != NULL)
1944		/* though it's an empty new sock, the charging may fail
1945		 * if sysctl_optmem_max was changed between creation of
1946		 * original socket and cloning
1947		 */
1948		is_charged = sk_filter_charge(newsk, filter);
1949	RCU_INIT_POINTER(newsk->sk_filter, filter);
1950	rcu_read_unlock();
1951
1952	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1953		/* We need to make sure that we don't uncharge the new
1954		 * socket if we couldn't charge it in the first place
1955		 * as otherwise we uncharge the parent's filter.
1956		 */
1957		if (!is_charged)
1958			RCU_INIT_POINTER(newsk->sk_filter, NULL);
1959		sk_free_unlock_clone(newsk);
1960		newsk = NULL;
1961		goto out;
1962	}
1963	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1964
1965	if (bpf_sk_storage_clone(sk, newsk)) {
1966		sk_free_unlock_clone(newsk);
1967		newsk = NULL;
1968		goto out;
1969	}
1970
1971	/* Clear sk_user_data if parent had the pointer tagged
1972	 * as not suitable for copying when cloning.
1973	 */
1974	if (sk_user_data_is_nocopy(newsk))
1975		newsk->sk_user_data = NULL;
1976
1977	newsk->sk_err	   = 0;
1978	newsk->sk_err_soft = 0;
1979	newsk->sk_priority = 0;
1980	newsk->sk_incoming_cpu = raw_smp_processor_id();
1981	if (likely(newsk->sk_net_refcnt))
1982		sock_inuse_add(sock_net(newsk), 1);
1983
1984	/* Before updating sk_refcnt, we must commit prior changes to memory
1985	 * (Documentation/RCU/rculist_nulls.rst for details)
1986	 */
1987	smp_wmb();
1988	refcount_set(&newsk->sk_refcnt, 2);
1989
1990	/* Increment the counter in the same struct proto as the master
1991	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1992	 * is the same as sk->sk_prot->socks, as this field was copied
1993	 * with memcpy).
1994	 *
1995	 * This _changes_ the previous behaviour, where
1996	 * tcp_create_openreq_child always was incrementing the
1997	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1998	 * to be taken into account in all callers. -acme
1999	 */
2000	sk_refcnt_debug_inc(newsk);
2001	sk_set_socket(newsk, NULL);
2002	sk_tx_queue_clear(newsk);
2003	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2004
2005	if (newsk->sk_prot->sockets_allocated)
2006		sk_sockets_allocated_inc(newsk);
2007
2008	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2009		net_enable_timestamp();
2010out:
2011	return newsk;
2012}
2013EXPORT_SYMBOL_GPL(sk_clone_lock);
2014
2015void sk_free_unlock_clone(struct sock *sk)
2016{
2017	/* It is still raw copy of parent, so invalidate
2018	 * destructor and make plain sk_free() */
2019	sk->sk_destruct = NULL;
2020	bh_unlock_sock(sk);
2021	sk_free(sk);
2022}
2023EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2024
2025void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2026{
2027	u32 max_segs = 1;
2028
2029	sk_dst_set(sk, dst);
2030	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2031	if (sk->sk_route_caps & NETIF_F_GSO)
2032		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2033	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2034	if (sk_can_gso(sk)) {
2035		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2036			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2037		} else {
2038			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2039			sk->sk_gso_max_size = dst->dev->gso_max_size;
2040			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2041		}
2042	}
2043	sk->sk_gso_max_segs = max_segs;
2044}
2045EXPORT_SYMBOL_GPL(sk_setup_caps);
2046
2047/*
2048 *	Simple resource managers for sockets.
2049 */
2050
2051
2052/*
2053 * Write buffer destructor automatically called from kfree_skb.
2054 */
2055void sock_wfree(struct sk_buff *skb)
2056{
2057	struct sock *sk = skb->sk;
2058	unsigned int len = skb->truesize;
2059
2060	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2061		/*
2062		 * Keep a reference on sk_wmem_alloc, this will be released
2063		 * after sk_write_space() call
2064		 */
2065		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2066		sk->sk_write_space(sk);
2067		len = 1;
2068	}
2069	/*
2070	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2071	 * could not do because of in-flight packets
2072	 */
2073	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2074		__sk_free(sk);
2075}
2076EXPORT_SYMBOL(sock_wfree);
2077
2078/* This variant of sock_wfree() is used by TCP,
2079 * since it sets SOCK_USE_WRITE_QUEUE.
2080 */
2081void __sock_wfree(struct sk_buff *skb)
2082{
2083	struct sock *sk = skb->sk;
2084
2085	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2086		__sk_free(sk);
2087}
2088
2089void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2090{
2091	skb_orphan(skb);
2092	skb->sk = sk;
2093#ifdef CONFIG_INET
2094	if (unlikely(!sk_fullsock(sk))) {
2095		skb->destructor = sock_edemux;
2096		sock_hold(sk);
2097		return;
2098	}
2099#endif
2100	skb->destructor = sock_wfree;
2101	skb_set_hash_from_sk(skb, sk);
2102	/*
2103	 * We used to take a refcount on sk, but following operation
2104	 * is enough to guarantee sk_free() wont free this sock until
2105	 * all in-flight packets are completed
2106	 */
2107	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2108}
2109EXPORT_SYMBOL(skb_set_owner_w);
2110
2111static bool can_skb_orphan_partial(const struct sk_buff *skb)
2112{
2113#ifdef CONFIG_TLS_DEVICE
2114	/* Drivers depend on in-order delivery for crypto offload,
2115	 * partial orphan breaks out-of-order-OK logic.
2116	 */
2117	if (skb->decrypted)
2118		return false;
2119#endif
2120	return (skb->destructor == sock_wfree ||
2121		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2122}
2123
2124/* This helper is used by netem, as it can hold packets in its
2125 * delay queue. We want to allow the owner socket to send more
2126 * packets, as if they were already TX completed by a typical driver.
2127 * But we also want to keep skb->sk set because some packet schedulers
2128 * rely on it (sch_fq for example).
2129 */
2130void skb_orphan_partial(struct sk_buff *skb)
2131{
2132	if (skb_is_tcp_pure_ack(skb))
2133		return;
2134
2135	if (can_skb_orphan_partial(skb)) {
2136		struct sock *sk = skb->sk;
2137
2138		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2139			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2140			skb->destructor = sock_efree;
2141		}
2142	} else {
2143		skb_orphan(skb);
2144	}
2145}
2146EXPORT_SYMBOL(skb_orphan_partial);
2147
2148/*
2149 * Read buffer destructor automatically called from kfree_skb.
2150 */
2151void sock_rfree(struct sk_buff *skb)
2152{
2153	struct sock *sk = skb->sk;
2154	unsigned int len = skb->truesize;
2155
2156	atomic_sub(len, &sk->sk_rmem_alloc);
2157	sk_mem_uncharge(sk, len);
2158}
2159EXPORT_SYMBOL(sock_rfree);
2160
2161/*
2162 * Buffer destructor for skbs that are not used directly in read or write
2163 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2164 */
2165void sock_efree(struct sk_buff *skb)
2166{
2167	sock_put(skb->sk);
2168}
2169EXPORT_SYMBOL(sock_efree);
2170
2171/* Buffer destructor for prefetch/receive path where reference count may
2172 * not be held, e.g. for listen sockets.
2173 */
2174#ifdef CONFIG_INET
2175void sock_pfree(struct sk_buff *skb)
2176{
2177	if (sk_is_refcounted(skb->sk))
2178		sock_gen_put(skb->sk);
2179}
2180EXPORT_SYMBOL(sock_pfree);
2181#endif /* CONFIG_INET */
2182
2183kuid_t sock_i_uid(struct sock *sk)
2184{
2185	kuid_t uid;
2186
2187	read_lock_bh(&sk->sk_callback_lock);
2188	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2189	read_unlock_bh(&sk->sk_callback_lock);
2190	return uid;
2191}
2192EXPORT_SYMBOL(sock_i_uid);
2193
2194unsigned long sock_i_ino(struct sock *sk)
2195{
2196	unsigned long ino;
2197
2198	read_lock_bh(&sk->sk_callback_lock);
2199	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2200	read_unlock_bh(&sk->sk_callback_lock);
2201	return ino;
2202}
2203EXPORT_SYMBOL(sock_i_ino);
2204
2205/*
2206 * Allocate a skb from the socket's send buffer.
2207 */
2208struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2209			     gfp_t priority)
2210{
2211	if (force ||
2212	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2213		struct sk_buff *skb = alloc_skb(size, priority);
2214
2215		if (skb) {
2216			skb_set_owner_w(skb, sk);
2217			return skb;
2218		}
2219	}
2220	return NULL;
2221}
2222EXPORT_SYMBOL(sock_wmalloc);
2223
2224static void sock_ofree(struct sk_buff *skb)
2225{
2226	struct sock *sk = skb->sk;
2227
2228	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2229}
2230
2231struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2232			     gfp_t priority)
2233{
2234	struct sk_buff *skb;
2235
2236	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2237	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2238	    sysctl_optmem_max)
2239		return NULL;
2240
2241	skb = alloc_skb(size, priority);
2242	if (!skb)
2243		return NULL;
2244
2245	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2246	skb->sk = sk;
2247	skb->destructor = sock_ofree;
2248	return skb;
2249}
2250
2251/*
2252 * Allocate a memory block from the socket's option memory buffer.
2253 */
2254void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2255{
2256	if ((unsigned int)size <= sysctl_optmem_max &&
2257	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2258		void *mem;
2259		/* First do the add, to avoid the race if kmalloc
2260		 * might sleep.
2261		 */
2262		atomic_add(size, &sk->sk_omem_alloc);
2263		mem = kmalloc(size, priority);
2264		if (mem)
2265			return mem;
2266		atomic_sub(size, &sk->sk_omem_alloc);
2267	}
2268	return NULL;
2269}
2270EXPORT_SYMBOL(sock_kmalloc);
2271
2272/* Free an option memory block. Note, we actually want the inline
2273 * here as this allows gcc to detect the nullify and fold away the
2274 * condition entirely.
2275 */
2276static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2277				  const bool nullify)
2278{
2279	if (WARN_ON_ONCE(!mem))
2280		return;
2281	if (nullify)
2282		kfree_sensitive(mem);
2283	else
2284		kfree(mem);
2285	atomic_sub(size, &sk->sk_omem_alloc);
2286}
2287
2288void sock_kfree_s(struct sock *sk, void *mem, int size)
2289{
2290	__sock_kfree_s(sk, mem, size, false);
2291}
2292EXPORT_SYMBOL(sock_kfree_s);
2293
2294void sock_kzfree_s(struct sock *sk, void *mem, int size)
2295{
2296	__sock_kfree_s(sk, mem, size, true);
2297}
2298EXPORT_SYMBOL(sock_kzfree_s);
2299
2300/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2301   I think, these locks should be removed for datagram sockets.
2302 */
2303static long sock_wait_for_wmem(struct sock *sk, long timeo)
2304{
2305	DEFINE_WAIT(wait);
2306
2307	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2308	for (;;) {
2309		if (!timeo)
2310			break;
2311		if (signal_pending(current))
2312			break;
2313		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2314		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2315		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2316			break;
2317		if (sk->sk_shutdown & SEND_SHUTDOWN)
2318			break;
2319		if (sk->sk_err)
2320			break;
2321		timeo = schedule_timeout(timeo);
2322	}
2323	finish_wait(sk_sleep(sk), &wait);
2324	return timeo;
2325}
2326
2327
2328/*
2329 *	Generic send/receive buffer handlers
2330 */
2331
2332struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2333				     unsigned long data_len, int noblock,
2334				     int *errcode, int max_page_order)
2335{
2336	struct sk_buff *skb;
2337	long timeo;
2338	int err;
2339
2340	timeo = sock_sndtimeo(sk, noblock);
2341	for (;;) {
2342		err = sock_error(sk);
2343		if (err != 0)
2344			goto failure;
2345
2346		err = -EPIPE;
2347		if (sk->sk_shutdown & SEND_SHUTDOWN)
2348			goto failure;
2349
2350		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2351			break;
2352
2353		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2354		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2355		err = -EAGAIN;
2356		if (!timeo)
2357			goto failure;
2358		if (signal_pending(current))
2359			goto interrupted;
2360		timeo = sock_wait_for_wmem(sk, timeo);
2361	}
2362	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2363				   errcode, sk->sk_allocation);
2364	if (skb)
2365		skb_set_owner_w(skb, sk);
2366	return skb;
2367
2368interrupted:
2369	err = sock_intr_errno(timeo);
2370failure:
2371	*errcode = err;
2372	return NULL;
2373}
2374EXPORT_SYMBOL(sock_alloc_send_pskb);
2375
2376struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2377				    int noblock, int *errcode)
2378{
2379	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2380}
2381EXPORT_SYMBOL(sock_alloc_send_skb);
2382
2383int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2384		     struct sockcm_cookie *sockc)
2385{
2386	u32 tsflags;
2387
2388	switch (cmsg->cmsg_type) {
2389	case SO_MARK:
2390		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2391			return -EPERM;
2392		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2393			return -EINVAL;
2394		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2395		break;
2396	case SO_TIMESTAMPING_OLD:
2397		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2398			return -EINVAL;
2399
2400		tsflags = *(u32 *)CMSG_DATA(cmsg);
2401		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2402			return -EINVAL;
2403
2404		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2405		sockc->tsflags |= tsflags;
2406		break;
2407	case SCM_TXTIME:
2408		if (!sock_flag(sk, SOCK_TXTIME))
2409			return -EINVAL;
2410		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2411			return -EINVAL;
2412		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2413		break;
2414	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2415	case SCM_RIGHTS:
2416	case SCM_CREDENTIALS:
2417		break;
2418	default:
2419		return -EINVAL;
2420	}
2421	return 0;
2422}
2423EXPORT_SYMBOL(__sock_cmsg_send);
2424
2425int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2426		   struct sockcm_cookie *sockc)
2427{
2428	struct cmsghdr *cmsg;
2429	int ret;
2430
2431	for_each_cmsghdr(cmsg, msg) {
2432		if (!CMSG_OK(msg, cmsg))
2433			return -EINVAL;
2434		if (cmsg->cmsg_level != SOL_SOCKET)
2435			continue;
2436		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2437		if (ret)
2438			return ret;
2439	}
2440	return 0;
2441}
2442EXPORT_SYMBOL(sock_cmsg_send);
2443
2444static void sk_enter_memory_pressure(struct sock *sk)
2445{
2446	if (!sk->sk_prot->enter_memory_pressure)
2447		return;
2448
2449	sk->sk_prot->enter_memory_pressure(sk);
2450}
2451
2452static void sk_leave_memory_pressure(struct sock *sk)
2453{
2454	if (sk->sk_prot->leave_memory_pressure) {
2455		sk->sk_prot->leave_memory_pressure(sk);
2456	} else {
2457		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2458
2459		if (memory_pressure && READ_ONCE(*memory_pressure))
2460			WRITE_ONCE(*memory_pressure, 0);
2461	}
2462}
2463
2464#define SKB_FRAG_PAGE_ORDER	get_order(32768)
2465DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2466
2467/**
2468 * skb_page_frag_refill - check that a page_frag contains enough room
2469 * @sz: minimum size of the fragment we want to get
2470 * @pfrag: pointer to page_frag
2471 * @gfp: priority for memory allocation
2472 *
2473 * Note: While this allocator tries to use high order pages, there is
2474 * no guarantee that allocations succeed. Therefore, @sz MUST be
2475 * less or equal than PAGE_SIZE.
2476 */
2477bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2478{
2479	if (pfrag->page) {
2480		if (page_ref_count(pfrag->page) == 1) {
2481			pfrag->offset = 0;
2482			return true;
2483		}
2484		if (pfrag->offset + sz <= pfrag->size)
2485			return true;
2486		put_page(pfrag->page);
2487	}
2488
2489	pfrag->offset = 0;
2490	if (SKB_FRAG_PAGE_ORDER &&
2491	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2492		/* Avoid direct reclaim but allow kswapd to wake */
2493		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2494					  __GFP_COMP | __GFP_NOWARN |
2495					  __GFP_NORETRY,
2496					  SKB_FRAG_PAGE_ORDER);
2497		if (likely(pfrag->page)) {
2498			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2499			return true;
2500		}
2501	}
2502	pfrag->page = alloc_page(gfp);
2503	if (likely(pfrag->page)) {
2504		pfrag->size = PAGE_SIZE;
2505		return true;
2506	}
2507	return false;
2508}
2509EXPORT_SYMBOL(skb_page_frag_refill);
2510
2511bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2512{
2513	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2514		return true;
2515
2516	sk_enter_memory_pressure(sk);
2517	sk_stream_moderate_sndbuf(sk);
2518	return false;
2519}
2520EXPORT_SYMBOL(sk_page_frag_refill);
2521
2522void __lock_sock(struct sock *sk)
2523	__releases(&sk->sk_lock.slock)
2524	__acquires(&sk->sk_lock.slock)
2525{
2526	DEFINE_WAIT(wait);
2527
2528	for (;;) {
2529		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2530					TASK_UNINTERRUPTIBLE);
2531		spin_unlock_bh(&sk->sk_lock.slock);
2532		schedule();
2533		spin_lock_bh(&sk->sk_lock.slock);
2534		if (!sock_owned_by_user(sk))
2535			break;
2536	}
2537	finish_wait(&sk->sk_lock.wq, &wait);
2538}
2539
2540void __release_sock(struct sock *sk)
2541	__releases(&sk->sk_lock.slock)
2542	__acquires(&sk->sk_lock.slock)
2543{
2544	struct sk_buff *skb, *next;
2545
2546	while ((skb = sk->sk_backlog.head) != NULL) {
2547		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2548
2549		spin_unlock_bh(&sk->sk_lock.slock);
2550
2551		do {
2552			next = skb->next;
2553			prefetch(next);
2554			WARN_ON_ONCE(skb_dst_is_noref(skb));
2555			skb_mark_not_on_list(skb);
2556			sk_backlog_rcv(sk, skb);
2557
2558			cond_resched();
2559
2560			skb = next;
2561		} while (skb != NULL);
2562
2563		spin_lock_bh(&sk->sk_lock.slock);
2564	}
2565
2566	/*
2567	 * Doing the zeroing here guarantee we can not loop forever
2568	 * while a wild producer attempts to flood us.
2569	 */
2570	sk->sk_backlog.len = 0;
2571}
2572
2573void __sk_flush_backlog(struct sock *sk)
2574{
2575	spin_lock_bh(&sk->sk_lock.slock);
2576	__release_sock(sk);
2577	spin_unlock_bh(&sk->sk_lock.slock);
2578}
2579
2580/**
2581 * sk_wait_data - wait for data to arrive at sk_receive_queue
2582 * @sk:    sock to wait on
2583 * @timeo: for how long
2584 * @skb:   last skb seen on sk_receive_queue
2585 *
2586 * Now socket state including sk->sk_err is changed only under lock,
2587 * hence we may omit checks after joining wait queue.
2588 * We check receive queue before schedule() only as optimization;
2589 * it is very likely that release_sock() added new data.
2590 */
2591int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2592{
2593	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2594	int rc;
2595
2596	add_wait_queue(sk_sleep(sk), &wait);
2597	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2598	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2599	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2600	remove_wait_queue(sk_sleep(sk), &wait);
2601	return rc;
2602}
2603EXPORT_SYMBOL(sk_wait_data);
2604
2605/**
2606 *	__sk_mem_raise_allocated - increase memory_allocated
2607 *	@sk: socket
2608 *	@size: memory size to allocate
2609 *	@amt: pages to allocate
2610 *	@kind: allocation type
2611 *
2612 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2613 */
2614int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2615{
2616	struct proto *prot = sk->sk_prot;
2617	long allocated = sk_memory_allocated_add(sk, amt);
2618	bool charged = true;
2619
2620	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2621	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2622		goto suppress_allocation;
2623
2624	/* Under limit. */
2625	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2626		sk_leave_memory_pressure(sk);
2627		return 1;
2628	}
2629
2630	/* Under pressure. */
2631	if (allocated > sk_prot_mem_limits(sk, 1))
2632		sk_enter_memory_pressure(sk);
2633
2634	/* Over hard limit. */
2635	if (allocated > sk_prot_mem_limits(sk, 2))
2636		goto suppress_allocation;
2637
2638	/* guarantee minimum buffer size under pressure */
2639	if (kind == SK_MEM_RECV) {
2640		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2641			return 1;
2642
2643	} else { /* SK_MEM_SEND */
2644		int wmem0 = sk_get_wmem0(sk, prot);
2645
2646		if (sk->sk_type == SOCK_STREAM) {
2647			if (sk->sk_wmem_queued < wmem0)
2648				return 1;
2649		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2650				return 1;
2651		}
2652	}
2653
2654	if (sk_has_memory_pressure(sk)) {
2655		u64 alloc;
2656
2657		if (!sk_under_memory_pressure(sk))
2658			return 1;
2659		alloc = sk_sockets_allocated_read_positive(sk);
2660		if (sk_prot_mem_limits(sk, 2) > alloc *
2661		    sk_mem_pages(sk->sk_wmem_queued +
2662				 atomic_read(&sk->sk_rmem_alloc) +
2663				 sk->sk_forward_alloc))
2664			return 1;
2665	}
2666
2667suppress_allocation:
2668
2669	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2670		sk_stream_moderate_sndbuf(sk);
2671
2672		/* Fail only if socket is _under_ its sndbuf.
2673		 * In this case we cannot block, so that we have to fail.
2674		 */
2675		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2676			return 1;
2677	}
2678
2679	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2680		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2681
2682	sk_memory_allocated_sub(sk, amt);
2683
2684	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2685		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2686
2687	return 0;
2688}
2689EXPORT_SYMBOL(__sk_mem_raise_allocated);
2690
2691/**
2692 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2693 *	@sk: socket
2694 *	@size: memory size to allocate
2695 *	@kind: allocation type
2696 *
2697 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2698 *	rmem allocation. This function assumes that protocols which have
2699 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2700 */
2701int __sk_mem_schedule(struct sock *sk, int size, int kind)
2702{
2703	int ret, amt = sk_mem_pages(size);
2704
2705	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2706	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2707	if (!ret)
2708		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2709	return ret;
2710}
2711EXPORT_SYMBOL(__sk_mem_schedule);
2712
2713/**
2714 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2715 *	@sk: socket
2716 *	@amount: number of quanta
2717 *
2718 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2719 */
2720void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2721{
2722	sk_memory_allocated_sub(sk, amount);
2723
2724	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2725		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2726
2727	if (sk_under_memory_pressure(sk) &&
2728	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2729		sk_leave_memory_pressure(sk);
2730}
2731EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2732
2733/**
2734 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2735 *	@sk: socket
2736 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2737 */
2738void __sk_mem_reclaim(struct sock *sk, int amount)
2739{
2740	amount >>= SK_MEM_QUANTUM_SHIFT;
2741	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2742	__sk_mem_reduce_allocated(sk, amount);
2743}
2744EXPORT_SYMBOL(__sk_mem_reclaim);
2745
2746int sk_set_peek_off(struct sock *sk, int val)
2747{
2748	sk->sk_peek_off = val;
2749	return 0;
2750}
2751EXPORT_SYMBOL_GPL(sk_set_peek_off);
2752
2753/*
2754 * Set of default routines for initialising struct proto_ops when
2755 * the protocol does not support a particular function. In certain
2756 * cases where it makes no sense for a protocol to have a "do nothing"
2757 * function, some default processing is provided.
2758 */
2759
2760int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2761{
2762	return -EOPNOTSUPP;
2763}
2764EXPORT_SYMBOL(sock_no_bind);
2765
2766int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2767		    int len, int flags)
2768{
2769	return -EOPNOTSUPP;
2770}
2771EXPORT_SYMBOL(sock_no_connect);
2772
2773int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2774{
2775	return -EOPNOTSUPP;
2776}
2777EXPORT_SYMBOL(sock_no_socketpair);
2778
2779int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2780		   bool kern)
2781{
2782	return -EOPNOTSUPP;
2783}
2784EXPORT_SYMBOL(sock_no_accept);
2785
2786int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2787		    int peer)
2788{
2789	return -EOPNOTSUPP;
2790}
2791EXPORT_SYMBOL(sock_no_getname);
2792
2793int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2794{
2795	return -EOPNOTSUPP;
2796}
2797EXPORT_SYMBOL(sock_no_ioctl);
2798
2799int sock_no_listen(struct socket *sock, int backlog)
2800{
2801	return -EOPNOTSUPP;
2802}
2803EXPORT_SYMBOL(sock_no_listen);
2804
2805int sock_no_shutdown(struct socket *sock, int how)
2806{
2807	return -EOPNOTSUPP;
2808}
2809EXPORT_SYMBOL(sock_no_shutdown);
2810
2811int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2812{
2813	return -EOPNOTSUPP;
2814}
2815EXPORT_SYMBOL(sock_no_sendmsg);
2816
2817int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2818{
2819	return -EOPNOTSUPP;
2820}
2821EXPORT_SYMBOL(sock_no_sendmsg_locked);
2822
2823int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2824		    int flags)
2825{
2826	return -EOPNOTSUPP;
2827}
2828EXPORT_SYMBOL(sock_no_recvmsg);
2829
2830int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2831{
2832	/* Mirror missing mmap method error code */
2833	return -ENODEV;
2834}
2835EXPORT_SYMBOL(sock_no_mmap);
2836
2837/*
2838 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2839 * various sock-based usage counts.
2840 */
2841void __receive_sock(struct file *file)
2842{
2843	struct socket *sock;
2844
2845	sock = sock_from_file(file);
2846	if (sock) {
2847		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2848		sock_update_classid(&sock->sk->sk_cgrp_data);
2849	}
2850}
2851
2852ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2853{
2854	ssize_t res;
2855	struct msghdr msg = {.msg_flags = flags};
2856	struct kvec iov;
2857	char *kaddr = kmap(page);
2858	iov.iov_base = kaddr + offset;
2859	iov.iov_len = size;
2860	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2861	kunmap(page);
2862	return res;
2863}
2864EXPORT_SYMBOL(sock_no_sendpage);
2865
2866ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2867				int offset, size_t size, int flags)
2868{
2869	ssize_t res;
2870	struct msghdr msg = {.msg_flags = flags};
2871	struct kvec iov;
2872	char *kaddr = kmap(page);
2873
2874	iov.iov_base = kaddr + offset;
2875	iov.iov_len = size;
2876	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2877	kunmap(page);
2878	return res;
2879}
2880EXPORT_SYMBOL(sock_no_sendpage_locked);
2881
2882/*
2883 *	Default Socket Callbacks
2884 */
2885
2886static void sock_def_wakeup(struct sock *sk)
2887{
2888	struct socket_wq *wq;
2889
2890	rcu_read_lock();
2891	wq = rcu_dereference(sk->sk_wq);
2892	if (skwq_has_sleeper(wq))
2893		wake_up_interruptible_all(&wq->wait);
2894	rcu_read_unlock();
2895}
2896
2897static void sock_def_error_report(struct sock *sk)
2898{
2899	struct socket_wq *wq;
2900
2901	rcu_read_lock();
2902	wq = rcu_dereference(sk->sk_wq);
2903	if (skwq_has_sleeper(wq))
2904		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2905	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2906	rcu_read_unlock();
2907}
2908
2909void sock_def_readable(struct sock *sk)
2910{
2911	struct socket_wq *wq;
2912
2913	rcu_read_lock();
2914	wq = rcu_dereference(sk->sk_wq);
2915	if (skwq_has_sleeper(wq))
2916		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2917						EPOLLRDNORM | EPOLLRDBAND);
2918	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2919	rcu_read_unlock();
2920}
2921
2922static void sock_def_write_space(struct sock *sk)
2923{
2924	struct socket_wq *wq;
2925
2926	rcu_read_lock();
2927
2928	/* Do not wake up a writer until he can make "significant"
2929	 * progress.  --DaveM
2930	 */
2931	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2932		wq = rcu_dereference(sk->sk_wq);
2933		if (skwq_has_sleeper(wq))
2934			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2935						EPOLLWRNORM | EPOLLWRBAND);
2936
2937		/* Should agree with poll, otherwise some programs break */
2938		if (sock_writeable(sk))
2939			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2940	}
2941
2942	rcu_read_unlock();
2943}
2944
2945static void sock_def_destruct(struct sock *sk)
2946{
2947}
2948
2949void sk_send_sigurg(struct sock *sk)
2950{
2951	if (sk->sk_socket && sk->sk_socket->file)
2952		if (send_sigurg(&sk->sk_socket->file->f_owner))
2953			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2954}
2955EXPORT_SYMBOL(sk_send_sigurg);
2956
2957void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2958		    unsigned long expires)
2959{
2960	if (!mod_timer(timer, expires))
2961		sock_hold(sk);
2962}
2963EXPORT_SYMBOL(sk_reset_timer);
2964
2965void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2966{
2967	if (del_timer(timer))
2968		__sock_put(sk);
2969}
2970EXPORT_SYMBOL(sk_stop_timer);
2971
2972void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2973{
2974	if (del_timer_sync(timer))
2975		__sock_put(sk);
2976}
2977EXPORT_SYMBOL(sk_stop_timer_sync);
2978
2979void sock_init_data(struct socket *sock, struct sock *sk)
2980{
2981	sk_init_common(sk);
2982	sk->sk_send_head	=	NULL;
2983
2984	timer_setup(&sk->sk_timer, NULL, 0);
2985
2986	sk->sk_allocation	=	GFP_KERNEL;
2987	sk->sk_rcvbuf		=	sysctl_rmem_default;
2988	sk->sk_sndbuf		=	sysctl_wmem_default;
2989	sk->sk_state		=	TCP_CLOSE;
2990	sk_set_socket(sk, sock);
2991
2992	sock_set_flag(sk, SOCK_ZAPPED);
2993
2994	if (sock) {
2995		sk->sk_type	=	sock->type;
2996		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2997		sock->sk	=	sk;
2998		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2999	} else {
3000		RCU_INIT_POINTER(sk->sk_wq, NULL);
3001		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3002	}
3003
3004	rwlock_init(&sk->sk_callback_lock);
3005	if (sk->sk_kern_sock)
3006		lockdep_set_class_and_name(
3007			&sk->sk_callback_lock,
3008			af_kern_callback_keys + sk->sk_family,
3009			af_family_kern_clock_key_strings[sk->sk_family]);
3010	else
3011		lockdep_set_class_and_name(
3012			&sk->sk_callback_lock,
3013			af_callback_keys + sk->sk_family,
3014			af_family_clock_key_strings[sk->sk_family]);
3015
3016	sk->sk_state_change	=	sock_def_wakeup;
3017	sk->sk_data_ready	=	sock_def_readable;
3018	sk->sk_write_space	=	sock_def_write_space;
3019	sk->sk_error_report	=	sock_def_error_report;
3020	sk->sk_destruct		=	sock_def_destruct;
3021
3022	sk->sk_frag.page	=	NULL;
3023	sk->sk_frag.offset	=	0;
3024	sk->sk_peek_off		=	-1;
3025
3026	sk->sk_peer_pid 	=	NULL;
3027	sk->sk_peer_cred	=	NULL;
3028	sk->sk_write_pending	=	0;
3029	sk->sk_rcvlowat		=	1;
3030	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3031	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3032
3033	sk->sk_stamp = SK_DEFAULT_STAMP;
3034#if BITS_PER_LONG==32
3035	seqlock_init(&sk->sk_stamp_seq);
3036#endif
3037	atomic_set(&sk->sk_zckey, 0);
3038
3039#ifdef CONFIG_NET_RX_BUSY_POLL
3040	sk->sk_napi_id		=	0;
3041	sk->sk_ll_usec		=	sysctl_net_busy_read;
3042#endif
3043
3044	sk->sk_max_pacing_rate = ~0UL;
3045	sk->sk_pacing_rate = ~0UL;
3046	WRITE_ONCE(sk->sk_pacing_shift, 10);
3047	sk->sk_incoming_cpu = -1;
3048
3049	sk_rx_queue_clear(sk);
3050	/*
3051	 * Before updating sk_refcnt, we must commit prior changes to memory
3052	 * (Documentation/RCU/rculist_nulls.rst for details)
3053	 */
3054	smp_wmb();
3055	refcount_set(&sk->sk_refcnt, 1);
3056	atomic_set(&sk->sk_drops, 0);
3057}
3058EXPORT_SYMBOL(sock_init_data);
3059
3060void lock_sock_nested(struct sock *sk, int subclass)
3061{
3062	might_sleep();
3063	spin_lock_bh(&sk->sk_lock.slock);
3064	if (sk->sk_lock.owned)
3065		__lock_sock(sk);
3066	sk->sk_lock.owned = 1;
3067	spin_unlock(&sk->sk_lock.slock);
3068	/*
3069	 * The sk_lock has mutex_lock() semantics here:
3070	 */
3071	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3072	local_bh_enable();
3073}
3074EXPORT_SYMBOL(lock_sock_nested);
3075
3076void release_sock(struct sock *sk)
3077{
3078	spin_lock_bh(&sk->sk_lock.slock);
3079	if (sk->sk_backlog.tail)
3080		__release_sock(sk);
3081
3082	/* Warning : release_cb() might need to release sk ownership,
3083	 * ie call sock_release_ownership(sk) before us.
3084	 */
3085	if (sk->sk_prot->release_cb)
3086		sk->sk_prot->release_cb(sk);
3087
3088	sock_release_ownership(sk);
3089	if (waitqueue_active(&sk->sk_lock.wq))
3090		wake_up(&sk->sk_lock.wq);
3091	spin_unlock_bh(&sk->sk_lock.slock);
3092}
3093EXPORT_SYMBOL(release_sock);
3094
3095/**
3096 * lock_sock_fast - fast version of lock_sock
3097 * @sk: socket
3098 *
3099 * This version should be used for very small section, where process wont block
3100 * return false if fast path is taken:
3101 *
3102 *   sk_lock.slock locked, owned = 0, BH disabled
3103 *
3104 * return true if slow path is taken:
3105 *
3106 *   sk_lock.slock unlocked, owned = 1, BH enabled
3107 */
3108bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3109{
3110	might_sleep();
3111	spin_lock_bh(&sk->sk_lock.slock);
3112
3113	if (!sk->sk_lock.owned)
3114		/*
3115		 * Note : We must disable BH
3116		 */
3117		return false;
3118
3119	__lock_sock(sk);
3120	sk->sk_lock.owned = 1;
3121	spin_unlock(&sk->sk_lock.slock);
3122	/*
3123	 * The sk_lock has mutex_lock() semantics here:
3124	 */
3125	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3126	__acquire(&sk->sk_lock.slock);
3127	local_bh_enable();
3128	return true;
3129}
3130EXPORT_SYMBOL(lock_sock_fast);
3131
3132int sock_gettstamp(struct socket *sock, void __user *userstamp,
3133		   bool timeval, bool time32)
3134{
3135	struct sock *sk = sock->sk;
3136	struct timespec64 ts;
3137
3138	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3139	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3140	if (ts.tv_sec == -1)
3141		return -ENOENT;
3142	if (ts.tv_sec == 0) {
3143		ktime_t kt = ktime_get_real();
3144		sock_write_timestamp(sk, kt);
3145		ts = ktime_to_timespec64(kt);
3146	}
3147
3148	if (timeval)
3149		ts.tv_nsec /= 1000;
3150
3151#ifdef CONFIG_COMPAT_32BIT_TIME
3152	if (time32)
3153		return put_old_timespec32(&ts, userstamp);
3154#endif
3155#ifdef CONFIG_SPARC64
3156	/* beware of padding in sparc64 timeval */
3157	if (timeval && !in_compat_syscall()) {
3158		struct __kernel_old_timeval __user tv = {
3159			.tv_sec = ts.tv_sec,
3160			.tv_usec = ts.tv_nsec,
3161		};
3162		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3163			return -EFAULT;
3164		return 0;
3165	}
3166#endif
3167	return put_timespec64(&ts, userstamp);
3168}
3169EXPORT_SYMBOL(sock_gettstamp);
3170
3171void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3172{
3173	if (!sock_flag(sk, flag)) {
3174		unsigned long previous_flags = sk->sk_flags;
3175
3176		sock_set_flag(sk, flag);
3177		/*
3178		 * we just set one of the two flags which require net
3179		 * time stamping, but time stamping might have been on
3180		 * already because of the other one
3181		 */
3182		if (sock_needs_netstamp(sk) &&
3183		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3184			net_enable_timestamp();
3185	}
3186}
3187
3188int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3189		       int level, int type)
3190{
3191	struct sock_exterr_skb *serr;
3192	struct sk_buff *skb;
3193	int copied, err;
3194
3195	err = -EAGAIN;
3196	skb = sock_dequeue_err_skb(sk);
3197	if (skb == NULL)
3198		goto out;
3199
3200	copied = skb->len;
3201	if (copied > len) {
3202		msg->msg_flags |= MSG_TRUNC;
3203		copied = len;
3204	}
3205	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3206	if (err)
3207		goto out_free_skb;
3208
3209	sock_recv_timestamp(msg, sk, skb);
3210
3211	serr = SKB_EXT_ERR(skb);
3212	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3213
3214	msg->msg_flags |= MSG_ERRQUEUE;
3215	err = copied;
3216
3217out_free_skb:
3218	kfree_skb(skb);
3219out:
3220	return err;
3221}
3222EXPORT_SYMBOL(sock_recv_errqueue);
3223
3224/*
3225 *	Get a socket option on an socket.
3226 *
3227 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3228 *	asynchronous errors should be reported by getsockopt. We assume
3229 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3230 */
3231int sock_common_getsockopt(struct socket *sock, int level, int optname,
3232			   char __user *optval, int __user *optlen)
3233{
3234	struct sock *sk = sock->sk;
3235
3236	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3237}
3238EXPORT_SYMBOL(sock_common_getsockopt);
3239
3240int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3241			int flags)
3242{
3243	struct sock *sk = sock->sk;
3244	int addr_len = 0;
3245	int err;
3246
3247	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3248				   flags & ~MSG_DONTWAIT, &addr_len);
3249	if (err >= 0)
3250		msg->msg_namelen = addr_len;
3251	return err;
3252}
3253EXPORT_SYMBOL(sock_common_recvmsg);
3254
3255/*
3256 *	Set socket options on an inet socket.
3257 */
3258int sock_common_setsockopt(struct socket *sock, int level, int optname,
3259			   sockptr_t optval, unsigned int optlen)
3260{
3261	struct sock *sk = sock->sk;
3262
3263	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3264}
3265EXPORT_SYMBOL(sock_common_setsockopt);
3266
3267void sk_common_release(struct sock *sk)
3268{
3269	if (sk->sk_prot->destroy)
3270		sk->sk_prot->destroy(sk);
3271
3272	/*
3273	 * Observation: when sk_common_release is called, processes have
3274	 * no access to socket. But net still has.
3275	 * Step one, detach it from networking:
3276	 *
3277	 * A. Remove from hash tables.
3278	 */
3279
3280	sk->sk_prot->unhash(sk);
3281
3282	/*
3283	 * In this point socket cannot receive new packets, but it is possible
3284	 * that some packets are in flight because some CPU runs receiver and
3285	 * did hash table lookup before we unhashed socket. They will achieve
3286	 * receive queue and will be purged by socket destructor.
3287	 *
3288	 * Also we still have packets pending on receive queue and probably,
3289	 * our own packets waiting in device queues. sock_destroy will drain
3290	 * receive queue, but transmitted packets will delay socket destruction
3291	 * until the last reference will be released.
3292	 */
3293
3294	sock_orphan(sk);
3295
3296	xfrm_sk_free_policy(sk);
3297
3298	sk_refcnt_debug_release(sk);
3299
3300	sock_put(sk);
3301}
3302EXPORT_SYMBOL(sk_common_release);
3303
3304void sk_get_meminfo(const struct sock *sk, u32 *mem)
3305{
3306	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3307
3308	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3309	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3310	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3311	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3312	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3313	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3314	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3315	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3316	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3317}
3318
3319#ifdef CONFIG_PROC_FS
3320#define PROTO_INUSE_NR	64	/* should be enough for the first time */
3321struct prot_inuse {
3322	int val[PROTO_INUSE_NR];
3323};
3324
3325static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3326
3327void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3328{
3329	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3330}
3331EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3332
3333int sock_prot_inuse_get(struct net *net, struct proto *prot)
3334{
3335	int cpu, idx = prot->inuse_idx;
3336	int res = 0;
3337
3338	for_each_possible_cpu(cpu)
3339		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3340
3341	return res >= 0 ? res : 0;
3342}
3343EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3344
3345static void sock_inuse_add(struct net *net, int val)
3346{
3347	this_cpu_add(*net->core.sock_inuse, val);
3348}
3349
3350int sock_inuse_get(struct net *net)
3351{
3352	int cpu, res = 0;
3353
3354	for_each_possible_cpu(cpu)
3355		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3356
3357	return res;
3358}
3359
3360EXPORT_SYMBOL_GPL(sock_inuse_get);
3361
3362static int __net_init sock_inuse_init_net(struct net *net)
3363{
3364	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3365	if (net->core.prot_inuse == NULL)
3366		return -ENOMEM;
3367
3368	net->core.sock_inuse = alloc_percpu(int);
3369	if (net->core.sock_inuse == NULL)
3370		goto out;
3371
3372	return 0;
3373
3374out:
3375	free_percpu(net->core.prot_inuse);
3376	return -ENOMEM;
3377}
3378
3379static void __net_exit sock_inuse_exit_net(struct net *net)
3380{
3381	free_percpu(net->core.prot_inuse);
3382	free_percpu(net->core.sock_inuse);
3383}
3384
3385static struct pernet_operations net_inuse_ops = {
3386	.init = sock_inuse_init_net,
3387	.exit = sock_inuse_exit_net,
3388};
3389
3390static __init int net_inuse_init(void)
3391{
3392	if (register_pernet_subsys(&net_inuse_ops))
3393		panic("Cannot initialize net inuse counters");
3394
3395	return 0;
3396}
3397
3398core_initcall(net_inuse_init);
3399
3400static int assign_proto_idx(struct proto *prot)
3401{
3402	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3403
3404	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3405		pr_err("PROTO_INUSE_NR exhausted\n");
3406		return -ENOSPC;
3407	}
3408
3409	set_bit(prot->inuse_idx, proto_inuse_idx);
3410	return 0;
3411}
3412
3413static void release_proto_idx(struct proto *prot)
3414{
3415	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3416		clear_bit(prot->inuse_idx, proto_inuse_idx);
3417}
3418#else
3419static inline int assign_proto_idx(struct proto *prot)
3420{
3421	return 0;
3422}
3423
3424static inline void release_proto_idx(struct proto *prot)
3425{
3426}
3427
3428static void sock_inuse_add(struct net *net, int val)
3429{
3430}
3431#endif
3432
3433static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3434{
3435	if (!twsk_prot)
3436		return;
3437	kfree(twsk_prot->twsk_slab_name);
3438	twsk_prot->twsk_slab_name = NULL;
3439	kmem_cache_destroy(twsk_prot->twsk_slab);
3440	twsk_prot->twsk_slab = NULL;
3441}
3442
3443static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3444{
3445	if (!rsk_prot)
3446		return;
3447	kfree(rsk_prot->slab_name);
3448	rsk_prot->slab_name = NULL;
3449	kmem_cache_destroy(rsk_prot->slab);
3450	rsk_prot->slab = NULL;
3451}
3452
3453static int req_prot_init(const struct proto *prot)
3454{
3455	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3456
3457	if (!rsk_prot)
3458		return 0;
3459
3460	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3461					prot->name);
3462	if (!rsk_prot->slab_name)
3463		return -ENOMEM;
3464
3465	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3466					   rsk_prot->obj_size, 0,
3467					   SLAB_ACCOUNT | prot->slab_flags,
3468					   NULL);
3469
3470	if (!rsk_prot->slab) {
3471		pr_crit("%s: Can't create request sock SLAB cache!\n",
3472			prot->name);
3473		return -ENOMEM;
3474	}
3475	return 0;
3476}
3477
3478int proto_register(struct proto *prot, int alloc_slab)
3479{
3480	int ret = -ENOBUFS;
3481
3482	if (alloc_slab) {
3483		prot->slab = kmem_cache_create_usercopy(prot->name,
3484					prot->obj_size, 0,
3485					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3486					prot->slab_flags,
3487					prot->useroffset, prot->usersize,
3488					NULL);
3489
3490		if (prot->slab == NULL) {
3491			pr_crit("%s: Can't create sock SLAB cache!\n",
3492				prot->name);
3493			goto out;
3494		}
3495
3496		if (req_prot_init(prot))
3497			goto out_free_request_sock_slab;
3498
3499		if (prot->twsk_prot != NULL) {
3500			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3501
3502			if (prot->twsk_prot->twsk_slab_name == NULL)
3503				goto out_free_request_sock_slab;
3504
3505			prot->twsk_prot->twsk_slab =
3506				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3507						  prot->twsk_prot->twsk_obj_size,
3508						  0,
3509						  SLAB_ACCOUNT |
3510						  prot->slab_flags,
3511						  NULL);
3512			if (prot->twsk_prot->twsk_slab == NULL)
3513				goto out_free_timewait_sock_slab;
3514		}
3515	}
3516
3517	mutex_lock(&proto_list_mutex);
3518	ret = assign_proto_idx(prot);
3519	if (ret) {
3520		mutex_unlock(&proto_list_mutex);
3521		goto out_free_timewait_sock_slab;
3522	}
3523	list_add(&prot->node, &proto_list);
3524	mutex_unlock(&proto_list_mutex);
3525	return ret;
3526
3527out_free_timewait_sock_slab:
3528	if (alloc_slab && prot->twsk_prot)
3529		tw_prot_cleanup(prot->twsk_prot);
3530out_free_request_sock_slab:
3531	if (alloc_slab) {
3532		req_prot_cleanup(prot->rsk_prot);
3533
3534		kmem_cache_destroy(prot->slab);
3535		prot->slab = NULL;
3536	}
3537out:
3538	return ret;
3539}
3540EXPORT_SYMBOL(proto_register);
3541
3542void proto_unregister(struct proto *prot)
3543{
3544	mutex_lock(&proto_list_mutex);
3545	release_proto_idx(prot);
3546	list_del(&prot->node);
3547	mutex_unlock(&proto_list_mutex);
3548
3549	kmem_cache_destroy(prot->slab);
3550	prot->slab = NULL;
3551
3552	req_prot_cleanup(prot->rsk_prot);
3553	tw_prot_cleanup(prot->twsk_prot);
3554}
3555EXPORT_SYMBOL(proto_unregister);
3556
3557int sock_load_diag_module(int family, int protocol)
3558{
3559	if (!protocol) {
3560		if (!sock_is_registered(family))
3561			return -ENOENT;
3562
3563		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3564				      NETLINK_SOCK_DIAG, family);
3565	}
3566
3567#ifdef CONFIG_INET
3568	if (family == AF_INET &&
3569	    protocol != IPPROTO_RAW &&
3570	    protocol < MAX_INET_PROTOS &&
3571	    !rcu_access_pointer(inet_protos[protocol]))
3572		return -ENOENT;
3573#endif
3574
3575	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3576			      NETLINK_SOCK_DIAG, family, protocol);
3577}
3578EXPORT_SYMBOL(sock_load_diag_module);
3579
3580#ifdef CONFIG_PROC_FS
3581static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3582	__acquires(proto_list_mutex)
3583{
3584	mutex_lock(&proto_list_mutex);
3585	return seq_list_start_head(&proto_list, *pos);
3586}
3587
3588static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3589{
3590	return seq_list_next(v, &proto_list, pos);
3591}
3592
3593static void proto_seq_stop(struct seq_file *seq, void *v)
3594	__releases(proto_list_mutex)
3595{
3596	mutex_unlock(&proto_list_mutex);
3597}
3598
3599static char proto_method_implemented(const void *method)
3600{
3601	return method == NULL ? 'n' : 'y';
3602}
3603static long sock_prot_memory_allocated(struct proto *proto)
3604{
3605	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3606}
3607
3608static const char *sock_prot_memory_pressure(struct proto *proto)
3609{
3610	return proto->memory_pressure != NULL ?
3611	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3612}
3613
3614static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3615{
3616
3617	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3618			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3619		   proto->name,
3620		   proto->obj_size,
3621		   sock_prot_inuse_get(seq_file_net(seq), proto),
3622		   sock_prot_memory_allocated(proto),
3623		   sock_prot_memory_pressure(proto),
3624		   proto->max_header,
3625		   proto->slab == NULL ? "no" : "yes",
3626		   module_name(proto->owner),
3627		   proto_method_implemented(proto->close),
3628		   proto_method_implemented(proto->connect),
3629		   proto_method_implemented(proto->disconnect),
3630		   proto_method_implemented(proto->accept),
3631		   proto_method_implemented(proto->ioctl),
3632		   proto_method_implemented(proto->init),
3633		   proto_method_implemented(proto->destroy),
3634		   proto_method_implemented(proto->shutdown),
3635		   proto_method_implemented(proto->setsockopt),
3636		   proto_method_implemented(proto->getsockopt),
3637		   proto_method_implemented(proto->sendmsg),
3638		   proto_method_implemented(proto->recvmsg),
3639		   proto_method_implemented(proto->sendpage),
3640		   proto_method_implemented(proto->bind),
3641		   proto_method_implemented(proto->backlog_rcv),
3642		   proto_method_implemented(proto->hash),
3643		   proto_method_implemented(proto->unhash),
3644		   proto_method_implemented(proto->get_port),
3645		   proto_method_implemented(proto->enter_memory_pressure));
3646}
3647
3648static int proto_seq_show(struct seq_file *seq, void *v)
3649{
3650	if (v == &proto_list)
3651		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3652			   "protocol",
3653			   "size",
3654			   "sockets",
3655			   "memory",
3656			   "press",
3657			   "maxhdr",
3658			   "slab",
3659			   "module",
3660			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3661	else
3662		proto_seq_printf(seq, list_entry(v, struct proto, node));
3663	return 0;
3664}
3665
3666static const struct seq_operations proto_seq_ops = {
3667	.start  = proto_seq_start,
3668	.next   = proto_seq_next,
3669	.stop   = proto_seq_stop,
3670	.show   = proto_seq_show,
3671};
3672
3673static __net_init int proto_init_net(struct net *net)
3674{
3675	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3676			sizeof(struct seq_net_private)))
3677		return -ENOMEM;
3678
3679	return 0;
3680}
3681
3682static __net_exit void proto_exit_net(struct net *net)
3683{
3684	remove_proc_entry("protocols", net->proc_net);
3685}
3686
3687
3688static __net_initdata struct pernet_operations proto_net_ops = {
3689	.init = proto_init_net,
3690	.exit = proto_exit_net,
3691};
3692
3693static int __init proto_init(void)
3694{
3695	return register_pernet_subsys(&proto_net_ops);
3696}
3697
3698subsys_initcall(proto_init);
3699
3700#endif /* PROC_FS */
3701
3702#ifdef CONFIG_NET_RX_BUSY_POLL
3703bool sk_busy_loop_end(void *p, unsigned long start_time)
3704{
3705	struct sock *sk = p;
3706
3707	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3708	       sk_busy_loop_timeout(sk, start_time);
3709}
3710EXPORT_SYMBOL(sk_busy_loop_end);
3711#endif /* CONFIG_NET_RX_BUSY_POLL */
3712
3713int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3714{
3715	if (!sk->sk_prot->bind_add)
3716		return -EOPNOTSUPP;
3717	return sk->sk_prot->bind_add(sk, addr, addr_len);
3718}
3719EXPORT_SYMBOL(sock_bind_add);