net/core/sock.c at v5.13-rc5 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v5.13-rc5 3733 lines 91 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142static DEFINE_MUTEX(proto_list_mutex);
 143static LIST_HEAD(proto_list);
 144
 145static void sock_inuse_add(struct net *net, int val);
 146
 147/**
 148 * sk_ns_capable - General socket capability test
 149 * @sk: Socket to use a capability on or through
 150 * @user_ns: The user namespace of the capability to use
 151 * @cap: The capability to use
 152 *
 153 * Test to see if the opener of the socket had when the socket was
 154 * created and the current process has the capability @cap in the user
 155 * namespace @user_ns.
 156 */
 157bool sk_ns_capable(const struct sock *sk,
 158		   struct user_namespace *user_ns, int cap)
 159{
 160	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161		ns_capable(user_ns, cap);
 162}
 163EXPORT_SYMBOL(sk_ns_capable);
 164
 165/**
 166 * sk_capable - Socket global capability test
 167 * @sk: Socket to use a capability on or through
 168 * @cap: The global capability to use
 169 *
 170 * Test to see if the opener of the socket had when the socket was
 171 * created and the current process has the capability @cap in all user
 172 * namespaces.
 173 */
 174bool sk_capable(const struct sock *sk, int cap)
 175{
 176	return sk_ns_capable(sk, &init_user_ns, cap);
 177}
 178EXPORT_SYMBOL(sk_capable);
 179
 180/**
 181 * sk_net_capable - Network namespace socket capability test
 182 * @sk: Socket to use a capability on or through
 183 * @cap: The capability to use
 184 *
 185 * Test to see if the opener of the socket had when the socket was created
 186 * and the current process has the capability @cap over the network namespace
 187 * the socket is a member of.
 188 */
 189bool sk_net_capable(const struct sock *sk, int cap)
 190{
 191	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192}
 193EXPORT_SYMBOL(sk_net_capable);
 194
 195/*
 196 * Each address family might have different locking rules, so we have
 197 * one slock key per address family and separate keys for internal and
 198 * userspace sockets.
 199 */
 200static struct lock_class_key af_family_keys[AF_MAX];
 201static struct lock_class_key af_family_kern_keys[AF_MAX];
 202static struct lock_class_key af_family_slock_keys[AF_MAX];
 203static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205/*
 206 * Make lock validator output more readable. (we pre-construct these
 207 * strings build-time, so that runtime initialization of socket
 208 * locks is fast):
 209 */
 210
 211#define _sock_locks(x)						  \
 212  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 213  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 214  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 215  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 216  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 217  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 218  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 219  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 220  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 221  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 222  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 223  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 224  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 225  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 226  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 227  x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230	_sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233	_sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236	_sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240	_sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243	_sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246	_sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249	_sock_locks("rlock-")
 250};
 251static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 252	_sock_locks("wlock-")
 253};
 254static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 255	_sock_locks("elock-")
 256};
 257
 258/*
 259 * sk_callback_lock and sk queues locking rules are per-address-family,
 260 * so split the lock classes by using a per-AF key:
 261 */
 262static struct lock_class_key af_callback_keys[AF_MAX];
 263static struct lock_class_key af_rlock_keys[AF_MAX];
 264static struct lock_class_key af_wlock_keys[AF_MAX];
 265static struct lock_class_key af_elock_keys[AF_MAX];
 266static struct lock_class_key af_kern_callback_keys[AF_MAX];
 267
 268/* Run time adjustable parameters. */
 269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270EXPORT_SYMBOL(sysctl_wmem_max);
 271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272EXPORT_SYMBOL(sysctl_rmem_max);
 273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276/* Maximal space eaten by iovec or ancillary data plus some space */
 277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 283EXPORT_SYMBOL_GPL(memalloc_socks_key);
 284
 285/**
 286 * sk_set_memalloc - sets %SOCK_MEMALLOC
 287 * @sk: socket to set it on
 288 *
 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290 * It's the responsibility of the admin to adjust min_free_kbytes
 291 * to meet the requirements
 292 */
 293void sk_set_memalloc(struct sock *sk)
 294{
 295	sock_set_flag(sk, SOCK_MEMALLOC);
 296	sk->sk_allocation |= __GFP_MEMALLOC;
 297	static_branch_inc(&memalloc_socks_key);
 298}
 299EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301void sk_clear_memalloc(struct sock *sk)
 302{
 303	sock_reset_flag(sk, SOCK_MEMALLOC);
 304	sk->sk_allocation &= ~__GFP_MEMALLOC;
 305	static_branch_dec(&memalloc_socks_key);
 306
 307	/*
 308	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 310	 * it has rmem allocations due to the last swapfile being deactivated
 311	 * but there is a risk that the socket is unusable due to exceeding
 312	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313	 */
 314	sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320	int ret;
 321	unsigned int noreclaim_flag;
 322
 323	/* these should have been dropped before queueing */
 324	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326	noreclaim_flag = memalloc_noreclaim_save();
 327	ret = sk->sk_backlog_rcv(sk, skb);
 328	memalloc_noreclaim_restore(noreclaim_flag);
 329
 330	return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 335{
 336	struct __kernel_sock_timeval tv;
 337
 338	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 339		tv.tv_sec = 0;
 340		tv.tv_usec = 0;
 341	} else {
 342		tv.tv_sec = timeo / HZ;
 343		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 344	}
 345
 346	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 347		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 348		*(struct old_timeval32 *)optval = tv32;
 349		return sizeof(tv32);
 350	}
 351
 352	if (old_timeval) {
 353		struct __kernel_old_timeval old_tv;
 354		old_tv.tv_sec = tv.tv_sec;
 355		old_tv.tv_usec = tv.tv_usec;
 356		*(struct __kernel_old_timeval *)optval = old_tv;
 357		return sizeof(old_tv);
 358	}
 359
 360	*(struct __kernel_sock_timeval *)optval = tv;
 361	return sizeof(tv);
 362}
 363
 364static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 365			    bool old_timeval)
 366{
 367	struct __kernel_sock_timeval tv;
 368
 369	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 370		struct old_timeval32 tv32;
 371
 372		if (optlen < sizeof(tv32))
 373			return -EINVAL;
 374
 375		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 376			return -EFAULT;
 377		tv.tv_sec = tv32.tv_sec;
 378		tv.tv_usec = tv32.tv_usec;
 379	} else if (old_timeval) {
 380		struct __kernel_old_timeval old_tv;
 381
 382		if (optlen < sizeof(old_tv))
 383			return -EINVAL;
 384		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 385			return -EFAULT;
 386		tv.tv_sec = old_tv.tv_sec;
 387		tv.tv_usec = old_tv.tv_usec;
 388	} else {
 389		if (optlen < sizeof(tv))
 390			return -EINVAL;
 391		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
 392			return -EFAULT;
 393	}
 394	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 395		return -EDOM;
 396
 397	if (tv.tv_sec < 0) {
 398		static int warned __read_mostly;
 399
 400		*timeo_p = 0;
 401		if (warned < 10 && net_ratelimit()) {
 402			warned++;
 403			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 404				__func__, current->comm, task_pid_nr(current));
 405		}
 406		return 0;
 407	}
 408	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 409	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 410		return 0;
 411	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 412		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 413	return 0;
 414}
 415
 416static bool sock_needs_netstamp(const struct sock *sk)
 417{
 418	switch (sk->sk_family) {
 419	case AF_UNSPEC:
 420	case AF_UNIX:
 421		return false;
 422	default:
 423		return true;
 424	}
 425}
 426
 427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 428{
 429	if (sk->sk_flags & flags) {
 430		sk->sk_flags &= ~flags;
 431		if (sock_needs_netstamp(sk) &&
 432		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 433			net_disable_timestamp();
 434	}
 435}
 436
 437
 438int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 439{
 440	unsigned long flags;
 441	struct sk_buff_head *list = &sk->sk_receive_queue;
 442
 443	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 444		atomic_inc(&sk->sk_drops);
 445		trace_sock_rcvqueue_full(sk, skb);
 446		return -ENOMEM;
 447	}
 448
 449	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 450		atomic_inc(&sk->sk_drops);
 451		return -ENOBUFS;
 452	}
 453
 454	skb->dev = NULL;
 455	skb_set_owner_r(skb, sk);
 456
 457	/* we escape from rcu protected region, make sure we dont leak
 458	 * a norefcounted dst
 459	 */
 460	skb_dst_force(skb);
 461
 462	spin_lock_irqsave(&list->lock, flags);
 463	sock_skb_set_dropcount(sk, skb);
 464	__skb_queue_tail(list, skb);
 465	spin_unlock_irqrestore(&list->lock, flags);
 466
 467	if (!sock_flag(sk, SOCK_DEAD))
 468		sk->sk_data_ready(sk);
 469	return 0;
 470}
 471EXPORT_SYMBOL(__sock_queue_rcv_skb);
 472
 473int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 474{
 475	int err;
 476
 477	err = sk_filter(sk, skb);
 478	if (err)
 479		return err;
 480
 481	return __sock_queue_rcv_skb(sk, skb);
 482}
 483EXPORT_SYMBOL(sock_queue_rcv_skb);
 484
 485int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 486		     const int nested, unsigned int trim_cap, bool refcounted)
 487{
 488	int rc = NET_RX_SUCCESS;
 489
 490	if (sk_filter_trim_cap(sk, skb, trim_cap))
 491		goto discard_and_relse;
 492
 493	skb->dev = NULL;
 494
 495	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 496		atomic_inc(&sk->sk_drops);
 497		goto discard_and_relse;
 498	}
 499	if (nested)
 500		bh_lock_sock_nested(sk);
 501	else
 502		bh_lock_sock(sk);
 503	if (!sock_owned_by_user(sk)) {
 504		/*
 505		 * trylock + unlock semantics:
 506		 */
 507		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 508
 509		rc = sk_backlog_rcv(sk, skb);
 510
 511		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 512	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 513		bh_unlock_sock(sk);
 514		atomic_inc(&sk->sk_drops);
 515		goto discard_and_relse;
 516	}
 517
 518	bh_unlock_sock(sk);
 519out:
 520	if (refcounted)
 521		sock_put(sk);
 522	return rc;
 523discard_and_relse:
 524	kfree_skb(skb);
 525	goto out;
 526}
 527EXPORT_SYMBOL(__sk_receive_skb);
 528
 529INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 530							  u32));
 531INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 532							   u32));
 533struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 534{
 535	struct dst_entry *dst = __sk_dst_get(sk);
 536
 537	if (dst && dst->obsolete &&
 538	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 539			       dst, cookie) == NULL) {
 540		sk_tx_queue_clear(sk);
 541		sk->sk_dst_pending_confirm = 0;
 542		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 543		dst_release(dst);
 544		return NULL;
 545	}
 546
 547	return dst;
 548}
 549EXPORT_SYMBOL(__sk_dst_check);
 550
 551struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 552{
 553	struct dst_entry *dst = sk_dst_get(sk);
 554
 555	if (dst && dst->obsolete &&
 556	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 557			       dst, cookie) == NULL) {
 558		sk_dst_reset(sk);
 559		dst_release(dst);
 560		return NULL;
 561	}
 562
 563	return dst;
 564}
 565EXPORT_SYMBOL(sk_dst_check);
 566
 567static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 568{
 569	int ret = -ENOPROTOOPT;
 570#ifdef CONFIG_NETDEVICES
 571	struct net *net = sock_net(sk);
 572
 573	/* Sorry... */
 574	ret = -EPERM;
 575	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 576		goto out;
 577
 578	ret = -EINVAL;
 579	if (ifindex < 0)
 580		goto out;
 581
 582	sk->sk_bound_dev_if = ifindex;
 583	if (sk->sk_prot->rehash)
 584		sk->sk_prot->rehash(sk);
 585	sk_dst_reset(sk);
 586
 587	ret = 0;
 588
 589out:
 590#endif
 591
 592	return ret;
 593}
 594
 595int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 596{
 597	int ret;
 598
 599	if (lock_sk)
 600		lock_sock(sk);
 601	ret = sock_bindtoindex_locked(sk, ifindex);
 602	if (lock_sk)
 603		release_sock(sk);
 604
 605	return ret;
 606}
 607EXPORT_SYMBOL(sock_bindtoindex);
 608
 609static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 610{
 611	int ret = -ENOPROTOOPT;
 612#ifdef CONFIG_NETDEVICES
 613	struct net *net = sock_net(sk);
 614	char devname[IFNAMSIZ];
 615	int index;
 616
 617	ret = -EINVAL;
 618	if (optlen < 0)
 619		goto out;
 620
 621	/* Bind this socket to a particular device like "eth0",
 622	 * as specified in the passed interface name. If the
 623	 * name is "" or the option length is zero the socket
 624	 * is not bound.
 625	 */
 626	if (optlen > IFNAMSIZ - 1)
 627		optlen = IFNAMSIZ - 1;
 628	memset(devname, 0, sizeof(devname));
 629
 630	ret = -EFAULT;
 631	if (copy_from_sockptr(devname, optval, optlen))
 632		goto out;
 633
 634	index = 0;
 635	if (devname[0] != '\0') {
 636		struct net_device *dev;
 637
 638		rcu_read_lock();
 639		dev = dev_get_by_name_rcu(net, devname);
 640		if (dev)
 641			index = dev->ifindex;
 642		rcu_read_unlock();
 643		ret = -ENODEV;
 644		if (!dev)
 645			goto out;
 646	}
 647
 648	return sock_bindtoindex(sk, index, true);
 649out:
 650#endif
 651
 652	return ret;
 653}
 654
 655static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 656				int __user *optlen, int len)
 657{
 658	int ret = -ENOPROTOOPT;
 659#ifdef CONFIG_NETDEVICES
 660	struct net *net = sock_net(sk);
 661	char devname[IFNAMSIZ];
 662
 663	if (sk->sk_bound_dev_if == 0) {
 664		len = 0;
 665		goto zero;
 666	}
 667
 668	ret = -EINVAL;
 669	if (len < IFNAMSIZ)
 670		goto out;
 671
 672	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 673	if (ret)
 674		goto out;
 675
 676	len = strlen(devname) + 1;
 677
 678	ret = -EFAULT;
 679	if (copy_to_user(optval, devname, len))
 680		goto out;
 681
 682zero:
 683	ret = -EFAULT;
 684	if (put_user(len, optlen))
 685		goto out;
 686
 687	ret = 0;
 688
 689out:
 690#endif
 691
 692	return ret;
 693}
 694
 695bool sk_mc_loop(struct sock *sk)
 696{
 697	if (dev_recursion_level())
 698		return false;
 699	if (!sk)
 700		return true;
 701	switch (sk->sk_family) {
 702	case AF_INET:
 703		return inet_sk(sk)->mc_loop;
 704#if IS_ENABLED(CONFIG_IPV6)
 705	case AF_INET6:
 706		return inet6_sk(sk)->mc_loop;
 707#endif
 708	}
 709	WARN_ON_ONCE(1);
 710	return true;
 711}
 712EXPORT_SYMBOL(sk_mc_loop);
 713
 714void sock_set_reuseaddr(struct sock *sk)
 715{
 716	lock_sock(sk);
 717	sk->sk_reuse = SK_CAN_REUSE;
 718	release_sock(sk);
 719}
 720EXPORT_SYMBOL(sock_set_reuseaddr);
 721
 722void sock_set_reuseport(struct sock *sk)
 723{
 724	lock_sock(sk);
 725	sk->sk_reuseport = true;
 726	release_sock(sk);
 727}
 728EXPORT_SYMBOL(sock_set_reuseport);
 729
 730void sock_no_linger(struct sock *sk)
 731{
 732	lock_sock(sk);
 733	sk->sk_lingertime = 0;
 734	sock_set_flag(sk, SOCK_LINGER);
 735	release_sock(sk);
 736}
 737EXPORT_SYMBOL(sock_no_linger);
 738
 739void sock_set_priority(struct sock *sk, u32 priority)
 740{
 741	lock_sock(sk);
 742	sk->sk_priority = priority;
 743	release_sock(sk);
 744}
 745EXPORT_SYMBOL(sock_set_priority);
 746
 747void sock_set_sndtimeo(struct sock *sk, s64 secs)
 748{
 749	lock_sock(sk);
 750	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 751		sk->sk_sndtimeo = secs * HZ;
 752	else
 753		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 754	release_sock(sk);
 755}
 756EXPORT_SYMBOL(sock_set_sndtimeo);
 757
 758static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 759{
 760	if (val)  {
 761		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 762		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 763		sock_set_flag(sk, SOCK_RCVTSTAMP);
 764		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 765	} else {
 766		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 767		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 768	}
 769}
 770
 771void sock_enable_timestamps(struct sock *sk)
 772{
 773	lock_sock(sk);
 774	__sock_set_timestamps(sk, true, false, true);
 775	release_sock(sk);
 776}
 777EXPORT_SYMBOL(sock_enable_timestamps);
 778
 779void sock_set_keepalive(struct sock *sk)
 780{
 781	lock_sock(sk);
 782	if (sk->sk_prot->keepalive)
 783		sk->sk_prot->keepalive(sk, true);
 784	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 785	release_sock(sk);
 786}
 787EXPORT_SYMBOL(sock_set_keepalive);
 788
 789static void __sock_set_rcvbuf(struct sock *sk, int val)
 790{
 791	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 792	 * as a negative value.
 793	 */
 794	val = min_t(int, val, INT_MAX / 2);
 795	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 796
 797	/* We double it on the way in to account for "struct sk_buff" etc.
 798	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 799	 * will allow that much actual data to be received on that socket.
 800	 *
 801	 * Applications are unaware that "struct sk_buff" and other overheads
 802	 * allocate from the receive buffer during socket buffer allocation.
 803	 *
 804	 * And after considering the possible alternatives, returning the value
 805	 * we actually used in getsockopt is the most desirable behavior.
 806	 */
 807	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 808}
 809
 810void sock_set_rcvbuf(struct sock *sk, int val)
 811{
 812	lock_sock(sk);
 813	__sock_set_rcvbuf(sk, val);
 814	release_sock(sk);
 815}
 816EXPORT_SYMBOL(sock_set_rcvbuf);
 817
 818static void __sock_set_mark(struct sock *sk, u32 val)
 819{
 820	if (val != sk->sk_mark) {
 821		sk->sk_mark = val;
 822		sk_dst_reset(sk);
 823	}
 824}
 825
 826void sock_set_mark(struct sock *sk, u32 val)
 827{
 828	lock_sock(sk);
 829	__sock_set_mark(sk, val);
 830	release_sock(sk);
 831}
 832EXPORT_SYMBOL(sock_set_mark);
 833
 834/*
 835 *	This is meant for all protocols to use and covers goings on
 836 *	at the socket level. Everything here is generic.
 837 */
 838
 839int sock_setsockopt(struct socket *sock, int level, int optname,
 840		    sockptr_t optval, unsigned int optlen)
 841{
 842	struct sock_txtime sk_txtime;
 843	struct sock *sk = sock->sk;
 844	int val;
 845	int valbool;
 846	struct linger ling;
 847	int ret = 0;
 848
 849	/*
 850	 *	Options without arguments
 851	 */
 852
 853	if (optname == SO_BINDTODEVICE)
 854		return sock_setbindtodevice(sk, optval, optlen);
 855
 856	if (optlen < sizeof(int))
 857		return -EINVAL;
 858
 859	if (copy_from_sockptr(&val, optval, sizeof(val)))
 860		return -EFAULT;
 861
 862	valbool = val ? 1 : 0;
 863
 864	lock_sock(sk);
 865
 866	switch (optname) {
 867	case SO_DEBUG:
 868		if (val && !capable(CAP_NET_ADMIN))
 869			ret = -EACCES;
 870		else
 871			sock_valbool_flag(sk, SOCK_DBG, valbool);
 872		break;
 873	case SO_REUSEADDR:
 874		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 875		break;
 876	case SO_REUSEPORT:
 877		sk->sk_reuseport = valbool;
 878		break;
 879	case SO_TYPE:
 880	case SO_PROTOCOL:
 881	case SO_DOMAIN:
 882	case SO_ERROR:
 883		ret = -ENOPROTOOPT;
 884		break;
 885	case SO_DONTROUTE:
 886		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 887		sk_dst_reset(sk);
 888		break;
 889	case SO_BROADCAST:
 890		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 891		break;
 892	case SO_SNDBUF:
 893		/* Don't error on this BSD doesn't and if you think
 894		 * about it this is right. Otherwise apps have to
 895		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 896		 * are treated in BSD as hints
 897		 */
 898		val = min_t(u32, val, sysctl_wmem_max);
 899set_sndbuf:
 900		/* Ensure val * 2 fits into an int, to prevent max_t()
 901		 * from treating it as a negative value.
 902		 */
 903		val = min_t(int, val, INT_MAX / 2);
 904		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 905		WRITE_ONCE(sk->sk_sndbuf,
 906			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
 907		/* Wake up sending tasks if we upped the value. */
 908		sk->sk_write_space(sk);
 909		break;
 910
 911	case SO_SNDBUFFORCE:
 912		if (!capable(CAP_NET_ADMIN)) {
 913			ret = -EPERM;
 914			break;
 915		}
 916
 917		/* No negative values (to prevent underflow, as val will be
 918		 * multiplied by 2).
 919		 */
 920		if (val < 0)
 921			val = 0;
 922		goto set_sndbuf;
 923
 924	case SO_RCVBUF:
 925		/* Don't error on this BSD doesn't and if you think
 926		 * about it this is right. Otherwise apps have to
 927		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 928		 * are treated in BSD as hints
 929		 */
 930		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
 931		break;
 932
 933	case SO_RCVBUFFORCE:
 934		if (!capable(CAP_NET_ADMIN)) {
 935			ret = -EPERM;
 936			break;
 937		}
 938
 939		/* No negative values (to prevent underflow, as val will be
 940		 * multiplied by 2).
 941		 */
 942		__sock_set_rcvbuf(sk, max(val, 0));
 943		break;
 944
 945	case SO_KEEPALIVE:
 946		if (sk->sk_prot->keepalive)
 947			sk->sk_prot->keepalive(sk, valbool);
 948		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 949		break;
 950
 951	case SO_OOBINLINE:
 952		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 953		break;
 954
 955	case SO_NO_CHECK:
 956		sk->sk_no_check_tx = valbool;
 957		break;
 958
 959	case SO_PRIORITY:
 960		if ((val >= 0 && val <= 6) ||
 961		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 962			sk->sk_priority = val;
 963		else
 964			ret = -EPERM;
 965		break;
 966
 967	case SO_LINGER:
 968		if (optlen < sizeof(ling)) {
 969			ret = -EINVAL;	/* 1003.1g */
 970			break;
 971		}
 972		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
 973			ret = -EFAULT;
 974			break;
 975		}
 976		if (!ling.l_onoff)
 977			sock_reset_flag(sk, SOCK_LINGER);
 978		else {
 979#if (BITS_PER_LONG == 32)
 980			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 981				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 982			else
 983#endif
 984				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 985			sock_set_flag(sk, SOCK_LINGER);
 986		}
 987		break;
 988
 989	case SO_BSDCOMPAT:
 990		break;
 991
 992	case SO_PASSCRED:
 993		if (valbool)
 994			set_bit(SOCK_PASSCRED, &sock->flags);
 995		else
 996			clear_bit(SOCK_PASSCRED, &sock->flags);
 997		break;
 998
 999	case SO_TIMESTAMP_OLD:
1000		__sock_set_timestamps(sk, valbool, false, false);
1001		break;
1002	case SO_TIMESTAMP_NEW:
1003		__sock_set_timestamps(sk, valbool, true, false);
1004		break;
1005	case SO_TIMESTAMPNS_OLD:
1006		__sock_set_timestamps(sk, valbool, false, true);
1007		break;
1008	case SO_TIMESTAMPNS_NEW:
1009		__sock_set_timestamps(sk, valbool, true, true);
1010		break;
1011	case SO_TIMESTAMPING_NEW:
1012	case SO_TIMESTAMPING_OLD:
1013		if (val & ~SOF_TIMESTAMPING_MASK) {
1014			ret = -EINVAL;
1015			break;
1016		}
1017
1018		if (val & SOF_TIMESTAMPING_OPT_ID &&
1019		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1020			if (sk->sk_protocol == IPPROTO_TCP &&
1021			    sk->sk_type == SOCK_STREAM) {
1022				if ((1 << sk->sk_state) &
1023				    (TCPF_CLOSE | TCPF_LISTEN)) {
1024					ret = -EINVAL;
1025					break;
1026				}
1027				sk->sk_tskey = tcp_sk(sk)->snd_una;
1028			} else {
1029				sk->sk_tskey = 0;
1030			}
1031		}
1032
1033		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1034		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1035			ret = -EINVAL;
1036			break;
1037		}
1038
1039		sk->sk_tsflags = val;
1040		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
1041
1042		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1043			sock_enable_timestamp(sk,
1044					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1045		else
1046			sock_disable_timestamp(sk,
1047					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1048		break;
1049
1050	case SO_RCVLOWAT:
1051		if (val < 0)
1052			val = INT_MAX;
1053		if (sock->ops->set_rcvlowat)
1054			ret = sock->ops->set_rcvlowat(sk, val);
1055		else
1056			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1057		break;
1058
1059	case SO_RCVTIMEO_OLD:
1060	case SO_RCVTIMEO_NEW:
1061		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1062				       optlen, optname == SO_RCVTIMEO_OLD);
1063		break;
1064
1065	case SO_SNDTIMEO_OLD:
1066	case SO_SNDTIMEO_NEW:
1067		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1068				       optlen, optname == SO_SNDTIMEO_OLD);
1069		break;
1070
1071	case SO_ATTACH_FILTER: {
1072		struct sock_fprog fprog;
1073
1074		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1075		if (!ret)
1076			ret = sk_attach_filter(&fprog, sk);
1077		break;
1078	}
1079	case SO_ATTACH_BPF:
1080		ret = -EINVAL;
1081		if (optlen == sizeof(u32)) {
1082			u32 ufd;
1083
1084			ret = -EFAULT;
1085			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1086				break;
1087
1088			ret = sk_attach_bpf(ufd, sk);
1089		}
1090		break;
1091
1092	case SO_ATTACH_REUSEPORT_CBPF: {
1093		struct sock_fprog fprog;
1094
1095		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1096		if (!ret)
1097			ret = sk_reuseport_attach_filter(&fprog, sk);
1098		break;
1099	}
1100	case SO_ATTACH_REUSEPORT_EBPF:
1101		ret = -EINVAL;
1102		if (optlen == sizeof(u32)) {
1103			u32 ufd;
1104
1105			ret = -EFAULT;
1106			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1107				break;
1108
1109			ret = sk_reuseport_attach_bpf(ufd, sk);
1110		}
1111		break;
1112
1113	case SO_DETACH_REUSEPORT_BPF:
1114		ret = reuseport_detach_prog(sk);
1115		break;
1116
1117	case SO_DETACH_FILTER:
1118		ret = sk_detach_filter(sk);
1119		break;
1120
1121	case SO_LOCK_FILTER:
1122		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1123			ret = -EPERM;
1124		else
1125			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1126		break;
1127
1128	case SO_PASSSEC:
1129		if (valbool)
1130			set_bit(SOCK_PASSSEC, &sock->flags);
1131		else
1132			clear_bit(SOCK_PASSSEC, &sock->flags);
1133		break;
1134	case SO_MARK:
1135		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1136			ret = -EPERM;
1137			break;
1138		}
1139
1140		__sock_set_mark(sk, val);
1141		break;
1142
1143	case SO_RXQ_OVFL:
1144		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1145		break;
1146
1147	case SO_WIFI_STATUS:
1148		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1149		break;
1150
1151	case SO_PEEK_OFF:
1152		if (sock->ops->set_peek_off)
1153			ret = sock->ops->set_peek_off(sk, val);
1154		else
1155			ret = -EOPNOTSUPP;
1156		break;
1157
1158	case SO_NOFCS:
1159		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1160		break;
1161
1162	case SO_SELECT_ERR_QUEUE:
1163		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1164		break;
1165
1166#ifdef CONFIG_NET_RX_BUSY_POLL
1167	case SO_BUSY_POLL:
1168		/* allow unprivileged users to decrease the value */
1169		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1170			ret = -EPERM;
1171		else {
1172			if (val < 0)
1173				ret = -EINVAL;
1174			else
1175				sk->sk_ll_usec = val;
1176		}
1177		break;
1178	case SO_PREFER_BUSY_POLL:
1179		if (valbool && !capable(CAP_NET_ADMIN))
1180			ret = -EPERM;
1181		else
1182			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1183		break;
1184	case SO_BUSY_POLL_BUDGET:
1185		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1186			ret = -EPERM;
1187		} else {
1188			if (val < 0 || val > U16_MAX)
1189				ret = -EINVAL;
1190			else
1191				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1192		}
1193		break;
1194#endif
1195
1196	case SO_MAX_PACING_RATE:
1197		{
1198		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1199
1200		if (sizeof(ulval) != sizeof(val) &&
1201		    optlen >= sizeof(ulval) &&
1202		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1203			ret = -EFAULT;
1204			break;
1205		}
1206		if (ulval != ~0UL)
1207			cmpxchg(&sk->sk_pacing_status,
1208				SK_PACING_NONE,
1209				SK_PACING_NEEDED);
1210		sk->sk_max_pacing_rate = ulval;
1211		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1212		break;
1213		}
1214	case SO_INCOMING_CPU:
1215		WRITE_ONCE(sk->sk_incoming_cpu, val);
1216		break;
1217
1218	case SO_CNX_ADVICE:
1219		if (val == 1)
1220			dst_negative_advice(sk);
1221		break;
1222
1223	case SO_ZEROCOPY:
1224		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1225			if (!((sk->sk_type == SOCK_STREAM &&
1226			       sk->sk_protocol == IPPROTO_TCP) ||
1227			      (sk->sk_type == SOCK_DGRAM &&
1228			       sk->sk_protocol == IPPROTO_UDP)))
1229				ret = -ENOTSUPP;
1230		} else if (sk->sk_family != PF_RDS) {
1231			ret = -ENOTSUPP;
1232		}
1233		if (!ret) {
1234			if (val < 0 || val > 1)
1235				ret = -EINVAL;
1236			else
1237				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1238		}
1239		break;
1240
1241	case SO_TXTIME:
1242		if (optlen != sizeof(struct sock_txtime)) {
1243			ret = -EINVAL;
1244			break;
1245		} else if (copy_from_sockptr(&sk_txtime, optval,
1246			   sizeof(struct sock_txtime))) {
1247			ret = -EFAULT;
1248			break;
1249		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1250			ret = -EINVAL;
1251			break;
1252		}
1253		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1254		 * scheduler has enough safe guards.
1255		 */
1256		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1257		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1258			ret = -EPERM;
1259			break;
1260		}
1261		sock_valbool_flag(sk, SOCK_TXTIME, true);
1262		sk->sk_clockid = sk_txtime.clockid;
1263		sk->sk_txtime_deadline_mode =
1264			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1265		sk->sk_txtime_report_errors =
1266			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1267		break;
1268
1269	case SO_BINDTOIFINDEX:
1270		ret = sock_bindtoindex_locked(sk, val);
1271		break;
1272
1273	default:
1274		ret = -ENOPROTOOPT;
1275		break;
1276	}
1277	release_sock(sk);
1278	return ret;
1279}
1280EXPORT_SYMBOL(sock_setsockopt);
1281
1282
1283static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1284			  struct ucred *ucred)
1285{
1286	ucred->pid = pid_vnr(pid);
1287	ucred->uid = ucred->gid = -1;
1288	if (cred) {
1289		struct user_namespace *current_ns = current_user_ns();
1290
1291		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1292		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1293	}
1294}
1295
1296static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1297{
1298	struct user_namespace *user_ns = current_user_ns();
1299	int i;
1300
1301	for (i = 0; i < src->ngroups; i++)
1302		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1303			return -EFAULT;
1304
1305	return 0;
1306}
1307
1308int sock_getsockopt(struct socket *sock, int level, int optname,
1309		    char __user *optval, int __user *optlen)
1310{
1311	struct sock *sk = sock->sk;
1312
1313	union {
1314		int val;
1315		u64 val64;
1316		unsigned long ulval;
1317		struct linger ling;
1318		struct old_timeval32 tm32;
1319		struct __kernel_old_timeval tm;
1320		struct  __kernel_sock_timeval stm;
1321		struct sock_txtime txtime;
1322	} v;
1323
1324	int lv = sizeof(int);
1325	int len;
1326
1327	if (get_user(len, optlen))
1328		return -EFAULT;
1329	if (len < 0)
1330		return -EINVAL;
1331
1332	memset(&v, 0, sizeof(v));
1333
1334	switch (optname) {
1335	case SO_DEBUG:
1336		v.val = sock_flag(sk, SOCK_DBG);
1337		break;
1338
1339	case SO_DONTROUTE:
1340		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1341		break;
1342
1343	case SO_BROADCAST:
1344		v.val = sock_flag(sk, SOCK_BROADCAST);
1345		break;
1346
1347	case SO_SNDBUF:
1348		v.val = sk->sk_sndbuf;
1349		break;
1350
1351	case SO_RCVBUF:
1352		v.val = sk->sk_rcvbuf;
1353		break;
1354
1355	case SO_REUSEADDR:
1356		v.val = sk->sk_reuse;
1357		break;
1358
1359	case SO_REUSEPORT:
1360		v.val = sk->sk_reuseport;
1361		break;
1362
1363	case SO_KEEPALIVE:
1364		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1365		break;
1366
1367	case SO_TYPE:
1368		v.val = sk->sk_type;
1369		break;
1370
1371	case SO_PROTOCOL:
1372		v.val = sk->sk_protocol;
1373		break;
1374
1375	case SO_DOMAIN:
1376		v.val = sk->sk_family;
1377		break;
1378
1379	case SO_ERROR:
1380		v.val = -sock_error(sk);
1381		if (v.val == 0)
1382			v.val = xchg(&sk->sk_err_soft, 0);
1383		break;
1384
1385	case SO_OOBINLINE:
1386		v.val = sock_flag(sk, SOCK_URGINLINE);
1387		break;
1388
1389	case SO_NO_CHECK:
1390		v.val = sk->sk_no_check_tx;
1391		break;
1392
1393	case SO_PRIORITY:
1394		v.val = sk->sk_priority;
1395		break;
1396
1397	case SO_LINGER:
1398		lv		= sizeof(v.ling);
1399		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1400		v.ling.l_linger	= sk->sk_lingertime / HZ;
1401		break;
1402
1403	case SO_BSDCOMPAT:
1404		break;
1405
1406	case SO_TIMESTAMP_OLD:
1407		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1408				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1409				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1410		break;
1411
1412	case SO_TIMESTAMPNS_OLD:
1413		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1414		break;
1415
1416	case SO_TIMESTAMP_NEW:
1417		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1418		break;
1419
1420	case SO_TIMESTAMPNS_NEW:
1421		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1422		break;
1423
1424	case SO_TIMESTAMPING_OLD:
1425		v.val = sk->sk_tsflags;
1426		break;
1427
1428	case SO_RCVTIMEO_OLD:
1429	case SO_RCVTIMEO_NEW:
1430		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1431		break;
1432
1433	case SO_SNDTIMEO_OLD:
1434	case SO_SNDTIMEO_NEW:
1435		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1436		break;
1437
1438	case SO_RCVLOWAT:
1439		v.val = sk->sk_rcvlowat;
1440		break;
1441
1442	case SO_SNDLOWAT:
1443		v.val = 1;
1444		break;
1445
1446	case SO_PASSCRED:
1447		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1448		break;
1449
1450	case SO_PEERCRED:
1451	{
1452		struct ucred peercred;
1453		if (len > sizeof(peercred))
1454			len = sizeof(peercred);
1455		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1456		if (copy_to_user(optval, &peercred, len))
1457			return -EFAULT;
1458		goto lenout;
1459	}
1460
1461	case SO_PEERGROUPS:
1462	{
1463		int ret, n;
1464
1465		if (!sk->sk_peer_cred)
1466			return -ENODATA;
1467
1468		n = sk->sk_peer_cred->group_info->ngroups;
1469		if (len < n * sizeof(gid_t)) {
1470			len = n * sizeof(gid_t);
1471			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1472		}
1473		len = n * sizeof(gid_t);
1474
1475		ret = groups_to_user((gid_t __user *)optval,
1476				     sk->sk_peer_cred->group_info);
1477		if (ret)
1478			return ret;
1479		goto lenout;
1480	}
1481
1482	case SO_PEERNAME:
1483	{
1484		char address[128];
1485
1486		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1487		if (lv < 0)
1488			return -ENOTCONN;
1489		if (lv < len)
1490			return -EINVAL;
1491		if (copy_to_user(optval, address, len))
1492			return -EFAULT;
1493		goto lenout;
1494	}
1495
1496	/* Dubious BSD thing... Probably nobody even uses it, but
1497	 * the UNIX standard wants it for whatever reason... -DaveM
1498	 */
1499	case SO_ACCEPTCONN:
1500		v.val = sk->sk_state == TCP_LISTEN;
1501		break;
1502
1503	case SO_PASSSEC:
1504		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1505		break;
1506
1507	case SO_PEERSEC:
1508		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1509
1510	case SO_MARK:
1511		v.val = sk->sk_mark;
1512		break;
1513
1514	case SO_RXQ_OVFL:
1515		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1516		break;
1517
1518	case SO_WIFI_STATUS:
1519		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1520		break;
1521
1522	case SO_PEEK_OFF:
1523		if (!sock->ops->set_peek_off)
1524			return -EOPNOTSUPP;
1525
1526		v.val = sk->sk_peek_off;
1527		break;
1528	case SO_NOFCS:
1529		v.val = sock_flag(sk, SOCK_NOFCS);
1530		break;
1531
1532	case SO_BINDTODEVICE:
1533		return sock_getbindtodevice(sk, optval, optlen, len);
1534
1535	case SO_GET_FILTER:
1536		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1537		if (len < 0)
1538			return len;
1539
1540		goto lenout;
1541
1542	case SO_LOCK_FILTER:
1543		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1544		break;
1545
1546	case SO_BPF_EXTENSIONS:
1547		v.val = bpf_tell_extensions();
1548		break;
1549
1550	case SO_SELECT_ERR_QUEUE:
1551		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1552		break;
1553
1554#ifdef CONFIG_NET_RX_BUSY_POLL
1555	case SO_BUSY_POLL:
1556		v.val = sk->sk_ll_usec;
1557		break;
1558	case SO_PREFER_BUSY_POLL:
1559		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1560		break;
1561#endif
1562
1563	case SO_MAX_PACING_RATE:
1564		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1565			lv = sizeof(v.ulval);
1566			v.ulval = sk->sk_max_pacing_rate;
1567		} else {
1568			/* 32bit version */
1569			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1570		}
1571		break;
1572
1573	case SO_INCOMING_CPU:
1574		v.val = READ_ONCE(sk->sk_incoming_cpu);
1575		break;
1576
1577	case SO_MEMINFO:
1578	{
1579		u32 meminfo[SK_MEMINFO_VARS];
1580
1581		sk_get_meminfo(sk, meminfo);
1582
1583		len = min_t(unsigned int, len, sizeof(meminfo));
1584		if (copy_to_user(optval, &meminfo, len))
1585			return -EFAULT;
1586
1587		goto lenout;
1588	}
1589
1590#ifdef CONFIG_NET_RX_BUSY_POLL
1591	case SO_INCOMING_NAPI_ID:
1592		v.val = READ_ONCE(sk->sk_napi_id);
1593
1594		/* aggregate non-NAPI IDs down to 0 */
1595		if (v.val < MIN_NAPI_ID)
1596			v.val = 0;
1597
1598		break;
1599#endif
1600
1601	case SO_COOKIE:
1602		lv = sizeof(u64);
1603		if (len < lv)
1604			return -EINVAL;
1605		v.val64 = sock_gen_cookie(sk);
1606		break;
1607
1608	case SO_ZEROCOPY:
1609		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1610		break;
1611
1612	case SO_TXTIME:
1613		lv = sizeof(v.txtime);
1614		v.txtime.clockid = sk->sk_clockid;
1615		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1616				  SOF_TXTIME_DEADLINE_MODE : 0;
1617		v.txtime.flags |= sk->sk_txtime_report_errors ?
1618				  SOF_TXTIME_REPORT_ERRORS : 0;
1619		break;
1620
1621	case SO_BINDTOIFINDEX:
1622		v.val = sk->sk_bound_dev_if;
1623		break;
1624
1625	default:
1626		/* We implement the SO_SNDLOWAT etc to not be settable
1627		 * (1003.1g 7).
1628		 */
1629		return -ENOPROTOOPT;
1630	}
1631
1632	if (len > lv)
1633		len = lv;
1634	if (copy_to_user(optval, &v, len))
1635		return -EFAULT;
1636lenout:
1637	if (put_user(len, optlen))
1638		return -EFAULT;
1639	return 0;
1640}
1641
1642/*
1643 * Initialize an sk_lock.
1644 *
1645 * (We also register the sk_lock with the lock validator.)
1646 */
1647static inline void sock_lock_init(struct sock *sk)
1648{
1649	if (sk->sk_kern_sock)
1650		sock_lock_init_class_and_name(
1651			sk,
1652			af_family_kern_slock_key_strings[sk->sk_family],
1653			af_family_kern_slock_keys + sk->sk_family,
1654			af_family_kern_key_strings[sk->sk_family],
1655			af_family_kern_keys + sk->sk_family);
1656	else
1657		sock_lock_init_class_and_name(
1658			sk,
1659			af_family_slock_key_strings[sk->sk_family],
1660			af_family_slock_keys + sk->sk_family,
1661			af_family_key_strings[sk->sk_family],
1662			af_family_keys + sk->sk_family);
1663}
1664
1665/*
1666 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1667 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1668 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1669 */
1670static void sock_copy(struct sock *nsk, const struct sock *osk)
1671{
1672	const struct proto *prot = READ_ONCE(osk->sk_prot);
1673#ifdef CONFIG_SECURITY_NETWORK
1674	void *sptr = nsk->sk_security;
1675#endif
1676
1677	/* If we move sk_tx_queue_mapping out of the private section,
1678	 * we must check if sk_tx_queue_clear() is called after
1679	 * sock_copy() in sk_clone_lock().
1680	 */
1681	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1682		     offsetof(struct sock, sk_dontcopy_begin) ||
1683		     offsetof(struct sock, sk_tx_queue_mapping) >=
1684		     offsetof(struct sock, sk_dontcopy_end));
1685
1686	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1687
1688	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1689	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1690
1691#ifdef CONFIG_SECURITY_NETWORK
1692	nsk->sk_security = sptr;
1693	security_sk_clone(osk, nsk);
1694#endif
1695}
1696
1697static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1698		int family)
1699{
1700	struct sock *sk;
1701	struct kmem_cache *slab;
1702
1703	slab = prot->slab;
1704	if (slab != NULL) {
1705		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1706		if (!sk)
1707			return sk;
1708		if (want_init_on_alloc(priority))
1709			sk_prot_clear_nulls(sk, prot->obj_size);
1710	} else
1711		sk = kmalloc(prot->obj_size, priority);
1712
1713	if (sk != NULL) {
1714		if (security_sk_alloc(sk, family, priority))
1715			goto out_free;
1716
1717		if (!try_module_get(prot->owner))
1718			goto out_free_sec;
1719	}
1720
1721	return sk;
1722
1723out_free_sec:
1724	security_sk_free(sk);
1725out_free:
1726	if (slab != NULL)
1727		kmem_cache_free(slab, sk);
1728	else
1729		kfree(sk);
1730	return NULL;
1731}
1732
1733static void sk_prot_free(struct proto *prot, struct sock *sk)
1734{
1735	struct kmem_cache *slab;
1736	struct module *owner;
1737
1738	owner = prot->owner;
1739	slab = prot->slab;
1740
1741	cgroup_sk_free(&sk->sk_cgrp_data);
1742	mem_cgroup_sk_free(sk);
1743	security_sk_free(sk);
1744	if (slab != NULL)
1745		kmem_cache_free(slab, sk);
1746	else
1747		kfree(sk);
1748	module_put(owner);
1749}
1750
1751/**
1752 *	sk_alloc - All socket objects are allocated here
1753 *	@net: the applicable net namespace
1754 *	@family: protocol family
1755 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1756 *	@prot: struct proto associated with this new sock instance
1757 *	@kern: is this to be a kernel socket?
1758 */
1759struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1760		      struct proto *prot, int kern)
1761{
1762	struct sock *sk;
1763
1764	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1765	if (sk) {
1766		sk->sk_family = family;
1767		/*
1768		 * See comment in struct sock definition to understand
1769		 * why we need sk_prot_creator -acme
1770		 */
1771		sk->sk_prot = sk->sk_prot_creator = prot;
1772		sk->sk_kern_sock = kern;
1773		sock_lock_init(sk);
1774		sk->sk_net_refcnt = kern ? 0 : 1;
1775		if (likely(sk->sk_net_refcnt)) {
1776			get_net(net);
1777			sock_inuse_add(net, 1);
1778		}
1779
1780		sock_net_set(sk, net);
1781		refcount_set(&sk->sk_wmem_alloc, 1);
1782
1783		mem_cgroup_sk_alloc(sk);
1784		cgroup_sk_alloc(&sk->sk_cgrp_data);
1785		sock_update_classid(&sk->sk_cgrp_data);
1786		sock_update_netprioidx(&sk->sk_cgrp_data);
1787		sk_tx_queue_clear(sk);
1788	}
1789
1790	return sk;
1791}
1792EXPORT_SYMBOL(sk_alloc);
1793
1794/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1795 * grace period. This is the case for UDP sockets and TCP listeners.
1796 */
1797static void __sk_destruct(struct rcu_head *head)
1798{
1799	struct sock *sk = container_of(head, struct sock, sk_rcu);
1800	struct sk_filter *filter;
1801
1802	if (sk->sk_destruct)
1803		sk->sk_destruct(sk);
1804
1805	filter = rcu_dereference_check(sk->sk_filter,
1806				       refcount_read(&sk->sk_wmem_alloc) == 0);
1807	if (filter) {
1808		sk_filter_uncharge(sk, filter);
1809		RCU_INIT_POINTER(sk->sk_filter, NULL);
1810	}
1811
1812	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1813
1814#ifdef CONFIG_BPF_SYSCALL
1815	bpf_sk_storage_free(sk);
1816#endif
1817
1818	if (atomic_read(&sk->sk_omem_alloc))
1819		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1820			 __func__, atomic_read(&sk->sk_omem_alloc));
1821
1822	if (sk->sk_frag.page) {
1823		put_page(sk->sk_frag.page);
1824		sk->sk_frag.page = NULL;
1825	}
1826
1827	if (sk->sk_peer_cred)
1828		put_cred(sk->sk_peer_cred);
1829	put_pid(sk->sk_peer_pid);
1830	if (likely(sk->sk_net_refcnt))
1831		put_net(sock_net(sk));
1832	sk_prot_free(sk->sk_prot_creator, sk);
1833}
1834
1835void sk_destruct(struct sock *sk)
1836{
1837	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1838
1839	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1840		reuseport_detach_sock(sk);
1841		use_call_rcu = true;
1842	}
1843
1844	if (use_call_rcu)
1845		call_rcu(&sk->sk_rcu, __sk_destruct);
1846	else
1847		__sk_destruct(&sk->sk_rcu);
1848}
1849
1850static void __sk_free(struct sock *sk)
1851{
1852	if (likely(sk->sk_net_refcnt))
1853		sock_inuse_add(sock_net(sk), -1);
1854
1855	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1856		sock_diag_broadcast_destroy(sk);
1857	else
1858		sk_destruct(sk);
1859}
1860
1861void sk_free(struct sock *sk)
1862{
1863	/*
1864	 * We subtract one from sk_wmem_alloc and can know if
1865	 * some packets are still in some tx queue.
1866	 * If not null, sock_wfree() will call __sk_free(sk) later
1867	 */
1868	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1869		__sk_free(sk);
1870}
1871EXPORT_SYMBOL(sk_free);
1872
1873static void sk_init_common(struct sock *sk)
1874{
1875	skb_queue_head_init(&sk->sk_receive_queue);
1876	skb_queue_head_init(&sk->sk_write_queue);
1877	skb_queue_head_init(&sk->sk_error_queue);
1878
1879	rwlock_init(&sk->sk_callback_lock);
1880	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1881			af_rlock_keys + sk->sk_family,
1882			af_family_rlock_key_strings[sk->sk_family]);
1883	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1884			af_wlock_keys + sk->sk_family,
1885			af_family_wlock_key_strings[sk->sk_family]);
1886	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1887			af_elock_keys + sk->sk_family,
1888			af_family_elock_key_strings[sk->sk_family]);
1889	lockdep_set_class_and_name(&sk->sk_callback_lock,
1890			af_callback_keys + sk->sk_family,
1891			af_family_clock_key_strings[sk->sk_family]);
1892}
1893
1894/**
1895 *	sk_clone_lock - clone a socket, and lock its clone
1896 *	@sk: the socket to clone
1897 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1898 *
1899 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1900 */
1901struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1902{
1903	struct proto *prot = READ_ONCE(sk->sk_prot);
1904	struct sk_filter *filter;
1905	bool is_charged = true;
1906	struct sock *newsk;
1907
1908	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1909	if (!newsk)
1910		goto out;
1911
1912	sock_copy(newsk, sk);
1913
1914	newsk->sk_prot_creator = prot;
1915
1916	/* SANITY */
1917	if (likely(newsk->sk_net_refcnt))
1918		get_net(sock_net(newsk));
1919	sk_node_init(&newsk->sk_node);
1920	sock_lock_init(newsk);
1921	bh_lock_sock(newsk);
1922	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1923	newsk->sk_backlog.len = 0;
1924
1925	atomic_set(&newsk->sk_rmem_alloc, 0);
1926
1927	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
1928	refcount_set(&newsk->sk_wmem_alloc, 1);
1929
1930	atomic_set(&newsk->sk_omem_alloc, 0);
1931	sk_init_common(newsk);
1932
1933	newsk->sk_dst_cache	= NULL;
1934	newsk->sk_dst_pending_confirm = 0;
1935	newsk->sk_wmem_queued	= 0;
1936	newsk->sk_forward_alloc = 0;
1937	atomic_set(&newsk->sk_drops, 0);
1938	newsk->sk_send_head	= NULL;
1939	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1940	atomic_set(&newsk->sk_zckey, 0);
1941
1942	sock_reset_flag(newsk, SOCK_DONE);
1943
1944	/* sk->sk_memcg will be populated at accept() time */
1945	newsk->sk_memcg = NULL;
1946
1947	cgroup_sk_clone(&newsk->sk_cgrp_data);
1948
1949	rcu_read_lock();
1950	filter = rcu_dereference(sk->sk_filter);
1951	if (filter != NULL)
1952		/* though it's an empty new sock, the charging may fail
1953		 * if sysctl_optmem_max was changed between creation of
1954		 * original socket and cloning
1955		 */
1956		is_charged = sk_filter_charge(newsk, filter);
1957	RCU_INIT_POINTER(newsk->sk_filter, filter);
1958	rcu_read_unlock();
1959
1960	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1961		/* We need to make sure that we don't uncharge the new
1962		 * socket if we couldn't charge it in the first place
1963		 * as otherwise we uncharge the parent's filter.
1964		 */
1965		if (!is_charged)
1966			RCU_INIT_POINTER(newsk->sk_filter, NULL);
1967		sk_free_unlock_clone(newsk);
1968		newsk = NULL;
1969		goto out;
1970	}
1971	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1972
1973	if (bpf_sk_storage_clone(sk, newsk)) {
1974		sk_free_unlock_clone(newsk);
1975		newsk = NULL;
1976		goto out;
1977	}
1978
1979	/* Clear sk_user_data if parent had the pointer tagged
1980	 * as not suitable for copying when cloning.
1981	 */
1982	if (sk_user_data_is_nocopy(newsk))
1983		newsk->sk_user_data = NULL;
1984
1985	newsk->sk_err	   = 0;
1986	newsk->sk_err_soft = 0;
1987	newsk->sk_priority = 0;
1988	newsk->sk_incoming_cpu = raw_smp_processor_id();
1989	if (likely(newsk->sk_net_refcnt))
1990		sock_inuse_add(sock_net(newsk), 1);
1991
1992	/* Before updating sk_refcnt, we must commit prior changes to memory
1993	 * (Documentation/RCU/rculist_nulls.rst for details)
1994	 */
1995	smp_wmb();
1996	refcount_set(&newsk->sk_refcnt, 2);
1997
1998	/* Increment the counter in the same struct proto as the master
1999	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2000	 * is the same as sk->sk_prot->socks, as this field was copied
2001	 * with memcpy).
2002	 *
2003	 * This _changes_ the previous behaviour, where
2004	 * tcp_create_openreq_child always was incrementing the
2005	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2006	 * to be taken into account in all callers. -acme
2007	 */
2008	sk_refcnt_debug_inc(newsk);
2009	sk_set_socket(newsk, NULL);
2010	sk_tx_queue_clear(newsk);
2011	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2012
2013	if (newsk->sk_prot->sockets_allocated)
2014		sk_sockets_allocated_inc(newsk);
2015
2016	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2017		net_enable_timestamp();
2018out:
2019	return newsk;
2020}
2021EXPORT_SYMBOL_GPL(sk_clone_lock);
2022
2023void sk_free_unlock_clone(struct sock *sk)
2024{
2025	/* It is still raw copy of parent, so invalidate
2026	 * destructor and make plain sk_free() */
2027	sk->sk_destruct = NULL;
2028	bh_unlock_sock(sk);
2029	sk_free(sk);
2030}
2031EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2032
2033void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2034{
2035	u32 max_segs = 1;
2036
2037	sk_dst_set(sk, dst);
2038	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2039	if (sk->sk_route_caps & NETIF_F_GSO)
2040		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2041	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2042	if (sk_can_gso(sk)) {
2043		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2044			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2045		} else {
2046			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2047			sk->sk_gso_max_size = dst->dev->gso_max_size;
2048			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2049		}
2050	}
2051	sk->sk_gso_max_segs = max_segs;
2052}
2053EXPORT_SYMBOL_GPL(sk_setup_caps);
2054
2055/*
2056 *	Simple resource managers for sockets.
2057 */
2058
2059
2060/*
2061 * Write buffer destructor automatically called from kfree_skb.
2062 */
2063void sock_wfree(struct sk_buff *skb)
2064{
2065	struct sock *sk = skb->sk;
2066	unsigned int len = skb->truesize;
2067
2068	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2069		/*
2070		 * Keep a reference on sk_wmem_alloc, this will be released
2071		 * after sk_write_space() call
2072		 */
2073		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2074		sk->sk_write_space(sk);
2075		len = 1;
2076	}
2077	/*
2078	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2079	 * could not do because of in-flight packets
2080	 */
2081	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2082		__sk_free(sk);
2083}
2084EXPORT_SYMBOL(sock_wfree);
2085
2086/* This variant of sock_wfree() is used by TCP,
2087 * since it sets SOCK_USE_WRITE_QUEUE.
2088 */
2089void __sock_wfree(struct sk_buff *skb)
2090{
2091	struct sock *sk = skb->sk;
2092
2093	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2094		__sk_free(sk);
2095}
2096
2097void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2098{
2099	skb_orphan(skb);
2100	skb->sk = sk;
2101#ifdef CONFIG_INET
2102	if (unlikely(!sk_fullsock(sk))) {
2103		skb->destructor = sock_edemux;
2104		sock_hold(sk);
2105		return;
2106	}
2107#endif
2108	skb->destructor = sock_wfree;
2109	skb_set_hash_from_sk(skb, sk);
2110	/*
2111	 * We used to take a refcount on sk, but following operation
2112	 * is enough to guarantee sk_free() wont free this sock until
2113	 * all in-flight packets are completed
2114	 */
2115	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2116}
2117EXPORT_SYMBOL(skb_set_owner_w);
2118
2119static bool can_skb_orphan_partial(const struct sk_buff *skb)
2120{
2121#ifdef CONFIG_TLS_DEVICE
2122	/* Drivers depend on in-order delivery for crypto offload,
2123	 * partial orphan breaks out-of-order-OK logic.
2124	 */
2125	if (skb->decrypted)
2126		return false;
2127#endif
2128	return (skb->destructor == sock_wfree ||
2129		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2130}
2131
2132/* This helper is used by netem, as it can hold packets in its
2133 * delay queue. We want to allow the owner socket to send more
2134 * packets, as if they were already TX completed by a typical driver.
2135 * But we also want to keep skb->sk set because some packet schedulers
2136 * rely on it (sch_fq for example).
2137 */
2138void skb_orphan_partial(struct sk_buff *skb)
2139{
2140	if (skb_is_tcp_pure_ack(skb))
2141		return;
2142
2143	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2144		return;
2145
2146	skb_orphan(skb);
2147}
2148EXPORT_SYMBOL(skb_orphan_partial);
2149
2150/*
2151 * Read buffer destructor automatically called from kfree_skb.
2152 */
2153void sock_rfree(struct sk_buff *skb)
2154{
2155	struct sock *sk = skb->sk;
2156	unsigned int len = skb->truesize;
2157
2158	atomic_sub(len, &sk->sk_rmem_alloc);
2159	sk_mem_uncharge(sk, len);
2160}
2161EXPORT_SYMBOL(sock_rfree);
2162
2163/*
2164 * Buffer destructor for skbs that are not used directly in read or write
2165 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2166 */
2167void sock_efree(struct sk_buff *skb)
2168{
2169	sock_put(skb->sk);
2170}
2171EXPORT_SYMBOL(sock_efree);
2172
2173/* Buffer destructor for prefetch/receive path where reference count may
2174 * not be held, e.g. for listen sockets.
2175 */
2176#ifdef CONFIG_INET
2177void sock_pfree(struct sk_buff *skb)
2178{
2179	if (sk_is_refcounted(skb->sk))
2180		sock_gen_put(skb->sk);
2181}
2182EXPORT_SYMBOL(sock_pfree);
2183#endif /* CONFIG_INET */
2184
2185kuid_t sock_i_uid(struct sock *sk)
2186{
2187	kuid_t uid;
2188
2189	read_lock_bh(&sk->sk_callback_lock);
2190	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2191	read_unlock_bh(&sk->sk_callback_lock);
2192	return uid;
2193}
2194EXPORT_SYMBOL(sock_i_uid);
2195
2196unsigned long sock_i_ino(struct sock *sk)
2197{
2198	unsigned long ino;
2199
2200	read_lock_bh(&sk->sk_callback_lock);
2201	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2202	read_unlock_bh(&sk->sk_callback_lock);
2203	return ino;
2204}
2205EXPORT_SYMBOL(sock_i_ino);
2206
2207/*
2208 * Allocate a skb from the socket's send buffer.
2209 */
2210struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2211			     gfp_t priority)
2212{
2213	if (force ||
2214	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2215		struct sk_buff *skb = alloc_skb(size, priority);
2216
2217		if (skb) {
2218			skb_set_owner_w(skb, sk);
2219			return skb;
2220		}
2221	}
2222	return NULL;
2223}
2224EXPORT_SYMBOL(sock_wmalloc);
2225
2226static void sock_ofree(struct sk_buff *skb)
2227{
2228	struct sock *sk = skb->sk;
2229
2230	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2231}
2232
2233struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2234			     gfp_t priority)
2235{
2236	struct sk_buff *skb;
2237
2238	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2239	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2240	    sysctl_optmem_max)
2241		return NULL;
2242
2243	skb = alloc_skb(size, priority);
2244	if (!skb)
2245		return NULL;
2246
2247	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2248	skb->sk = sk;
2249	skb->destructor = sock_ofree;
2250	return skb;
2251}
2252
2253/*
2254 * Allocate a memory block from the socket's option memory buffer.
2255 */
2256void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2257{
2258	if ((unsigned int)size <= sysctl_optmem_max &&
2259	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2260		void *mem;
2261		/* First do the add, to avoid the race if kmalloc
2262		 * might sleep.
2263		 */
2264		atomic_add(size, &sk->sk_omem_alloc);
2265		mem = kmalloc(size, priority);
2266		if (mem)
2267			return mem;
2268		atomic_sub(size, &sk->sk_omem_alloc);
2269	}
2270	return NULL;
2271}
2272EXPORT_SYMBOL(sock_kmalloc);
2273
2274/* Free an option memory block. Note, we actually want the inline
2275 * here as this allows gcc to detect the nullify and fold away the
2276 * condition entirely.
2277 */
2278static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2279				  const bool nullify)
2280{
2281	if (WARN_ON_ONCE(!mem))
2282		return;
2283	if (nullify)
2284		kfree_sensitive(mem);
2285	else
2286		kfree(mem);
2287	atomic_sub(size, &sk->sk_omem_alloc);
2288}
2289
2290void sock_kfree_s(struct sock *sk, void *mem, int size)
2291{
2292	__sock_kfree_s(sk, mem, size, false);
2293}
2294EXPORT_SYMBOL(sock_kfree_s);
2295
2296void sock_kzfree_s(struct sock *sk, void *mem, int size)
2297{
2298	__sock_kfree_s(sk, mem, size, true);
2299}
2300EXPORT_SYMBOL(sock_kzfree_s);
2301
2302/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2303   I think, these locks should be removed for datagram sockets.
2304 */
2305static long sock_wait_for_wmem(struct sock *sk, long timeo)
2306{
2307	DEFINE_WAIT(wait);
2308
2309	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2310	for (;;) {
2311		if (!timeo)
2312			break;
2313		if (signal_pending(current))
2314			break;
2315		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2316		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2317		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2318			break;
2319		if (sk->sk_shutdown & SEND_SHUTDOWN)
2320			break;
2321		if (sk->sk_err)
2322			break;
2323		timeo = schedule_timeout(timeo);
2324	}
2325	finish_wait(sk_sleep(sk), &wait);
2326	return timeo;
2327}
2328
2329
2330/*
2331 *	Generic send/receive buffer handlers
2332 */
2333
2334struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2335				     unsigned long data_len, int noblock,
2336				     int *errcode, int max_page_order)
2337{
2338	struct sk_buff *skb;
2339	long timeo;
2340	int err;
2341
2342	timeo = sock_sndtimeo(sk, noblock);
2343	for (;;) {
2344		err = sock_error(sk);
2345		if (err != 0)
2346			goto failure;
2347
2348		err = -EPIPE;
2349		if (sk->sk_shutdown & SEND_SHUTDOWN)
2350			goto failure;
2351
2352		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2353			break;
2354
2355		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2356		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2357		err = -EAGAIN;
2358		if (!timeo)
2359			goto failure;
2360		if (signal_pending(current))
2361			goto interrupted;
2362		timeo = sock_wait_for_wmem(sk, timeo);
2363	}
2364	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2365				   errcode, sk->sk_allocation);
2366	if (skb)
2367		skb_set_owner_w(skb, sk);
2368	return skb;
2369
2370interrupted:
2371	err = sock_intr_errno(timeo);
2372failure:
2373	*errcode = err;
2374	return NULL;
2375}
2376EXPORT_SYMBOL(sock_alloc_send_pskb);
2377
2378struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2379				    int noblock, int *errcode)
2380{
2381	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2382}
2383EXPORT_SYMBOL(sock_alloc_send_skb);
2384
2385int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2386		     struct sockcm_cookie *sockc)
2387{
2388	u32 tsflags;
2389
2390	switch (cmsg->cmsg_type) {
2391	case SO_MARK:
2392		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2393			return -EPERM;
2394		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2395			return -EINVAL;
2396		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2397		break;
2398	case SO_TIMESTAMPING_OLD:
2399		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2400			return -EINVAL;
2401
2402		tsflags = *(u32 *)CMSG_DATA(cmsg);
2403		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2404			return -EINVAL;
2405
2406		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2407		sockc->tsflags |= tsflags;
2408		break;
2409	case SCM_TXTIME:
2410		if (!sock_flag(sk, SOCK_TXTIME))
2411			return -EINVAL;
2412		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2413			return -EINVAL;
2414		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2415		break;
2416	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2417	case SCM_RIGHTS:
2418	case SCM_CREDENTIALS:
2419		break;
2420	default:
2421		return -EINVAL;
2422	}
2423	return 0;
2424}
2425EXPORT_SYMBOL(__sock_cmsg_send);
2426
2427int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2428		   struct sockcm_cookie *sockc)
2429{
2430	struct cmsghdr *cmsg;
2431	int ret;
2432
2433	for_each_cmsghdr(cmsg, msg) {
2434		if (!CMSG_OK(msg, cmsg))
2435			return -EINVAL;
2436		if (cmsg->cmsg_level != SOL_SOCKET)
2437			continue;
2438		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2439		if (ret)
2440			return ret;
2441	}
2442	return 0;
2443}
2444EXPORT_SYMBOL(sock_cmsg_send);
2445
2446static void sk_enter_memory_pressure(struct sock *sk)
2447{
2448	if (!sk->sk_prot->enter_memory_pressure)
2449		return;
2450
2451	sk->sk_prot->enter_memory_pressure(sk);
2452}
2453
2454static void sk_leave_memory_pressure(struct sock *sk)
2455{
2456	if (sk->sk_prot->leave_memory_pressure) {
2457		sk->sk_prot->leave_memory_pressure(sk);
2458	} else {
2459		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2460
2461		if (memory_pressure && READ_ONCE(*memory_pressure))
2462			WRITE_ONCE(*memory_pressure, 0);
2463	}
2464}
2465
2466#define SKB_FRAG_PAGE_ORDER	get_order(32768)
2467DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2468
2469/**
2470 * skb_page_frag_refill - check that a page_frag contains enough room
2471 * @sz: minimum size of the fragment we want to get
2472 * @pfrag: pointer to page_frag
2473 * @gfp: priority for memory allocation
2474 *
2475 * Note: While this allocator tries to use high order pages, there is
2476 * no guarantee that allocations succeed. Therefore, @sz MUST be
2477 * less or equal than PAGE_SIZE.
2478 */
2479bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2480{
2481	if (pfrag->page) {
2482		if (page_ref_count(pfrag->page) == 1) {
2483			pfrag->offset = 0;
2484			return true;
2485		}
2486		if (pfrag->offset + sz <= pfrag->size)
2487			return true;
2488		put_page(pfrag->page);
2489	}
2490
2491	pfrag->offset = 0;
2492	if (SKB_FRAG_PAGE_ORDER &&
2493	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2494		/* Avoid direct reclaim but allow kswapd to wake */
2495		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2496					  __GFP_COMP | __GFP_NOWARN |
2497					  __GFP_NORETRY,
2498					  SKB_FRAG_PAGE_ORDER);
2499		if (likely(pfrag->page)) {
2500			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2501			return true;
2502		}
2503	}
2504	pfrag->page = alloc_page(gfp);
2505	if (likely(pfrag->page)) {
2506		pfrag->size = PAGE_SIZE;
2507		return true;
2508	}
2509	return false;
2510}
2511EXPORT_SYMBOL(skb_page_frag_refill);
2512
2513bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2514{
2515	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2516		return true;
2517
2518	sk_enter_memory_pressure(sk);
2519	sk_stream_moderate_sndbuf(sk);
2520	return false;
2521}
2522EXPORT_SYMBOL(sk_page_frag_refill);
2523
2524void __lock_sock(struct sock *sk)
2525	__releases(&sk->sk_lock.slock)
2526	__acquires(&sk->sk_lock.slock)
2527{
2528	DEFINE_WAIT(wait);
2529
2530	for (;;) {
2531		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2532					TASK_UNINTERRUPTIBLE);
2533		spin_unlock_bh(&sk->sk_lock.slock);
2534		schedule();
2535		spin_lock_bh(&sk->sk_lock.slock);
2536		if (!sock_owned_by_user(sk))
2537			break;
2538	}
2539	finish_wait(&sk->sk_lock.wq, &wait);
2540}
2541
2542void __release_sock(struct sock *sk)
2543	__releases(&sk->sk_lock.slock)
2544	__acquires(&sk->sk_lock.slock)
2545{
2546	struct sk_buff *skb, *next;
2547
2548	while ((skb = sk->sk_backlog.head) != NULL) {
2549		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2550
2551		spin_unlock_bh(&sk->sk_lock.slock);
2552
2553		do {
2554			next = skb->next;
2555			prefetch(next);
2556			WARN_ON_ONCE(skb_dst_is_noref(skb));
2557			skb_mark_not_on_list(skb);
2558			sk_backlog_rcv(sk, skb);
2559
2560			cond_resched();
2561
2562			skb = next;
2563		} while (skb != NULL);
2564
2565		spin_lock_bh(&sk->sk_lock.slock);
2566	}
2567
2568	/*
2569	 * Doing the zeroing here guarantee we can not loop forever
2570	 * while a wild producer attempts to flood us.
2571	 */
2572	sk->sk_backlog.len = 0;
2573}
2574
2575void __sk_flush_backlog(struct sock *sk)
2576{
2577	spin_lock_bh(&sk->sk_lock.slock);
2578	__release_sock(sk);
2579	spin_unlock_bh(&sk->sk_lock.slock);
2580}
2581
2582/**
2583 * sk_wait_data - wait for data to arrive at sk_receive_queue
2584 * @sk:    sock to wait on
2585 * @timeo: for how long
2586 * @skb:   last skb seen on sk_receive_queue
2587 *
2588 * Now socket state including sk->sk_err is changed only under lock,
2589 * hence we may omit checks after joining wait queue.
2590 * We check receive queue before schedule() only as optimization;
2591 * it is very likely that release_sock() added new data.
2592 */
2593int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2594{
2595	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2596	int rc;
2597
2598	add_wait_queue(sk_sleep(sk), &wait);
2599	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2600	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2601	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2602	remove_wait_queue(sk_sleep(sk), &wait);
2603	return rc;
2604}
2605EXPORT_SYMBOL(sk_wait_data);
2606
2607/**
2608 *	__sk_mem_raise_allocated - increase memory_allocated
2609 *	@sk: socket
2610 *	@size: memory size to allocate
2611 *	@amt: pages to allocate
2612 *	@kind: allocation type
2613 *
2614 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2615 */
2616int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2617{
2618	struct proto *prot = sk->sk_prot;
2619	long allocated = sk_memory_allocated_add(sk, amt);
2620	bool charged = true;
2621
2622	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2623	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2624		goto suppress_allocation;
2625
2626	/* Under limit. */
2627	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2628		sk_leave_memory_pressure(sk);
2629		return 1;
2630	}
2631
2632	/* Under pressure. */
2633	if (allocated > sk_prot_mem_limits(sk, 1))
2634		sk_enter_memory_pressure(sk);
2635
2636	/* Over hard limit. */
2637	if (allocated > sk_prot_mem_limits(sk, 2))
2638		goto suppress_allocation;
2639
2640	/* guarantee minimum buffer size under pressure */
2641	if (kind == SK_MEM_RECV) {
2642		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2643			return 1;
2644
2645	} else { /* SK_MEM_SEND */
2646		int wmem0 = sk_get_wmem0(sk, prot);
2647
2648		if (sk->sk_type == SOCK_STREAM) {
2649			if (sk->sk_wmem_queued < wmem0)
2650				return 1;
2651		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2652				return 1;
2653		}
2654	}
2655
2656	if (sk_has_memory_pressure(sk)) {
2657		u64 alloc;
2658
2659		if (!sk_under_memory_pressure(sk))
2660			return 1;
2661		alloc = sk_sockets_allocated_read_positive(sk);
2662		if (sk_prot_mem_limits(sk, 2) > alloc *
2663		    sk_mem_pages(sk->sk_wmem_queued +
2664				 atomic_read(&sk->sk_rmem_alloc) +
2665				 sk->sk_forward_alloc))
2666			return 1;
2667	}
2668
2669suppress_allocation:
2670
2671	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2672		sk_stream_moderate_sndbuf(sk);
2673
2674		/* Fail only if socket is _under_ its sndbuf.
2675		 * In this case we cannot block, so that we have to fail.
2676		 */
2677		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2678			return 1;
2679	}
2680
2681	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2682		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2683
2684	sk_memory_allocated_sub(sk, amt);
2685
2686	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2687		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2688
2689	return 0;
2690}
2691EXPORT_SYMBOL(__sk_mem_raise_allocated);
2692
2693/**
2694 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2695 *	@sk: socket
2696 *	@size: memory size to allocate
2697 *	@kind: allocation type
2698 *
2699 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2700 *	rmem allocation. This function assumes that protocols which have
2701 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2702 */
2703int __sk_mem_schedule(struct sock *sk, int size, int kind)
2704{
2705	int ret, amt = sk_mem_pages(size);
2706
2707	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2708	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2709	if (!ret)
2710		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2711	return ret;
2712}
2713EXPORT_SYMBOL(__sk_mem_schedule);
2714
2715/**
2716 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2717 *	@sk: socket
2718 *	@amount: number of quanta
2719 *
2720 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2721 */
2722void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2723{
2724	sk_memory_allocated_sub(sk, amount);
2725
2726	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2727		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2728
2729	if (sk_under_memory_pressure(sk) &&
2730	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2731		sk_leave_memory_pressure(sk);
2732}
2733EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2734
2735/**
2736 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2737 *	@sk: socket
2738 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2739 */
2740void __sk_mem_reclaim(struct sock *sk, int amount)
2741{
2742	amount >>= SK_MEM_QUANTUM_SHIFT;
2743	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2744	__sk_mem_reduce_allocated(sk, amount);
2745}
2746EXPORT_SYMBOL(__sk_mem_reclaim);
2747
2748int sk_set_peek_off(struct sock *sk, int val)
2749{
2750	sk->sk_peek_off = val;
2751	return 0;
2752}
2753EXPORT_SYMBOL_GPL(sk_set_peek_off);
2754
2755/*
2756 * Set of default routines for initialising struct proto_ops when
2757 * the protocol does not support a particular function. In certain
2758 * cases where it makes no sense for a protocol to have a "do nothing"
2759 * function, some default processing is provided.
2760 */
2761
2762int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2763{
2764	return -EOPNOTSUPP;
2765}
2766EXPORT_SYMBOL(sock_no_bind);
2767
2768int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2769		    int len, int flags)
2770{
2771	return -EOPNOTSUPP;
2772}
2773EXPORT_SYMBOL(sock_no_connect);
2774
2775int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2776{
2777	return -EOPNOTSUPP;
2778}
2779EXPORT_SYMBOL(sock_no_socketpair);
2780
2781int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2782		   bool kern)
2783{
2784	return -EOPNOTSUPP;
2785}
2786EXPORT_SYMBOL(sock_no_accept);
2787
2788int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2789		    int peer)
2790{
2791	return -EOPNOTSUPP;
2792}
2793EXPORT_SYMBOL(sock_no_getname);
2794
2795int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2796{
2797	return -EOPNOTSUPP;
2798}
2799EXPORT_SYMBOL(sock_no_ioctl);
2800
2801int sock_no_listen(struct socket *sock, int backlog)
2802{
2803	return -EOPNOTSUPP;
2804}
2805EXPORT_SYMBOL(sock_no_listen);
2806
2807int sock_no_shutdown(struct socket *sock, int how)
2808{
2809	return -EOPNOTSUPP;
2810}
2811EXPORT_SYMBOL(sock_no_shutdown);
2812
2813int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2814{
2815	return -EOPNOTSUPP;
2816}
2817EXPORT_SYMBOL(sock_no_sendmsg);
2818
2819int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2820{
2821	return -EOPNOTSUPP;
2822}
2823EXPORT_SYMBOL(sock_no_sendmsg_locked);
2824
2825int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2826		    int flags)
2827{
2828	return -EOPNOTSUPP;
2829}
2830EXPORT_SYMBOL(sock_no_recvmsg);
2831
2832int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2833{
2834	/* Mirror missing mmap method error code */
2835	return -ENODEV;
2836}
2837EXPORT_SYMBOL(sock_no_mmap);
2838
2839/*
2840 * When a file is received (via SCM_RIGHTS, etc), we must bump the
2841 * various sock-based usage counts.
2842 */
2843void __receive_sock(struct file *file)
2844{
2845	struct socket *sock;
2846
2847	sock = sock_from_file(file);
2848	if (sock) {
2849		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2850		sock_update_classid(&sock->sk->sk_cgrp_data);
2851	}
2852}
2853
2854ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2855{
2856	ssize_t res;
2857	struct msghdr msg = {.msg_flags = flags};
2858	struct kvec iov;
2859	char *kaddr = kmap(page);
2860	iov.iov_base = kaddr + offset;
2861	iov.iov_len = size;
2862	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2863	kunmap(page);
2864	return res;
2865}
2866EXPORT_SYMBOL(sock_no_sendpage);
2867
2868ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2869				int offset, size_t size, int flags)
2870{
2871	ssize_t res;
2872	struct msghdr msg = {.msg_flags = flags};
2873	struct kvec iov;
2874	char *kaddr = kmap(page);
2875
2876	iov.iov_base = kaddr + offset;
2877	iov.iov_len = size;
2878	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2879	kunmap(page);
2880	return res;
2881}
2882EXPORT_SYMBOL(sock_no_sendpage_locked);
2883
2884/*
2885 *	Default Socket Callbacks
2886 */
2887
2888static void sock_def_wakeup(struct sock *sk)
2889{
2890	struct socket_wq *wq;
2891
2892	rcu_read_lock();
2893	wq = rcu_dereference(sk->sk_wq);
2894	if (skwq_has_sleeper(wq))
2895		wake_up_interruptible_all(&wq->wait);
2896	rcu_read_unlock();
2897}
2898
2899static void sock_def_error_report(struct sock *sk)
2900{
2901	struct socket_wq *wq;
2902
2903	rcu_read_lock();
2904	wq = rcu_dereference(sk->sk_wq);
2905	if (skwq_has_sleeper(wq))
2906		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2907	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2908	rcu_read_unlock();
2909}
2910
2911void sock_def_readable(struct sock *sk)
2912{
2913	struct socket_wq *wq;
2914
2915	rcu_read_lock();
2916	wq = rcu_dereference(sk->sk_wq);
2917	if (skwq_has_sleeper(wq))
2918		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2919						EPOLLRDNORM | EPOLLRDBAND);
2920	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2921	rcu_read_unlock();
2922}
2923
2924static void sock_def_write_space(struct sock *sk)
2925{
2926	struct socket_wq *wq;
2927
2928	rcu_read_lock();
2929
2930	/* Do not wake up a writer until he can make "significant"
2931	 * progress.  --DaveM
2932	 */
2933	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2934		wq = rcu_dereference(sk->sk_wq);
2935		if (skwq_has_sleeper(wq))
2936			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2937						EPOLLWRNORM | EPOLLWRBAND);
2938
2939		/* Should agree with poll, otherwise some programs break */
2940		if (sock_writeable(sk))
2941			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2942	}
2943
2944	rcu_read_unlock();
2945}
2946
2947static void sock_def_destruct(struct sock *sk)
2948{
2949}
2950
2951void sk_send_sigurg(struct sock *sk)
2952{
2953	if (sk->sk_socket && sk->sk_socket->file)
2954		if (send_sigurg(&sk->sk_socket->file->f_owner))
2955			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2956}
2957EXPORT_SYMBOL(sk_send_sigurg);
2958
2959void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2960		    unsigned long expires)
2961{
2962	if (!mod_timer(timer, expires))
2963		sock_hold(sk);
2964}
2965EXPORT_SYMBOL(sk_reset_timer);
2966
2967void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2968{
2969	if (del_timer(timer))
2970		__sock_put(sk);
2971}
2972EXPORT_SYMBOL(sk_stop_timer);
2973
2974void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
2975{
2976	if (del_timer_sync(timer))
2977		__sock_put(sk);
2978}
2979EXPORT_SYMBOL(sk_stop_timer_sync);
2980
2981void sock_init_data(struct socket *sock, struct sock *sk)
2982{
2983	sk_init_common(sk);
2984	sk->sk_send_head	=	NULL;
2985
2986	timer_setup(&sk->sk_timer, NULL, 0);
2987
2988	sk->sk_allocation	=	GFP_KERNEL;
2989	sk->sk_rcvbuf		=	sysctl_rmem_default;
2990	sk->sk_sndbuf		=	sysctl_wmem_default;
2991	sk->sk_state		=	TCP_CLOSE;
2992	sk_set_socket(sk, sock);
2993
2994	sock_set_flag(sk, SOCK_ZAPPED);
2995
2996	if (sock) {
2997		sk->sk_type	=	sock->type;
2998		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2999		sock->sk	=	sk;
3000		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3001	} else {
3002		RCU_INIT_POINTER(sk->sk_wq, NULL);
3003		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3004	}
3005
3006	rwlock_init(&sk->sk_callback_lock);
3007	if (sk->sk_kern_sock)
3008		lockdep_set_class_and_name(
3009			&sk->sk_callback_lock,
3010			af_kern_callback_keys + sk->sk_family,
3011			af_family_kern_clock_key_strings[sk->sk_family]);
3012	else
3013		lockdep_set_class_and_name(
3014			&sk->sk_callback_lock,
3015			af_callback_keys + sk->sk_family,
3016			af_family_clock_key_strings[sk->sk_family]);
3017
3018	sk->sk_state_change	=	sock_def_wakeup;
3019	sk->sk_data_ready	=	sock_def_readable;
3020	sk->sk_write_space	=	sock_def_write_space;
3021	sk->sk_error_report	=	sock_def_error_report;
3022	sk->sk_destruct		=	sock_def_destruct;
3023
3024	sk->sk_frag.page	=	NULL;
3025	sk->sk_frag.offset	=	0;
3026	sk->sk_peek_off		=	-1;
3027
3028	sk->sk_peer_pid 	=	NULL;
3029	sk->sk_peer_cred	=	NULL;
3030	sk->sk_write_pending	=	0;
3031	sk->sk_rcvlowat		=	1;
3032	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3033	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3034
3035	sk->sk_stamp = SK_DEFAULT_STAMP;
3036#if BITS_PER_LONG==32
3037	seqlock_init(&sk->sk_stamp_seq);
3038#endif
3039	atomic_set(&sk->sk_zckey, 0);
3040
3041#ifdef CONFIG_NET_RX_BUSY_POLL
3042	sk->sk_napi_id		=	0;
3043	sk->sk_ll_usec		=	sysctl_net_busy_read;
3044#endif
3045
3046	sk->sk_max_pacing_rate = ~0UL;
3047	sk->sk_pacing_rate = ~0UL;
3048	WRITE_ONCE(sk->sk_pacing_shift, 10);
3049	sk->sk_incoming_cpu = -1;
3050
3051	sk_rx_queue_clear(sk);
3052	/*
3053	 * Before updating sk_refcnt, we must commit prior changes to memory
3054	 * (Documentation/RCU/rculist_nulls.rst for details)
3055	 */
3056	smp_wmb();
3057	refcount_set(&sk->sk_refcnt, 1);
3058	atomic_set(&sk->sk_drops, 0);
3059}
3060EXPORT_SYMBOL(sock_init_data);
3061
3062void lock_sock_nested(struct sock *sk, int subclass)
3063{
3064	might_sleep();
3065	spin_lock_bh(&sk->sk_lock.slock);
3066	if (sk->sk_lock.owned)
3067		__lock_sock(sk);
3068	sk->sk_lock.owned = 1;
3069	spin_unlock(&sk->sk_lock.slock);
3070	/*
3071	 * The sk_lock has mutex_lock() semantics here:
3072	 */
3073	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3074	local_bh_enable();
3075}
3076EXPORT_SYMBOL(lock_sock_nested);
3077
3078void release_sock(struct sock *sk)
3079{
3080	spin_lock_bh(&sk->sk_lock.slock);
3081	if (sk->sk_backlog.tail)
3082		__release_sock(sk);
3083
3084	/* Warning : release_cb() might need to release sk ownership,
3085	 * ie call sock_release_ownership(sk) before us.
3086	 */
3087	if (sk->sk_prot->release_cb)
3088		sk->sk_prot->release_cb(sk);
3089
3090	sock_release_ownership(sk);
3091	if (waitqueue_active(&sk->sk_lock.wq))
3092		wake_up(&sk->sk_lock.wq);
3093	spin_unlock_bh(&sk->sk_lock.slock);
3094}
3095EXPORT_SYMBOL(release_sock);
3096
3097/**
3098 * lock_sock_fast - fast version of lock_sock
3099 * @sk: socket
3100 *
3101 * This version should be used for very small section, where process wont block
3102 * return false if fast path is taken:
3103 *
3104 *   sk_lock.slock locked, owned = 0, BH disabled
3105 *
3106 * return true if slow path is taken:
3107 *
3108 *   sk_lock.slock unlocked, owned = 1, BH enabled
3109 */
3110bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3111{
3112	might_sleep();
3113	spin_lock_bh(&sk->sk_lock.slock);
3114
3115	if (!sk->sk_lock.owned)
3116		/*
3117		 * Note : We must disable BH
3118		 */
3119		return false;
3120
3121	__lock_sock(sk);
3122	sk->sk_lock.owned = 1;
3123	spin_unlock(&sk->sk_lock.slock);
3124	/*
3125	 * The sk_lock has mutex_lock() semantics here:
3126	 */
3127	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3128	__acquire(&sk->sk_lock.slock);
3129	local_bh_enable();
3130	return true;
3131}
3132EXPORT_SYMBOL(lock_sock_fast);
3133
3134int sock_gettstamp(struct socket *sock, void __user *userstamp,
3135		   bool timeval, bool time32)
3136{
3137	struct sock *sk = sock->sk;
3138	struct timespec64 ts;
3139
3140	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3141	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3142	if (ts.tv_sec == -1)
3143		return -ENOENT;
3144	if (ts.tv_sec == 0) {
3145		ktime_t kt = ktime_get_real();
3146		sock_write_timestamp(sk, kt);
3147		ts = ktime_to_timespec64(kt);
3148	}
3149
3150	if (timeval)
3151		ts.tv_nsec /= 1000;
3152
3153#ifdef CONFIG_COMPAT_32BIT_TIME
3154	if (time32)
3155		return put_old_timespec32(&ts, userstamp);
3156#endif
3157#ifdef CONFIG_SPARC64
3158	/* beware of padding in sparc64 timeval */
3159	if (timeval && !in_compat_syscall()) {
3160		struct __kernel_old_timeval __user tv = {
3161			.tv_sec = ts.tv_sec,
3162			.tv_usec = ts.tv_nsec,
3163		};
3164		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3165			return -EFAULT;
3166		return 0;
3167	}
3168#endif
3169	return put_timespec64(&ts, userstamp);
3170}
3171EXPORT_SYMBOL(sock_gettstamp);
3172
3173void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3174{
3175	if (!sock_flag(sk, flag)) {
3176		unsigned long previous_flags = sk->sk_flags;
3177
3178		sock_set_flag(sk, flag);
3179		/*
3180		 * we just set one of the two flags which require net
3181		 * time stamping, but time stamping might have been on
3182		 * already because of the other one
3183		 */
3184		if (sock_needs_netstamp(sk) &&
3185		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3186			net_enable_timestamp();
3187	}
3188}
3189
3190int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3191		       int level, int type)
3192{
3193	struct sock_exterr_skb *serr;
3194	struct sk_buff *skb;
3195	int copied, err;
3196
3197	err = -EAGAIN;
3198	skb = sock_dequeue_err_skb(sk);
3199	if (skb == NULL)
3200		goto out;
3201
3202	copied = skb->len;
3203	if (copied > len) {
3204		msg->msg_flags |= MSG_TRUNC;
3205		copied = len;
3206	}
3207	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3208	if (err)
3209		goto out_free_skb;
3210
3211	sock_recv_timestamp(msg, sk, skb);
3212
3213	serr = SKB_EXT_ERR(skb);
3214	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3215
3216	msg->msg_flags |= MSG_ERRQUEUE;
3217	err = copied;
3218
3219out_free_skb:
3220	kfree_skb(skb);
3221out:
3222	return err;
3223}
3224EXPORT_SYMBOL(sock_recv_errqueue);
3225
3226/*
3227 *	Get a socket option on an socket.
3228 *
3229 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3230 *	asynchronous errors should be reported by getsockopt. We assume
3231 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3232 */
3233int sock_common_getsockopt(struct socket *sock, int level, int optname,
3234			   char __user *optval, int __user *optlen)
3235{
3236	struct sock *sk = sock->sk;
3237
3238	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3239}
3240EXPORT_SYMBOL(sock_common_getsockopt);
3241
3242int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3243			int flags)
3244{
3245	struct sock *sk = sock->sk;
3246	int addr_len = 0;
3247	int err;
3248
3249	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3250				   flags & ~MSG_DONTWAIT, &addr_len);
3251	if (err >= 0)
3252		msg->msg_namelen = addr_len;
3253	return err;
3254}
3255EXPORT_SYMBOL(sock_common_recvmsg);
3256
3257/*
3258 *	Set socket options on an inet socket.
3259 */
3260int sock_common_setsockopt(struct socket *sock, int level, int optname,
3261			   sockptr_t optval, unsigned int optlen)
3262{
3263	struct sock *sk = sock->sk;
3264
3265	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3266}
3267EXPORT_SYMBOL(sock_common_setsockopt);
3268
3269void sk_common_release(struct sock *sk)
3270{
3271	if (sk->sk_prot->destroy)
3272		sk->sk_prot->destroy(sk);
3273
3274	/*
3275	 * Observation: when sk_common_release is called, processes have
3276	 * no access to socket. But net still has.
3277	 * Step one, detach it from networking:
3278	 *
3279	 * A. Remove from hash tables.
3280	 */
3281
3282	sk->sk_prot->unhash(sk);
3283
3284	/*
3285	 * In this point socket cannot receive new packets, but it is possible
3286	 * that some packets are in flight because some CPU runs receiver and
3287	 * did hash table lookup before we unhashed socket. They will achieve
3288	 * receive queue and will be purged by socket destructor.
3289	 *
3290	 * Also we still have packets pending on receive queue and probably,
3291	 * our own packets waiting in device queues. sock_destroy will drain
3292	 * receive queue, but transmitted packets will delay socket destruction
3293	 * until the last reference will be released.
3294	 */
3295
3296	sock_orphan(sk);
3297
3298	xfrm_sk_free_policy(sk);
3299
3300	sk_refcnt_debug_release(sk);
3301
3302	sock_put(sk);
3303}
3304EXPORT_SYMBOL(sk_common_release);
3305
3306void sk_get_meminfo(const struct sock *sk, u32 *mem)
3307{
3308	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3309
3310	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3311	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3312	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3313	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3314	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3315	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3316	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3317	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3318	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3319}
3320
3321#ifdef CONFIG_PROC_FS
3322#define PROTO_INUSE_NR	64	/* should be enough for the first time */
3323struct prot_inuse {
3324	int val[PROTO_INUSE_NR];
3325};
3326
3327static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3328
3329void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3330{
3331	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3332}
3333EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3334
3335int sock_prot_inuse_get(struct net *net, struct proto *prot)
3336{
3337	int cpu, idx = prot->inuse_idx;
3338	int res = 0;
3339
3340	for_each_possible_cpu(cpu)
3341		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3342
3343	return res >= 0 ? res : 0;
3344}
3345EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3346
3347static void sock_inuse_add(struct net *net, int val)
3348{
3349	this_cpu_add(*net->core.sock_inuse, val);
3350}
3351
3352int sock_inuse_get(struct net *net)
3353{
3354	int cpu, res = 0;
3355
3356	for_each_possible_cpu(cpu)
3357		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3358
3359	return res;
3360}
3361
3362EXPORT_SYMBOL_GPL(sock_inuse_get);
3363
3364static int __net_init sock_inuse_init_net(struct net *net)
3365{
3366	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3367	if (net->core.prot_inuse == NULL)
3368		return -ENOMEM;
3369
3370	net->core.sock_inuse = alloc_percpu(int);
3371	if (net->core.sock_inuse == NULL)
3372		goto out;
3373
3374	return 0;
3375
3376out:
3377	free_percpu(net->core.prot_inuse);
3378	return -ENOMEM;
3379}
3380
3381static void __net_exit sock_inuse_exit_net(struct net *net)
3382{
3383	free_percpu(net->core.prot_inuse);
3384	free_percpu(net->core.sock_inuse);
3385}
3386
3387static struct pernet_operations net_inuse_ops = {
3388	.init = sock_inuse_init_net,
3389	.exit = sock_inuse_exit_net,
3390};
3391
3392static __init int net_inuse_init(void)
3393{
3394	if (register_pernet_subsys(&net_inuse_ops))
3395		panic("Cannot initialize net inuse counters");
3396
3397	return 0;
3398}
3399
3400core_initcall(net_inuse_init);
3401
3402static int assign_proto_idx(struct proto *prot)
3403{
3404	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3405
3406	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3407		pr_err("PROTO_INUSE_NR exhausted\n");
3408		return -ENOSPC;
3409	}
3410
3411	set_bit(prot->inuse_idx, proto_inuse_idx);
3412	return 0;
3413}
3414
3415static void release_proto_idx(struct proto *prot)
3416{
3417	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3418		clear_bit(prot->inuse_idx, proto_inuse_idx);
3419}
3420#else
3421static inline int assign_proto_idx(struct proto *prot)
3422{
3423	return 0;
3424}
3425
3426static inline void release_proto_idx(struct proto *prot)
3427{
3428}
3429
3430static void sock_inuse_add(struct net *net, int val)
3431{
3432}
3433#endif
3434
3435static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3436{
3437	if (!twsk_prot)
3438		return;
3439	kfree(twsk_prot->twsk_slab_name);
3440	twsk_prot->twsk_slab_name = NULL;
3441	kmem_cache_destroy(twsk_prot->twsk_slab);
3442	twsk_prot->twsk_slab = NULL;
3443}
3444
3445static int tw_prot_init(const struct proto *prot)
3446{
3447	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3448
3449	if (!twsk_prot)
3450		return 0;
3451
3452	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3453					      prot->name);
3454	if (!twsk_prot->twsk_slab_name)
3455		return -ENOMEM;
3456
3457	twsk_prot->twsk_slab =
3458		kmem_cache_create(twsk_prot->twsk_slab_name,
3459				  twsk_prot->twsk_obj_size, 0,
3460				  SLAB_ACCOUNT | prot->slab_flags,
3461				  NULL);
3462	if (!twsk_prot->twsk_slab) {
3463		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3464			prot->name);
3465		return -ENOMEM;
3466	}
3467
3468	return 0;
3469}
3470
3471static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3472{
3473	if (!rsk_prot)
3474		return;
3475	kfree(rsk_prot->slab_name);
3476	rsk_prot->slab_name = NULL;
3477	kmem_cache_destroy(rsk_prot->slab);
3478	rsk_prot->slab = NULL;
3479}
3480
3481static int req_prot_init(const struct proto *prot)
3482{
3483	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3484
3485	if (!rsk_prot)
3486		return 0;
3487
3488	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3489					prot->name);
3490	if (!rsk_prot->slab_name)
3491		return -ENOMEM;
3492
3493	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3494					   rsk_prot->obj_size, 0,
3495					   SLAB_ACCOUNT | prot->slab_flags,
3496					   NULL);
3497
3498	if (!rsk_prot->slab) {
3499		pr_crit("%s: Can't create request sock SLAB cache!\n",
3500			prot->name);
3501		return -ENOMEM;
3502	}
3503	return 0;
3504}
3505
3506int proto_register(struct proto *prot, int alloc_slab)
3507{
3508	int ret = -ENOBUFS;
3509
3510	if (alloc_slab) {
3511		prot->slab = kmem_cache_create_usercopy(prot->name,
3512					prot->obj_size, 0,
3513					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3514					prot->slab_flags,
3515					prot->useroffset, prot->usersize,
3516					NULL);
3517
3518		if (prot->slab == NULL) {
3519			pr_crit("%s: Can't create sock SLAB cache!\n",
3520				prot->name);
3521			goto out;
3522		}
3523
3524		if (req_prot_init(prot))
3525			goto out_free_request_sock_slab;
3526
3527		if (tw_prot_init(prot))
3528			goto out_free_timewait_sock_slab;
3529	}
3530
3531	mutex_lock(&proto_list_mutex);
3532	ret = assign_proto_idx(prot);
3533	if (ret) {
3534		mutex_unlock(&proto_list_mutex);
3535		goto out_free_timewait_sock_slab;
3536	}
3537	list_add(&prot->node, &proto_list);
3538	mutex_unlock(&proto_list_mutex);
3539	return ret;
3540
3541out_free_timewait_sock_slab:
3542	if (alloc_slab)
3543		tw_prot_cleanup(prot->twsk_prot);
3544out_free_request_sock_slab:
3545	if (alloc_slab) {
3546		req_prot_cleanup(prot->rsk_prot);
3547
3548		kmem_cache_destroy(prot->slab);
3549		prot->slab = NULL;
3550	}
3551out:
3552	return ret;
3553}
3554EXPORT_SYMBOL(proto_register);
3555
3556void proto_unregister(struct proto *prot)
3557{
3558	mutex_lock(&proto_list_mutex);
3559	release_proto_idx(prot);
3560	list_del(&prot->node);
3561	mutex_unlock(&proto_list_mutex);
3562
3563	kmem_cache_destroy(prot->slab);
3564	prot->slab = NULL;
3565
3566	req_prot_cleanup(prot->rsk_prot);
3567	tw_prot_cleanup(prot->twsk_prot);
3568}
3569EXPORT_SYMBOL(proto_unregister);
3570
3571int sock_load_diag_module(int family, int protocol)
3572{
3573	if (!protocol) {
3574		if (!sock_is_registered(family))
3575			return -ENOENT;
3576
3577		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3578				      NETLINK_SOCK_DIAG, family);
3579	}
3580
3581#ifdef CONFIG_INET
3582	if (family == AF_INET &&
3583	    protocol != IPPROTO_RAW &&
3584	    protocol < MAX_INET_PROTOS &&
3585	    !rcu_access_pointer(inet_protos[protocol]))
3586		return -ENOENT;
3587#endif
3588
3589	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3590			      NETLINK_SOCK_DIAG, family, protocol);
3591}
3592EXPORT_SYMBOL(sock_load_diag_module);
3593
3594#ifdef CONFIG_PROC_FS
3595static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3596	__acquires(proto_list_mutex)
3597{
3598	mutex_lock(&proto_list_mutex);
3599	return seq_list_start_head(&proto_list, *pos);
3600}
3601
3602static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3603{
3604	return seq_list_next(v, &proto_list, pos);
3605}
3606
3607static void proto_seq_stop(struct seq_file *seq, void *v)
3608	__releases(proto_list_mutex)
3609{
3610	mutex_unlock(&proto_list_mutex);
3611}
3612
3613static char proto_method_implemented(const void *method)
3614{
3615	return method == NULL ? 'n' : 'y';
3616}
3617static long sock_prot_memory_allocated(struct proto *proto)
3618{
3619	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3620}
3621
3622static const char *sock_prot_memory_pressure(struct proto *proto)
3623{
3624	return proto->memory_pressure != NULL ?
3625	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3626}
3627
3628static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3629{
3630
3631	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3632			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3633		   proto->name,
3634		   proto->obj_size,
3635		   sock_prot_inuse_get(seq_file_net(seq), proto),
3636		   sock_prot_memory_allocated(proto),
3637		   sock_prot_memory_pressure(proto),
3638		   proto->max_header,
3639		   proto->slab == NULL ? "no" : "yes",
3640		   module_name(proto->owner),
3641		   proto_method_implemented(proto->close),
3642		   proto_method_implemented(proto->connect),
3643		   proto_method_implemented(proto->disconnect),
3644		   proto_method_implemented(proto->accept),
3645		   proto_method_implemented(proto->ioctl),
3646		   proto_method_implemented(proto->init),
3647		   proto_method_implemented(proto->destroy),
3648		   proto_method_implemented(proto->shutdown),
3649		   proto_method_implemented(proto->setsockopt),
3650		   proto_method_implemented(proto->getsockopt),
3651		   proto_method_implemented(proto->sendmsg),
3652		   proto_method_implemented(proto->recvmsg),
3653		   proto_method_implemented(proto->sendpage),
3654		   proto_method_implemented(proto->bind),
3655		   proto_method_implemented(proto->backlog_rcv),
3656		   proto_method_implemented(proto->hash),
3657		   proto_method_implemented(proto->unhash),
3658		   proto_method_implemented(proto->get_port),
3659		   proto_method_implemented(proto->enter_memory_pressure));
3660}
3661
3662static int proto_seq_show(struct seq_file *seq, void *v)
3663{
3664	if (v == &proto_list)
3665		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3666			   "protocol",
3667			   "size",
3668			   "sockets",
3669			   "memory",
3670			   "press",
3671			   "maxhdr",
3672			   "slab",
3673			   "module",
3674			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3675	else
3676		proto_seq_printf(seq, list_entry(v, struct proto, node));
3677	return 0;
3678}
3679
3680static const struct seq_operations proto_seq_ops = {
3681	.start  = proto_seq_start,
3682	.next   = proto_seq_next,
3683	.stop   = proto_seq_stop,
3684	.show   = proto_seq_show,
3685};
3686
3687static __net_init int proto_init_net(struct net *net)
3688{
3689	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3690			sizeof(struct seq_net_private)))
3691		return -ENOMEM;
3692
3693	return 0;
3694}
3695
3696static __net_exit void proto_exit_net(struct net *net)
3697{
3698	remove_proc_entry("protocols", net->proc_net);
3699}
3700
3701
3702static __net_initdata struct pernet_operations proto_net_ops = {
3703	.init = proto_init_net,
3704	.exit = proto_exit_net,
3705};
3706
3707static int __init proto_init(void)
3708{
3709	return register_pernet_subsys(&proto_net_ops);
3710}
3711
3712subsys_initcall(proto_init);
3713
3714#endif /* PROC_FS */
3715
3716#ifdef CONFIG_NET_RX_BUSY_POLL
3717bool sk_busy_loop_end(void *p, unsigned long start_time)
3718{
3719	struct sock *sk = p;
3720
3721	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3722	       sk_busy_loop_timeout(sk, start_time);
3723}
3724EXPORT_SYMBOL(sk_busy_loop_end);
3725#endif /* CONFIG_NET_RX_BUSY_POLL */
3726
3727int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3728{
3729	if (!sk->sk_prot->bind_add)
3730		return -EOPNOTSUPP;
3731	return sk->sk_prot->bind_add(sk, addr, addr_len);
3732}
3733EXPORT_SYMBOL(sock_bind_add);