net/core/sock.c at v6.11-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v6.11-rc4 4310 lines 108 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/udp.h>
 111#include <linux/init.h>
 112#include <linux/highmem.h>
 113#include <linux/user_namespace.h>
 114#include <linux/static_key.h>
 115#include <linux/memcontrol.h>
 116#include <linux/prefetch.h>
 117#include <linux/compat.h>
 118#include <linux/mroute.h>
 119#include <linux/mroute6.h>
 120#include <linux/icmpv6.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <net/proto_memory.h>
 131#include <linux/net_tstamp.h>
 132#include <net/xfrm.h>
 133#include <linux/ipsec.h>
 134#include <net/cls_cgroup.h>
 135#include <net/netprio_cgroup.h>
 136#include <linux/sock_diag.h>
 137
 138#include <linux/filter.h>
 139#include <net/sock_reuseport.h>
 140#include <net/bpf_sk_storage.h>
 141
 142#include <trace/events/sock.h>
 143
 144#include <net/tcp.h>
 145#include <net/busy_poll.h>
 146#include <net/phonet/phonet.h>
 147
 148#include <linux/ethtool.h>
 149
 150#include "dev.h"
 151
 152static DEFINE_MUTEX(proto_list_mutex);
 153static LIST_HEAD(proto_list);
 154
 155static void sock_def_write_space_wfree(struct sock *sk);
 156static void sock_def_write_space(struct sock *sk);
 157
 158/**
 159 * sk_ns_capable - General socket capability test
 160 * @sk: Socket to use a capability on or through
 161 * @user_ns: The user namespace of the capability to use
 162 * @cap: The capability to use
 163 *
 164 * Test to see if the opener of the socket had when the socket was
 165 * created and the current process has the capability @cap in the user
 166 * namespace @user_ns.
 167 */
 168bool sk_ns_capable(const struct sock *sk,
 169		   struct user_namespace *user_ns, int cap)
 170{
 171	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 172		ns_capable(user_ns, cap);
 173}
 174EXPORT_SYMBOL(sk_ns_capable);
 175
 176/**
 177 * sk_capable - Socket global capability test
 178 * @sk: Socket to use a capability on or through
 179 * @cap: The global capability to use
 180 *
 181 * Test to see if the opener of the socket had when the socket was
 182 * created and the current process has the capability @cap in all user
 183 * namespaces.
 184 */
 185bool sk_capable(const struct sock *sk, int cap)
 186{
 187	return sk_ns_capable(sk, &init_user_ns, cap);
 188}
 189EXPORT_SYMBOL(sk_capable);
 190
 191/**
 192 * sk_net_capable - Network namespace socket capability test
 193 * @sk: Socket to use a capability on or through
 194 * @cap: The capability to use
 195 *
 196 * Test to see if the opener of the socket had when the socket was created
 197 * and the current process has the capability @cap over the network namespace
 198 * the socket is a member of.
 199 */
 200bool sk_net_capable(const struct sock *sk, int cap)
 201{
 202	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 203}
 204EXPORT_SYMBOL(sk_net_capable);
 205
 206/*
 207 * Each address family might have different locking rules, so we have
 208 * one slock key per address family and separate keys for internal and
 209 * userspace sockets.
 210 */
 211static struct lock_class_key af_family_keys[AF_MAX];
 212static struct lock_class_key af_family_kern_keys[AF_MAX];
 213static struct lock_class_key af_family_slock_keys[AF_MAX];
 214static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 215
 216/*
 217 * Make lock validator output more readable. (we pre-construct these
 218 * strings build-time, so that runtime initialization of socket
 219 * locks is fast):
 220 */
 221
 222#define _sock_locks(x)						  \
 223  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 224  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 225  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 226  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 227  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 228  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 229  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 230  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 231  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 232  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 233  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 234  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 235  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 236  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 237  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 238  x "AF_MCTP"  , \
 239  x "AF_MAX"
 240
 241static const char *const af_family_key_strings[AF_MAX+1] = {
 242	_sock_locks("sk_lock-")
 243};
 244static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 245	_sock_locks("slock-")
 246};
 247static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 248	_sock_locks("clock-")
 249};
 250
 251static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 252	_sock_locks("k-sk_lock-")
 253};
 254static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 255	_sock_locks("k-slock-")
 256};
 257static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 258	_sock_locks("k-clock-")
 259};
 260static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 261	_sock_locks("rlock-")
 262};
 263static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 264	_sock_locks("wlock-")
 265};
 266static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 267	_sock_locks("elock-")
 268};
 269
 270/*
 271 * sk_callback_lock and sk queues locking rules are per-address-family,
 272 * so split the lock classes by using a per-AF key:
 273 */
 274static struct lock_class_key af_callback_keys[AF_MAX];
 275static struct lock_class_key af_rlock_keys[AF_MAX];
 276static struct lock_class_key af_wlock_keys[AF_MAX];
 277static struct lock_class_key af_elock_keys[AF_MAX];
 278static struct lock_class_key af_kern_callback_keys[AF_MAX];
 279
 280/* Run time adjustable parameters. */
 281__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 282EXPORT_SYMBOL(sysctl_wmem_max);
 283__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 284EXPORT_SYMBOL(sysctl_rmem_max);
 285__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 286__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 287
 288int sysctl_tstamp_allow_data __read_mostly = 1;
 289
 290DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 291EXPORT_SYMBOL_GPL(memalloc_socks_key);
 292
 293/**
 294 * sk_set_memalloc - sets %SOCK_MEMALLOC
 295 * @sk: socket to set it on
 296 *
 297 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 298 * It's the responsibility of the admin to adjust min_free_kbytes
 299 * to meet the requirements
 300 */
 301void sk_set_memalloc(struct sock *sk)
 302{
 303	sock_set_flag(sk, SOCK_MEMALLOC);
 304	sk->sk_allocation |= __GFP_MEMALLOC;
 305	static_branch_inc(&memalloc_socks_key);
 306}
 307EXPORT_SYMBOL_GPL(sk_set_memalloc);
 308
 309void sk_clear_memalloc(struct sock *sk)
 310{
 311	sock_reset_flag(sk, SOCK_MEMALLOC);
 312	sk->sk_allocation &= ~__GFP_MEMALLOC;
 313	static_branch_dec(&memalloc_socks_key);
 314
 315	/*
 316	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 317	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 318	 * it has rmem allocations due to the last swapfile being deactivated
 319	 * but there is a risk that the socket is unusable due to exceeding
 320	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 321	 */
 322	sk_mem_reclaim(sk);
 323}
 324EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 325
 326int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 327{
 328	int ret;
 329	unsigned int noreclaim_flag;
 330
 331	/* these should have been dropped before queueing */
 332	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 333
 334	noreclaim_flag = memalloc_noreclaim_save();
 335	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 336				 tcp_v6_do_rcv,
 337				 tcp_v4_do_rcv,
 338				 sk, skb);
 339	memalloc_noreclaim_restore(noreclaim_flag);
 340
 341	return ret;
 342}
 343EXPORT_SYMBOL(__sk_backlog_rcv);
 344
 345void sk_error_report(struct sock *sk)
 346{
 347	sk->sk_error_report(sk);
 348
 349	switch (sk->sk_family) {
 350	case AF_INET:
 351		fallthrough;
 352	case AF_INET6:
 353		trace_inet_sk_error_report(sk);
 354		break;
 355	default:
 356		break;
 357	}
 358}
 359EXPORT_SYMBOL(sk_error_report);
 360
 361int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 362{
 363	struct __kernel_sock_timeval tv;
 364
 365	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 366		tv.tv_sec = 0;
 367		tv.tv_usec = 0;
 368	} else {
 369		tv.tv_sec = timeo / HZ;
 370		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 371	}
 372
 373	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 374		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 375		*(struct old_timeval32 *)optval = tv32;
 376		return sizeof(tv32);
 377	}
 378
 379	if (old_timeval) {
 380		struct __kernel_old_timeval old_tv;
 381		old_tv.tv_sec = tv.tv_sec;
 382		old_tv.tv_usec = tv.tv_usec;
 383		*(struct __kernel_old_timeval *)optval = old_tv;
 384		return sizeof(old_tv);
 385	}
 386
 387	*(struct __kernel_sock_timeval *)optval = tv;
 388	return sizeof(tv);
 389}
 390EXPORT_SYMBOL(sock_get_timeout);
 391
 392int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 393			   sockptr_t optval, int optlen, bool old_timeval)
 394{
 395	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 396		struct old_timeval32 tv32;
 397
 398		if (optlen < sizeof(tv32))
 399			return -EINVAL;
 400
 401		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 402			return -EFAULT;
 403		tv->tv_sec = tv32.tv_sec;
 404		tv->tv_usec = tv32.tv_usec;
 405	} else if (old_timeval) {
 406		struct __kernel_old_timeval old_tv;
 407
 408		if (optlen < sizeof(old_tv))
 409			return -EINVAL;
 410		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 411			return -EFAULT;
 412		tv->tv_sec = old_tv.tv_sec;
 413		tv->tv_usec = old_tv.tv_usec;
 414	} else {
 415		if (optlen < sizeof(*tv))
 416			return -EINVAL;
 417		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 418			return -EFAULT;
 419	}
 420
 421	return 0;
 422}
 423EXPORT_SYMBOL(sock_copy_user_timeval);
 424
 425static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 426			    bool old_timeval)
 427{
 428	struct __kernel_sock_timeval tv;
 429	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 430	long val;
 431
 432	if (err)
 433		return err;
 434
 435	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 436		return -EDOM;
 437
 438	if (tv.tv_sec < 0) {
 439		static int warned __read_mostly;
 440
 441		WRITE_ONCE(*timeo_p, 0);
 442		if (warned < 10 && net_ratelimit()) {
 443			warned++;
 444			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 445				__func__, current->comm, task_pid_nr(current));
 446		}
 447		return 0;
 448	}
 449	val = MAX_SCHEDULE_TIMEOUT;
 450	if ((tv.tv_sec || tv.tv_usec) &&
 451	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 452		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 453						    USEC_PER_SEC / HZ);
 454	WRITE_ONCE(*timeo_p, val);
 455	return 0;
 456}
 457
 458static bool sock_needs_netstamp(const struct sock *sk)
 459{
 460	switch (sk->sk_family) {
 461	case AF_UNSPEC:
 462	case AF_UNIX:
 463		return false;
 464	default:
 465		return true;
 466	}
 467}
 468
 469static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 470{
 471	if (sk->sk_flags & flags) {
 472		sk->sk_flags &= ~flags;
 473		if (sock_needs_netstamp(sk) &&
 474		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 475			net_disable_timestamp();
 476	}
 477}
 478
 479
 480int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 481{
 482	unsigned long flags;
 483	struct sk_buff_head *list = &sk->sk_receive_queue;
 484
 485	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
 486		atomic_inc(&sk->sk_drops);
 487		trace_sock_rcvqueue_full(sk, skb);
 488		return -ENOMEM;
 489	}
 490
 491	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 492		atomic_inc(&sk->sk_drops);
 493		return -ENOBUFS;
 494	}
 495
 496	skb->dev = NULL;
 497	skb_set_owner_r(skb, sk);
 498
 499	/* we escape from rcu protected region, make sure we dont leak
 500	 * a norefcounted dst
 501	 */
 502	skb_dst_force(skb);
 503
 504	spin_lock_irqsave(&list->lock, flags);
 505	sock_skb_set_dropcount(sk, skb);
 506	__skb_queue_tail(list, skb);
 507	spin_unlock_irqrestore(&list->lock, flags);
 508
 509	if (!sock_flag(sk, SOCK_DEAD))
 510		sk->sk_data_ready(sk);
 511	return 0;
 512}
 513EXPORT_SYMBOL(__sock_queue_rcv_skb);
 514
 515int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 516			      enum skb_drop_reason *reason)
 517{
 518	enum skb_drop_reason drop_reason;
 519	int err;
 520
 521	err = sk_filter(sk, skb);
 522	if (err) {
 523		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 524		goto out;
 525	}
 526	err = __sock_queue_rcv_skb(sk, skb);
 527	switch (err) {
 528	case -ENOMEM:
 529		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 530		break;
 531	case -ENOBUFS:
 532		drop_reason = SKB_DROP_REASON_PROTO_MEM;
 533		break;
 534	default:
 535		drop_reason = SKB_NOT_DROPPED_YET;
 536		break;
 537	}
 538out:
 539	if (reason)
 540		*reason = drop_reason;
 541	return err;
 542}
 543EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 544
 545int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 546		     const int nested, unsigned int trim_cap, bool refcounted)
 547{
 548	int rc = NET_RX_SUCCESS;
 549
 550	if (sk_filter_trim_cap(sk, skb, trim_cap))
 551		goto discard_and_relse;
 552
 553	skb->dev = NULL;
 554
 555	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 556		atomic_inc(&sk->sk_drops);
 557		goto discard_and_relse;
 558	}
 559	if (nested)
 560		bh_lock_sock_nested(sk);
 561	else
 562		bh_lock_sock(sk);
 563	if (!sock_owned_by_user(sk)) {
 564		/*
 565		 * trylock + unlock semantics:
 566		 */
 567		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 568
 569		rc = sk_backlog_rcv(sk, skb);
 570
 571		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 572	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 573		bh_unlock_sock(sk);
 574		atomic_inc(&sk->sk_drops);
 575		goto discard_and_relse;
 576	}
 577
 578	bh_unlock_sock(sk);
 579out:
 580	if (refcounted)
 581		sock_put(sk);
 582	return rc;
 583discard_and_relse:
 584	kfree_skb(skb);
 585	goto out;
 586}
 587EXPORT_SYMBOL(__sk_receive_skb);
 588
 589INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 590							  u32));
 591INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 592							   u32));
 593struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 594{
 595	struct dst_entry *dst = __sk_dst_get(sk);
 596
 597	if (dst && dst->obsolete &&
 598	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 599			       dst, cookie) == NULL) {
 600		sk_tx_queue_clear(sk);
 601		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 602		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 603		dst_release(dst);
 604		return NULL;
 605	}
 606
 607	return dst;
 608}
 609EXPORT_SYMBOL(__sk_dst_check);
 610
 611struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 612{
 613	struct dst_entry *dst = sk_dst_get(sk);
 614
 615	if (dst && dst->obsolete &&
 616	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 617			       dst, cookie) == NULL) {
 618		sk_dst_reset(sk);
 619		dst_release(dst);
 620		return NULL;
 621	}
 622
 623	return dst;
 624}
 625EXPORT_SYMBOL(sk_dst_check);
 626
 627static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 628{
 629	int ret = -ENOPROTOOPT;
 630#ifdef CONFIG_NETDEVICES
 631	struct net *net = sock_net(sk);
 632
 633	/* Sorry... */
 634	ret = -EPERM;
 635	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 636		goto out;
 637
 638	ret = -EINVAL;
 639	if (ifindex < 0)
 640		goto out;
 641
 642	/* Paired with all READ_ONCE() done locklessly. */
 643	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 644
 645	if (sk->sk_prot->rehash)
 646		sk->sk_prot->rehash(sk);
 647	sk_dst_reset(sk);
 648
 649	ret = 0;
 650
 651out:
 652#endif
 653
 654	return ret;
 655}
 656
 657int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 658{
 659	int ret;
 660
 661	if (lock_sk)
 662		lock_sock(sk);
 663	ret = sock_bindtoindex_locked(sk, ifindex);
 664	if (lock_sk)
 665		release_sock(sk);
 666
 667	return ret;
 668}
 669EXPORT_SYMBOL(sock_bindtoindex);
 670
 671static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 672{
 673	int ret = -ENOPROTOOPT;
 674#ifdef CONFIG_NETDEVICES
 675	struct net *net = sock_net(sk);
 676	char devname[IFNAMSIZ];
 677	int index;
 678
 679	ret = -EINVAL;
 680	if (optlen < 0)
 681		goto out;
 682
 683	/* Bind this socket to a particular device like "eth0",
 684	 * as specified in the passed interface name. If the
 685	 * name is "" or the option length is zero the socket
 686	 * is not bound.
 687	 */
 688	if (optlen > IFNAMSIZ - 1)
 689		optlen = IFNAMSIZ - 1;
 690	memset(devname, 0, sizeof(devname));
 691
 692	ret = -EFAULT;
 693	if (copy_from_sockptr(devname, optval, optlen))
 694		goto out;
 695
 696	index = 0;
 697	if (devname[0] != '\0') {
 698		struct net_device *dev;
 699
 700		rcu_read_lock();
 701		dev = dev_get_by_name_rcu(net, devname);
 702		if (dev)
 703			index = dev->ifindex;
 704		rcu_read_unlock();
 705		ret = -ENODEV;
 706		if (!dev)
 707			goto out;
 708	}
 709
 710	sockopt_lock_sock(sk);
 711	ret = sock_bindtoindex_locked(sk, index);
 712	sockopt_release_sock(sk);
 713out:
 714#endif
 715
 716	return ret;
 717}
 718
 719static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 720				sockptr_t optlen, int len)
 721{
 722	int ret = -ENOPROTOOPT;
 723#ifdef CONFIG_NETDEVICES
 724	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 725	struct net *net = sock_net(sk);
 726	char devname[IFNAMSIZ];
 727
 728	if (bound_dev_if == 0) {
 729		len = 0;
 730		goto zero;
 731	}
 732
 733	ret = -EINVAL;
 734	if (len < IFNAMSIZ)
 735		goto out;
 736
 737	ret = netdev_get_name(net, devname, bound_dev_if);
 738	if (ret)
 739		goto out;
 740
 741	len = strlen(devname) + 1;
 742
 743	ret = -EFAULT;
 744	if (copy_to_sockptr(optval, devname, len))
 745		goto out;
 746
 747zero:
 748	ret = -EFAULT;
 749	if (copy_to_sockptr(optlen, &len, sizeof(int)))
 750		goto out;
 751
 752	ret = 0;
 753
 754out:
 755#endif
 756
 757	return ret;
 758}
 759
 760bool sk_mc_loop(const struct sock *sk)
 761{
 762	if (dev_recursion_level())
 763		return false;
 764	if (!sk)
 765		return true;
 766	/* IPV6_ADDRFORM can change sk->sk_family under us. */
 767	switch (READ_ONCE(sk->sk_family)) {
 768	case AF_INET:
 769		return inet_test_bit(MC_LOOP, sk);
 770#if IS_ENABLED(CONFIG_IPV6)
 771	case AF_INET6:
 772		return inet6_test_bit(MC6_LOOP, sk);
 773#endif
 774	}
 775	WARN_ON_ONCE(1);
 776	return true;
 777}
 778EXPORT_SYMBOL(sk_mc_loop);
 779
 780void sock_set_reuseaddr(struct sock *sk)
 781{
 782	lock_sock(sk);
 783	sk->sk_reuse = SK_CAN_REUSE;
 784	release_sock(sk);
 785}
 786EXPORT_SYMBOL(sock_set_reuseaddr);
 787
 788void sock_set_reuseport(struct sock *sk)
 789{
 790	lock_sock(sk);
 791	sk->sk_reuseport = true;
 792	release_sock(sk);
 793}
 794EXPORT_SYMBOL(sock_set_reuseport);
 795
 796void sock_no_linger(struct sock *sk)
 797{
 798	lock_sock(sk);
 799	WRITE_ONCE(sk->sk_lingertime, 0);
 800	sock_set_flag(sk, SOCK_LINGER);
 801	release_sock(sk);
 802}
 803EXPORT_SYMBOL(sock_no_linger);
 804
 805void sock_set_priority(struct sock *sk, u32 priority)
 806{
 807	WRITE_ONCE(sk->sk_priority, priority);
 808}
 809EXPORT_SYMBOL(sock_set_priority);
 810
 811void sock_set_sndtimeo(struct sock *sk, s64 secs)
 812{
 813	lock_sock(sk);
 814	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 815		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 816	else
 817		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 818	release_sock(sk);
 819}
 820EXPORT_SYMBOL(sock_set_sndtimeo);
 821
 822static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 823{
 824	if (val)  {
 825		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 826		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 827		sock_set_flag(sk, SOCK_RCVTSTAMP);
 828		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 829	} else {
 830		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 831		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 832	}
 833}
 834
 835void sock_enable_timestamps(struct sock *sk)
 836{
 837	lock_sock(sk);
 838	__sock_set_timestamps(sk, true, false, true);
 839	release_sock(sk);
 840}
 841EXPORT_SYMBOL(sock_enable_timestamps);
 842
 843void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 844{
 845	switch (optname) {
 846	case SO_TIMESTAMP_OLD:
 847		__sock_set_timestamps(sk, valbool, false, false);
 848		break;
 849	case SO_TIMESTAMP_NEW:
 850		__sock_set_timestamps(sk, valbool, true, false);
 851		break;
 852	case SO_TIMESTAMPNS_OLD:
 853		__sock_set_timestamps(sk, valbool, false, true);
 854		break;
 855	case SO_TIMESTAMPNS_NEW:
 856		__sock_set_timestamps(sk, valbool, true, true);
 857		break;
 858	}
 859}
 860
 861static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 862{
 863	struct net *net = sock_net(sk);
 864	struct net_device *dev = NULL;
 865	bool match = false;
 866	int *vclock_index;
 867	int i, num;
 868
 869	if (sk->sk_bound_dev_if)
 870		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 871
 872	if (!dev) {
 873		pr_err("%s: sock not bind to device\n", __func__);
 874		return -EOPNOTSUPP;
 875	}
 876
 877	num = ethtool_get_phc_vclocks(dev, &vclock_index);
 878	dev_put(dev);
 879
 880	for (i = 0; i < num; i++) {
 881		if (*(vclock_index + i) == phc_index) {
 882			match = true;
 883			break;
 884		}
 885	}
 886
 887	if (num > 0)
 888		kfree(vclock_index);
 889
 890	if (!match)
 891		return -EINVAL;
 892
 893	WRITE_ONCE(sk->sk_bind_phc, phc_index);
 894
 895	return 0;
 896}
 897
 898int sock_set_timestamping(struct sock *sk, int optname,
 899			  struct so_timestamping timestamping)
 900{
 901	int val = timestamping.flags;
 902	int ret;
 903
 904	if (val & ~SOF_TIMESTAMPING_MASK)
 905		return -EINVAL;
 906
 907	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 908	    !(val & SOF_TIMESTAMPING_OPT_ID))
 909		return -EINVAL;
 910
 911	if (val & SOF_TIMESTAMPING_OPT_ID &&
 912	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 913		if (sk_is_tcp(sk)) {
 914			if ((1 << sk->sk_state) &
 915			    (TCPF_CLOSE | TCPF_LISTEN))
 916				return -EINVAL;
 917			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 918				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 919			else
 920				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 921		} else {
 922			atomic_set(&sk->sk_tskey, 0);
 923		}
 924	}
 925
 926	if (val & SOF_TIMESTAMPING_OPT_STATS &&
 927	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 928		return -EINVAL;
 929
 930	if (val & SOF_TIMESTAMPING_BIND_PHC) {
 931		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 932		if (ret)
 933			return ret;
 934	}
 935
 936	WRITE_ONCE(sk->sk_tsflags, val);
 937	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 938
 939	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 940		sock_enable_timestamp(sk,
 941				      SOCK_TIMESTAMPING_RX_SOFTWARE);
 942	else
 943		sock_disable_timestamp(sk,
 944				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 945	return 0;
 946}
 947
 948void sock_set_keepalive(struct sock *sk)
 949{
 950	lock_sock(sk);
 951	if (sk->sk_prot->keepalive)
 952		sk->sk_prot->keepalive(sk, true);
 953	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 954	release_sock(sk);
 955}
 956EXPORT_SYMBOL(sock_set_keepalive);
 957
 958static void __sock_set_rcvbuf(struct sock *sk, int val)
 959{
 960	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 961	 * as a negative value.
 962	 */
 963	val = min_t(int, val, INT_MAX / 2);
 964	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 965
 966	/* We double it on the way in to account for "struct sk_buff" etc.
 967	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 968	 * will allow that much actual data to be received on that socket.
 969	 *
 970	 * Applications are unaware that "struct sk_buff" and other overheads
 971	 * allocate from the receive buffer during socket buffer allocation.
 972	 *
 973	 * And after considering the possible alternatives, returning the value
 974	 * we actually used in getsockopt is the most desirable behavior.
 975	 */
 976	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 977}
 978
 979void sock_set_rcvbuf(struct sock *sk, int val)
 980{
 981	lock_sock(sk);
 982	__sock_set_rcvbuf(sk, val);
 983	release_sock(sk);
 984}
 985EXPORT_SYMBOL(sock_set_rcvbuf);
 986
 987static void __sock_set_mark(struct sock *sk, u32 val)
 988{
 989	if (val != sk->sk_mark) {
 990		WRITE_ONCE(sk->sk_mark, val);
 991		sk_dst_reset(sk);
 992	}
 993}
 994
 995void sock_set_mark(struct sock *sk, u32 val)
 996{
 997	lock_sock(sk);
 998	__sock_set_mark(sk, val);
 999	release_sock(sk);
1000}
1001EXPORT_SYMBOL(sock_set_mark);
1002
1003static void sock_release_reserved_memory(struct sock *sk, int bytes)
1004{
1005	/* Round down bytes to multiple of pages */
1006	bytes = round_down(bytes, PAGE_SIZE);
1007
1008	WARN_ON(bytes > sk->sk_reserved_mem);
1009	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1010	sk_mem_reclaim(sk);
1011}
1012
1013static int sock_reserve_memory(struct sock *sk, int bytes)
1014{
1015	long allocated;
1016	bool charged;
1017	int pages;
1018
1019	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1020		return -EOPNOTSUPP;
1021
1022	if (!bytes)
1023		return 0;
1024
1025	pages = sk_mem_pages(bytes);
1026
1027	/* pre-charge to memcg */
1028	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1029					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1030	if (!charged)
1031		return -ENOMEM;
1032
1033	/* pre-charge to forward_alloc */
1034	sk_memory_allocated_add(sk, pages);
1035	allocated = sk_memory_allocated(sk);
1036	/* If the system goes into memory pressure with this
1037	 * precharge, give up and return error.
1038	 */
1039	if (allocated > sk_prot_mem_limits(sk, 1)) {
1040		sk_memory_allocated_sub(sk, pages);
1041		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1042		return -ENOMEM;
1043	}
1044	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1045
1046	WRITE_ONCE(sk->sk_reserved_mem,
1047		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1048
1049	return 0;
1050}
1051
1052void sockopt_lock_sock(struct sock *sk)
1053{
1054	/* When current->bpf_ctx is set, the setsockopt is called from
1055	 * a bpf prog.  bpf has ensured the sk lock has been
1056	 * acquired before calling setsockopt().
1057	 */
1058	if (has_current_bpf_ctx())
1059		return;
1060
1061	lock_sock(sk);
1062}
1063EXPORT_SYMBOL(sockopt_lock_sock);
1064
1065void sockopt_release_sock(struct sock *sk)
1066{
1067	if (has_current_bpf_ctx())
1068		return;
1069
1070	release_sock(sk);
1071}
1072EXPORT_SYMBOL(sockopt_release_sock);
1073
1074bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075{
1076	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077}
1078EXPORT_SYMBOL(sockopt_ns_capable);
1079
1080bool sockopt_capable(int cap)
1081{
1082	return has_current_bpf_ctx() || capable(cap);
1083}
1084EXPORT_SYMBOL(sockopt_capable);
1085
1086static int sockopt_validate_clockid(__kernel_clockid_t value)
1087{
1088	switch (value) {
1089	case CLOCK_REALTIME:
1090	case CLOCK_MONOTONIC:
1091	case CLOCK_TAI:
1092		return 0;
1093	}
1094	return -EINVAL;
1095}
1096
1097/*
1098 *	This is meant for all protocols to use and covers goings on
1099 *	at the socket level. Everything here is generic.
1100 */
1101
1102int sk_setsockopt(struct sock *sk, int level, int optname,
1103		  sockptr_t optval, unsigned int optlen)
1104{
1105	struct so_timestamping timestamping;
1106	struct socket *sock = sk->sk_socket;
1107	struct sock_txtime sk_txtime;
1108	int val;
1109	int valbool;
1110	struct linger ling;
1111	int ret = 0;
1112
1113	/*
1114	 *	Options without arguments
1115	 */
1116
1117	if (optname == SO_BINDTODEVICE)
1118		return sock_setbindtodevice(sk, optval, optlen);
1119
1120	if (optlen < sizeof(int))
1121		return -EINVAL;
1122
1123	if (copy_from_sockptr(&val, optval, sizeof(val)))
1124		return -EFAULT;
1125
1126	valbool = val ? 1 : 0;
1127
1128	/* handle options which do not require locking the socket. */
1129	switch (optname) {
1130	case SO_PRIORITY:
1131		if ((val >= 0 && val <= 6) ||
1132		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1133		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1134			sock_set_priority(sk, val);
1135			return 0;
1136		}
1137		return -EPERM;
1138	case SO_PASSSEC:
1139		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1140		return 0;
1141	case SO_PASSCRED:
1142		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1143		return 0;
1144	case SO_PASSPIDFD:
1145		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1146		return 0;
1147	case SO_TYPE:
1148	case SO_PROTOCOL:
1149	case SO_DOMAIN:
1150	case SO_ERROR:
1151		return -ENOPROTOOPT;
1152#ifdef CONFIG_NET_RX_BUSY_POLL
1153	case SO_BUSY_POLL:
1154		if (val < 0)
1155			return -EINVAL;
1156		WRITE_ONCE(sk->sk_ll_usec, val);
1157		return 0;
1158	case SO_PREFER_BUSY_POLL:
1159		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1160			return -EPERM;
1161		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1162		return 0;
1163	case SO_BUSY_POLL_BUDGET:
1164		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1165		    !sockopt_capable(CAP_NET_ADMIN))
1166			return -EPERM;
1167		if (val < 0 || val > U16_MAX)
1168			return -EINVAL;
1169		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1170		return 0;
1171#endif
1172	case SO_MAX_PACING_RATE:
1173		{
1174		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1175		unsigned long pacing_rate;
1176
1177		if (sizeof(ulval) != sizeof(val) &&
1178		    optlen >= sizeof(ulval) &&
1179		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1180			return -EFAULT;
1181		}
1182		if (ulval != ~0UL)
1183			cmpxchg(&sk->sk_pacing_status,
1184				SK_PACING_NONE,
1185				SK_PACING_NEEDED);
1186		/* Pairs with READ_ONCE() from sk_getsockopt() */
1187		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1188		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1189		if (ulval < pacing_rate)
1190			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1191		return 0;
1192		}
1193	case SO_TXREHASH:
1194		if (val < -1 || val > 1)
1195			return -EINVAL;
1196		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1197			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1198		/* Paired with READ_ONCE() in tcp_rtx_synack()
1199		 * and sk_getsockopt().
1200		 */
1201		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1202		return 0;
1203	case SO_PEEK_OFF:
1204		{
1205		int (*set_peek_off)(struct sock *sk, int val);
1206
1207		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1208		if (set_peek_off)
1209			ret = set_peek_off(sk, val);
1210		else
1211			ret = -EOPNOTSUPP;
1212		return ret;
1213		}
1214	}
1215
1216	sockopt_lock_sock(sk);
1217
1218	switch (optname) {
1219	case SO_DEBUG:
1220		if (val && !sockopt_capable(CAP_NET_ADMIN))
1221			ret = -EACCES;
1222		else
1223			sock_valbool_flag(sk, SOCK_DBG, valbool);
1224		break;
1225	case SO_REUSEADDR:
1226		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1227		break;
1228	case SO_REUSEPORT:
1229		sk->sk_reuseport = valbool;
1230		break;
1231	case SO_DONTROUTE:
1232		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1233		sk_dst_reset(sk);
1234		break;
1235	case SO_BROADCAST:
1236		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1237		break;
1238	case SO_SNDBUF:
1239		/* Don't error on this BSD doesn't and if you think
1240		 * about it this is right. Otherwise apps have to
1241		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1242		 * are treated in BSD as hints
1243		 */
1244		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1245set_sndbuf:
1246		/* Ensure val * 2 fits into an int, to prevent max_t()
1247		 * from treating it as a negative value.
1248		 */
1249		val = min_t(int, val, INT_MAX / 2);
1250		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1251		WRITE_ONCE(sk->sk_sndbuf,
1252			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1253		/* Wake up sending tasks if we upped the value. */
1254		sk->sk_write_space(sk);
1255		break;
1256
1257	case SO_SNDBUFFORCE:
1258		if (!sockopt_capable(CAP_NET_ADMIN)) {
1259			ret = -EPERM;
1260			break;
1261		}
1262
1263		/* No negative values (to prevent underflow, as val will be
1264		 * multiplied by 2).
1265		 */
1266		if (val < 0)
1267			val = 0;
1268		goto set_sndbuf;
1269
1270	case SO_RCVBUF:
1271		/* Don't error on this BSD doesn't and if you think
1272		 * about it this is right. Otherwise apps have to
1273		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1274		 * are treated in BSD as hints
1275		 */
1276		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1277		break;
1278
1279	case SO_RCVBUFFORCE:
1280		if (!sockopt_capable(CAP_NET_ADMIN)) {
1281			ret = -EPERM;
1282			break;
1283		}
1284
1285		/* No negative values (to prevent underflow, as val will be
1286		 * multiplied by 2).
1287		 */
1288		__sock_set_rcvbuf(sk, max(val, 0));
1289		break;
1290
1291	case SO_KEEPALIVE:
1292		if (sk->sk_prot->keepalive)
1293			sk->sk_prot->keepalive(sk, valbool);
1294		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1295		break;
1296
1297	case SO_OOBINLINE:
1298		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1299		break;
1300
1301	case SO_NO_CHECK:
1302		sk->sk_no_check_tx = valbool;
1303		break;
1304
1305	case SO_LINGER:
1306		if (optlen < sizeof(ling)) {
1307			ret = -EINVAL;	/* 1003.1g */
1308			break;
1309		}
1310		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1311			ret = -EFAULT;
1312			break;
1313		}
1314		if (!ling.l_onoff) {
1315			sock_reset_flag(sk, SOCK_LINGER);
1316		} else {
1317			unsigned long t_sec = ling.l_linger;
1318
1319			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1320				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1321			else
1322				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1323			sock_set_flag(sk, SOCK_LINGER);
1324		}
1325		break;
1326
1327	case SO_BSDCOMPAT:
1328		break;
1329
1330	case SO_TIMESTAMP_OLD:
1331	case SO_TIMESTAMP_NEW:
1332	case SO_TIMESTAMPNS_OLD:
1333	case SO_TIMESTAMPNS_NEW:
1334		sock_set_timestamp(sk, optname, valbool);
1335		break;
1336
1337	case SO_TIMESTAMPING_NEW:
1338	case SO_TIMESTAMPING_OLD:
1339		if (optlen == sizeof(timestamping)) {
1340			if (copy_from_sockptr(&timestamping, optval,
1341					      sizeof(timestamping))) {
1342				ret = -EFAULT;
1343				break;
1344			}
1345		} else {
1346			memset(&timestamping, 0, sizeof(timestamping));
1347			timestamping.flags = val;
1348		}
1349		ret = sock_set_timestamping(sk, optname, timestamping);
1350		break;
1351
1352	case SO_RCVLOWAT:
1353		{
1354		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1355
1356		if (val < 0)
1357			val = INT_MAX;
1358		if (sock)
1359			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1360		if (set_rcvlowat)
1361			ret = set_rcvlowat(sk, val);
1362		else
1363			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1364		break;
1365		}
1366	case SO_RCVTIMEO_OLD:
1367	case SO_RCVTIMEO_NEW:
1368		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1369				       optlen, optname == SO_RCVTIMEO_OLD);
1370		break;
1371
1372	case SO_SNDTIMEO_OLD:
1373	case SO_SNDTIMEO_NEW:
1374		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1375				       optlen, optname == SO_SNDTIMEO_OLD);
1376		break;
1377
1378	case SO_ATTACH_FILTER: {
1379		struct sock_fprog fprog;
1380
1381		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1382		if (!ret)
1383			ret = sk_attach_filter(&fprog, sk);
1384		break;
1385	}
1386	case SO_ATTACH_BPF:
1387		ret = -EINVAL;
1388		if (optlen == sizeof(u32)) {
1389			u32 ufd;
1390
1391			ret = -EFAULT;
1392			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1393				break;
1394
1395			ret = sk_attach_bpf(ufd, sk);
1396		}
1397		break;
1398
1399	case SO_ATTACH_REUSEPORT_CBPF: {
1400		struct sock_fprog fprog;
1401
1402		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1403		if (!ret)
1404			ret = sk_reuseport_attach_filter(&fprog, sk);
1405		break;
1406	}
1407	case SO_ATTACH_REUSEPORT_EBPF:
1408		ret = -EINVAL;
1409		if (optlen == sizeof(u32)) {
1410			u32 ufd;
1411
1412			ret = -EFAULT;
1413			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1414				break;
1415
1416			ret = sk_reuseport_attach_bpf(ufd, sk);
1417		}
1418		break;
1419
1420	case SO_DETACH_REUSEPORT_BPF:
1421		ret = reuseport_detach_prog(sk);
1422		break;
1423
1424	case SO_DETACH_FILTER:
1425		ret = sk_detach_filter(sk);
1426		break;
1427
1428	case SO_LOCK_FILTER:
1429		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1430			ret = -EPERM;
1431		else
1432			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1433		break;
1434
1435	case SO_MARK:
1436		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1437		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1438			ret = -EPERM;
1439			break;
1440		}
1441
1442		__sock_set_mark(sk, val);
1443		break;
1444	case SO_RCVMARK:
1445		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1446		break;
1447
1448	case SO_RXQ_OVFL:
1449		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1450		break;
1451
1452	case SO_WIFI_STATUS:
1453		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1454		break;
1455
1456	case SO_NOFCS:
1457		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1458		break;
1459
1460	case SO_SELECT_ERR_QUEUE:
1461		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1462		break;
1463
1464
1465	case SO_INCOMING_CPU:
1466		reuseport_update_incoming_cpu(sk, val);
1467		break;
1468
1469	case SO_CNX_ADVICE:
1470		if (val == 1)
1471			dst_negative_advice(sk);
1472		break;
1473
1474	case SO_ZEROCOPY:
1475		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1476			if (!(sk_is_tcp(sk) ||
1477			      (sk->sk_type == SOCK_DGRAM &&
1478			       sk->sk_protocol == IPPROTO_UDP)))
1479				ret = -EOPNOTSUPP;
1480		} else if (sk->sk_family != PF_RDS) {
1481			ret = -EOPNOTSUPP;
1482		}
1483		if (!ret) {
1484			if (val < 0 || val > 1)
1485				ret = -EINVAL;
1486			else
1487				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1488		}
1489		break;
1490
1491	case SO_TXTIME:
1492		if (optlen != sizeof(struct sock_txtime)) {
1493			ret = -EINVAL;
1494			break;
1495		} else if (copy_from_sockptr(&sk_txtime, optval,
1496			   sizeof(struct sock_txtime))) {
1497			ret = -EFAULT;
1498			break;
1499		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1500			ret = -EINVAL;
1501			break;
1502		}
1503		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1504		 * scheduler has enough safe guards.
1505		 */
1506		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1507		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1508			ret = -EPERM;
1509			break;
1510		}
1511
1512		ret = sockopt_validate_clockid(sk_txtime.clockid);
1513		if (ret)
1514			break;
1515
1516		sock_valbool_flag(sk, SOCK_TXTIME, true);
1517		sk->sk_clockid = sk_txtime.clockid;
1518		sk->sk_txtime_deadline_mode =
1519			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1520		sk->sk_txtime_report_errors =
1521			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1522		break;
1523
1524	case SO_BINDTOIFINDEX:
1525		ret = sock_bindtoindex_locked(sk, val);
1526		break;
1527
1528	case SO_BUF_LOCK:
1529		if (val & ~SOCK_BUF_LOCK_MASK) {
1530			ret = -EINVAL;
1531			break;
1532		}
1533		sk->sk_userlocks = val | (sk->sk_userlocks &
1534					  ~SOCK_BUF_LOCK_MASK);
1535		break;
1536
1537	case SO_RESERVE_MEM:
1538	{
1539		int delta;
1540
1541		if (val < 0) {
1542			ret = -EINVAL;
1543			break;
1544		}
1545
1546		delta = val - sk->sk_reserved_mem;
1547		if (delta < 0)
1548			sock_release_reserved_memory(sk, -delta);
1549		else
1550			ret = sock_reserve_memory(sk, delta);
1551		break;
1552	}
1553
1554	default:
1555		ret = -ENOPROTOOPT;
1556		break;
1557	}
1558	sockopt_release_sock(sk);
1559	return ret;
1560}
1561
1562int sock_setsockopt(struct socket *sock, int level, int optname,
1563		    sockptr_t optval, unsigned int optlen)
1564{
1565	return sk_setsockopt(sock->sk, level, optname,
1566			     optval, optlen);
1567}
1568EXPORT_SYMBOL(sock_setsockopt);
1569
1570static const struct cred *sk_get_peer_cred(struct sock *sk)
1571{
1572	const struct cred *cred;
1573
1574	spin_lock(&sk->sk_peer_lock);
1575	cred = get_cred(sk->sk_peer_cred);
1576	spin_unlock(&sk->sk_peer_lock);
1577
1578	return cred;
1579}
1580
1581static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1582			  struct ucred *ucred)
1583{
1584	ucred->pid = pid_vnr(pid);
1585	ucred->uid = ucred->gid = -1;
1586	if (cred) {
1587		struct user_namespace *current_ns = current_user_ns();
1588
1589		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1590		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1591	}
1592}
1593
1594static int groups_to_user(sockptr_t dst, const struct group_info *src)
1595{
1596	struct user_namespace *user_ns = current_user_ns();
1597	int i;
1598
1599	for (i = 0; i < src->ngroups; i++) {
1600		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1601
1602		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1603			return -EFAULT;
1604	}
1605
1606	return 0;
1607}
1608
1609int sk_getsockopt(struct sock *sk, int level, int optname,
1610		  sockptr_t optval, sockptr_t optlen)
1611{
1612	struct socket *sock = sk->sk_socket;
1613
1614	union {
1615		int val;
1616		u64 val64;
1617		unsigned long ulval;
1618		struct linger ling;
1619		struct old_timeval32 tm32;
1620		struct __kernel_old_timeval tm;
1621		struct  __kernel_sock_timeval stm;
1622		struct sock_txtime txtime;
1623		struct so_timestamping timestamping;
1624	} v;
1625
1626	int lv = sizeof(int);
1627	int len;
1628
1629	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1630		return -EFAULT;
1631	if (len < 0)
1632		return -EINVAL;
1633
1634	memset(&v, 0, sizeof(v));
1635
1636	switch (optname) {
1637	case SO_DEBUG:
1638		v.val = sock_flag(sk, SOCK_DBG);
1639		break;
1640
1641	case SO_DONTROUTE:
1642		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1643		break;
1644
1645	case SO_BROADCAST:
1646		v.val = sock_flag(sk, SOCK_BROADCAST);
1647		break;
1648
1649	case SO_SNDBUF:
1650		v.val = READ_ONCE(sk->sk_sndbuf);
1651		break;
1652
1653	case SO_RCVBUF:
1654		v.val = READ_ONCE(sk->sk_rcvbuf);
1655		break;
1656
1657	case SO_REUSEADDR:
1658		v.val = sk->sk_reuse;
1659		break;
1660
1661	case SO_REUSEPORT:
1662		v.val = sk->sk_reuseport;
1663		break;
1664
1665	case SO_KEEPALIVE:
1666		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1667		break;
1668
1669	case SO_TYPE:
1670		v.val = sk->sk_type;
1671		break;
1672
1673	case SO_PROTOCOL:
1674		v.val = sk->sk_protocol;
1675		break;
1676
1677	case SO_DOMAIN:
1678		v.val = sk->sk_family;
1679		break;
1680
1681	case SO_ERROR:
1682		v.val = -sock_error(sk);
1683		if (v.val == 0)
1684			v.val = xchg(&sk->sk_err_soft, 0);
1685		break;
1686
1687	case SO_OOBINLINE:
1688		v.val = sock_flag(sk, SOCK_URGINLINE);
1689		break;
1690
1691	case SO_NO_CHECK:
1692		v.val = sk->sk_no_check_tx;
1693		break;
1694
1695	case SO_PRIORITY:
1696		v.val = READ_ONCE(sk->sk_priority);
1697		break;
1698
1699	case SO_LINGER:
1700		lv		= sizeof(v.ling);
1701		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1702		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1703		break;
1704
1705	case SO_BSDCOMPAT:
1706		break;
1707
1708	case SO_TIMESTAMP_OLD:
1709		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1710				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1711				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1712		break;
1713
1714	case SO_TIMESTAMPNS_OLD:
1715		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1716		break;
1717
1718	case SO_TIMESTAMP_NEW:
1719		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1720		break;
1721
1722	case SO_TIMESTAMPNS_NEW:
1723		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1724		break;
1725
1726	case SO_TIMESTAMPING_OLD:
1727	case SO_TIMESTAMPING_NEW:
1728		lv = sizeof(v.timestamping);
1729		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1730		 * returning the flags when they were set through the same option.
1731		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1732		 */
1733		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1734			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1735			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1736		}
1737		break;
1738
1739	case SO_RCVTIMEO_OLD:
1740	case SO_RCVTIMEO_NEW:
1741		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1742				      SO_RCVTIMEO_OLD == optname);
1743		break;
1744
1745	case SO_SNDTIMEO_OLD:
1746	case SO_SNDTIMEO_NEW:
1747		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1748				      SO_SNDTIMEO_OLD == optname);
1749		break;
1750
1751	case SO_RCVLOWAT:
1752		v.val = READ_ONCE(sk->sk_rcvlowat);
1753		break;
1754
1755	case SO_SNDLOWAT:
1756		v.val = 1;
1757		break;
1758
1759	case SO_PASSCRED:
1760		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1761		break;
1762
1763	case SO_PASSPIDFD:
1764		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1765		break;
1766
1767	case SO_PEERCRED:
1768	{
1769		struct ucred peercred;
1770		if (len > sizeof(peercred))
1771			len = sizeof(peercred);
1772
1773		spin_lock(&sk->sk_peer_lock);
1774		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1775		spin_unlock(&sk->sk_peer_lock);
1776
1777		if (copy_to_sockptr(optval, &peercred, len))
1778			return -EFAULT;
1779		goto lenout;
1780	}
1781
1782	case SO_PEERPIDFD:
1783	{
1784		struct pid *peer_pid;
1785		struct file *pidfd_file = NULL;
1786		int pidfd;
1787
1788		if (len > sizeof(pidfd))
1789			len = sizeof(pidfd);
1790
1791		spin_lock(&sk->sk_peer_lock);
1792		peer_pid = get_pid(sk->sk_peer_pid);
1793		spin_unlock(&sk->sk_peer_lock);
1794
1795		if (!peer_pid)
1796			return -ENODATA;
1797
1798		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1799		put_pid(peer_pid);
1800		if (pidfd < 0)
1801			return pidfd;
1802
1803		if (copy_to_sockptr(optval, &pidfd, len) ||
1804		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1805			put_unused_fd(pidfd);
1806			fput(pidfd_file);
1807
1808			return -EFAULT;
1809		}
1810
1811		fd_install(pidfd, pidfd_file);
1812		return 0;
1813	}
1814
1815	case SO_PEERGROUPS:
1816	{
1817		const struct cred *cred;
1818		int ret, n;
1819
1820		cred = sk_get_peer_cred(sk);
1821		if (!cred)
1822			return -ENODATA;
1823
1824		n = cred->group_info->ngroups;
1825		if (len < n * sizeof(gid_t)) {
1826			len = n * sizeof(gid_t);
1827			put_cred(cred);
1828			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1829		}
1830		len = n * sizeof(gid_t);
1831
1832		ret = groups_to_user(optval, cred->group_info);
1833		put_cred(cred);
1834		if (ret)
1835			return ret;
1836		goto lenout;
1837	}
1838
1839	case SO_PEERNAME:
1840	{
1841		struct sockaddr_storage address;
1842
1843		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1844		if (lv < 0)
1845			return -ENOTCONN;
1846		if (lv < len)
1847			return -EINVAL;
1848		if (copy_to_sockptr(optval, &address, len))
1849			return -EFAULT;
1850		goto lenout;
1851	}
1852
1853	/* Dubious BSD thing... Probably nobody even uses it, but
1854	 * the UNIX standard wants it for whatever reason... -DaveM
1855	 */
1856	case SO_ACCEPTCONN:
1857		v.val = sk->sk_state == TCP_LISTEN;
1858		break;
1859
1860	case SO_PASSSEC:
1861		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1862		break;
1863
1864	case SO_PEERSEC:
1865		return security_socket_getpeersec_stream(sock,
1866							 optval, optlen, len);
1867
1868	case SO_MARK:
1869		v.val = READ_ONCE(sk->sk_mark);
1870		break;
1871
1872	case SO_RCVMARK:
1873		v.val = sock_flag(sk, SOCK_RCVMARK);
1874		break;
1875
1876	case SO_RXQ_OVFL:
1877		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1878		break;
1879
1880	case SO_WIFI_STATUS:
1881		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1882		break;
1883
1884	case SO_PEEK_OFF:
1885		if (!READ_ONCE(sock->ops)->set_peek_off)
1886			return -EOPNOTSUPP;
1887
1888		v.val = READ_ONCE(sk->sk_peek_off);
1889		break;
1890	case SO_NOFCS:
1891		v.val = sock_flag(sk, SOCK_NOFCS);
1892		break;
1893
1894	case SO_BINDTODEVICE:
1895		return sock_getbindtodevice(sk, optval, optlen, len);
1896
1897	case SO_GET_FILTER:
1898		len = sk_get_filter(sk, optval, len);
1899		if (len < 0)
1900			return len;
1901
1902		goto lenout;
1903
1904	case SO_LOCK_FILTER:
1905		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1906		break;
1907
1908	case SO_BPF_EXTENSIONS:
1909		v.val = bpf_tell_extensions();
1910		break;
1911
1912	case SO_SELECT_ERR_QUEUE:
1913		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1914		break;
1915
1916#ifdef CONFIG_NET_RX_BUSY_POLL
1917	case SO_BUSY_POLL:
1918		v.val = READ_ONCE(sk->sk_ll_usec);
1919		break;
1920	case SO_PREFER_BUSY_POLL:
1921		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1922		break;
1923#endif
1924
1925	case SO_MAX_PACING_RATE:
1926		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1927		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1928			lv = sizeof(v.ulval);
1929			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1930		} else {
1931			/* 32bit version */
1932			v.val = min_t(unsigned long, ~0U,
1933				      READ_ONCE(sk->sk_max_pacing_rate));
1934		}
1935		break;
1936
1937	case SO_INCOMING_CPU:
1938		v.val = READ_ONCE(sk->sk_incoming_cpu);
1939		break;
1940
1941	case SO_MEMINFO:
1942	{
1943		u32 meminfo[SK_MEMINFO_VARS];
1944
1945		sk_get_meminfo(sk, meminfo);
1946
1947		len = min_t(unsigned int, len, sizeof(meminfo));
1948		if (copy_to_sockptr(optval, &meminfo, len))
1949			return -EFAULT;
1950
1951		goto lenout;
1952	}
1953
1954#ifdef CONFIG_NET_RX_BUSY_POLL
1955	case SO_INCOMING_NAPI_ID:
1956		v.val = READ_ONCE(sk->sk_napi_id);
1957
1958		/* aggregate non-NAPI IDs down to 0 */
1959		if (v.val < MIN_NAPI_ID)
1960			v.val = 0;
1961
1962		break;
1963#endif
1964
1965	case SO_COOKIE:
1966		lv = sizeof(u64);
1967		if (len < lv)
1968			return -EINVAL;
1969		v.val64 = sock_gen_cookie(sk);
1970		break;
1971
1972	case SO_ZEROCOPY:
1973		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1974		break;
1975
1976	case SO_TXTIME:
1977		lv = sizeof(v.txtime);
1978		v.txtime.clockid = sk->sk_clockid;
1979		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1980				  SOF_TXTIME_DEADLINE_MODE : 0;
1981		v.txtime.flags |= sk->sk_txtime_report_errors ?
1982				  SOF_TXTIME_REPORT_ERRORS : 0;
1983		break;
1984
1985	case SO_BINDTOIFINDEX:
1986		v.val = READ_ONCE(sk->sk_bound_dev_if);
1987		break;
1988
1989	case SO_NETNS_COOKIE:
1990		lv = sizeof(u64);
1991		if (len != lv)
1992			return -EINVAL;
1993		v.val64 = sock_net(sk)->net_cookie;
1994		break;
1995
1996	case SO_BUF_LOCK:
1997		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1998		break;
1999
2000	case SO_RESERVE_MEM:
2001		v.val = READ_ONCE(sk->sk_reserved_mem);
2002		break;
2003
2004	case SO_TXREHASH:
2005		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2006		v.val = READ_ONCE(sk->sk_txrehash);
2007		break;
2008
2009	default:
2010		/* We implement the SO_SNDLOWAT etc to not be settable
2011		 * (1003.1g 7).
2012		 */
2013		return -ENOPROTOOPT;
2014	}
2015
2016	if (len > lv)
2017		len = lv;
2018	if (copy_to_sockptr(optval, &v, len))
2019		return -EFAULT;
2020lenout:
2021	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2022		return -EFAULT;
2023	return 0;
2024}
2025
2026/*
2027 * Initialize an sk_lock.
2028 *
2029 * (We also register the sk_lock with the lock validator.)
2030 */
2031static inline void sock_lock_init(struct sock *sk)
2032{
2033	if (sk->sk_kern_sock)
2034		sock_lock_init_class_and_name(
2035			sk,
2036			af_family_kern_slock_key_strings[sk->sk_family],
2037			af_family_kern_slock_keys + sk->sk_family,
2038			af_family_kern_key_strings[sk->sk_family],
2039			af_family_kern_keys + sk->sk_family);
2040	else
2041		sock_lock_init_class_and_name(
2042			sk,
2043			af_family_slock_key_strings[sk->sk_family],
2044			af_family_slock_keys + sk->sk_family,
2045			af_family_key_strings[sk->sk_family],
2046			af_family_keys + sk->sk_family);
2047}
2048
2049/*
2050 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2051 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2052 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2053 */
2054static void sock_copy(struct sock *nsk, const struct sock *osk)
2055{
2056	const struct proto *prot = READ_ONCE(osk->sk_prot);
2057#ifdef CONFIG_SECURITY_NETWORK
2058	void *sptr = nsk->sk_security;
2059#endif
2060
2061	/* If we move sk_tx_queue_mapping out of the private section,
2062	 * we must check if sk_tx_queue_clear() is called after
2063	 * sock_copy() in sk_clone_lock().
2064	 */
2065	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2066		     offsetof(struct sock, sk_dontcopy_begin) ||
2067		     offsetof(struct sock, sk_tx_queue_mapping) >=
2068		     offsetof(struct sock, sk_dontcopy_end));
2069
2070	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2071
2072	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2073		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2074		      /* alloc is larger than struct, see sk_prot_alloc() */);
2075
2076#ifdef CONFIG_SECURITY_NETWORK
2077	nsk->sk_security = sptr;
2078	security_sk_clone(osk, nsk);
2079#endif
2080}
2081
2082static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2083		int family)
2084{
2085	struct sock *sk;
2086	struct kmem_cache *slab;
2087
2088	slab = prot->slab;
2089	if (slab != NULL) {
2090		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2091		if (!sk)
2092			return sk;
2093		if (want_init_on_alloc(priority))
2094			sk_prot_clear_nulls(sk, prot->obj_size);
2095	} else
2096		sk = kmalloc(prot->obj_size, priority);
2097
2098	if (sk != NULL) {
2099		if (security_sk_alloc(sk, family, priority))
2100			goto out_free;
2101
2102		if (!try_module_get(prot->owner))
2103			goto out_free_sec;
2104	}
2105
2106	return sk;
2107
2108out_free_sec:
2109	security_sk_free(sk);
2110out_free:
2111	if (slab != NULL)
2112		kmem_cache_free(slab, sk);
2113	else
2114		kfree(sk);
2115	return NULL;
2116}
2117
2118static void sk_prot_free(struct proto *prot, struct sock *sk)
2119{
2120	struct kmem_cache *slab;
2121	struct module *owner;
2122
2123	owner = prot->owner;
2124	slab = prot->slab;
2125
2126	cgroup_sk_free(&sk->sk_cgrp_data);
2127	mem_cgroup_sk_free(sk);
2128	security_sk_free(sk);
2129	if (slab != NULL)
2130		kmem_cache_free(slab, sk);
2131	else
2132		kfree(sk);
2133	module_put(owner);
2134}
2135
2136/**
2137 *	sk_alloc - All socket objects are allocated here
2138 *	@net: the applicable net namespace
2139 *	@family: protocol family
2140 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2141 *	@prot: struct proto associated with this new sock instance
2142 *	@kern: is this to be a kernel socket?
2143 */
2144struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2145		      struct proto *prot, int kern)
2146{
2147	struct sock *sk;
2148
2149	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2150	if (sk) {
2151		sk->sk_family = family;
2152		/*
2153		 * See comment in struct sock definition to understand
2154		 * why we need sk_prot_creator -acme
2155		 */
2156		sk->sk_prot = sk->sk_prot_creator = prot;
2157		sk->sk_kern_sock = kern;
2158		sock_lock_init(sk);
2159		sk->sk_net_refcnt = kern ? 0 : 1;
2160		if (likely(sk->sk_net_refcnt)) {
2161			get_net_track(net, &sk->ns_tracker, priority);
2162			sock_inuse_add(net, 1);
2163		} else {
2164			__netns_tracker_alloc(net, &sk->ns_tracker,
2165					      false, priority);
2166		}
2167
2168		sock_net_set(sk, net);
2169		refcount_set(&sk->sk_wmem_alloc, 1);
2170
2171		mem_cgroup_sk_alloc(sk);
2172		cgroup_sk_alloc(&sk->sk_cgrp_data);
2173		sock_update_classid(&sk->sk_cgrp_data);
2174		sock_update_netprioidx(&sk->sk_cgrp_data);
2175		sk_tx_queue_clear(sk);
2176	}
2177
2178	return sk;
2179}
2180EXPORT_SYMBOL(sk_alloc);
2181
2182/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2183 * grace period. This is the case for UDP sockets and TCP listeners.
2184 */
2185static void __sk_destruct(struct rcu_head *head)
2186{
2187	struct sock *sk = container_of(head, struct sock, sk_rcu);
2188	struct sk_filter *filter;
2189
2190	if (sk->sk_destruct)
2191		sk->sk_destruct(sk);
2192
2193	filter = rcu_dereference_check(sk->sk_filter,
2194				       refcount_read(&sk->sk_wmem_alloc) == 0);
2195	if (filter) {
2196		sk_filter_uncharge(sk, filter);
2197		RCU_INIT_POINTER(sk->sk_filter, NULL);
2198	}
2199
2200	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2201
2202#ifdef CONFIG_BPF_SYSCALL
2203	bpf_sk_storage_free(sk);
2204#endif
2205
2206	if (atomic_read(&sk->sk_omem_alloc))
2207		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2208			 __func__, atomic_read(&sk->sk_omem_alloc));
2209
2210	if (sk->sk_frag.page) {
2211		put_page(sk->sk_frag.page);
2212		sk->sk_frag.page = NULL;
2213	}
2214
2215	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2216	put_cred(sk->sk_peer_cred);
2217	put_pid(sk->sk_peer_pid);
2218
2219	if (likely(sk->sk_net_refcnt))
2220		put_net_track(sock_net(sk), &sk->ns_tracker);
2221	else
2222		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2223
2224	sk_prot_free(sk->sk_prot_creator, sk);
2225}
2226
2227void sk_destruct(struct sock *sk)
2228{
2229	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2230
2231	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2232		reuseport_detach_sock(sk);
2233		use_call_rcu = true;
2234	}
2235
2236	if (use_call_rcu)
2237		call_rcu(&sk->sk_rcu, __sk_destruct);
2238	else
2239		__sk_destruct(&sk->sk_rcu);
2240}
2241
2242static void __sk_free(struct sock *sk)
2243{
2244	if (likely(sk->sk_net_refcnt))
2245		sock_inuse_add(sock_net(sk), -1);
2246
2247	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2248		sock_diag_broadcast_destroy(sk);
2249	else
2250		sk_destruct(sk);
2251}
2252
2253void sk_free(struct sock *sk)
2254{
2255	/*
2256	 * We subtract one from sk_wmem_alloc and can know if
2257	 * some packets are still in some tx queue.
2258	 * If not null, sock_wfree() will call __sk_free(sk) later
2259	 */
2260	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2261		__sk_free(sk);
2262}
2263EXPORT_SYMBOL(sk_free);
2264
2265static void sk_init_common(struct sock *sk)
2266{
2267	skb_queue_head_init(&sk->sk_receive_queue);
2268	skb_queue_head_init(&sk->sk_write_queue);
2269	skb_queue_head_init(&sk->sk_error_queue);
2270
2271	rwlock_init(&sk->sk_callback_lock);
2272	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2273			af_rlock_keys + sk->sk_family,
2274			af_family_rlock_key_strings[sk->sk_family]);
2275	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2276			af_wlock_keys + sk->sk_family,
2277			af_family_wlock_key_strings[sk->sk_family]);
2278	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2279			af_elock_keys + sk->sk_family,
2280			af_family_elock_key_strings[sk->sk_family]);
2281	if (sk->sk_kern_sock)
2282		lockdep_set_class_and_name(&sk->sk_callback_lock,
2283			af_kern_callback_keys + sk->sk_family,
2284			af_family_kern_clock_key_strings[sk->sk_family]);
2285	else
2286		lockdep_set_class_and_name(&sk->sk_callback_lock,
2287			af_callback_keys + sk->sk_family,
2288			af_family_clock_key_strings[sk->sk_family]);
2289}
2290
2291/**
2292 *	sk_clone_lock - clone a socket, and lock its clone
2293 *	@sk: the socket to clone
2294 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2295 *
2296 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2297 */
2298struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2299{
2300	struct proto *prot = READ_ONCE(sk->sk_prot);
2301	struct sk_filter *filter;
2302	bool is_charged = true;
2303	struct sock *newsk;
2304
2305	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2306	if (!newsk)
2307		goto out;
2308
2309	sock_copy(newsk, sk);
2310
2311	newsk->sk_prot_creator = prot;
2312
2313	/* SANITY */
2314	if (likely(newsk->sk_net_refcnt)) {
2315		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2316		sock_inuse_add(sock_net(newsk), 1);
2317	} else {
2318		/* Kernel sockets are not elevating the struct net refcount.
2319		 * Instead, use a tracker to more easily detect if a layer
2320		 * is not properly dismantling its kernel sockets at netns
2321		 * destroy time.
2322		 */
2323		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2324				      false, priority);
2325	}
2326	sk_node_init(&newsk->sk_node);
2327	sock_lock_init(newsk);
2328	bh_lock_sock(newsk);
2329	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2330	newsk->sk_backlog.len = 0;
2331
2332	atomic_set(&newsk->sk_rmem_alloc, 0);
2333
2334	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2335	refcount_set(&newsk->sk_wmem_alloc, 1);
2336
2337	atomic_set(&newsk->sk_omem_alloc, 0);
2338	sk_init_common(newsk);
2339
2340	newsk->sk_dst_cache	= NULL;
2341	newsk->sk_dst_pending_confirm = 0;
2342	newsk->sk_wmem_queued	= 0;
2343	newsk->sk_forward_alloc = 0;
2344	newsk->sk_reserved_mem  = 0;
2345	atomic_set(&newsk->sk_drops, 0);
2346	newsk->sk_send_head	= NULL;
2347	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2348	atomic_set(&newsk->sk_zckey, 0);
2349
2350	sock_reset_flag(newsk, SOCK_DONE);
2351
2352	/* sk->sk_memcg will be populated at accept() time */
2353	newsk->sk_memcg = NULL;
2354
2355	cgroup_sk_clone(&newsk->sk_cgrp_data);
2356
2357	rcu_read_lock();
2358	filter = rcu_dereference(sk->sk_filter);
2359	if (filter != NULL)
2360		/* though it's an empty new sock, the charging may fail
2361		 * if sysctl_optmem_max was changed between creation of
2362		 * original socket and cloning
2363		 */
2364		is_charged = sk_filter_charge(newsk, filter);
2365	RCU_INIT_POINTER(newsk->sk_filter, filter);
2366	rcu_read_unlock();
2367
2368	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2369		/* We need to make sure that we don't uncharge the new
2370		 * socket if we couldn't charge it in the first place
2371		 * as otherwise we uncharge the parent's filter.
2372		 */
2373		if (!is_charged)
2374			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2375		sk_free_unlock_clone(newsk);
2376		newsk = NULL;
2377		goto out;
2378	}
2379	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2380
2381	if (bpf_sk_storage_clone(sk, newsk)) {
2382		sk_free_unlock_clone(newsk);
2383		newsk = NULL;
2384		goto out;
2385	}
2386
2387	/* Clear sk_user_data if parent had the pointer tagged
2388	 * as not suitable for copying when cloning.
2389	 */
2390	if (sk_user_data_is_nocopy(newsk))
2391		newsk->sk_user_data = NULL;
2392
2393	newsk->sk_err	   = 0;
2394	newsk->sk_err_soft = 0;
2395	newsk->sk_priority = 0;
2396	newsk->sk_incoming_cpu = raw_smp_processor_id();
2397
2398	/* Before updating sk_refcnt, we must commit prior changes to memory
2399	 * (Documentation/RCU/rculist_nulls.rst for details)
2400	 */
2401	smp_wmb();
2402	refcount_set(&newsk->sk_refcnt, 2);
2403
2404	sk_set_socket(newsk, NULL);
2405	sk_tx_queue_clear(newsk);
2406	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2407
2408	if (newsk->sk_prot->sockets_allocated)
2409		sk_sockets_allocated_inc(newsk);
2410
2411	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2412		net_enable_timestamp();
2413out:
2414	return newsk;
2415}
2416EXPORT_SYMBOL_GPL(sk_clone_lock);
2417
2418void sk_free_unlock_clone(struct sock *sk)
2419{
2420	/* It is still raw copy of parent, so invalidate
2421	 * destructor and make plain sk_free() */
2422	sk->sk_destruct = NULL;
2423	bh_unlock_sock(sk);
2424	sk_free(sk);
2425}
2426EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2427
2428static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2429{
2430	bool is_ipv6 = false;
2431	u32 max_size;
2432
2433#if IS_ENABLED(CONFIG_IPV6)
2434	is_ipv6 = (sk->sk_family == AF_INET6 &&
2435		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2436#endif
2437	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2438	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2439			READ_ONCE(dst->dev->gso_ipv4_max_size);
2440	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2441		max_size = GSO_LEGACY_MAX_SIZE;
2442
2443	return max_size - (MAX_TCP_HEADER + 1);
2444}
2445
2446void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2447{
2448	u32 max_segs = 1;
2449
2450	sk->sk_route_caps = dst->dev->features;
2451	if (sk_is_tcp(sk))
2452		sk->sk_route_caps |= NETIF_F_GSO;
2453	if (sk->sk_route_caps & NETIF_F_GSO)
2454		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2455	if (unlikely(sk->sk_gso_disabled))
2456		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2457	if (sk_can_gso(sk)) {
2458		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2459			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2460		} else {
2461			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2462			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2463			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2464			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2465		}
2466	}
2467	sk->sk_gso_max_segs = max_segs;
2468	sk_dst_set(sk, dst);
2469}
2470EXPORT_SYMBOL_GPL(sk_setup_caps);
2471
2472/*
2473 *	Simple resource managers for sockets.
2474 */
2475
2476
2477/*
2478 * Write buffer destructor automatically called from kfree_skb.
2479 */
2480void sock_wfree(struct sk_buff *skb)
2481{
2482	struct sock *sk = skb->sk;
2483	unsigned int len = skb->truesize;
2484	bool free;
2485
2486	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2487		if (sock_flag(sk, SOCK_RCU_FREE) &&
2488		    sk->sk_write_space == sock_def_write_space) {
2489			rcu_read_lock();
2490			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2491			sock_def_write_space_wfree(sk);
2492			rcu_read_unlock();
2493			if (unlikely(free))
2494				__sk_free(sk);
2495			return;
2496		}
2497
2498		/*
2499		 * Keep a reference on sk_wmem_alloc, this will be released
2500		 * after sk_write_space() call
2501		 */
2502		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2503		sk->sk_write_space(sk);
2504		len = 1;
2505	}
2506	/*
2507	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2508	 * could not do because of in-flight packets
2509	 */
2510	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2511		__sk_free(sk);
2512}
2513EXPORT_SYMBOL(sock_wfree);
2514
2515/* This variant of sock_wfree() is used by TCP,
2516 * since it sets SOCK_USE_WRITE_QUEUE.
2517 */
2518void __sock_wfree(struct sk_buff *skb)
2519{
2520	struct sock *sk = skb->sk;
2521
2522	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2523		__sk_free(sk);
2524}
2525
2526void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2527{
2528	skb_orphan(skb);
2529	skb->sk = sk;
2530#ifdef CONFIG_INET
2531	if (unlikely(!sk_fullsock(sk))) {
2532		skb->destructor = sock_edemux;
2533		sock_hold(sk);
2534		return;
2535	}
2536#endif
2537	skb->destructor = sock_wfree;
2538	skb_set_hash_from_sk(skb, sk);
2539	/*
2540	 * We used to take a refcount on sk, but following operation
2541	 * is enough to guarantee sk_free() wont free this sock until
2542	 * all in-flight packets are completed
2543	 */
2544	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2545}
2546EXPORT_SYMBOL(skb_set_owner_w);
2547
2548static bool can_skb_orphan_partial(const struct sk_buff *skb)
2549{
2550	/* Drivers depend on in-order delivery for crypto offload,
2551	 * partial orphan breaks out-of-order-OK logic.
2552	 */
2553	if (skb_is_decrypted(skb))
2554		return false;
2555
2556	return (skb->destructor == sock_wfree ||
2557		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2558}
2559
2560/* This helper is used by netem, as it can hold packets in its
2561 * delay queue. We want to allow the owner socket to send more
2562 * packets, as if they were already TX completed by a typical driver.
2563 * But we also want to keep skb->sk set because some packet schedulers
2564 * rely on it (sch_fq for example).
2565 */
2566void skb_orphan_partial(struct sk_buff *skb)
2567{
2568	if (skb_is_tcp_pure_ack(skb))
2569		return;
2570
2571	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2572		return;
2573
2574	skb_orphan(skb);
2575}
2576EXPORT_SYMBOL(skb_orphan_partial);
2577
2578/*
2579 * Read buffer destructor automatically called from kfree_skb.
2580 */
2581void sock_rfree(struct sk_buff *skb)
2582{
2583	struct sock *sk = skb->sk;
2584	unsigned int len = skb->truesize;
2585
2586	atomic_sub(len, &sk->sk_rmem_alloc);
2587	sk_mem_uncharge(sk, len);
2588}
2589EXPORT_SYMBOL(sock_rfree);
2590
2591/*
2592 * Buffer destructor for skbs that are not used directly in read or write
2593 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2594 */
2595void sock_efree(struct sk_buff *skb)
2596{
2597	sock_put(skb->sk);
2598}
2599EXPORT_SYMBOL(sock_efree);
2600
2601/* Buffer destructor for prefetch/receive path where reference count may
2602 * not be held, e.g. for listen sockets.
2603 */
2604#ifdef CONFIG_INET
2605void sock_pfree(struct sk_buff *skb)
2606{
2607	struct sock *sk = skb->sk;
2608
2609	if (!sk_is_refcounted(sk))
2610		return;
2611
2612	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2613		inet_reqsk(sk)->rsk_listener = NULL;
2614		reqsk_free(inet_reqsk(sk));
2615		return;
2616	}
2617
2618	sock_gen_put(sk);
2619}
2620EXPORT_SYMBOL(sock_pfree);
2621#endif /* CONFIG_INET */
2622
2623kuid_t sock_i_uid(struct sock *sk)
2624{
2625	kuid_t uid;
2626
2627	read_lock_bh(&sk->sk_callback_lock);
2628	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2629	read_unlock_bh(&sk->sk_callback_lock);
2630	return uid;
2631}
2632EXPORT_SYMBOL(sock_i_uid);
2633
2634unsigned long __sock_i_ino(struct sock *sk)
2635{
2636	unsigned long ino;
2637
2638	read_lock(&sk->sk_callback_lock);
2639	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2640	read_unlock(&sk->sk_callback_lock);
2641	return ino;
2642}
2643EXPORT_SYMBOL(__sock_i_ino);
2644
2645unsigned long sock_i_ino(struct sock *sk)
2646{
2647	unsigned long ino;
2648
2649	local_bh_disable();
2650	ino = __sock_i_ino(sk);
2651	local_bh_enable();
2652	return ino;
2653}
2654EXPORT_SYMBOL(sock_i_ino);
2655
2656/*
2657 * Allocate a skb from the socket's send buffer.
2658 */
2659struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2660			     gfp_t priority)
2661{
2662	if (force ||
2663	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2664		struct sk_buff *skb = alloc_skb(size, priority);
2665
2666		if (skb) {
2667			skb_set_owner_w(skb, sk);
2668			return skb;
2669		}
2670	}
2671	return NULL;
2672}
2673EXPORT_SYMBOL(sock_wmalloc);
2674
2675static void sock_ofree(struct sk_buff *skb)
2676{
2677	struct sock *sk = skb->sk;
2678
2679	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2680}
2681
2682struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2683			     gfp_t priority)
2684{
2685	struct sk_buff *skb;
2686
2687	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2688	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2689	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2690		return NULL;
2691
2692	skb = alloc_skb(size, priority);
2693	if (!skb)
2694		return NULL;
2695
2696	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2697	skb->sk = sk;
2698	skb->destructor = sock_ofree;
2699	return skb;
2700}
2701
2702/*
2703 * Allocate a memory block from the socket's option memory buffer.
2704 */
2705void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2706{
2707	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2708
2709	if ((unsigned int)size <= optmem_max &&
2710	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2711		void *mem;
2712		/* First do the add, to avoid the race if kmalloc
2713		 * might sleep.
2714		 */
2715		atomic_add(size, &sk->sk_omem_alloc);
2716		mem = kmalloc(size, priority);
2717		if (mem)
2718			return mem;
2719		atomic_sub(size, &sk->sk_omem_alloc);
2720	}
2721	return NULL;
2722}
2723EXPORT_SYMBOL(sock_kmalloc);
2724
2725/* Free an option memory block. Note, we actually want the inline
2726 * here as this allows gcc to detect the nullify and fold away the
2727 * condition entirely.
2728 */
2729static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2730				  const bool nullify)
2731{
2732	if (WARN_ON_ONCE(!mem))
2733		return;
2734	if (nullify)
2735		kfree_sensitive(mem);
2736	else
2737		kfree(mem);
2738	atomic_sub(size, &sk->sk_omem_alloc);
2739}
2740
2741void sock_kfree_s(struct sock *sk, void *mem, int size)
2742{
2743	__sock_kfree_s(sk, mem, size, false);
2744}
2745EXPORT_SYMBOL(sock_kfree_s);
2746
2747void sock_kzfree_s(struct sock *sk, void *mem, int size)
2748{
2749	__sock_kfree_s(sk, mem, size, true);
2750}
2751EXPORT_SYMBOL(sock_kzfree_s);
2752
2753/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2754   I think, these locks should be removed for datagram sockets.
2755 */
2756static long sock_wait_for_wmem(struct sock *sk, long timeo)
2757{
2758	DEFINE_WAIT(wait);
2759
2760	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2761	for (;;) {
2762		if (!timeo)
2763			break;
2764		if (signal_pending(current))
2765			break;
2766		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2767		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2768		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2769			break;
2770		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2771			break;
2772		if (READ_ONCE(sk->sk_err))
2773			break;
2774		timeo = schedule_timeout(timeo);
2775	}
2776	finish_wait(sk_sleep(sk), &wait);
2777	return timeo;
2778}
2779
2780
2781/*
2782 *	Generic send/receive buffer handlers
2783 */
2784
2785struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2786				     unsigned long data_len, int noblock,
2787				     int *errcode, int max_page_order)
2788{
2789	struct sk_buff *skb;
2790	long timeo;
2791	int err;
2792
2793	timeo = sock_sndtimeo(sk, noblock);
2794	for (;;) {
2795		err = sock_error(sk);
2796		if (err != 0)
2797			goto failure;
2798
2799		err = -EPIPE;
2800		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2801			goto failure;
2802
2803		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2804			break;
2805
2806		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2807		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2808		err = -EAGAIN;
2809		if (!timeo)
2810			goto failure;
2811		if (signal_pending(current))
2812			goto interrupted;
2813		timeo = sock_wait_for_wmem(sk, timeo);
2814	}
2815	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2816				   errcode, sk->sk_allocation);
2817	if (skb)
2818		skb_set_owner_w(skb, sk);
2819	return skb;
2820
2821interrupted:
2822	err = sock_intr_errno(timeo);
2823failure:
2824	*errcode = err;
2825	return NULL;
2826}
2827EXPORT_SYMBOL(sock_alloc_send_pskb);
2828
2829int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2830		     struct sockcm_cookie *sockc)
2831{
2832	u32 tsflags;
2833
2834	switch (cmsg->cmsg_type) {
2835	case SO_MARK:
2836		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2837		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2838			return -EPERM;
2839		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2840			return -EINVAL;
2841		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2842		break;
2843	case SO_TIMESTAMPING_OLD:
2844	case SO_TIMESTAMPING_NEW:
2845		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2846			return -EINVAL;
2847
2848		tsflags = *(u32 *)CMSG_DATA(cmsg);
2849		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2850			return -EINVAL;
2851
2852		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2853		sockc->tsflags |= tsflags;
2854		break;
2855	case SCM_TXTIME:
2856		if (!sock_flag(sk, SOCK_TXTIME))
2857			return -EINVAL;
2858		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2859			return -EINVAL;
2860		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2861		break;
2862	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2863	case SCM_RIGHTS:
2864	case SCM_CREDENTIALS:
2865		break;
2866	default:
2867		return -EINVAL;
2868	}
2869	return 0;
2870}
2871EXPORT_SYMBOL(__sock_cmsg_send);
2872
2873int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2874		   struct sockcm_cookie *sockc)
2875{
2876	struct cmsghdr *cmsg;
2877	int ret;
2878
2879	for_each_cmsghdr(cmsg, msg) {
2880		if (!CMSG_OK(msg, cmsg))
2881			return -EINVAL;
2882		if (cmsg->cmsg_level != SOL_SOCKET)
2883			continue;
2884		ret = __sock_cmsg_send(sk, cmsg, sockc);
2885		if (ret)
2886			return ret;
2887	}
2888	return 0;
2889}
2890EXPORT_SYMBOL(sock_cmsg_send);
2891
2892static void sk_enter_memory_pressure(struct sock *sk)
2893{
2894	if (!sk->sk_prot->enter_memory_pressure)
2895		return;
2896
2897	sk->sk_prot->enter_memory_pressure(sk);
2898}
2899
2900static void sk_leave_memory_pressure(struct sock *sk)
2901{
2902	if (sk->sk_prot->leave_memory_pressure) {
2903		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2904				     tcp_leave_memory_pressure, sk);
2905	} else {
2906		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2907
2908		if (memory_pressure && READ_ONCE(*memory_pressure))
2909			WRITE_ONCE(*memory_pressure, 0);
2910	}
2911}
2912
2913DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2914
2915/**
2916 * skb_page_frag_refill - check that a page_frag contains enough room
2917 * @sz: minimum size of the fragment we want to get
2918 * @pfrag: pointer to page_frag
2919 * @gfp: priority for memory allocation
2920 *
2921 * Note: While this allocator tries to use high order pages, there is
2922 * no guarantee that allocations succeed. Therefore, @sz MUST be
2923 * less or equal than PAGE_SIZE.
2924 */
2925bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2926{
2927	if (pfrag->page) {
2928		if (page_ref_count(pfrag->page) == 1) {
2929			pfrag->offset = 0;
2930			return true;
2931		}
2932		if (pfrag->offset + sz <= pfrag->size)
2933			return true;
2934		put_page(pfrag->page);
2935	}
2936
2937	pfrag->offset = 0;
2938	if (SKB_FRAG_PAGE_ORDER &&
2939	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2940		/* Avoid direct reclaim but allow kswapd to wake */
2941		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2942					  __GFP_COMP | __GFP_NOWARN |
2943					  __GFP_NORETRY,
2944					  SKB_FRAG_PAGE_ORDER);
2945		if (likely(pfrag->page)) {
2946			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2947			return true;
2948		}
2949	}
2950	pfrag->page = alloc_page(gfp);
2951	if (likely(pfrag->page)) {
2952		pfrag->size = PAGE_SIZE;
2953		return true;
2954	}
2955	return false;
2956}
2957EXPORT_SYMBOL(skb_page_frag_refill);
2958
2959bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2960{
2961	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2962		return true;
2963
2964	sk_enter_memory_pressure(sk);
2965	sk_stream_moderate_sndbuf(sk);
2966	return false;
2967}
2968EXPORT_SYMBOL(sk_page_frag_refill);
2969
2970void __lock_sock(struct sock *sk)
2971	__releases(&sk->sk_lock.slock)
2972	__acquires(&sk->sk_lock.slock)
2973{
2974	DEFINE_WAIT(wait);
2975
2976	for (;;) {
2977		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2978					TASK_UNINTERRUPTIBLE);
2979		spin_unlock_bh(&sk->sk_lock.slock);
2980		schedule();
2981		spin_lock_bh(&sk->sk_lock.slock);
2982		if (!sock_owned_by_user(sk))
2983			break;
2984	}
2985	finish_wait(&sk->sk_lock.wq, &wait);
2986}
2987
2988void __release_sock(struct sock *sk)
2989	__releases(&sk->sk_lock.slock)
2990	__acquires(&sk->sk_lock.slock)
2991{
2992	struct sk_buff *skb, *next;
2993
2994	while ((skb = sk->sk_backlog.head) != NULL) {
2995		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2996
2997		spin_unlock_bh(&sk->sk_lock.slock);
2998
2999		do {
3000			next = skb->next;
3001			prefetch(next);
3002			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3003			skb_mark_not_on_list(skb);
3004			sk_backlog_rcv(sk, skb);
3005
3006			cond_resched();
3007
3008			skb = next;
3009		} while (skb != NULL);
3010
3011		spin_lock_bh(&sk->sk_lock.slock);
3012	}
3013
3014	/*
3015	 * Doing the zeroing here guarantee we can not loop forever
3016	 * while a wild producer attempts to flood us.
3017	 */
3018	sk->sk_backlog.len = 0;
3019}
3020
3021void __sk_flush_backlog(struct sock *sk)
3022{
3023	spin_lock_bh(&sk->sk_lock.slock);
3024	__release_sock(sk);
3025
3026	if (sk->sk_prot->release_cb)
3027		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3028				     tcp_release_cb, sk);
3029
3030	spin_unlock_bh(&sk->sk_lock.slock);
3031}
3032EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3033
3034/**
3035 * sk_wait_data - wait for data to arrive at sk_receive_queue
3036 * @sk:    sock to wait on
3037 * @timeo: for how long
3038 * @skb:   last skb seen on sk_receive_queue
3039 *
3040 * Now socket state including sk->sk_err is changed only under lock,
3041 * hence we may omit checks after joining wait queue.
3042 * We check receive queue before schedule() only as optimization;
3043 * it is very likely that release_sock() added new data.
3044 */
3045int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3046{
3047	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3048	int rc;
3049
3050	add_wait_queue(sk_sleep(sk), &wait);
3051	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3052	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3053	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3054	remove_wait_queue(sk_sleep(sk), &wait);
3055	return rc;
3056}
3057EXPORT_SYMBOL(sk_wait_data);
3058
3059/**
3060 *	__sk_mem_raise_allocated - increase memory_allocated
3061 *	@sk: socket
3062 *	@size: memory size to allocate
3063 *	@amt: pages to allocate
3064 *	@kind: allocation type
3065 *
3066 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3067 *
3068 *	Unlike the globally shared limits among the sockets under same protocol,
3069 *	consuming the budget of a memcg won't have direct effect on other ones.
3070 *	So be optimistic about memcg's tolerance, and leave the callers to decide
3071 *	whether or not to raise allocated through sk_under_memory_pressure() or
3072 *	its variants.
3073 */
3074int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3075{
3076	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3077	struct proto *prot = sk->sk_prot;
3078	bool charged = false;
3079	long allocated;
3080
3081	sk_memory_allocated_add(sk, amt);
3082	allocated = sk_memory_allocated(sk);
3083
3084	if (memcg) {
3085		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3086			goto suppress_allocation;
3087		charged = true;
3088	}
3089
3090	/* Under limit. */
3091	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3092		sk_leave_memory_pressure(sk);
3093		return 1;
3094	}
3095
3096	/* Under pressure. */
3097	if (allocated > sk_prot_mem_limits(sk, 1))
3098		sk_enter_memory_pressure(sk);
3099
3100	/* Over hard limit. */
3101	if (allocated > sk_prot_mem_limits(sk, 2))
3102		goto suppress_allocation;
3103
3104	/* Guarantee minimum buffer size under pressure (either global
3105	 * or memcg) to make sure features described in RFC 7323 (TCP
3106	 * Extensions for High Performance) work properly.
3107	 *
3108	 * This rule does NOT stand when exceeds global or memcg's hard
3109	 * limit, or else a DoS attack can be taken place by spawning
3110	 * lots of sockets whose usage are under minimum buffer size.
3111	 */
3112	if (kind == SK_MEM_RECV) {
3113		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3114			return 1;
3115
3116	} else { /* SK_MEM_SEND */
3117		int wmem0 = sk_get_wmem0(sk, prot);
3118
3119		if (sk->sk_type == SOCK_STREAM) {
3120			if (sk->sk_wmem_queued < wmem0)
3121				return 1;
3122		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3123				return 1;
3124		}
3125	}
3126
3127	if (sk_has_memory_pressure(sk)) {
3128		u64 alloc;
3129
3130		/* The following 'average' heuristic is within the
3131		 * scope of global accounting, so it only makes
3132		 * sense for global memory pressure.
3133		 */
3134		if (!sk_under_global_memory_pressure(sk))
3135			return 1;
3136
3137		/* Try to be fair among all the sockets under global
3138		 * pressure by allowing the ones that below average
3139		 * usage to raise.
3140		 */
3141		alloc = sk_sockets_allocated_read_positive(sk);
3142		if (sk_prot_mem_limits(sk, 2) > alloc *
3143		    sk_mem_pages(sk->sk_wmem_queued +
3144				 atomic_read(&sk->sk_rmem_alloc) +
3145				 sk->sk_forward_alloc))
3146			return 1;
3147	}
3148
3149suppress_allocation:
3150
3151	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3152		sk_stream_moderate_sndbuf(sk);
3153
3154		/* Fail only if socket is _under_ its sndbuf.
3155		 * In this case we cannot block, so that we have to fail.
3156		 */
3157		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3158			/* Force charge with __GFP_NOFAIL */
3159			if (memcg && !charged) {
3160				mem_cgroup_charge_skmem(memcg, amt,
3161					gfp_memcg_charge() | __GFP_NOFAIL);
3162			}
3163			return 1;
3164		}
3165	}
3166
3167	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3168		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3169
3170	sk_memory_allocated_sub(sk, amt);
3171
3172	if (charged)
3173		mem_cgroup_uncharge_skmem(memcg, amt);
3174
3175	return 0;
3176}
3177
3178/**
3179 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3180 *	@sk: socket
3181 *	@size: memory size to allocate
3182 *	@kind: allocation type
3183 *
3184 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3185 *	rmem allocation. This function assumes that protocols which have
3186 *	memory_pressure use sk_wmem_queued as write buffer accounting.
3187 */
3188int __sk_mem_schedule(struct sock *sk, int size, int kind)
3189{
3190	int ret, amt = sk_mem_pages(size);
3191
3192	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3193	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3194	if (!ret)
3195		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3196	return ret;
3197}
3198EXPORT_SYMBOL(__sk_mem_schedule);
3199
3200/**
3201 *	__sk_mem_reduce_allocated - reclaim memory_allocated
3202 *	@sk: socket
3203 *	@amount: number of quanta
3204 *
3205 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3206 */
3207void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3208{
3209	sk_memory_allocated_sub(sk, amount);
3210
3211	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3212		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3213
3214	if (sk_under_global_memory_pressure(sk) &&
3215	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3216		sk_leave_memory_pressure(sk);
3217}
3218
3219/**
3220 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3221 *	@sk: socket
3222 *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3223 */
3224void __sk_mem_reclaim(struct sock *sk, int amount)
3225{
3226	amount >>= PAGE_SHIFT;
3227	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3228	__sk_mem_reduce_allocated(sk, amount);
3229}
3230EXPORT_SYMBOL(__sk_mem_reclaim);
3231
3232int sk_set_peek_off(struct sock *sk, int val)
3233{
3234	WRITE_ONCE(sk->sk_peek_off, val);
3235	return 0;
3236}
3237EXPORT_SYMBOL_GPL(sk_set_peek_off);
3238
3239/*
3240 * Set of default routines for initialising struct proto_ops when
3241 * the protocol does not support a particular function. In certain
3242 * cases where it makes no sense for a protocol to have a "do nothing"
3243 * function, some default processing is provided.
3244 */
3245
3246int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3247{
3248	return -EOPNOTSUPP;
3249}
3250EXPORT_SYMBOL(sock_no_bind);
3251
3252int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3253		    int len, int flags)
3254{
3255	return -EOPNOTSUPP;
3256}
3257EXPORT_SYMBOL(sock_no_connect);
3258
3259int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3260{
3261	return -EOPNOTSUPP;
3262}
3263EXPORT_SYMBOL(sock_no_socketpair);
3264
3265int sock_no_accept(struct socket *sock, struct socket *newsock,
3266		   struct proto_accept_arg *arg)
3267{
3268	return -EOPNOTSUPP;
3269}
3270EXPORT_SYMBOL(sock_no_accept);
3271
3272int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3273		    int peer)
3274{
3275	return -EOPNOTSUPP;
3276}
3277EXPORT_SYMBOL(sock_no_getname);
3278
3279int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3280{
3281	return -EOPNOTSUPP;
3282}
3283EXPORT_SYMBOL(sock_no_ioctl);
3284
3285int sock_no_listen(struct socket *sock, int backlog)
3286{
3287	return -EOPNOTSUPP;
3288}
3289EXPORT_SYMBOL(sock_no_listen);
3290
3291int sock_no_shutdown(struct socket *sock, int how)
3292{
3293	return -EOPNOTSUPP;
3294}
3295EXPORT_SYMBOL(sock_no_shutdown);
3296
3297int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3298{
3299	return -EOPNOTSUPP;
3300}
3301EXPORT_SYMBOL(sock_no_sendmsg);
3302
3303int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3304{
3305	return -EOPNOTSUPP;
3306}
3307EXPORT_SYMBOL(sock_no_sendmsg_locked);
3308
3309int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3310		    int flags)
3311{
3312	return -EOPNOTSUPP;
3313}
3314EXPORT_SYMBOL(sock_no_recvmsg);
3315
3316int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3317{
3318	/* Mirror missing mmap method error code */
3319	return -ENODEV;
3320}
3321EXPORT_SYMBOL(sock_no_mmap);
3322
3323/*
3324 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3325 * various sock-based usage counts.
3326 */
3327void __receive_sock(struct file *file)
3328{
3329	struct socket *sock;
3330
3331	sock = sock_from_file(file);
3332	if (sock) {
3333		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3334		sock_update_classid(&sock->sk->sk_cgrp_data);
3335	}
3336}
3337
3338/*
3339 *	Default Socket Callbacks
3340 */
3341
3342static void sock_def_wakeup(struct sock *sk)
3343{
3344	struct socket_wq *wq;
3345
3346	rcu_read_lock();
3347	wq = rcu_dereference(sk->sk_wq);
3348	if (skwq_has_sleeper(wq))
3349		wake_up_interruptible_all(&wq->wait);
3350	rcu_read_unlock();
3351}
3352
3353static void sock_def_error_report(struct sock *sk)
3354{
3355	struct socket_wq *wq;
3356
3357	rcu_read_lock();
3358	wq = rcu_dereference(sk->sk_wq);
3359	if (skwq_has_sleeper(wq))
3360		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3361	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3362	rcu_read_unlock();
3363}
3364
3365void sock_def_readable(struct sock *sk)
3366{
3367	struct socket_wq *wq;
3368
3369	trace_sk_data_ready(sk);
3370
3371	rcu_read_lock();
3372	wq = rcu_dereference(sk->sk_wq);
3373	if (skwq_has_sleeper(wq))
3374		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3375						EPOLLRDNORM | EPOLLRDBAND);
3376	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3377	rcu_read_unlock();
3378}
3379
3380static void sock_def_write_space(struct sock *sk)
3381{
3382	struct socket_wq *wq;
3383
3384	rcu_read_lock();
3385
3386	/* Do not wake up a writer until he can make "significant"
3387	 * progress.  --DaveM
3388	 */
3389	if (sock_writeable(sk)) {
3390		wq = rcu_dereference(sk->sk_wq);
3391		if (skwq_has_sleeper(wq))
3392			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3393						EPOLLWRNORM | EPOLLWRBAND);
3394
3395		/* Should agree with poll, otherwise some programs break */
3396		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3397	}
3398
3399	rcu_read_unlock();
3400}
3401
3402/* An optimised version of sock_def_write_space(), should only be called
3403 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3404 * ->sk_wmem_alloc.
3405 */
3406static void sock_def_write_space_wfree(struct sock *sk)
3407{
3408	/* Do not wake up a writer until he can make "significant"
3409	 * progress.  --DaveM
3410	 */
3411	if (sock_writeable(sk)) {
3412		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3413
3414		/* rely on refcount_sub from sock_wfree() */
3415		smp_mb__after_atomic();
3416		if (wq && waitqueue_active(&wq->wait))
3417			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3418						EPOLLWRNORM | EPOLLWRBAND);
3419
3420		/* Should agree with poll, otherwise some programs break */
3421		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3422	}
3423}
3424
3425static void sock_def_destruct(struct sock *sk)
3426{
3427}
3428
3429void sk_send_sigurg(struct sock *sk)
3430{
3431	if (sk->sk_socket && sk->sk_socket->file)
3432		if (send_sigurg(&sk->sk_socket->file->f_owner))
3433			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3434}
3435EXPORT_SYMBOL(sk_send_sigurg);
3436
3437void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3438		    unsigned long expires)
3439{
3440	if (!mod_timer(timer, expires))
3441		sock_hold(sk);
3442}
3443EXPORT_SYMBOL(sk_reset_timer);
3444
3445void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3446{
3447	if (del_timer(timer))
3448		__sock_put(sk);
3449}
3450EXPORT_SYMBOL(sk_stop_timer);
3451
3452void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3453{
3454	if (del_timer_sync(timer))
3455		__sock_put(sk);
3456}
3457EXPORT_SYMBOL(sk_stop_timer_sync);
3458
3459void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3460{
3461	sk_init_common(sk);
3462	sk->sk_send_head	=	NULL;
3463
3464	timer_setup(&sk->sk_timer, NULL, 0);
3465
3466	sk->sk_allocation	=	GFP_KERNEL;
3467	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3468	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3469	sk->sk_state		=	TCP_CLOSE;
3470	sk->sk_use_task_frag	=	true;
3471	sk_set_socket(sk, sock);
3472
3473	sock_set_flag(sk, SOCK_ZAPPED);
3474
3475	if (sock) {
3476		sk->sk_type	=	sock->type;
3477		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3478		sock->sk	=	sk;
3479	} else {
3480		RCU_INIT_POINTER(sk->sk_wq, NULL);
3481	}
3482	sk->sk_uid	=	uid;
3483
3484	sk->sk_state_change	=	sock_def_wakeup;
3485	sk->sk_data_ready	=	sock_def_readable;
3486	sk->sk_write_space	=	sock_def_write_space;
3487	sk->sk_error_report	=	sock_def_error_report;
3488	sk->sk_destruct		=	sock_def_destruct;
3489
3490	sk->sk_frag.page	=	NULL;
3491	sk->sk_frag.offset	=	0;
3492	sk->sk_peek_off		=	-1;
3493
3494	sk->sk_peer_pid 	=	NULL;
3495	sk->sk_peer_cred	=	NULL;
3496	spin_lock_init(&sk->sk_peer_lock);
3497
3498	sk->sk_write_pending	=	0;
3499	sk->sk_rcvlowat		=	1;
3500	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3501	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3502
3503	sk->sk_stamp = SK_DEFAULT_STAMP;
3504#if BITS_PER_LONG==32
3505	seqlock_init(&sk->sk_stamp_seq);
3506#endif
3507	atomic_set(&sk->sk_zckey, 0);
3508
3509#ifdef CONFIG_NET_RX_BUSY_POLL
3510	sk->sk_napi_id		=	0;
3511	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3512#endif
3513
3514	sk->sk_max_pacing_rate = ~0UL;
3515	sk->sk_pacing_rate = ~0UL;
3516	WRITE_ONCE(sk->sk_pacing_shift, 10);
3517	sk->sk_incoming_cpu = -1;
3518
3519	sk_rx_queue_clear(sk);
3520	/*
3521	 * Before updating sk_refcnt, we must commit prior changes to memory
3522	 * (Documentation/RCU/rculist_nulls.rst for details)
3523	 */
3524	smp_wmb();
3525	refcount_set(&sk->sk_refcnt, 1);
3526	atomic_set(&sk->sk_drops, 0);
3527}
3528EXPORT_SYMBOL(sock_init_data_uid);
3529
3530void sock_init_data(struct socket *sock, struct sock *sk)
3531{
3532	kuid_t uid = sock ?
3533		SOCK_INODE(sock)->i_uid :
3534		make_kuid(sock_net(sk)->user_ns, 0);
3535
3536	sock_init_data_uid(sock, sk, uid);
3537}
3538EXPORT_SYMBOL(sock_init_data);
3539
3540void lock_sock_nested(struct sock *sk, int subclass)
3541{
3542	/* The sk_lock has mutex_lock() semantics here. */
3543	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3544
3545	might_sleep();
3546	spin_lock_bh(&sk->sk_lock.slock);
3547	if (sock_owned_by_user_nocheck(sk))
3548		__lock_sock(sk);
3549	sk->sk_lock.owned = 1;
3550	spin_unlock_bh(&sk->sk_lock.slock);
3551}
3552EXPORT_SYMBOL(lock_sock_nested);
3553
3554void release_sock(struct sock *sk)
3555{
3556	spin_lock_bh(&sk->sk_lock.slock);
3557	if (sk->sk_backlog.tail)
3558		__release_sock(sk);
3559
3560	if (sk->sk_prot->release_cb)
3561		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3562				     tcp_release_cb, sk);
3563
3564	sock_release_ownership(sk);
3565	if (waitqueue_active(&sk->sk_lock.wq))
3566		wake_up(&sk->sk_lock.wq);
3567	spin_unlock_bh(&sk->sk_lock.slock);
3568}
3569EXPORT_SYMBOL(release_sock);
3570
3571bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3572{
3573	might_sleep();
3574	spin_lock_bh(&sk->sk_lock.slock);
3575
3576	if (!sock_owned_by_user_nocheck(sk)) {
3577		/*
3578		 * Fast path return with bottom halves disabled and
3579		 * sock::sk_lock.slock held.
3580		 *
3581		 * The 'mutex' is not contended and holding
3582		 * sock::sk_lock.slock prevents all other lockers to
3583		 * proceed so the corresponding unlock_sock_fast() can
3584		 * avoid the slow path of release_sock() completely and
3585		 * just release slock.
3586		 *
3587		 * From a semantical POV this is equivalent to 'acquiring'
3588		 * the 'mutex', hence the corresponding lockdep
3589		 * mutex_release() has to happen in the fast path of
3590		 * unlock_sock_fast().
3591		 */
3592		return false;
3593	}
3594
3595	__lock_sock(sk);
3596	sk->sk_lock.owned = 1;
3597	__acquire(&sk->sk_lock.slock);
3598	spin_unlock_bh(&sk->sk_lock.slock);
3599	return true;
3600}
3601EXPORT_SYMBOL(__lock_sock_fast);
3602
3603int sock_gettstamp(struct socket *sock, void __user *userstamp,
3604		   bool timeval, bool time32)
3605{
3606	struct sock *sk = sock->sk;
3607	struct timespec64 ts;
3608
3609	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3610	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3611	if (ts.tv_sec == -1)
3612		return -ENOENT;
3613	if (ts.tv_sec == 0) {
3614		ktime_t kt = ktime_get_real();
3615		sock_write_timestamp(sk, kt);
3616		ts = ktime_to_timespec64(kt);
3617	}
3618
3619	if (timeval)
3620		ts.tv_nsec /= 1000;
3621
3622#ifdef CONFIG_COMPAT_32BIT_TIME
3623	if (time32)
3624		return put_old_timespec32(&ts, userstamp);
3625#endif
3626#ifdef CONFIG_SPARC64
3627	/* beware of padding in sparc64 timeval */
3628	if (timeval && !in_compat_syscall()) {
3629		struct __kernel_old_timeval __user tv = {
3630			.tv_sec = ts.tv_sec,
3631			.tv_usec = ts.tv_nsec,
3632		};
3633		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3634			return -EFAULT;
3635		return 0;
3636	}
3637#endif
3638	return put_timespec64(&ts, userstamp);
3639}
3640EXPORT_SYMBOL(sock_gettstamp);
3641
3642void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3643{
3644	if (!sock_flag(sk, flag)) {
3645		unsigned long previous_flags = sk->sk_flags;
3646
3647		sock_set_flag(sk, flag);
3648		/*
3649		 * we just set one of the two flags which require net
3650		 * time stamping, but time stamping might have been on
3651		 * already because of the other one
3652		 */
3653		if (sock_needs_netstamp(sk) &&
3654		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3655			net_enable_timestamp();
3656	}
3657}
3658
3659int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3660		       int level, int type)
3661{
3662	struct sock_exterr_skb *serr;
3663	struct sk_buff *skb;
3664	int copied, err;
3665
3666	err = -EAGAIN;
3667	skb = sock_dequeue_err_skb(sk);
3668	if (skb == NULL)
3669		goto out;
3670
3671	copied = skb->len;
3672	if (copied > len) {
3673		msg->msg_flags |= MSG_TRUNC;
3674		copied = len;
3675	}
3676	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3677	if (err)
3678		goto out_free_skb;
3679
3680	sock_recv_timestamp(msg, sk, skb);
3681
3682	serr = SKB_EXT_ERR(skb);
3683	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3684
3685	msg->msg_flags |= MSG_ERRQUEUE;
3686	err = copied;
3687
3688out_free_skb:
3689	kfree_skb(skb);
3690out:
3691	return err;
3692}
3693EXPORT_SYMBOL(sock_recv_errqueue);
3694
3695/*
3696 *	Get a socket option on an socket.
3697 *
3698 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3699 *	asynchronous errors should be reported by getsockopt. We assume
3700 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3701 */
3702int sock_common_getsockopt(struct socket *sock, int level, int optname,
3703			   char __user *optval, int __user *optlen)
3704{
3705	struct sock *sk = sock->sk;
3706
3707	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3708	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3709}
3710EXPORT_SYMBOL(sock_common_getsockopt);
3711
3712int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3713			int flags)
3714{
3715	struct sock *sk = sock->sk;
3716	int addr_len = 0;
3717	int err;
3718
3719	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3720	if (err >= 0)
3721		msg->msg_namelen = addr_len;
3722	return err;
3723}
3724EXPORT_SYMBOL(sock_common_recvmsg);
3725
3726/*
3727 *	Set socket options on an inet socket.
3728 */
3729int sock_common_setsockopt(struct socket *sock, int level, int optname,
3730			   sockptr_t optval, unsigned int optlen)
3731{
3732	struct sock *sk = sock->sk;
3733
3734	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3735	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3736}
3737EXPORT_SYMBOL(sock_common_setsockopt);
3738
3739void sk_common_release(struct sock *sk)
3740{
3741	if (sk->sk_prot->destroy)
3742		sk->sk_prot->destroy(sk);
3743
3744	/*
3745	 * Observation: when sk_common_release is called, processes have
3746	 * no access to socket. But net still has.
3747	 * Step one, detach it from networking:
3748	 *
3749	 * A. Remove from hash tables.
3750	 */
3751
3752	sk->sk_prot->unhash(sk);
3753
3754	if (sk->sk_socket)
3755		sk->sk_socket->sk = NULL;
3756
3757	/*
3758	 * In this point socket cannot receive new packets, but it is possible
3759	 * that some packets are in flight because some CPU runs receiver and
3760	 * did hash table lookup before we unhashed socket. They will achieve
3761	 * receive queue and will be purged by socket destructor.
3762	 *
3763	 * Also we still have packets pending on receive queue and probably,
3764	 * our own packets waiting in device queues. sock_destroy will drain
3765	 * receive queue, but transmitted packets will delay socket destruction
3766	 * until the last reference will be released.
3767	 */
3768
3769	sock_orphan(sk);
3770
3771	xfrm_sk_free_policy(sk);
3772
3773	sock_put(sk);
3774}
3775EXPORT_SYMBOL(sk_common_release);
3776
3777void sk_get_meminfo(const struct sock *sk, u32 *mem)
3778{
3779	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3780
3781	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3782	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3783	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3784	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3785	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3786	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3787	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3788	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3789	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3790}
3791
3792#ifdef CONFIG_PROC_FS
3793static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3794
3795int sock_prot_inuse_get(struct net *net, struct proto *prot)
3796{
3797	int cpu, idx = prot->inuse_idx;
3798	int res = 0;
3799
3800	for_each_possible_cpu(cpu)
3801		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3802
3803	return res >= 0 ? res : 0;
3804}
3805EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3806
3807int sock_inuse_get(struct net *net)
3808{
3809	int cpu, res = 0;
3810
3811	for_each_possible_cpu(cpu)
3812		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3813
3814	return res;
3815}
3816
3817EXPORT_SYMBOL_GPL(sock_inuse_get);
3818
3819static int __net_init sock_inuse_init_net(struct net *net)
3820{
3821	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3822	if (net->core.prot_inuse == NULL)
3823		return -ENOMEM;
3824	return 0;
3825}
3826
3827static void __net_exit sock_inuse_exit_net(struct net *net)
3828{
3829	free_percpu(net->core.prot_inuse);
3830}
3831
3832static struct pernet_operations net_inuse_ops = {
3833	.init = sock_inuse_init_net,
3834	.exit = sock_inuse_exit_net,
3835};
3836
3837static __init int net_inuse_init(void)
3838{
3839	if (register_pernet_subsys(&net_inuse_ops))
3840		panic("Cannot initialize net inuse counters");
3841
3842	return 0;
3843}
3844
3845core_initcall(net_inuse_init);
3846
3847static int assign_proto_idx(struct proto *prot)
3848{
3849	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3850
3851	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3852		pr_err("PROTO_INUSE_NR exhausted\n");
3853		return -ENOSPC;
3854	}
3855
3856	set_bit(prot->inuse_idx, proto_inuse_idx);
3857	return 0;
3858}
3859
3860static void release_proto_idx(struct proto *prot)
3861{
3862	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3863		clear_bit(prot->inuse_idx, proto_inuse_idx);
3864}
3865#else
3866static inline int assign_proto_idx(struct proto *prot)
3867{
3868	return 0;
3869}
3870
3871static inline void release_proto_idx(struct proto *prot)
3872{
3873}
3874
3875#endif
3876
3877static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3878{
3879	if (!twsk_prot)
3880		return;
3881	kfree(twsk_prot->twsk_slab_name);
3882	twsk_prot->twsk_slab_name = NULL;
3883	kmem_cache_destroy(twsk_prot->twsk_slab);
3884	twsk_prot->twsk_slab = NULL;
3885}
3886
3887static int tw_prot_init(const struct proto *prot)
3888{
3889	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3890
3891	if (!twsk_prot)
3892		return 0;
3893
3894	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3895					      prot->name);
3896	if (!twsk_prot->twsk_slab_name)
3897		return -ENOMEM;
3898
3899	twsk_prot->twsk_slab =
3900		kmem_cache_create(twsk_prot->twsk_slab_name,
3901				  twsk_prot->twsk_obj_size, 0,
3902				  SLAB_ACCOUNT | prot->slab_flags,
3903				  NULL);
3904	if (!twsk_prot->twsk_slab) {
3905		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3906			prot->name);
3907		return -ENOMEM;
3908	}
3909
3910	return 0;
3911}
3912
3913static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3914{
3915	if (!rsk_prot)
3916		return;
3917	kfree(rsk_prot->slab_name);
3918	rsk_prot->slab_name = NULL;
3919	kmem_cache_destroy(rsk_prot->slab);
3920	rsk_prot->slab = NULL;
3921}
3922
3923static int req_prot_init(const struct proto *prot)
3924{
3925	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3926
3927	if (!rsk_prot)
3928		return 0;
3929
3930	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3931					prot->name);
3932	if (!rsk_prot->slab_name)
3933		return -ENOMEM;
3934
3935	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3936					   rsk_prot->obj_size, 0,
3937					   SLAB_ACCOUNT | prot->slab_flags,
3938					   NULL);
3939
3940	if (!rsk_prot->slab) {
3941		pr_crit("%s: Can't create request sock SLAB cache!\n",
3942			prot->name);
3943		return -ENOMEM;
3944	}
3945	return 0;
3946}
3947
3948int proto_register(struct proto *prot, int alloc_slab)
3949{
3950	int ret = -ENOBUFS;
3951
3952	if (prot->memory_allocated && !prot->sysctl_mem) {
3953		pr_err("%s: missing sysctl_mem\n", prot->name);
3954		return -EINVAL;
3955	}
3956	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3957		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3958		return -EINVAL;
3959	}
3960	if (alloc_slab) {
3961		prot->slab = kmem_cache_create_usercopy(prot->name,
3962					prot->obj_size, 0,
3963					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3964					prot->slab_flags,
3965					prot->useroffset, prot->usersize,
3966					NULL);
3967
3968		if (prot->slab == NULL) {
3969			pr_crit("%s: Can't create sock SLAB cache!\n",
3970				prot->name);
3971			goto out;
3972		}
3973
3974		if (req_prot_init(prot))
3975			goto out_free_request_sock_slab;
3976
3977		if (tw_prot_init(prot))
3978			goto out_free_timewait_sock_slab;
3979	}
3980
3981	mutex_lock(&proto_list_mutex);
3982	ret = assign_proto_idx(prot);
3983	if (ret) {
3984		mutex_unlock(&proto_list_mutex);
3985		goto out_free_timewait_sock_slab;
3986	}
3987	list_add(&prot->node, &proto_list);
3988	mutex_unlock(&proto_list_mutex);
3989	return ret;
3990
3991out_free_timewait_sock_slab:
3992	if (alloc_slab)
3993		tw_prot_cleanup(prot->twsk_prot);
3994out_free_request_sock_slab:
3995	if (alloc_slab) {
3996		req_prot_cleanup(prot->rsk_prot);
3997
3998		kmem_cache_destroy(prot->slab);
3999		prot->slab = NULL;
4000	}
4001out:
4002	return ret;
4003}
4004EXPORT_SYMBOL(proto_register);
4005
4006void proto_unregister(struct proto *prot)
4007{
4008	mutex_lock(&proto_list_mutex);
4009	release_proto_idx(prot);
4010	list_del(&prot->node);
4011	mutex_unlock(&proto_list_mutex);
4012
4013	kmem_cache_destroy(prot->slab);
4014	prot->slab = NULL;
4015
4016	req_prot_cleanup(prot->rsk_prot);
4017	tw_prot_cleanup(prot->twsk_prot);
4018}
4019EXPORT_SYMBOL(proto_unregister);
4020
4021int sock_load_diag_module(int family, int protocol)
4022{
4023	if (!protocol) {
4024		if (!sock_is_registered(family))
4025			return -ENOENT;
4026
4027		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4028				      NETLINK_SOCK_DIAG, family);
4029	}
4030
4031#ifdef CONFIG_INET
4032	if (family == AF_INET &&
4033	    protocol != IPPROTO_RAW &&
4034	    protocol < MAX_INET_PROTOS &&
4035	    !rcu_access_pointer(inet_protos[protocol]))
4036		return -ENOENT;
4037#endif
4038
4039	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4040			      NETLINK_SOCK_DIAG, family, protocol);
4041}
4042EXPORT_SYMBOL(sock_load_diag_module);
4043
4044#ifdef CONFIG_PROC_FS
4045static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4046	__acquires(proto_list_mutex)
4047{
4048	mutex_lock(&proto_list_mutex);
4049	return seq_list_start_head(&proto_list, *pos);
4050}
4051
4052static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4053{
4054	return seq_list_next(v, &proto_list, pos);
4055}
4056
4057static void proto_seq_stop(struct seq_file *seq, void *v)
4058	__releases(proto_list_mutex)
4059{
4060	mutex_unlock(&proto_list_mutex);
4061}
4062
4063static char proto_method_implemented(const void *method)
4064{
4065	return method == NULL ? 'n' : 'y';
4066}
4067static long sock_prot_memory_allocated(struct proto *proto)
4068{
4069	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4070}
4071
4072static const char *sock_prot_memory_pressure(struct proto *proto)
4073{
4074	return proto->memory_pressure != NULL ?
4075	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4076}
4077
4078static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4079{
4080
4081	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4082			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4083		   proto->name,
4084		   proto->obj_size,
4085		   sock_prot_inuse_get(seq_file_net(seq), proto),
4086		   sock_prot_memory_allocated(proto),
4087		   sock_prot_memory_pressure(proto),
4088		   proto->max_header,
4089		   proto->slab == NULL ? "no" : "yes",
4090		   module_name(proto->owner),
4091		   proto_method_implemented(proto->close),
4092		   proto_method_implemented(proto->connect),
4093		   proto_method_implemented(proto->disconnect),
4094		   proto_method_implemented(proto->accept),
4095		   proto_method_implemented(proto->ioctl),
4096		   proto_method_implemented(proto->init),
4097		   proto_method_implemented(proto->destroy),
4098		   proto_method_implemented(proto->shutdown),
4099		   proto_method_implemented(proto->setsockopt),
4100		   proto_method_implemented(proto->getsockopt),
4101		   proto_method_implemented(proto->sendmsg),
4102		   proto_method_implemented(proto->recvmsg),
4103		   proto_method_implemented(proto->bind),
4104		   proto_method_implemented(proto->backlog_rcv),
4105		   proto_method_implemented(proto->hash),
4106		   proto_method_implemented(proto->unhash),
4107		   proto_method_implemented(proto->get_port),
4108		   proto_method_implemented(proto->enter_memory_pressure));
4109}
4110
4111static int proto_seq_show(struct seq_file *seq, void *v)
4112{
4113	if (v == &proto_list)
4114		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4115			   "protocol",
4116			   "size",
4117			   "sockets",
4118			   "memory",
4119			   "press",
4120			   "maxhdr",
4121			   "slab",
4122			   "module",
4123			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4124	else
4125		proto_seq_printf(seq, list_entry(v, struct proto, node));
4126	return 0;
4127}
4128
4129static const struct seq_operations proto_seq_ops = {
4130	.start  = proto_seq_start,
4131	.next   = proto_seq_next,
4132	.stop   = proto_seq_stop,
4133	.show   = proto_seq_show,
4134};
4135
4136static __net_init int proto_init_net(struct net *net)
4137{
4138	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4139			sizeof(struct seq_net_private)))
4140		return -ENOMEM;
4141
4142	return 0;
4143}
4144
4145static __net_exit void proto_exit_net(struct net *net)
4146{
4147	remove_proc_entry("protocols", net->proc_net);
4148}
4149
4150
4151static __net_initdata struct pernet_operations proto_net_ops = {
4152	.init = proto_init_net,
4153	.exit = proto_exit_net,
4154};
4155
4156static int __init proto_init(void)
4157{
4158	return register_pernet_subsys(&proto_net_ops);
4159}
4160
4161subsys_initcall(proto_init);
4162
4163#endif /* PROC_FS */
4164
4165#ifdef CONFIG_NET_RX_BUSY_POLL
4166bool sk_busy_loop_end(void *p, unsigned long start_time)
4167{
4168	struct sock *sk = p;
4169
4170	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4171		return true;
4172
4173	if (sk_is_udp(sk) &&
4174	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4175		return true;
4176
4177	return sk_busy_loop_timeout(sk, start_time);
4178}
4179EXPORT_SYMBOL(sk_busy_loop_end);
4180#endif /* CONFIG_NET_RX_BUSY_POLL */
4181
4182int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4183{
4184	if (!sk->sk_prot->bind_add)
4185		return -EOPNOTSUPP;
4186	return sk->sk_prot->bind_add(sk, addr, addr_len);
4187}
4188EXPORT_SYMBOL(sock_bind_add);
4189
4190/* Copy 'size' bytes from userspace and return `size` back to userspace */
4191int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4192		     void __user *arg, void *karg, size_t size)
4193{
4194	int ret;
4195
4196	if (copy_from_user(karg, arg, size))
4197		return -EFAULT;
4198
4199	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4200	if (ret)
4201		return ret;
4202
4203	if (copy_to_user(arg, karg, size))
4204		return -EFAULT;
4205
4206	return 0;
4207}
4208EXPORT_SYMBOL(sock_ioctl_inout);
4209
4210/* This is the most common ioctl prep function, where the result (4 bytes) is
4211 * copied back to userspace if the ioctl() returns successfully. No input is
4212 * copied from userspace as input argument.
4213 */
4214static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4215{
4216	int ret, karg = 0;
4217
4218	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4219	if (ret)
4220		return ret;
4221
4222	return put_user(karg, (int __user *)arg);
4223}
4224
4225/* A wrapper around sock ioctls, which copies the data from userspace
4226 * (depending on the protocol/ioctl), and copies back the result to userspace.
4227 * The main motivation for this function is to pass kernel memory to the
4228 * protocol ioctl callbacks, instead of userspace memory.
4229 */
4230int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4231{
4232	int rc = 1;
4233
4234	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4235		rc = ipmr_sk_ioctl(sk, cmd, arg);
4236	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4237		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4238	else if (sk_is_phonet(sk))
4239		rc = phonet_sk_ioctl(sk, cmd, arg);
4240
4241	/* If ioctl was processed, returns its value */
4242	if (rc <= 0)
4243		return rc;
4244
4245	/* Otherwise call the default handler */
4246	return sock_ioctl_out(sk, cmd, arg);
4247}
4248EXPORT_SYMBOL(sk_ioctl);
4249
4250static int __init sock_struct_check(void)
4251{
4252	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4253	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4254	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4255	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4256	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4257
4258	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4259	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4260	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4261	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4262	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4263	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4264	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4265	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4266	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4267
4268	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4269	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4270	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4271
4272	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4273	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4274	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4275	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4276
4277	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4278	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4279	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4280	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4281	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4282	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4283	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4284	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4285	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4286	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4287	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4288	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4289	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4290	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4291	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4292	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4293
4294	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4295	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4296	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4297	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4298	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4299	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4300	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4301	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4302	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4303	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4304	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4305	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4306	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4307	return 0;
4308}
4309
4310core_initcall(sock_struct_check);