net/core/sock.c at v6.4-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / core / sock.c
at v6.4-rc1 4108 lines 100 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		Generic socket support routines. Memory allocators, socket lock/release
   8 *		handler for protocols to use and generic option handler.
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 */
  85
  86#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88#include <asm/unaligned.h>
  89#include <linux/capability.h>
  90#include <linux/errno.h>
  91#include <linux/errqueue.h>
  92#include <linux/types.h>
  93#include <linux/socket.h>
  94#include <linux/in.h>
  95#include <linux/kernel.h>
  96#include <linux/module.h>
  97#include <linux/proc_fs.h>
  98#include <linux/seq_file.h>
  99#include <linux/sched.h>
 100#include <linux/sched/mm.h>
 101#include <linux/timer.h>
 102#include <linux/string.h>
 103#include <linux/sockios.h>
 104#include <linux/net.h>
 105#include <linux/mm.h>
 106#include <linux/slab.h>
 107#include <linux/interrupt.h>
 108#include <linux/poll.h>
 109#include <linux/tcp.h>
 110#include <linux/init.h>
 111#include <linux/highmem.h>
 112#include <linux/user_namespace.h>
 113#include <linux/static_key.h>
 114#include <linux/memcontrol.h>
 115#include <linux/prefetch.h>
 116#include <linux/compat.h>
 117
 118#include <linux/uaccess.h>
 119
 120#include <linux/netdevice.h>
 121#include <net/protocol.h>
 122#include <linux/skbuff.h>
 123#include <net/net_namespace.h>
 124#include <net/request_sock.h>
 125#include <net/sock.h>
 126#include <linux/net_tstamp.h>
 127#include <net/xfrm.h>
 128#include <linux/ipsec.h>
 129#include <net/cls_cgroup.h>
 130#include <net/netprio_cgroup.h>
 131#include <linux/sock_diag.h>
 132
 133#include <linux/filter.h>
 134#include <net/sock_reuseport.h>
 135#include <net/bpf_sk_storage.h>
 136
 137#include <trace/events/sock.h>
 138
 139#include <net/tcp.h>
 140#include <net/busy_poll.h>
 141
 142#include <linux/ethtool.h>
 143
 144#include "dev.h"
 145
 146static DEFINE_MUTEX(proto_list_mutex);
 147static LIST_HEAD(proto_list);
 148
 149static void sock_def_write_space_wfree(struct sock *sk);
 150static void sock_def_write_space(struct sock *sk);
 151
 152/**
 153 * sk_ns_capable - General socket capability test
 154 * @sk: Socket to use a capability on or through
 155 * @user_ns: The user namespace of the capability to use
 156 * @cap: The capability to use
 157 *
 158 * Test to see if the opener of the socket had when the socket was
 159 * created and the current process has the capability @cap in the user
 160 * namespace @user_ns.
 161 */
 162bool sk_ns_capable(const struct sock *sk,
 163		   struct user_namespace *user_ns, int cap)
 164{
 165	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 166		ns_capable(user_ns, cap);
 167}
 168EXPORT_SYMBOL(sk_ns_capable);
 169
 170/**
 171 * sk_capable - Socket global capability test
 172 * @sk: Socket to use a capability on or through
 173 * @cap: The global capability to use
 174 *
 175 * Test to see if the opener of the socket had when the socket was
 176 * created and the current process has the capability @cap in all user
 177 * namespaces.
 178 */
 179bool sk_capable(const struct sock *sk, int cap)
 180{
 181	return sk_ns_capable(sk, &init_user_ns, cap);
 182}
 183EXPORT_SYMBOL(sk_capable);
 184
 185/**
 186 * sk_net_capable - Network namespace socket capability test
 187 * @sk: Socket to use a capability on or through
 188 * @cap: The capability to use
 189 *
 190 * Test to see if the opener of the socket had when the socket was created
 191 * and the current process has the capability @cap over the network namespace
 192 * the socket is a member of.
 193 */
 194bool sk_net_capable(const struct sock *sk, int cap)
 195{
 196	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 197}
 198EXPORT_SYMBOL(sk_net_capable);
 199
 200/*
 201 * Each address family might have different locking rules, so we have
 202 * one slock key per address family and separate keys for internal and
 203 * userspace sockets.
 204 */
 205static struct lock_class_key af_family_keys[AF_MAX];
 206static struct lock_class_key af_family_kern_keys[AF_MAX];
 207static struct lock_class_key af_family_slock_keys[AF_MAX];
 208static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 209
 210/*
 211 * Make lock validator output more readable. (we pre-construct these
 212 * strings build-time, so that runtime initialization of socket
 213 * locks is fast):
 214 */
 215
 216#define _sock_locks(x)						  \
 217  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 218  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 219  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 220  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 221  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 222  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 223  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 224  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 225  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 226  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 227  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 228  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 229  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 230  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 231  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 232  x "AF_MCTP"  , \
 233  x "AF_MAX"
 234
 235static const char *const af_family_key_strings[AF_MAX+1] = {
 236	_sock_locks("sk_lock-")
 237};
 238static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 239	_sock_locks("slock-")
 240};
 241static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 242	_sock_locks("clock-")
 243};
 244
 245static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 246	_sock_locks("k-sk_lock-")
 247};
 248static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 249	_sock_locks("k-slock-")
 250};
 251static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 252	_sock_locks("k-clock-")
 253};
 254static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 255	_sock_locks("rlock-")
 256};
 257static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 258	_sock_locks("wlock-")
 259};
 260static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 261	_sock_locks("elock-")
 262};
 263
 264/*
 265 * sk_callback_lock and sk queues locking rules are per-address-family,
 266 * so split the lock classes by using a per-AF key:
 267 */
 268static struct lock_class_key af_callback_keys[AF_MAX];
 269static struct lock_class_key af_rlock_keys[AF_MAX];
 270static struct lock_class_key af_wlock_keys[AF_MAX];
 271static struct lock_class_key af_elock_keys[AF_MAX];
 272static struct lock_class_key af_kern_callback_keys[AF_MAX];
 273
 274/* Run time adjustable parameters. */
 275__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 276EXPORT_SYMBOL(sysctl_wmem_max);
 277__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 278EXPORT_SYMBOL(sysctl_rmem_max);
 279__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 280__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 281
 282/* Maximal space eaten by iovec or ancillary data plus some space */
 283int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 284EXPORT_SYMBOL(sysctl_optmem_max);
 285
 286int sysctl_tstamp_allow_data __read_mostly = 1;
 287
 288DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 289EXPORT_SYMBOL_GPL(memalloc_socks_key);
 290
 291/**
 292 * sk_set_memalloc - sets %SOCK_MEMALLOC
 293 * @sk: socket to set it on
 294 *
 295 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 296 * It's the responsibility of the admin to adjust min_free_kbytes
 297 * to meet the requirements
 298 */
 299void sk_set_memalloc(struct sock *sk)
 300{
 301	sock_set_flag(sk, SOCK_MEMALLOC);
 302	sk->sk_allocation |= __GFP_MEMALLOC;
 303	static_branch_inc(&memalloc_socks_key);
 304}
 305EXPORT_SYMBOL_GPL(sk_set_memalloc);
 306
 307void sk_clear_memalloc(struct sock *sk)
 308{
 309	sock_reset_flag(sk, SOCK_MEMALLOC);
 310	sk->sk_allocation &= ~__GFP_MEMALLOC;
 311	static_branch_dec(&memalloc_socks_key);
 312
 313	/*
 314	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 315	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 316	 * it has rmem allocations due to the last swapfile being deactivated
 317	 * but there is a risk that the socket is unusable due to exceeding
 318	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 319	 */
 320	sk_mem_reclaim(sk);
 321}
 322EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 323
 324int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 325{
 326	int ret;
 327	unsigned int noreclaim_flag;
 328
 329	/* these should have been dropped before queueing */
 330	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 331
 332	noreclaim_flag = memalloc_noreclaim_save();
 333	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 334				 tcp_v6_do_rcv,
 335				 tcp_v4_do_rcv,
 336				 sk, skb);
 337	memalloc_noreclaim_restore(noreclaim_flag);
 338
 339	return ret;
 340}
 341EXPORT_SYMBOL(__sk_backlog_rcv);
 342
 343void sk_error_report(struct sock *sk)
 344{
 345	sk->sk_error_report(sk);
 346
 347	switch (sk->sk_family) {
 348	case AF_INET:
 349		fallthrough;
 350	case AF_INET6:
 351		trace_inet_sk_error_report(sk);
 352		break;
 353	default:
 354		break;
 355	}
 356}
 357EXPORT_SYMBOL(sk_error_report);
 358
 359int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 360{
 361	struct __kernel_sock_timeval tv;
 362
 363	if (timeo == MAX_SCHEDULE_TIMEOUT) {
 364		tv.tv_sec = 0;
 365		tv.tv_usec = 0;
 366	} else {
 367		tv.tv_sec = timeo / HZ;
 368		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 369	}
 370
 371	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 372		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 373		*(struct old_timeval32 *)optval = tv32;
 374		return sizeof(tv32);
 375	}
 376
 377	if (old_timeval) {
 378		struct __kernel_old_timeval old_tv;
 379		old_tv.tv_sec = tv.tv_sec;
 380		old_tv.tv_usec = tv.tv_usec;
 381		*(struct __kernel_old_timeval *)optval = old_tv;
 382		return sizeof(old_tv);
 383	}
 384
 385	*(struct __kernel_sock_timeval *)optval = tv;
 386	return sizeof(tv);
 387}
 388EXPORT_SYMBOL(sock_get_timeout);
 389
 390int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 391			   sockptr_t optval, int optlen, bool old_timeval)
 392{
 393	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 394		struct old_timeval32 tv32;
 395
 396		if (optlen < sizeof(tv32))
 397			return -EINVAL;
 398
 399		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 400			return -EFAULT;
 401		tv->tv_sec = tv32.tv_sec;
 402		tv->tv_usec = tv32.tv_usec;
 403	} else if (old_timeval) {
 404		struct __kernel_old_timeval old_tv;
 405
 406		if (optlen < sizeof(old_tv))
 407			return -EINVAL;
 408		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 409			return -EFAULT;
 410		tv->tv_sec = old_tv.tv_sec;
 411		tv->tv_usec = old_tv.tv_usec;
 412	} else {
 413		if (optlen < sizeof(*tv))
 414			return -EINVAL;
 415		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 416			return -EFAULT;
 417	}
 418
 419	return 0;
 420}
 421EXPORT_SYMBOL(sock_copy_user_timeval);
 422
 423static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 424			    bool old_timeval)
 425{
 426	struct __kernel_sock_timeval tv;
 427	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 428
 429	if (err)
 430		return err;
 431
 432	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 433		return -EDOM;
 434
 435	if (tv.tv_sec < 0) {
 436		static int warned __read_mostly;
 437
 438		*timeo_p = 0;
 439		if (warned < 10 && net_ratelimit()) {
 440			warned++;
 441			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 442				__func__, current->comm, task_pid_nr(current));
 443		}
 444		return 0;
 445	}
 446	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 447	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 448		return 0;
 449	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 450		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 451	return 0;
 452}
 453
 454static bool sock_needs_netstamp(const struct sock *sk)
 455{
 456	switch (sk->sk_family) {
 457	case AF_UNSPEC:
 458	case AF_UNIX:
 459		return false;
 460	default:
 461		return true;
 462	}
 463}
 464
 465static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 466{
 467	if (sk->sk_flags & flags) {
 468		sk->sk_flags &= ~flags;
 469		if (sock_needs_netstamp(sk) &&
 470		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 471			net_disable_timestamp();
 472	}
 473}
 474
 475
 476int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 477{
 478	unsigned long flags;
 479	struct sk_buff_head *list = &sk->sk_receive_queue;
 480
 481	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 482		atomic_inc(&sk->sk_drops);
 483		trace_sock_rcvqueue_full(sk, skb);
 484		return -ENOMEM;
 485	}
 486
 487	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 488		atomic_inc(&sk->sk_drops);
 489		return -ENOBUFS;
 490	}
 491
 492	skb->dev = NULL;
 493	skb_set_owner_r(skb, sk);
 494
 495	/* we escape from rcu protected region, make sure we dont leak
 496	 * a norefcounted dst
 497	 */
 498	skb_dst_force(skb);
 499
 500	spin_lock_irqsave(&list->lock, flags);
 501	sock_skb_set_dropcount(sk, skb);
 502	__skb_queue_tail(list, skb);
 503	spin_unlock_irqrestore(&list->lock, flags);
 504
 505	if (!sock_flag(sk, SOCK_DEAD))
 506		sk->sk_data_ready(sk);
 507	return 0;
 508}
 509EXPORT_SYMBOL(__sock_queue_rcv_skb);
 510
 511int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 512			      enum skb_drop_reason *reason)
 513{
 514	enum skb_drop_reason drop_reason;
 515	int err;
 516
 517	err = sk_filter(sk, skb);
 518	if (err) {
 519		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 520		goto out;
 521	}
 522	err = __sock_queue_rcv_skb(sk, skb);
 523	switch (err) {
 524	case -ENOMEM:
 525		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 526		break;
 527	case -ENOBUFS:
 528		drop_reason = SKB_DROP_REASON_PROTO_MEM;
 529		break;
 530	default:
 531		drop_reason = SKB_NOT_DROPPED_YET;
 532		break;
 533	}
 534out:
 535	if (reason)
 536		*reason = drop_reason;
 537	return err;
 538}
 539EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 540
 541int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 542		     const int nested, unsigned int trim_cap, bool refcounted)
 543{
 544	int rc = NET_RX_SUCCESS;
 545
 546	if (sk_filter_trim_cap(sk, skb, trim_cap))
 547		goto discard_and_relse;
 548
 549	skb->dev = NULL;
 550
 551	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 552		atomic_inc(&sk->sk_drops);
 553		goto discard_and_relse;
 554	}
 555	if (nested)
 556		bh_lock_sock_nested(sk);
 557	else
 558		bh_lock_sock(sk);
 559	if (!sock_owned_by_user(sk)) {
 560		/*
 561		 * trylock + unlock semantics:
 562		 */
 563		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 564
 565		rc = sk_backlog_rcv(sk, skb);
 566
 567		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 568	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 569		bh_unlock_sock(sk);
 570		atomic_inc(&sk->sk_drops);
 571		goto discard_and_relse;
 572	}
 573
 574	bh_unlock_sock(sk);
 575out:
 576	if (refcounted)
 577		sock_put(sk);
 578	return rc;
 579discard_and_relse:
 580	kfree_skb(skb);
 581	goto out;
 582}
 583EXPORT_SYMBOL(__sk_receive_skb);
 584
 585INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 586							  u32));
 587INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 588							   u32));
 589struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 590{
 591	struct dst_entry *dst = __sk_dst_get(sk);
 592
 593	if (dst && dst->obsolete &&
 594	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 595			       dst, cookie) == NULL) {
 596		sk_tx_queue_clear(sk);
 597		sk->sk_dst_pending_confirm = 0;
 598		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 599		dst_release(dst);
 600		return NULL;
 601	}
 602
 603	return dst;
 604}
 605EXPORT_SYMBOL(__sk_dst_check);
 606
 607struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 608{
 609	struct dst_entry *dst = sk_dst_get(sk);
 610
 611	if (dst && dst->obsolete &&
 612	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 613			       dst, cookie) == NULL) {
 614		sk_dst_reset(sk);
 615		dst_release(dst);
 616		return NULL;
 617	}
 618
 619	return dst;
 620}
 621EXPORT_SYMBOL(sk_dst_check);
 622
 623static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 624{
 625	int ret = -ENOPROTOOPT;
 626#ifdef CONFIG_NETDEVICES
 627	struct net *net = sock_net(sk);
 628
 629	/* Sorry... */
 630	ret = -EPERM;
 631	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 632		goto out;
 633
 634	ret = -EINVAL;
 635	if (ifindex < 0)
 636		goto out;
 637
 638	/* Paired with all READ_ONCE() done locklessly. */
 639	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 640
 641	if (sk->sk_prot->rehash)
 642		sk->sk_prot->rehash(sk);
 643	sk_dst_reset(sk);
 644
 645	ret = 0;
 646
 647out:
 648#endif
 649
 650	return ret;
 651}
 652
 653int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 654{
 655	int ret;
 656
 657	if (lock_sk)
 658		lock_sock(sk);
 659	ret = sock_bindtoindex_locked(sk, ifindex);
 660	if (lock_sk)
 661		release_sock(sk);
 662
 663	return ret;
 664}
 665EXPORT_SYMBOL(sock_bindtoindex);
 666
 667static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 668{
 669	int ret = -ENOPROTOOPT;
 670#ifdef CONFIG_NETDEVICES
 671	struct net *net = sock_net(sk);
 672	char devname[IFNAMSIZ];
 673	int index;
 674
 675	ret = -EINVAL;
 676	if (optlen < 0)
 677		goto out;
 678
 679	/* Bind this socket to a particular device like "eth0",
 680	 * as specified in the passed interface name. If the
 681	 * name is "" or the option length is zero the socket
 682	 * is not bound.
 683	 */
 684	if (optlen > IFNAMSIZ - 1)
 685		optlen = IFNAMSIZ - 1;
 686	memset(devname, 0, sizeof(devname));
 687
 688	ret = -EFAULT;
 689	if (copy_from_sockptr(devname, optval, optlen))
 690		goto out;
 691
 692	index = 0;
 693	if (devname[0] != '\0') {
 694		struct net_device *dev;
 695
 696		rcu_read_lock();
 697		dev = dev_get_by_name_rcu(net, devname);
 698		if (dev)
 699			index = dev->ifindex;
 700		rcu_read_unlock();
 701		ret = -ENODEV;
 702		if (!dev)
 703			goto out;
 704	}
 705
 706	sockopt_lock_sock(sk);
 707	ret = sock_bindtoindex_locked(sk, index);
 708	sockopt_release_sock(sk);
 709out:
 710#endif
 711
 712	return ret;
 713}
 714
 715static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 716				sockptr_t optlen, int len)
 717{
 718	int ret = -ENOPROTOOPT;
 719#ifdef CONFIG_NETDEVICES
 720	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 721	struct net *net = sock_net(sk);
 722	char devname[IFNAMSIZ];
 723
 724	if (bound_dev_if == 0) {
 725		len = 0;
 726		goto zero;
 727	}
 728
 729	ret = -EINVAL;
 730	if (len < IFNAMSIZ)
 731		goto out;
 732
 733	ret = netdev_get_name(net, devname, bound_dev_if);
 734	if (ret)
 735		goto out;
 736
 737	len = strlen(devname) + 1;
 738
 739	ret = -EFAULT;
 740	if (copy_to_sockptr(optval, devname, len))
 741		goto out;
 742
 743zero:
 744	ret = -EFAULT;
 745	if (copy_to_sockptr(optlen, &len, sizeof(int)))
 746		goto out;
 747
 748	ret = 0;
 749
 750out:
 751#endif
 752
 753	return ret;
 754}
 755
 756bool sk_mc_loop(struct sock *sk)
 757{
 758	if (dev_recursion_level())
 759		return false;
 760	if (!sk)
 761		return true;
 762	switch (sk->sk_family) {
 763	case AF_INET:
 764		return inet_sk(sk)->mc_loop;
 765#if IS_ENABLED(CONFIG_IPV6)
 766	case AF_INET6:
 767		return inet6_sk(sk)->mc_loop;
 768#endif
 769	}
 770	WARN_ON_ONCE(1);
 771	return true;
 772}
 773EXPORT_SYMBOL(sk_mc_loop);
 774
 775void sock_set_reuseaddr(struct sock *sk)
 776{
 777	lock_sock(sk);
 778	sk->sk_reuse = SK_CAN_REUSE;
 779	release_sock(sk);
 780}
 781EXPORT_SYMBOL(sock_set_reuseaddr);
 782
 783void sock_set_reuseport(struct sock *sk)
 784{
 785	lock_sock(sk);
 786	sk->sk_reuseport = true;
 787	release_sock(sk);
 788}
 789EXPORT_SYMBOL(sock_set_reuseport);
 790
 791void sock_no_linger(struct sock *sk)
 792{
 793	lock_sock(sk);
 794	sk->sk_lingertime = 0;
 795	sock_set_flag(sk, SOCK_LINGER);
 796	release_sock(sk);
 797}
 798EXPORT_SYMBOL(sock_no_linger);
 799
 800void sock_set_priority(struct sock *sk, u32 priority)
 801{
 802	lock_sock(sk);
 803	sk->sk_priority = priority;
 804	release_sock(sk);
 805}
 806EXPORT_SYMBOL(sock_set_priority);
 807
 808void sock_set_sndtimeo(struct sock *sk, s64 secs)
 809{
 810	lock_sock(sk);
 811	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 812		sk->sk_sndtimeo = secs * HZ;
 813	else
 814		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 815	release_sock(sk);
 816}
 817EXPORT_SYMBOL(sock_set_sndtimeo);
 818
 819static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 820{
 821	if (val)  {
 822		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 823		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 824		sock_set_flag(sk, SOCK_RCVTSTAMP);
 825		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 826	} else {
 827		sock_reset_flag(sk, SOCK_RCVTSTAMP);
 828		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 829	}
 830}
 831
 832void sock_enable_timestamps(struct sock *sk)
 833{
 834	lock_sock(sk);
 835	__sock_set_timestamps(sk, true, false, true);
 836	release_sock(sk);
 837}
 838EXPORT_SYMBOL(sock_enable_timestamps);
 839
 840void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 841{
 842	switch (optname) {
 843	case SO_TIMESTAMP_OLD:
 844		__sock_set_timestamps(sk, valbool, false, false);
 845		break;
 846	case SO_TIMESTAMP_NEW:
 847		__sock_set_timestamps(sk, valbool, true, false);
 848		break;
 849	case SO_TIMESTAMPNS_OLD:
 850		__sock_set_timestamps(sk, valbool, false, true);
 851		break;
 852	case SO_TIMESTAMPNS_NEW:
 853		__sock_set_timestamps(sk, valbool, true, true);
 854		break;
 855	}
 856}
 857
 858static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 859{
 860	struct net *net = sock_net(sk);
 861	struct net_device *dev = NULL;
 862	bool match = false;
 863	int *vclock_index;
 864	int i, num;
 865
 866	if (sk->sk_bound_dev_if)
 867		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 868
 869	if (!dev) {
 870		pr_err("%s: sock not bind to device\n", __func__);
 871		return -EOPNOTSUPP;
 872	}
 873
 874	num = ethtool_get_phc_vclocks(dev, &vclock_index);
 875	dev_put(dev);
 876
 877	for (i = 0; i < num; i++) {
 878		if (*(vclock_index + i) == phc_index) {
 879			match = true;
 880			break;
 881		}
 882	}
 883
 884	if (num > 0)
 885		kfree(vclock_index);
 886
 887	if (!match)
 888		return -EINVAL;
 889
 890	sk->sk_bind_phc = phc_index;
 891
 892	return 0;
 893}
 894
 895int sock_set_timestamping(struct sock *sk, int optname,
 896			  struct so_timestamping timestamping)
 897{
 898	int val = timestamping.flags;
 899	int ret;
 900
 901	if (val & ~SOF_TIMESTAMPING_MASK)
 902		return -EINVAL;
 903
 904	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 905	    !(val & SOF_TIMESTAMPING_OPT_ID))
 906		return -EINVAL;
 907
 908	if (val & SOF_TIMESTAMPING_OPT_ID &&
 909	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 910		if (sk_is_tcp(sk)) {
 911			if ((1 << sk->sk_state) &
 912			    (TCPF_CLOSE | TCPF_LISTEN))
 913				return -EINVAL;
 914			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 915				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 916			else
 917				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 918		} else {
 919			atomic_set(&sk->sk_tskey, 0);
 920		}
 921	}
 922
 923	if (val & SOF_TIMESTAMPING_OPT_STATS &&
 924	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 925		return -EINVAL;
 926
 927	if (val & SOF_TIMESTAMPING_BIND_PHC) {
 928		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 929		if (ret)
 930			return ret;
 931	}
 932
 933	sk->sk_tsflags = val;
 934	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 935
 936	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 937		sock_enable_timestamp(sk,
 938				      SOCK_TIMESTAMPING_RX_SOFTWARE);
 939	else
 940		sock_disable_timestamp(sk,
 941				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 942	return 0;
 943}
 944
 945void sock_set_keepalive(struct sock *sk)
 946{
 947	lock_sock(sk);
 948	if (sk->sk_prot->keepalive)
 949		sk->sk_prot->keepalive(sk, true);
 950	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 951	release_sock(sk);
 952}
 953EXPORT_SYMBOL(sock_set_keepalive);
 954
 955static void __sock_set_rcvbuf(struct sock *sk, int val)
 956{
 957	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 958	 * as a negative value.
 959	 */
 960	val = min_t(int, val, INT_MAX / 2);
 961	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 962
 963	/* We double it on the way in to account for "struct sk_buff" etc.
 964	 * overhead.   Applications assume that the SO_RCVBUF setting they make
 965	 * will allow that much actual data to be received on that socket.
 966	 *
 967	 * Applications are unaware that "struct sk_buff" and other overheads
 968	 * allocate from the receive buffer during socket buffer allocation.
 969	 *
 970	 * And after considering the possible alternatives, returning the value
 971	 * we actually used in getsockopt is the most desirable behavior.
 972	 */
 973	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 974}
 975
 976void sock_set_rcvbuf(struct sock *sk, int val)
 977{
 978	lock_sock(sk);
 979	__sock_set_rcvbuf(sk, val);
 980	release_sock(sk);
 981}
 982EXPORT_SYMBOL(sock_set_rcvbuf);
 983
 984static void __sock_set_mark(struct sock *sk, u32 val)
 985{
 986	if (val != sk->sk_mark) {
 987		sk->sk_mark = val;
 988		sk_dst_reset(sk);
 989	}
 990}
 991
 992void sock_set_mark(struct sock *sk, u32 val)
 993{
 994	lock_sock(sk);
 995	__sock_set_mark(sk, val);
 996	release_sock(sk);
 997}
 998EXPORT_SYMBOL(sock_set_mark);
 999
1000static void sock_release_reserved_memory(struct sock *sk, int bytes)
1001{
1002	/* Round down bytes to multiple of pages */
1003	bytes = round_down(bytes, PAGE_SIZE);
1004
1005	WARN_ON(bytes > sk->sk_reserved_mem);
1006	sk->sk_reserved_mem -= bytes;
1007	sk_mem_reclaim(sk);
1008}
1009
1010static int sock_reserve_memory(struct sock *sk, int bytes)
1011{
1012	long allocated;
1013	bool charged;
1014	int pages;
1015
1016	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1017		return -EOPNOTSUPP;
1018
1019	if (!bytes)
1020		return 0;
1021
1022	pages = sk_mem_pages(bytes);
1023
1024	/* pre-charge to memcg */
1025	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1026					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1027	if (!charged)
1028		return -ENOMEM;
1029
1030	/* pre-charge to forward_alloc */
1031	sk_memory_allocated_add(sk, pages);
1032	allocated = sk_memory_allocated(sk);
1033	/* If the system goes into memory pressure with this
1034	 * precharge, give up and return error.
1035	 */
1036	if (allocated > sk_prot_mem_limits(sk, 1)) {
1037		sk_memory_allocated_sub(sk, pages);
1038		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1039		return -ENOMEM;
1040	}
1041	sk->sk_forward_alloc += pages << PAGE_SHIFT;
1042
1043	sk->sk_reserved_mem += pages << PAGE_SHIFT;
1044
1045	return 0;
1046}
1047
1048void sockopt_lock_sock(struct sock *sk)
1049{
1050	/* When current->bpf_ctx is set, the setsockopt is called from
1051	 * a bpf prog.  bpf has ensured the sk lock has been
1052	 * acquired before calling setsockopt().
1053	 */
1054	if (has_current_bpf_ctx())
1055		return;
1056
1057	lock_sock(sk);
1058}
1059EXPORT_SYMBOL(sockopt_lock_sock);
1060
1061void sockopt_release_sock(struct sock *sk)
1062{
1063	if (has_current_bpf_ctx())
1064		return;
1065
1066	release_sock(sk);
1067}
1068EXPORT_SYMBOL(sockopt_release_sock);
1069
1070bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1071{
1072	return has_current_bpf_ctx() || ns_capable(ns, cap);
1073}
1074EXPORT_SYMBOL(sockopt_ns_capable);
1075
1076bool sockopt_capable(int cap)
1077{
1078	return has_current_bpf_ctx() || capable(cap);
1079}
1080EXPORT_SYMBOL(sockopt_capable);
1081
1082/*
1083 *	This is meant for all protocols to use and covers goings on
1084 *	at the socket level. Everything here is generic.
1085 */
1086
1087int sk_setsockopt(struct sock *sk, int level, int optname,
1088		  sockptr_t optval, unsigned int optlen)
1089{
1090	struct so_timestamping timestamping;
1091	struct socket *sock = sk->sk_socket;
1092	struct sock_txtime sk_txtime;
1093	int val;
1094	int valbool;
1095	struct linger ling;
1096	int ret = 0;
1097
1098	/*
1099	 *	Options without arguments
1100	 */
1101
1102	if (optname == SO_BINDTODEVICE)
1103		return sock_setbindtodevice(sk, optval, optlen);
1104
1105	if (optlen < sizeof(int))
1106		return -EINVAL;
1107
1108	if (copy_from_sockptr(&val, optval, sizeof(val)))
1109		return -EFAULT;
1110
1111	valbool = val ? 1 : 0;
1112
1113	sockopt_lock_sock(sk);
1114
1115	switch (optname) {
1116	case SO_DEBUG:
1117		if (val && !sockopt_capable(CAP_NET_ADMIN))
1118			ret = -EACCES;
1119		else
1120			sock_valbool_flag(sk, SOCK_DBG, valbool);
1121		break;
1122	case SO_REUSEADDR:
1123		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1124		break;
1125	case SO_REUSEPORT:
1126		sk->sk_reuseport = valbool;
1127		break;
1128	case SO_TYPE:
1129	case SO_PROTOCOL:
1130	case SO_DOMAIN:
1131	case SO_ERROR:
1132		ret = -ENOPROTOOPT;
1133		break;
1134	case SO_DONTROUTE:
1135		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1136		sk_dst_reset(sk);
1137		break;
1138	case SO_BROADCAST:
1139		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1140		break;
1141	case SO_SNDBUF:
1142		/* Don't error on this BSD doesn't and if you think
1143		 * about it this is right. Otherwise apps have to
1144		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1145		 * are treated in BSD as hints
1146		 */
1147		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1148set_sndbuf:
1149		/* Ensure val * 2 fits into an int, to prevent max_t()
1150		 * from treating it as a negative value.
1151		 */
1152		val = min_t(int, val, INT_MAX / 2);
1153		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1154		WRITE_ONCE(sk->sk_sndbuf,
1155			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1156		/* Wake up sending tasks if we upped the value. */
1157		sk->sk_write_space(sk);
1158		break;
1159
1160	case SO_SNDBUFFORCE:
1161		if (!sockopt_capable(CAP_NET_ADMIN)) {
1162			ret = -EPERM;
1163			break;
1164		}
1165
1166		/* No negative values (to prevent underflow, as val will be
1167		 * multiplied by 2).
1168		 */
1169		if (val < 0)
1170			val = 0;
1171		goto set_sndbuf;
1172
1173	case SO_RCVBUF:
1174		/* Don't error on this BSD doesn't and if you think
1175		 * about it this is right. Otherwise apps have to
1176		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1177		 * are treated in BSD as hints
1178		 */
1179		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1180		break;
1181
1182	case SO_RCVBUFFORCE:
1183		if (!sockopt_capable(CAP_NET_ADMIN)) {
1184			ret = -EPERM;
1185			break;
1186		}
1187
1188		/* No negative values (to prevent underflow, as val will be
1189		 * multiplied by 2).
1190		 */
1191		__sock_set_rcvbuf(sk, max(val, 0));
1192		break;
1193
1194	case SO_KEEPALIVE:
1195		if (sk->sk_prot->keepalive)
1196			sk->sk_prot->keepalive(sk, valbool);
1197		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1198		break;
1199
1200	case SO_OOBINLINE:
1201		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1202		break;
1203
1204	case SO_NO_CHECK:
1205		sk->sk_no_check_tx = valbool;
1206		break;
1207
1208	case SO_PRIORITY:
1209		if ((val >= 0 && val <= 6) ||
1210		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1211		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1212			sk->sk_priority = val;
1213		else
1214			ret = -EPERM;
1215		break;
1216
1217	case SO_LINGER:
1218		if (optlen < sizeof(ling)) {
1219			ret = -EINVAL;	/* 1003.1g */
1220			break;
1221		}
1222		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1223			ret = -EFAULT;
1224			break;
1225		}
1226		if (!ling.l_onoff)
1227			sock_reset_flag(sk, SOCK_LINGER);
1228		else {
1229#if (BITS_PER_LONG == 32)
1230			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1231				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1232			else
1233#endif
1234				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1235			sock_set_flag(sk, SOCK_LINGER);
1236		}
1237		break;
1238
1239	case SO_BSDCOMPAT:
1240		break;
1241
1242	case SO_PASSCRED:
1243		if (valbool)
1244			set_bit(SOCK_PASSCRED, &sock->flags);
1245		else
1246			clear_bit(SOCK_PASSCRED, &sock->flags);
1247		break;
1248
1249	case SO_TIMESTAMP_OLD:
1250	case SO_TIMESTAMP_NEW:
1251	case SO_TIMESTAMPNS_OLD:
1252	case SO_TIMESTAMPNS_NEW:
1253		sock_set_timestamp(sk, optname, valbool);
1254		break;
1255
1256	case SO_TIMESTAMPING_NEW:
1257	case SO_TIMESTAMPING_OLD:
1258		if (optlen == sizeof(timestamping)) {
1259			if (copy_from_sockptr(&timestamping, optval,
1260					      sizeof(timestamping))) {
1261				ret = -EFAULT;
1262				break;
1263			}
1264		} else {
1265			memset(&timestamping, 0, sizeof(timestamping));
1266			timestamping.flags = val;
1267		}
1268		ret = sock_set_timestamping(sk, optname, timestamping);
1269		break;
1270
1271	case SO_RCVLOWAT:
1272		if (val < 0)
1273			val = INT_MAX;
1274		if (sock && sock->ops->set_rcvlowat)
1275			ret = sock->ops->set_rcvlowat(sk, val);
1276		else
1277			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1278		break;
1279
1280	case SO_RCVTIMEO_OLD:
1281	case SO_RCVTIMEO_NEW:
1282		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1283				       optlen, optname == SO_RCVTIMEO_OLD);
1284		break;
1285
1286	case SO_SNDTIMEO_OLD:
1287	case SO_SNDTIMEO_NEW:
1288		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1289				       optlen, optname == SO_SNDTIMEO_OLD);
1290		break;
1291
1292	case SO_ATTACH_FILTER: {
1293		struct sock_fprog fprog;
1294
1295		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1296		if (!ret)
1297			ret = sk_attach_filter(&fprog, sk);
1298		break;
1299	}
1300	case SO_ATTACH_BPF:
1301		ret = -EINVAL;
1302		if (optlen == sizeof(u32)) {
1303			u32 ufd;
1304
1305			ret = -EFAULT;
1306			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1307				break;
1308
1309			ret = sk_attach_bpf(ufd, sk);
1310		}
1311		break;
1312
1313	case SO_ATTACH_REUSEPORT_CBPF: {
1314		struct sock_fprog fprog;
1315
1316		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1317		if (!ret)
1318			ret = sk_reuseport_attach_filter(&fprog, sk);
1319		break;
1320	}
1321	case SO_ATTACH_REUSEPORT_EBPF:
1322		ret = -EINVAL;
1323		if (optlen == sizeof(u32)) {
1324			u32 ufd;
1325
1326			ret = -EFAULT;
1327			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1328				break;
1329
1330			ret = sk_reuseport_attach_bpf(ufd, sk);
1331		}
1332		break;
1333
1334	case SO_DETACH_REUSEPORT_BPF:
1335		ret = reuseport_detach_prog(sk);
1336		break;
1337
1338	case SO_DETACH_FILTER:
1339		ret = sk_detach_filter(sk);
1340		break;
1341
1342	case SO_LOCK_FILTER:
1343		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1344			ret = -EPERM;
1345		else
1346			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1347		break;
1348
1349	case SO_PASSSEC:
1350		if (valbool)
1351			set_bit(SOCK_PASSSEC, &sock->flags);
1352		else
1353			clear_bit(SOCK_PASSSEC, &sock->flags);
1354		break;
1355	case SO_MARK:
1356		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1357		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1358			ret = -EPERM;
1359			break;
1360		}
1361
1362		__sock_set_mark(sk, val);
1363		break;
1364	case SO_RCVMARK:
1365		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1366		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1367			ret = -EPERM;
1368			break;
1369		}
1370
1371		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1372		break;
1373
1374	case SO_RXQ_OVFL:
1375		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1376		break;
1377
1378	case SO_WIFI_STATUS:
1379		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1380		break;
1381
1382	case SO_PEEK_OFF:
1383		if (sock->ops->set_peek_off)
1384			ret = sock->ops->set_peek_off(sk, val);
1385		else
1386			ret = -EOPNOTSUPP;
1387		break;
1388
1389	case SO_NOFCS:
1390		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1391		break;
1392
1393	case SO_SELECT_ERR_QUEUE:
1394		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1395		break;
1396
1397#ifdef CONFIG_NET_RX_BUSY_POLL
1398	case SO_BUSY_POLL:
1399		if (val < 0)
1400			ret = -EINVAL;
1401		else
1402			WRITE_ONCE(sk->sk_ll_usec, val);
1403		break;
1404	case SO_PREFER_BUSY_POLL:
1405		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1406			ret = -EPERM;
1407		else
1408			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1409		break;
1410	case SO_BUSY_POLL_BUDGET:
1411		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
1412			ret = -EPERM;
1413		} else {
1414			if (val < 0 || val > U16_MAX)
1415				ret = -EINVAL;
1416			else
1417				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1418		}
1419		break;
1420#endif
1421
1422	case SO_MAX_PACING_RATE:
1423		{
1424		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1425
1426		if (sizeof(ulval) != sizeof(val) &&
1427		    optlen >= sizeof(ulval) &&
1428		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1429			ret = -EFAULT;
1430			break;
1431		}
1432		if (ulval != ~0UL)
1433			cmpxchg(&sk->sk_pacing_status,
1434				SK_PACING_NONE,
1435				SK_PACING_NEEDED);
1436		sk->sk_max_pacing_rate = ulval;
1437		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1438		break;
1439		}
1440	case SO_INCOMING_CPU:
1441		reuseport_update_incoming_cpu(sk, val);
1442		break;
1443
1444	case SO_CNX_ADVICE:
1445		if (val == 1)
1446			dst_negative_advice(sk);
1447		break;
1448
1449	case SO_ZEROCOPY:
1450		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1451			if (!(sk_is_tcp(sk) ||
1452			      (sk->sk_type == SOCK_DGRAM &&
1453			       sk->sk_protocol == IPPROTO_UDP)))
1454				ret = -EOPNOTSUPP;
1455		} else if (sk->sk_family != PF_RDS) {
1456			ret = -EOPNOTSUPP;
1457		}
1458		if (!ret) {
1459			if (val < 0 || val > 1)
1460				ret = -EINVAL;
1461			else
1462				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1463		}
1464		break;
1465
1466	case SO_TXTIME:
1467		if (optlen != sizeof(struct sock_txtime)) {
1468			ret = -EINVAL;
1469			break;
1470		} else if (copy_from_sockptr(&sk_txtime, optval,
1471			   sizeof(struct sock_txtime))) {
1472			ret = -EFAULT;
1473			break;
1474		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1475			ret = -EINVAL;
1476			break;
1477		}
1478		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1479		 * scheduler has enough safe guards.
1480		 */
1481		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1482		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1483			ret = -EPERM;
1484			break;
1485		}
1486		sock_valbool_flag(sk, SOCK_TXTIME, true);
1487		sk->sk_clockid = sk_txtime.clockid;
1488		sk->sk_txtime_deadline_mode =
1489			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1490		sk->sk_txtime_report_errors =
1491			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1492		break;
1493
1494	case SO_BINDTOIFINDEX:
1495		ret = sock_bindtoindex_locked(sk, val);
1496		break;
1497
1498	case SO_BUF_LOCK:
1499		if (val & ~SOCK_BUF_LOCK_MASK) {
1500			ret = -EINVAL;
1501			break;
1502		}
1503		sk->sk_userlocks = val | (sk->sk_userlocks &
1504					  ~SOCK_BUF_LOCK_MASK);
1505		break;
1506
1507	case SO_RESERVE_MEM:
1508	{
1509		int delta;
1510
1511		if (val < 0) {
1512			ret = -EINVAL;
1513			break;
1514		}
1515
1516		delta = val - sk->sk_reserved_mem;
1517		if (delta < 0)
1518			sock_release_reserved_memory(sk, -delta);
1519		else
1520			ret = sock_reserve_memory(sk, delta);
1521		break;
1522	}
1523
1524	case SO_TXREHASH:
1525		if (val < -1 || val > 1) {
1526			ret = -EINVAL;
1527			break;
1528		}
1529		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1530			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1531		/* Paired with READ_ONCE() in tcp_rtx_synack() */
1532		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1533		break;
1534
1535	default:
1536		ret = -ENOPROTOOPT;
1537		break;
1538	}
1539	sockopt_release_sock(sk);
1540	return ret;
1541}
1542
1543int sock_setsockopt(struct socket *sock, int level, int optname,
1544		    sockptr_t optval, unsigned int optlen)
1545{
1546	return sk_setsockopt(sock->sk, level, optname,
1547			     optval, optlen);
1548}
1549EXPORT_SYMBOL(sock_setsockopt);
1550
1551static const struct cred *sk_get_peer_cred(struct sock *sk)
1552{
1553	const struct cred *cred;
1554
1555	spin_lock(&sk->sk_peer_lock);
1556	cred = get_cred(sk->sk_peer_cred);
1557	spin_unlock(&sk->sk_peer_lock);
1558
1559	return cred;
1560}
1561
1562static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1563			  struct ucred *ucred)
1564{
1565	ucred->pid = pid_vnr(pid);
1566	ucred->uid = ucred->gid = -1;
1567	if (cred) {
1568		struct user_namespace *current_ns = current_user_ns();
1569
1570		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1571		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1572	}
1573}
1574
1575static int groups_to_user(sockptr_t dst, const struct group_info *src)
1576{
1577	struct user_namespace *user_ns = current_user_ns();
1578	int i;
1579
1580	for (i = 0; i < src->ngroups; i++) {
1581		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1582
1583		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1584			return -EFAULT;
1585	}
1586
1587	return 0;
1588}
1589
1590int sk_getsockopt(struct sock *sk, int level, int optname,
1591		  sockptr_t optval, sockptr_t optlen)
1592{
1593	struct socket *sock = sk->sk_socket;
1594
1595	union {
1596		int val;
1597		u64 val64;
1598		unsigned long ulval;
1599		struct linger ling;
1600		struct old_timeval32 tm32;
1601		struct __kernel_old_timeval tm;
1602		struct  __kernel_sock_timeval stm;
1603		struct sock_txtime txtime;
1604		struct so_timestamping timestamping;
1605	} v;
1606
1607	int lv = sizeof(int);
1608	int len;
1609
1610	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1611		return -EFAULT;
1612	if (len < 0)
1613		return -EINVAL;
1614
1615	memset(&v, 0, sizeof(v));
1616
1617	switch (optname) {
1618	case SO_DEBUG:
1619		v.val = sock_flag(sk, SOCK_DBG);
1620		break;
1621
1622	case SO_DONTROUTE:
1623		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1624		break;
1625
1626	case SO_BROADCAST:
1627		v.val = sock_flag(sk, SOCK_BROADCAST);
1628		break;
1629
1630	case SO_SNDBUF:
1631		v.val = sk->sk_sndbuf;
1632		break;
1633
1634	case SO_RCVBUF:
1635		v.val = sk->sk_rcvbuf;
1636		break;
1637
1638	case SO_REUSEADDR:
1639		v.val = sk->sk_reuse;
1640		break;
1641
1642	case SO_REUSEPORT:
1643		v.val = sk->sk_reuseport;
1644		break;
1645
1646	case SO_KEEPALIVE:
1647		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1648		break;
1649
1650	case SO_TYPE:
1651		v.val = sk->sk_type;
1652		break;
1653
1654	case SO_PROTOCOL:
1655		v.val = sk->sk_protocol;
1656		break;
1657
1658	case SO_DOMAIN:
1659		v.val = sk->sk_family;
1660		break;
1661
1662	case SO_ERROR:
1663		v.val = -sock_error(sk);
1664		if (v.val == 0)
1665			v.val = xchg(&sk->sk_err_soft, 0);
1666		break;
1667
1668	case SO_OOBINLINE:
1669		v.val = sock_flag(sk, SOCK_URGINLINE);
1670		break;
1671
1672	case SO_NO_CHECK:
1673		v.val = sk->sk_no_check_tx;
1674		break;
1675
1676	case SO_PRIORITY:
1677		v.val = sk->sk_priority;
1678		break;
1679
1680	case SO_LINGER:
1681		lv		= sizeof(v.ling);
1682		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1683		v.ling.l_linger	= sk->sk_lingertime / HZ;
1684		break;
1685
1686	case SO_BSDCOMPAT:
1687		break;
1688
1689	case SO_TIMESTAMP_OLD:
1690		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1691				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1692				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1693		break;
1694
1695	case SO_TIMESTAMPNS_OLD:
1696		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1697		break;
1698
1699	case SO_TIMESTAMP_NEW:
1700		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1701		break;
1702
1703	case SO_TIMESTAMPNS_NEW:
1704		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1705		break;
1706
1707	case SO_TIMESTAMPING_OLD:
1708		lv = sizeof(v.timestamping);
1709		v.timestamping.flags = sk->sk_tsflags;
1710		v.timestamping.bind_phc = sk->sk_bind_phc;
1711		break;
1712
1713	case SO_RCVTIMEO_OLD:
1714	case SO_RCVTIMEO_NEW:
1715		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1716		break;
1717
1718	case SO_SNDTIMEO_OLD:
1719	case SO_SNDTIMEO_NEW:
1720		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1721		break;
1722
1723	case SO_RCVLOWAT:
1724		v.val = sk->sk_rcvlowat;
1725		break;
1726
1727	case SO_SNDLOWAT:
1728		v.val = 1;
1729		break;
1730
1731	case SO_PASSCRED:
1732		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1733		break;
1734
1735	case SO_PEERCRED:
1736	{
1737		struct ucred peercred;
1738		if (len > sizeof(peercred))
1739			len = sizeof(peercred);
1740
1741		spin_lock(&sk->sk_peer_lock);
1742		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1743		spin_unlock(&sk->sk_peer_lock);
1744
1745		if (copy_to_sockptr(optval, &peercred, len))
1746			return -EFAULT;
1747		goto lenout;
1748	}
1749
1750	case SO_PEERGROUPS:
1751	{
1752		const struct cred *cred;
1753		int ret, n;
1754
1755		cred = sk_get_peer_cred(sk);
1756		if (!cred)
1757			return -ENODATA;
1758
1759		n = cred->group_info->ngroups;
1760		if (len < n * sizeof(gid_t)) {
1761			len = n * sizeof(gid_t);
1762			put_cred(cred);
1763			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1764		}
1765		len = n * sizeof(gid_t);
1766
1767		ret = groups_to_user(optval, cred->group_info);
1768		put_cred(cred);
1769		if (ret)
1770			return ret;
1771		goto lenout;
1772	}
1773
1774	case SO_PEERNAME:
1775	{
1776		char address[128];
1777
1778		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1779		if (lv < 0)
1780			return -ENOTCONN;
1781		if (lv < len)
1782			return -EINVAL;
1783		if (copy_to_sockptr(optval, address, len))
1784			return -EFAULT;
1785		goto lenout;
1786	}
1787
1788	/* Dubious BSD thing... Probably nobody even uses it, but
1789	 * the UNIX standard wants it for whatever reason... -DaveM
1790	 */
1791	case SO_ACCEPTCONN:
1792		v.val = sk->sk_state == TCP_LISTEN;
1793		break;
1794
1795	case SO_PASSSEC:
1796		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1797		break;
1798
1799	case SO_PEERSEC:
1800		return security_socket_getpeersec_stream(sock,
1801							 optval, optlen, len);
1802
1803	case SO_MARK:
1804		v.val = sk->sk_mark;
1805		break;
1806
1807	case SO_RCVMARK:
1808		v.val = sock_flag(sk, SOCK_RCVMARK);
1809		break;
1810
1811	case SO_RXQ_OVFL:
1812		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1813		break;
1814
1815	case SO_WIFI_STATUS:
1816		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1817		break;
1818
1819	case SO_PEEK_OFF:
1820		if (!sock->ops->set_peek_off)
1821			return -EOPNOTSUPP;
1822
1823		v.val = sk->sk_peek_off;
1824		break;
1825	case SO_NOFCS:
1826		v.val = sock_flag(sk, SOCK_NOFCS);
1827		break;
1828
1829	case SO_BINDTODEVICE:
1830		return sock_getbindtodevice(sk, optval, optlen, len);
1831
1832	case SO_GET_FILTER:
1833		len = sk_get_filter(sk, optval, len);
1834		if (len < 0)
1835			return len;
1836
1837		goto lenout;
1838
1839	case SO_LOCK_FILTER:
1840		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1841		break;
1842
1843	case SO_BPF_EXTENSIONS:
1844		v.val = bpf_tell_extensions();
1845		break;
1846
1847	case SO_SELECT_ERR_QUEUE:
1848		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1849		break;
1850
1851#ifdef CONFIG_NET_RX_BUSY_POLL
1852	case SO_BUSY_POLL:
1853		v.val = sk->sk_ll_usec;
1854		break;
1855	case SO_PREFER_BUSY_POLL:
1856		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1857		break;
1858#endif
1859
1860	case SO_MAX_PACING_RATE:
1861		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1862			lv = sizeof(v.ulval);
1863			v.ulval = sk->sk_max_pacing_rate;
1864		} else {
1865			/* 32bit version */
1866			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1867		}
1868		break;
1869
1870	case SO_INCOMING_CPU:
1871		v.val = READ_ONCE(sk->sk_incoming_cpu);
1872		break;
1873
1874	case SO_MEMINFO:
1875	{
1876		u32 meminfo[SK_MEMINFO_VARS];
1877
1878		sk_get_meminfo(sk, meminfo);
1879
1880		len = min_t(unsigned int, len, sizeof(meminfo));
1881		if (copy_to_sockptr(optval, &meminfo, len))
1882			return -EFAULT;
1883
1884		goto lenout;
1885	}
1886
1887#ifdef CONFIG_NET_RX_BUSY_POLL
1888	case SO_INCOMING_NAPI_ID:
1889		v.val = READ_ONCE(sk->sk_napi_id);
1890
1891		/* aggregate non-NAPI IDs down to 0 */
1892		if (v.val < MIN_NAPI_ID)
1893			v.val = 0;
1894
1895		break;
1896#endif
1897
1898	case SO_COOKIE:
1899		lv = sizeof(u64);
1900		if (len < lv)
1901			return -EINVAL;
1902		v.val64 = sock_gen_cookie(sk);
1903		break;
1904
1905	case SO_ZEROCOPY:
1906		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1907		break;
1908
1909	case SO_TXTIME:
1910		lv = sizeof(v.txtime);
1911		v.txtime.clockid = sk->sk_clockid;
1912		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1913				  SOF_TXTIME_DEADLINE_MODE : 0;
1914		v.txtime.flags |= sk->sk_txtime_report_errors ?
1915				  SOF_TXTIME_REPORT_ERRORS : 0;
1916		break;
1917
1918	case SO_BINDTOIFINDEX:
1919		v.val = READ_ONCE(sk->sk_bound_dev_if);
1920		break;
1921
1922	case SO_NETNS_COOKIE:
1923		lv = sizeof(u64);
1924		if (len != lv)
1925			return -EINVAL;
1926		v.val64 = sock_net(sk)->net_cookie;
1927		break;
1928
1929	case SO_BUF_LOCK:
1930		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1931		break;
1932
1933	case SO_RESERVE_MEM:
1934		v.val = sk->sk_reserved_mem;
1935		break;
1936
1937	case SO_TXREHASH:
1938		v.val = sk->sk_txrehash;
1939		break;
1940
1941	default:
1942		/* We implement the SO_SNDLOWAT etc to not be settable
1943		 * (1003.1g 7).
1944		 */
1945		return -ENOPROTOOPT;
1946	}
1947
1948	if (len > lv)
1949		len = lv;
1950	if (copy_to_sockptr(optval, &v, len))
1951		return -EFAULT;
1952lenout:
1953	if (copy_to_sockptr(optlen, &len, sizeof(int)))
1954		return -EFAULT;
1955	return 0;
1956}
1957
1958int sock_getsockopt(struct socket *sock, int level, int optname,
1959		    char __user *optval, int __user *optlen)
1960{
1961	return sk_getsockopt(sock->sk, level, optname,
1962			     USER_SOCKPTR(optval),
1963			     USER_SOCKPTR(optlen));
1964}
1965
1966/*
1967 * Initialize an sk_lock.
1968 *
1969 * (We also register the sk_lock with the lock validator.)
1970 */
1971static inline void sock_lock_init(struct sock *sk)
1972{
1973	if (sk->sk_kern_sock)
1974		sock_lock_init_class_and_name(
1975			sk,
1976			af_family_kern_slock_key_strings[sk->sk_family],
1977			af_family_kern_slock_keys + sk->sk_family,
1978			af_family_kern_key_strings[sk->sk_family],
1979			af_family_kern_keys + sk->sk_family);
1980	else
1981		sock_lock_init_class_and_name(
1982			sk,
1983			af_family_slock_key_strings[sk->sk_family],
1984			af_family_slock_keys + sk->sk_family,
1985			af_family_key_strings[sk->sk_family],
1986			af_family_keys + sk->sk_family);
1987}
1988
1989/*
1990 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1991 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1992 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1993 */
1994static void sock_copy(struct sock *nsk, const struct sock *osk)
1995{
1996	const struct proto *prot = READ_ONCE(osk->sk_prot);
1997#ifdef CONFIG_SECURITY_NETWORK
1998	void *sptr = nsk->sk_security;
1999#endif
2000
2001	/* If we move sk_tx_queue_mapping out of the private section,
2002	 * we must check if sk_tx_queue_clear() is called after
2003	 * sock_copy() in sk_clone_lock().
2004	 */
2005	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2006		     offsetof(struct sock, sk_dontcopy_begin) ||
2007		     offsetof(struct sock, sk_tx_queue_mapping) >=
2008		     offsetof(struct sock, sk_dontcopy_end));
2009
2010	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2011
2012	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2013	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2014
2015#ifdef CONFIG_SECURITY_NETWORK
2016	nsk->sk_security = sptr;
2017	security_sk_clone(osk, nsk);
2018#endif
2019}
2020
2021static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2022		int family)
2023{
2024	struct sock *sk;
2025	struct kmem_cache *slab;
2026
2027	slab = prot->slab;
2028	if (slab != NULL) {
2029		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2030		if (!sk)
2031			return sk;
2032		if (want_init_on_alloc(priority))
2033			sk_prot_clear_nulls(sk, prot->obj_size);
2034	} else
2035		sk = kmalloc(prot->obj_size, priority);
2036
2037	if (sk != NULL) {
2038		if (security_sk_alloc(sk, family, priority))
2039			goto out_free;
2040
2041		if (!try_module_get(prot->owner))
2042			goto out_free_sec;
2043	}
2044
2045	return sk;
2046
2047out_free_sec:
2048	security_sk_free(sk);
2049out_free:
2050	if (slab != NULL)
2051		kmem_cache_free(slab, sk);
2052	else
2053		kfree(sk);
2054	return NULL;
2055}
2056
2057static void sk_prot_free(struct proto *prot, struct sock *sk)
2058{
2059	struct kmem_cache *slab;
2060	struct module *owner;
2061
2062	owner = prot->owner;
2063	slab = prot->slab;
2064
2065	cgroup_sk_free(&sk->sk_cgrp_data);
2066	mem_cgroup_sk_free(sk);
2067	security_sk_free(sk);
2068	if (slab != NULL)
2069		kmem_cache_free(slab, sk);
2070	else
2071		kfree(sk);
2072	module_put(owner);
2073}
2074
2075/**
2076 *	sk_alloc - All socket objects are allocated here
2077 *	@net: the applicable net namespace
2078 *	@family: protocol family
2079 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2080 *	@prot: struct proto associated with this new sock instance
2081 *	@kern: is this to be a kernel socket?
2082 */
2083struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2084		      struct proto *prot, int kern)
2085{
2086	struct sock *sk;
2087
2088	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2089	if (sk) {
2090		sk->sk_family = family;
2091		/*
2092		 * See comment in struct sock definition to understand
2093		 * why we need sk_prot_creator -acme
2094		 */
2095		sk->sk_prot = sk->sk_prot_creator = prot;
2096		sk->sk_kern_sock = kern;
2097		sock_lock_init(sk);
2098		sk->sk_net_refcnt = kern ? 0 : 1;
2099		if (likely(sk->sk_net_refcnt)) {
2100			get_net_track(net, &sk->ns_tracker, priority);
2101			sock_inuse_add(net, 1);
2102		} else {
2103			__netns_tracker_alloc(net, &sk->ns_tracker,
2104					      false, priority);
2105		}
2106
2107		sock_net_set(sk, net);
2108		refcount_set(&sk->sk_wmem_alloc, 1);
2109
2110		mem_cgroup_sk_alloc(sk);
2111		cgroup_sk_alloc(&sk->sk_cgrp_data);
2112		sock_update_classid(&sk->sk_cgrp_data);
2113		sock_update_netprioidx(&sk->sk_cgrp_data);
2114		sk_tx_queue_clear(sk);
2115	}
2116
2117	return sk;
2118}
2119EXPORT_SYMBOL(sk_alloc);
2120
2121/* Sockets having SOCK_RCU_FREE will call this function after one RCU
2122 * grace period. This is the case for UDP sockets and TCP listeners.
2123 */
2124static void __sk_destruct(struct rcu_head *head)
2125{
2126	struct sock *sk = container_of(head, struct sock, sk_rcu);
2127	struct sk_filter *filter;
2128
2129	if (sk->sk_destruct)
2130		sk->sk_destruct(sk);
2131
2132	filter = rcu_dereference_check(sk->sk_filter,
2133				       refcount_read(&sk->sk_wmem_alloc) == 0);
2134	if (filter) {
2135		sk_filter_uncharge(sk, filter);
2136		RCU_INIT_POINTER(sk->sk_filter, NULL);
2137	}
2138
2139	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2140
2141#ifdef CONFIG_BPF_SYSCALL
2142	bpf_sk_storage_free(sk);
2143#endif
2144
2145	if (atomic_read(&sk->sk_omem_alloc))
2146		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2147			 __func__, atomic_read(&sk->sk_omem_alloc));
2148
2149	if (sk->sk_frag.page) {
2150		put_page(sk->sk_frag.page);
2151		sk->sk_frag.page = NULL;
2152	}
2153
2154	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2155	put_cred(sk->sk_peer_cred);
2156	put_pid(sk->sk_peer_pid);
2157
2158	if (likely(sk->sk_net_refcnt))
2159		put_net_track(sock_net(sk), &sk->ns_tracker);
2160	else
2161		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2162
2163	sk_prot_free(sk->sk_prot_creator, sk);
2164}
2165
2166void sk_destruct(struct sock *sk)
2167{
2168	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2169
2170	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2171		reuseport_detach_sock(sk);
2172		use_call_rcu = true;
2173	}
2174
2175	if (use_call_rcu)
2176		call_rcu(&sk->sk_rcu, __sk_destruct);
2177	else
2178		__sk_destruct(&sk->sk_rcu);
2179}
2180
2181static void __sk_free(struct sock *sk)
2182{
2183	if (likely(sk->sk_net_refcnt))
2184		sock_inuse_add(sock_net(sk), -1);
2185
2186	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2187		sock_diag_broadcast_destroy(sk);
2188	else
2189		sk_destruct(sk);
2190}
2191
2192void sk_free(struct sock *sk)
2193{
2194	/*
2195	 * We subtract one from sk_wmem_alloc and can know if
2196	 * some packets are still in some tx queue.
2197	 * If not null, sock_wfree() will call __sk_free(sk) later
2198	 */
2199	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2200		__sk_free(sk);
2201}
2202EXPORT_SYMBOL(sk_free);
2203
2204static void sk_init_common(struct sock *sk)
2205{
2206	skb_queue_head_init(&sk->sk_receive_queue);
2207	skb_queue_head_init(&sk->sk_write_queue);
2208	skb_queue_head_init(&sk->sk_error_queue);
2209
2210	rwlock_init(&sk->sk_callback_lock);
2211	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2212			af_rlock_keys + sk->sk_family,
2213			af_family_rlock_key_strings[sk->sk_family]);
2214	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2215			af_wlock_keys + sk->sk_family,
2216			af_family_wlock_key_strings[sk->sk_family]);
2217	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2218			af_elock_keys + sk->sk_family,
2219			af_family_elock_key_strings[sk->sk_family]);
2220	lockdep_set_class_and_name(&sk->sk_callback_lock,
2221			af_callback_keys + sk->sk_family,
2222			af_family_clock_key_strings[sk->sk_family]);
2223}
2224
2225/**
2226 *	sk_clone_lock - clone a socket, and lock its clone
2227 *	@sk: the socket to clone
2228 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2229 *
2230 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2231 */
2232struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2233{
2234	struct proto *prot = READ_ONCE(sk->sk_prot);
2235	struct sk_filter *filter;
2236	bool is_charged = true;
2237	struct sock *newsk;
2238
2239	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2240	if (!newsk)
2241		goto out;
2242
2243	sock_copy(newsk, sk);
2244
2245	newsk->sk_prot_creator = prot;
2246
2247	/* SANITY */
2248	if (likely(newsk->sk_net_refcnt)) {
2249		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2250		sock_inuse_add(sock_net(newsk), 1);
2251	} else {
2252		/* Kernel sockets are not elevating the struct net refcount.
2253		 * Instead, use a tracker to more easily detect if a layer
2254		 * is not properly dismantling its kernel sockets at netns
2255		 * destroy time.
2256		 */
2257		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2258				      false, priority);
2259	}
2260	sk_node_init(&newsk->sk_node);
2261	sock_lock_init(newsk);
2262	bh_lock_sock(newsk);
2263	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2264	newsk->sk_backlog.len = 0;
2265
2266	atomic_set(&newsk->sk_rmem_alloc, 0);
2267
2268	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2269	refcount_set(&newsk->sk_wmem_alloc, 1);
2270
2271	atomic_set(&newsk->sk_omem_alloc, 0);
2272	sk_init_common(newsk);
2273
2274	newsk->sk_dst_cache	= NULL;
2275	newsk->sk_dst_pending_confirm = 0;
2276	newsk->sk_wmem_queued	= 0;
2277	newsk->sk_forward_alloc = 0;
2278	newsk->sk_reserved_mem  = 0;
2279	atomic_set(&newsk->sk_drops, 0);
2280	newsk->sk_send_head	= NULL;
2281	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2282	atomic_set(&newsk->sk_zckey, 0);
2283
2284	sock_reset_flag(newsk, SOCK_DONE);
2285
2286	/* sk->sk_memcg will be populated at accept() time */
2287	newsk->sk_memcg = NULL;
2288
2289	cgroup_sk_clone(&newsk->sk_cgrp_data);
2290
2291	rcu_read_lock();
2292	filter = rcu_dereference(sk->sk_filter);
2293	if (filter != NULL)
2294		/* though it's an empty new sock, the charging may fail
2295		 * if sysctl_optmem_max was changed between creation of
2296		 * original socket and cloning
2297		 */
2298		is_charged = sk_filter_charge(newsk, filter);
2299	RCU_INIT_POINTER(newsk->sk_filter, filter);
2300	rcu_read_unlock();
2301
2302	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2303		/* We need to make sure that we don't uncharge the new
2304		 * socket if we couldn't charge it in the first place
2305		 * as otherwise we uncharge the parent's filter.
2306		 */
2307		if (!is_charged)
2308			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2309		sk_free_unlock_clone(newsk);
2310		newsk = NULL;
2311		goto out;
2312	}
2313	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2314
2315	if (bpf_sk_storage_clone(sk, newsk)) {
2316		sk_free_unlock_clone(newsk);
2317		newsk = NULL;
2318		goto out;
2319	}
2320
2321	/* Clear sk_user_data if parent had the pointer tagged
2322	 * as not suitable for copying when cloning.
2323	 */
2324	if (sk_user_data_is_nocopy(newsk))
2325		newsk->sk_user_data = NULL;
2326
2327	newsk->sk_err	   = 0;
2328	newsk->sk_err_soft = 0;
2329	newsk->sk_priority = 0;
2330	newsk->sk_incoming_cpu = raw_smp_processor_id();
2331
2332	/* Before updating sk_refcnt, we must commit prior changes to memory
2333	 * (Documentation/RCU/rculist_nulls.rst for details)
2334	 */
2335	smp_wmb();
2336	refcount_set(&newsk->sk_refcnt, 2);
2337
2338	sk_set_socket(newsk, NULL);
2339	sk_tx_queue_clear(newsk);
2340	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2341
2342	if (newsk->sk_prot->sockets_allocated)
2343		sk_sockets_allocated_inc(newsk);
2344
2345	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2346		net_enable_timestamp();
2347out:
2348	return newsk;
2349}
2350EXPORT_SYMBOL_GPL(sk_clone_lock);
2351
2352void sk_free_unlock_clone(struct sock *sk)
2353{
2354	/* It is still raw copy of parent, so invalidate
2355	 * destructor and make plain sk_free() */
2356	sk->sk_destruct = NULL;
2357	bh_unlock_sock(sk);
2358	sk_free(sk);
2359}
2360EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2361
2362static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2363{
2364	bool is_ipv6 = false;
2365	u32 max_size;
2366
2367#if IS_ENABLED(CONFIG_IPV6)
2368	is_ipv6 = (sk->sk_family == AF_INET6 &&
2369		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2370#endif
2371	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2372	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2373			READ_ONCE(dst->dev->gso_ipv4_max_size);
2374	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2375		max_size = GSO_LEGACY_MAX_SIZE;
2376
2377	return max_size - (MAX_TCP_HEADER + 1);
2378}
2379
2380void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2381{
2382	u32 max_segs = 1;
2383
2384	sk_dst_set(sk, dst);
2385	sk->sk_route_caps = dst->dev->features;
2386	if (sk_is_tcp(sk))
2387		sk->sk_route_caps |= NETIF_F_GSO;
2388	if (sk->sk_route_caps & NETIF_F_GSO)
2389		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2390	if (unlikely(sk->sk_gso_disabled))
2391		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2392	if (sk_can_gso(sk)) {
2393		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2394			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2395		} else {
2396			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2397			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2398			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2399			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2400		}
2401	}
2402	sk->sk_gso_max_segs = max_segs;
2403}
2404EXPORT_SYMBOL_GPL(sk_setup_caps);
2405
2406/*
2407 *	Simple resource managers for sockets.
2408 */
2409
2410
2411/*
2412 * Write buffer destructor automatically called from kfree_skb.
2413 */
2414void sock_wfree(struct sk_buff *skb)
2415{
2416	struct sock *sk = skb->sk;
2417	unsigned int len = skb->truesize;
2418	bool free;
2419
2420	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2421		if (sock_flag(sk, SOCK_RCU_FREE) &&
2422		    sk->sk_write_space == sock_def_write_space) {
2423			rcu_read_lock();
2424			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2425			sock_def_write_space_wfree(sk);
2426			rcu_read_unlock();
2427			if (unlikely(free))
2428				__sk_free(sk);
2429			return;
2430		}
2431
2432		/*
2433		 * Keep a reference on sk_wmem_alloc, this will be released
2434		 * after sk_write_space() call
2435		 */
2436		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2437		sk->sk_write_space(sk);
2438		len = 1;
2439	}
2440	/*
2441	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2442	 * could not do because of in-flight packets
2443	 */
2444	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2445		__sk_free(sk);
2446}
2447EXPORT_SYMBOL(sock_wfree);
2448
2449/* This variant of sock_wfree() is used by TCP,
2450 * since it sets SOCK_USE_WRITE_QUEUE.
2451 */
2452void __sock_wfree(struct sk_buff *skb)
2453{
2454	struct sock *sk = skb->sk;
2455
2456	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2457		__sk_free(sk);
2458}
2459
2460void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2461{
2462	skb_orphan(skb);
2463	skb->sk = sk;
2464#ifdef CONFIG_INET
2465	if (unlikely(!sk_fullsock(sk))) {
2466		skb->destructor = sock_edemux;
2467		sock_hold(sk);
2468		return;
2469	}
2470#endif
2471	skb->destructor = sock_wfree;
2472	skb_set_hash_from_sk(skb, sk);
2473	/*
2474	 * We used to take a refcount on sk, but following operation
2475	 * is enough to guarantee sk_free() wont free this sock until
2476	 * all in-flight packets are completed
2477	 */
2478	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2479}
2480EXPORT_SYMBOL(skb_set_owner_w);
2481
2482static bool can_skb_orphan_partial(const struct sk_buff *skb)
2483{
2484#ifdef CONFIG_TLS_DEVICE
2485	/* Drivers depend on in-order delivery for crypto offload,
2486	 * partial orphan breaks out-of-order-OK logic.
2487	 */
2488	if (skb->decrypted)
2489		return false;
2490#endif
2491	return (skb->destructor == sock_wfree ||
2492		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2493}
2494
2495/* This helper is used by netem, as it can hold packets in its
2496 * delay queue. We want to allow the owner socket to send more
2497 * packets, as if they were already TX completed by a typical driver.
2498 * But we also want to keep skb->sk set because some packet schedulers
2499 * rely on it (sch_fq for example).
2500 */
2501void skb_orphan_partial(struct sk_buff *skb)
2502{
2503	if (skb_is_tcp_pure_ack(skb))
2504		return;
2505
2506	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2507		return;
2508
2509	skb_orphan(skb);
2510}
2511EXPORT_SYMBOL(skb_orphan_partial);
2512
2513/*
2514 * Read buffer destructor automatically called from kfree_skb.
2515 */
2516void sock_rfree(struct sk_buff *skb)
2517{
2518	struct sock *sk = skb->sk;
2519	unsigned int len = skb->truesize;
2520
2521	atomic_sub(len, &sk->sk_rmem_alloc);
2522	sk_mem_uncharge(sk, len);
2523}
2524EXPORT_SYMBOL(sock_rfree);
2525
2526/*
2527 * Buffer destructor for skbs that are not used directly in read or write
2528 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2529 */
2530void sock_efree(struct sk_buff *skb)
2531{
2532	sock_put(skb->sk);
2533}
2534EXPORT_SYMBOL(sock_efree);
2535
2536/* Buffer destructor for prefetch/receive path where reference count may
2537 * not be held, e.g. for listen sockets.
2538 */
2539#ifdef CONFIG_INET
2540void sock_pfree(struct sk_buff *skb)
2541{
2542	if (sk_is_refcounted(skb->sk))
2543		sock_gen_put(skb->sk);
2544}
2545EXPORT_SYMBOL(sock_pfree);
2546#endif /* CONFIG_INET */
2547
2548kuid_t sock_i_uid(struct sock *sk)
2549{
2550	kuid_t uid;
2551
2552	read_lock_bh(&sk->sk_callback_lock);
2553	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2554	read_unlock_bh(&sk->sk_callback_lock);
2555	return uid;
2556}
2557EXPORT_SYMBOL(sock_i_uid);
2558
2559unsigned long sock_i_ino(struct sock *sk)
2560{
2561	unsigned long ino;
2562
2563	read_lock_bh(&sk->sk_callback_lock);
2564	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2565	read_unlock_bh(&sk->sk_callback_lock);
2566	return ino;
2567}
2568EXPORT_SYMBOL(sock_i_ino);
2569
2570/*
2571 * Allocate a skb from the socket's send buffer.
2572 */
2573struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2574			     gfp_t priority)
2575{
2576	if (force ||
2577	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2578		struct sk_buff *skb = alloc_skb(size, priority);
2579
2580		if (skb) {
2581			skb_set_owner_w(skb, sk);
2582			return skb;
2583		}
2584	}
2585	return NULL;
2586}
2587EXPORT_SYMBOL(sock_wmalloc);
2588
2589static void sock_ofree(struct sk_buff *skb)
2590{
2591	struct sock *sk = skb->sk;
2592
2593	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2594}
2595
2596struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2597			     gfp_t priority)
2598{
2599	struct sk_buff *skb;
2600
2601	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2602	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2603	    READ_ONCE(sysctl_optmem_max))
2604		return NULL;
2605
2606	skb = alloc_skb(size, priority);
2607	if (!skb)
2608		return NULL;
2609
2610	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2611	skb->sk = sk;
2612	skb->destructor = sock_ofree;
2613	return skb;
2614}
2615
2616/*
2617 * Allocate a memory block from the socket's option memory buffer.
2618 */
2619void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2620{
2621	int optmem_max = READ_ONCE(sysctl_optmem_max);
2622
2623	if ((unsigned int)size <= optmem_max &&
2624	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2625		void *mem;
2626		/* First do the add, to avoid the race if kmalloc
2627		 * might sleep.
2628		 */
2629		atomic_add(size, &sk->sk_omem_alloc);
2630		mem = kmalloc(size, priority);
2631		if (mem)
2632			return mem;
2633		atomic_sub(size, &sk->sk_omem_alloc);
2634	}
2635	return NULL;
2636}
2637EXPORT_SYMBOL(sock_kmalloc);
2638
2639/* Free an option memory block. Note, we actually want the inline
2640 * here as this allows gcc to detect the nullify and fold away the
2641 * condition entirely.
2642 */
2643static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2644				  const bool nullify)
2645{
2646	if (WARN_ON_ONCE(!mem))
2647		return;
2648	if (nullify)
2649		kfree_sensitive(mem);
2650	else
2651		kfree(mem);
2652	atomic_sub(size, &sk->sk_omem_alloc);
2653}
2654
2655void sock_kfree_s(struct sock *sk, void *mem, int size)
2656{
2657	__sock_kfree_s(sk, mem, size, false);
2658}
2659EXPORT_SYMBOL(sock_kfree_s);
2660
2661void sock_kzfree_s(struct sock *sk, void *mem, int size)
2662{
2663	__sock_kfree_s(sk, mem, size, true);
2664}
2665EXPORT_SYMBOL(sock_kzfree_s);
2666
2667/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2668   I think, these locks should be removed for datagram sockets.
2669 */
2670static long sock_wait_for_wmem(struct sock *sk, long timeo)
2671{
2672	DEFINE_WAIT(wait);
2673
2674	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2675	for (;;) {
2676		if (!timeo)
2677			break;
2678		if (signal_pending(current))
2679			break;
2680		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2681		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2682		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2683			break;
2684		if (sk->sk_shutdown & SEND_SHUTDOWN)
2685			break;
2686		if (sk->sk_err)
2687			break;
2688		timeo = schedule_timeout(timeo);
2689	}
2690	finish_wait(sk_sleep(sk), &wait);
2691	return timeo;
2692}
2693
2694
2695/*
2696 *	Generic send/receive buffer handlers
2697 */
2698
2699struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2700				     unsigned long data_len, int noblock,
2701				     int *errcode, int max_page_order)
2702{
2703	struct sk_buff *skb;
2704	long timeo;
2705	int err;
2706
2707	timeo = sock_sndtimeo(sk, noblock);
2708	for (;;) {
2709		err = sock_error(sk);
2710		if (err != 0)
2711			goto failure;
2712
2713		err = -EPIPE;
2714		if (sk->sk_shutdown & SEND_SHUTDOWN)
2715			goto failure;
2716
2717		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2718			break;
2719
2720		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2721		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2722		err = -EAGAIN;
2723		if (!timeo)
2724			goto failure;
2725		if (signal_pending(current))
2726			goto interrupted;
2727		timeo = sock_wait_for_wmem(sk, timeo);
2728	}
2729	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2730				   errcode, sk->sk_allocation);
2731	if (skb)
2732		skb_set_owner_w(skb, sk);
2733	return skb;
2734
2735interrupted:
2736	err = sock_intr_errno(timeo);
2737failure:
2738	*errcode = err;
2739	return NULL;
2740}
2741EXPORT_SYMBOL(sock_alloc_send_pskb);
2742
2743int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2744		     struct sockcm_cookie *sockc)
2745{
2746	u32 tsflags;
2747
2748	switch (cmsg->cmsg_type) {
2749	case SO_MARK:
2750		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2751		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2752			return -EPERM;
2753		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2754			return -EINVAL;
2755		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2756		break;
2757	case SO_TIMESTAMPING_OLD:
2758		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2759			return -EINVAL;
2760
2761		tsflags = *(u32 *)CMSG_DATA(cmsg);
2762		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2763			return -EINVAL;
2764
2765		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2766		sockc->tsflags |= tsflags;
2767		break;
2768	case SCM_TXTIME:
2769		if (!sock_flag(sk, SOCK_TXTIME))
2770			return -EINVAL;
2771		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2772			return -EINVAL;
2773		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2774		break;
2775	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2776	case SCM_RIGHTS:
2777	case SCM_CREDENTIALS:
2778		break;
2779	default:
2780		return -EINVAL;
2781	}
2782	return 0;
2783}
2784EXPORT_SYMBOL(__sock_cmsg_send);
2785
2786int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2787		   struct sockcm_cookie *sockc)
2788{
2789	struct cmsghdr *cmsg;
2790	int ret;
2791
2792	for_each_cmsghdr(cmsg, msg) {
2793		if (!CMSG_OK(msg, cmsg))
2794			return -EINVAL;
2795		if (cmsg->cmsg_level != SOL_SOCKET)
2796			continue;
2797		ret = __sock_cmsg_send(sk, cmsg, sockc);
2798		if (ret)
2799			return ret;
2800	}
2801	return 0;
2802}
2803EXPORT_SYMBOL(sock_cmsg_send);
2804
2805static void sk_enter_memory_pressure(struct sock *sk)
2806{
2807	if (!sk->sk_prot->enter_memory_pressure)
2808		return;
2809
2810	sk->sk_prot->enter_memory_pressure(sk);
2811}
2812
2813static void sk_leave_memory_pressure(struct sock *sk)
2814{
2815	if (sk->sk_prot->leave_memory_pressure) {
2816		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2817				     tcp_leave_memory_pressure, sk);
2818	} else {
2819		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2820
2821		if (memory_pressure && READ_ONCE(*memory_pressure))
2822			WRITE_ONCE(*memory_pressure, 0);
2823	}
2824}
2825
2826DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2827
2828/**
2829 * skb_page_frag_refill - check that a page_frag contains enough room
2830 * @sz: minimum size of the fragment we want to get
2831 * @pfrag: pointer to page_frag
2832 * @gfp: priority for memory allocation
2833 *
2834 * Note: While this allocator tries to use high order pages, there is
2835 * no guarantee that allocations succeed. Therefore, @sz MUST be
2836 * less or equal than PAGE_SIZE.
2837 */
2838bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2839{
2840	if (pfrag->page) {
2841		if (page_ref_count(pfrag->page) == 1) {
2842			pfrag->offset = 0;
2843			return true;
2844		}
2845		if (pfrag->offset + sz <= pfrag->size)
2846			return true;
2847		put_page(pfrag->page);
2848	}
2849
2850	pfrag->offset = 0;
2851	if (SKB_FRAG_PAGE_ORDER &&
2852	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2853		/* Avoid direct reclaim but allow kswapd to wake */
2854		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2855					  __GFP_COMP | __GFP_NOWARN |
2856					  __GFP_NORETRY,
2857					  SKB_FRAG_PAGE_ORDER);
2858		if (likely(pfrag->page)) {
2859			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2860			return true;
2861		}
2862	}
2863	pfrag->page = alloc_page(gfp);
2864	if (likely(pfrag->page)) {
2865		pfrag->size = PAGE_SIZE;
2866		return true;
2867	}
2868	return false;
2869}
2870EXPORT_SYMBOL(skb_page_frag_refill);
2871
2872bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2873{
2874	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2875		return true;
2876
2877	sk_enter_memory_pressure(sk);
2878	sk_stream_moderate_sndbuf(sk);
2879	return false;
2880}
2881EXPORT_SYMBOL(sk_page_frag_refill);
2882
2883void __lock_sock(struct sock *sk)
2884	__releases(&sk->sk_lock.slock)
2885	__acquires(&sk->sk_lock.slock)
2886{
2887	DEFINE_WAIT(wait);
2888
2889	for (;;) {
2890		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2891					TASK_UNINTERRUPTIBLE);
2892		spin_unlock_bh(&sk->sk_lock.slock);
2893		schedule();
2894		spin_lock_bh(&sk->sk_lock.slock);
2895		if (!sock_owned_by_user(sk))
2896			break;
2897	}
2898	finish_wait(&sk->sk_lock.wq, &wait);
2899}
2900
2901void __release_sock(struct sock *sk)
2902	__releases(&sk->sk_lock.slock)
2903	__acquires(&sk->sk_lock.slock)
2904{
2905	struct sk_buff *skb, *next;
2906
2907	while ((skb = sk->sk_backlog.head) != NULL) {
2908		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2909
2910		spin_unlock_bh(&sk->sk_lock.slock);
2911
2912		do {
2913			next = skb->next;
2914			prefetch(next);
2915			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2916			skb_mark_not_on_list(skb);
2917			sk_backlog_rcv(sk, skb);
2918
2919			cond_resched();
2920
2921			skb = next;
2922		} while (skb != NULL);
2923
2924		spin_lock_bh(&sk->sk_lock.slock);
2925	}
2926
2927	/*
2928	 * Doing the zeroing here guarantee we can not loop forever
2929	 * while a wild producer attempts to flood us.
2930	 */
2931	sk->sk_backlog.len = 0;
2932}
2933
2934void __sk_flush_backlog(struct sock *sk)
2935{
2936	spin_lock_bh(&sk->sk_lock.slock);
2937	__release_sock(sk);
2938	spin_unlock_bh(&sk->sk_lock.slock);
2939}
2940EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2941
2942/**
2943 * sk_wait_data - wait for data to arrive at sk_receive_queue
2944 * @sk:    sock to wait on
2945 * @timeo: for how long
2946 * @skb:   last skb seen on sk_receive_queue
2947 *
2948 * Now socket state including sk->sk_err is changed only under lock,
2949 * hence we may omit checks after joining wait queue.
2950 * We check receive queue before schedule() only as optimization;
2951 * it is very likely that release_sock() added new data.
2952 */
2953int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2954{
2955	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2956	int rc;
2957
2958	add_wait_queue(sk_sleep(sk), &wait);
2959	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2960	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2961	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2962	remove_wait_queue(sk_sleep(sk), &wait);
2963	return rc;
2964}
2965EXPORT_SYMBOL(sk_wait_data);
2966
2967/**
2968 *	__sk_mem_raise_allocated - increase memory_allocated
2969 *	@sk: socket
2970 *	@size: memory size to allocate
2971 *	@amt: pages to allocate
2972 *	@kind: allocation type
2973 *
2974 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2975 */
2976int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2977{
2978	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2979	struct proto *prot = sk->sk_prot;
2980	bool charged = true;
2981	long allocated;
2982
2983	sk_memory_allocated_add(sk, amt);
2984	allocated = sk_memory_allocated(sk);
2985	if (memcg_charge &&
2986	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2987						gfp_memcg_charge())))
2988		goto suppress_allocation;
2989
2990	/* Under limit. */
2991	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2992		sk_leave_memory_pressure(sk);
2993		return 1;
2994	}
2995
2996	/* Under pressure. */
2997	if (allocated > sk_prot_mem_limits(sk, 1))
2998		sk_enter_memory_pressure(sk);
2999
3000	/* Over hard limit. */
3001	if (allocated > sk_prot_mem_limits(sk, 2))
3002		goto suppress_allocation;
3003
3004	/* guarantee minimum buffer size under pressure */
3005	if (kind == SK_MEM_RECV) {
3006		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3007			return 1;
3008
3009	} else { /* SK_MEM_SEND */
3010		int wmem0 = sk_get_wmem0(sk, prot);
3011
3012		if (sk->sk_type == SOCK_STREAM) {
3013			if (sk->sk_wmem_queued < wmem0)
3014				return 1;
3015		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3016				return 1;
3017		}
3018	}
3019
3020	if (sk_has_memory_pressure(sk)) {
3021		u64 alloc;
3022
3023		if (!sk_under_memory_pressure(sk))
3024			return 1;
3025		alloc = sk_sockets_allocated_read_positive(sk);
3026		if (sk_prot_mem_limits(sk, 2) > alloc *
3027		    sk_mem_pages(sk->sk_wmem_queued +
3028				 atomic_read(&sk->sk_rmem_alloc) +
3029				 sk->sk_forward_alloc))
3030			return 1;
3031	}
3032
3033suppress_allocation:
3034
3035	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3036		sk_stream_moderate_sndbuf(sk);
3037
3038		/* Fail only if socket is _under_ its sndbuf.
3039		 * In this case we cannot block, so that we have to fail.
3040		 */
3041		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3042			/* Force charge with __GFP_NOFAIL */
3043			if (memcg_charge && !charged) {
3044				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
3045					gfp_memcg_charge() | __GFP_NOFAIL);
3046			}
3047			return 1;
3048		}
3049	}
3050
3051	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3052		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3053
3054	sk_memory_allocated_sub(sk, amt);
3055
3056	if (memcg_charge && charged)
3057		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
3058
3059	return 0;
3060}
3061
3062/**
3063 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3064 *	@sk: socket
3065 *	@size: memory size to allocate
3066 *	@kind: allocation type
3067 *
3068 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3069 *	rmem allocation. This function assumes that protocols which have
3070 *	memory_pressure use sk_wmem_queued as write buffer accounting.
3071 */
3072int __sk_mem_schedule(struct sock *sk, int size, int kind)
3073{
3074	int ret, amt = sk_mem_pages(size);
3075
3076	sk->sk_forward_alloc += amt << PAGE_SHIFT;
3077	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3078	if (!ret)
3079		sk->sk_forward_alloc -= amt << PAGE_SHIFT;
3080	return ret;
3081}
3082EXPORT_SYMBOL(__sk_mem_schedule);
3083
3084/**
3085 *	__sk_mem_reduce_allocated - reclaim memory_allocated
3086 *	@sk: socket
3087 *	@amount: number of quanta
3088 *
3089 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3090 */
3091void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3092{
3093	sk_memory_allocated_sub(sk, amount);
3094
3095	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3096		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3097
3098	if (sk_under_memory_pressure(sk) &&
3099	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3100		sk_leave_memory_pressure(sk);
3101}
3102
3103/**
3104 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3105 *	@sk: socket
3106 *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3107 */
3108void __sk_mem_reclaim(struct sock *sk, int amount)
3109{
3110	amount >>= PAGE_SHIFT;
3111	sk->sk_forward_alloc -= amount << PAGE_SHIFT;
3112	__sk_mem_reduce_allocated(sk, amount);
3113}
3114EXPORT_SYMBOL(__sk_mem_reclaim);
3115
3116int sk_set_peek_off(struct sock *sk, int val)
3117{
3118	sk->sk_peek_off = val;
3119	return 0;
3120}
3121EXPORT_SYMBOL_GPL(sk_set_peek_off);
3122
3123/*
3124 * Set of default routines for initialising struct proto_ops when
3125 * the protocol does not support a particular function. In certain
3126 * cases where it makes no sense for a protocol to have a "do nothing"
3127 * function, some default processing is provided.
3128 */
3129
3130int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3131{
3132	return -EOPNOTSUPP;
3133}
3134EXPORT_SYMBOL(sock_no_bind);
3135
3136int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3137		    int len, int flags)
3138{
3139	return -EOPNOTSUPP;
3140}
3141EXPORT_SYMBOL(sock_no_connect);
3142
3143int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3144{
3145	return -EOPNOTSUPP;
3146}
3147EXPORT_SYMBOL(sock_no_socketpair);
3148
3149int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3150		   bool kern)
3151{
3152	return -EOPNOTSUPP;
3153}
3154EXPORT_SYMBOL(sock_no_accept);
3155
3156int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3157		    int peer)
3158{
3159	return -EOPNOTSUPP;
3160}
3161EXPORT_SYMBOL(sock_no_getname);
3162
3163int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3164{
3165	return -EOPNOTSUPP;
3166}
3167EXPORT_SYMBOL(sock_no_ioctl);
3168
3169int sock_no_listen(struct socket *sock, int backlog)
3170{
3171	return -EOPNOTSUPP;
3172}
3173EXPORT_SYMBOL(sock_no_listen);
3174
3175int sock_no_shutdown(struct socket *sock, int how)
3176{
3177	return -EOPNOTSUPP;
3178}
3179EXPORT_SYMBOL(sock_no_shutdown);
3180
3181int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3182{
3183	return -EOPNOTSUPP;
3184}
3185EXPORT_SYMBOL(sock_no_sendmsg);
3186
3187int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3188{
3189	return -EOPNOTSUPP;
3190}
3191EXPORT_SYMBOL(sock_no_sendmsg_locked);
3192
3193int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3194		    int flags)
3195{
3196	return -EOPNOTSUPP;
3197}
3198EXPORT_SYMBOL(sock_no_recvmsg);
3199
3200int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3201{
3202	/* Mirror missing mmap method error code */
3203	return -ENODEV;
3204}
3205EXPORT_SYMBOL(sock_no_mmap);
3206
3207/*
3208 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3209 * various sock-based usage counts.
3210 */
3211void __receive_sock(struct file *file)
3212{
3213	struct socket *sock;
3214
3215	sock = sock_from_file(file);
3216	if (sock) {
3217		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3218		sock_update_classid(&sock->sk->sk_cgrp_data);
3219	}
3220}
3221
3222ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3223{
3224	ssize_t res;
3225	struct msghdr msg = {.msg_flags = flags};
3226	struct kvec iov;
3227	char *kaddr = kmap(page);
3228	iov.iov_base = kaddr + offset;
3229	iov.iov_len = size;
3230	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3231	kunmap(page);
3232	return res;
3233}
3234EXPORT_SYMBOL(sock_no_sendpage);
3235
3236ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3237				int offset, size_t size, int flags)
3238{
3239	ssize_t res;
3240	struct msghdr msg = {.msg_flags = flags};
3241	struct kvec iov;
3242	char *kaddr = kmap(page);
3243
3244	iov.iov_base = kaddr + offset;
3245	iov.iov_len = size;
3246	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3247	kunmap(page);
3248	return res;
3249}
3250EXPORT_SYMBOL(sock_no_sendpage_locked);
3251
3252/*
3253 *	Default Socket Callbacks
3254 */
3255
3256static void sock_def_wakeup(struct sock *sk)
3257{
3258	struct socket_wq *wq;
3259
3260	rcu_read_lock();
3261	wq = rcu_dereference(sk->sk_wq);
3262	if (skwq_has_sleeper(wq))
3263		wake_up_interruptible_all(&wq->wait);
3264	rcu_read_unlock();
3265}
3266
3267static void sock_def_error_report(struct sock *sk)
3268{
3269	struct socket_wq *wq;
3270
3271	rcu_read_lock();
3272	wq = rcu_dereference(sk->sk_wq);
3273	if (skwq_has_sleeper(wq))
3274		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3275	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3276	rcu_read_unlock();
3277}
3278
3279void sock_def_readable(struct sock *sk)
3280{
3281	struct socket_wq *wq;
3282
3283	trace_sk_data_ready(sk);
3284
3285	rcu_read_lock();
3286	wq = rcu_dereference(sk->sk_wq);
3287	if (skwq_has_sleeper(wq))
3288		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3289						EPOLLRDNORM | EPOLLRDBAND);
3290	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3291	rcu_read_unlock();
3292}
3293
3294static void sock_def_write_space(struct sock *sk)
3295{
3296	struct socket_wq *wq;
3297
3298	rcu_read_lock();
3299
3300	/* Do not wake up a writer until he can make "significant"
3301	 * progress.  --DaveM
3302	 */
3303	if (sock_writeable(sk)) {
3304		wq = rcu_dereference(sk->sk_wq);
3305		if (skwq_has_sleeper(wq))
3306			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3307						EPOLLWRNORM | EPOLLWRBAND);
3308
3309		/* Should agree with poll, otherwise some programs break */
3310		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3311	}
3312
3313	rcu_read_unlock();
3314}
3315
3316/* An optimised version of sock_def_write_space(), should only be called
3317 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3318 * ->sk_wmem_alloc.
3319 */
3320static void sock_def_write_space_wfree(struct sock *sk)
3321{
3322	/* Do not wake up a writer until he can make "significant"
3323	 * progress.  --DaveM
3324	 */
3325	if (sock_writeable(sk)) {
3326		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3327
3328		/* rely on refcount_sub from sock_wfree() */
3329		smp_mb__after_atomic();
3330		if (wq && waitqueue_active(&wq->wait))
3331			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3332						EPOLLWRNORM | EPOLLWRBAND);
3333
3334		/* Should agree with poll, otherwise some programs break */
3335		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3336	}
3337}
3338
3339static void sock_def_destruct(struct sock *sk)
3340{
3341}
3342
3343void sk_send_sigurg(struct sock *sk)
3344{
3345	if (sk->sk_socket && sk->sk_socket->file)
3346		if (send_sigurg(&sk->sk_socket->file->f_owner))
3347			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3348}
3349EXPORT_SYMBOL(sk_send_sigurg);
3350
3351void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3352		    unsigned long expires)
3353{
3354	if (!mod_timer(timer, expires))
3355		sock_hold(sk);
3356}
3357EXPORT_SYMBOL(sk_reset_timer);
3358
3359void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3360{
3361	if (del_timer(timer))
3362		__sock_put(sk);
3363}
3364EXPORT_SYMBOL(sk_stop_timer);
3365
3366void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3367{
3368	if (del_timer_sync(timer))
3369		__sock_put(sk);
3370}
3371EXPORT_SYMBOL(sk_stop_timer_sync);
3372
3373void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3374{
3375	sk_init_common(sk);
3376	sk->sk_send_head	=	NULL;
3377
3378	timer_setup(&sk->sk_timer, NULL, 0);
3379
3380	sk->sk_allocation	=	GFP_KERNEL;
3381	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3382	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3383	sk->sk_state		=	TCP_CLOSE;
3384	sk->sk_use_task_frag	=	true;
3385	sk_set_socket(sk, sock);
3386
3387	sock_set_flag(sk, SOCK_ZAPPED);
3388
3389	if (sock) {
3390		sk->sk_type	=	sock->type;
3391		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3392		sock->sk	=	sk;
3393	} else {
3394		RCU_INIT_POINTER(sk->sk_wq, NULL);
3395	}
3396	sk->sk_uid	=	uid;
3397
3398	rwlock_init(&sk->sk_callback_lock);
3399	if (sk->sk_kern_sock)
3400		lockdep_set_class_and_name(
3401			&sk->sk_callback_lock,
3402			af_kern_callback_keys + sk->sk_family,
3403			af_family_kern_clock_key_strings[sk->sk_family]);
3404	else
3405		lockdep_set_class_and_name(
3406			&sk->sk_callback_lock,
3407			af_callback_keys + sk->sk_family,
3408			af_family_clock_key_strings[sk->sk_family]);
3409
3410	sk->sk_state_change	=	sock_def_wakeup;
3411	sk->sk_data_ready	=	sock_def_readable;
3412	sk->sk_write_space	=	sock_def_write_space;
3413	sk->sk_error_report	=	sock_def_error_report;
3414	sk->sk_destruct		=	sock_def_destruct;
3415
3416	sk->sk_frag.page	=	NULL;
3417	sk->sk_frag.offset	=	0;
3418	sk->sk_peek_off		=	-1;
3419
3420	sk->sk_peer_pid 	=	NULL;
3421	sk->sk_peer_cred	=	NULL;
3422	spin_lock_init(&sk->sk_peer_lock);
3423
3424	sk->sk_write_pending	=	0;
3425	sk->sk_rcvlowat		=	1;
3426	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3427	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3428
3429	sk->sk_stamp = SK_DEFAULT_STAMP;
3430#if BITS_PER_LONG==32
3431	seqlock_init(&sk->sk_stamp_seq);
3432#endif
3433	atomic_set(&sk->sk_zckey, 0);
3434
3435#ifdef CONFIG_NET_RX_BUSY_POLL
3436	sk->sk_napi_id		=	0;
3437	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3438#endif
3439
3440	sk->sk_max_pacing_rate = ~0UL;
3441	sk->sk_pacing_rate = ~0UL;
3442	WRITE_ONCE(sk->sk_pacing_shift, 10);
3443	sk->sk_incoming_cpu = -1;
3444
3445	sk_rx_queue_clear(sk);
3446	/*
3447	 * Before updating sk_refcnt, we must commit prior changes to memory
3448	 * (Documentation/RCU/rculist_nulls.rst for details)
3449	 */
3450	smp_wmb();
3451	refcount_set(&sk->sk_refcnt, 1);
3452	atomic_set(&sk->sk_drops, 0);
3453}
3454EXPORT_SYMBOL(sock_init_data_uid);
3455
3456void sock_init_data(struct socket *sock, struct sock *sk)
3457{
3458	kuid_t uid = sock ?
3459		SOCK_INODE(sock)->i_uid :
3460		make_kuid(sock_net(sk)->user_ns, 0);
3461
3462	sock_init_data_uid(sock, sk, uid);
3463}
3464EXPORT_SYMBOL(sock_init_data);
3465
3466void lock_sock_nested(struct sock *sk, int subclass)
3467{
3468	/* The sk_lock has mutex_lock() semantics here. */
3469	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3470
3471	might_sleep();
3472	spin_lock_bh(&sk->sk_lock.slock);
3473	if (sock_owned_by_user_nocheck(sk))
3474		__lock_sock(sk);
3475	sk->sk_lock.owned = 1;
3476	spin_unlock_bh(&sk->sk_lock.slock);
3477}
3478EXPORT_SYMBOL(lock_sock_nested);
3479
3480void release_sock(struct sock *sk)
3481{
3482	spin_lock_bh(&sk->sk_lock.slock);
3483	if (sk->sk_backlog.tail)
3484		__release_sock(sk);
3485
3486	/* Warning : release_cb() might need to release sk ownership,
3487	 * ie call sock_release_ownership(sk) before us.
3488	 */
3489	if (sk->sk_prot->release_cb)
3490		sk->sk_prot->release_cb(sk);
3491
3492	sock_release_ownership(sk);
3493	if (waitqueue_active(&sk->sk_lock.wq))
3494		wake_up(&sk->sk_lock.wq);
3495	spin_unlock_bh(&sk->sk_lock.slock);
3496}
3497EXPORT_SYMBOL(release_sock);
3498
3499bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3500{
3501	might_sleep();
3502	spin_lock_bh(&sk->sk_lock.slock);
3503
3504	if (!sock_owned_by_user_nocheck(sk)) {
3505		/*
3506		 * Fast path return with bottom halves disabled and
3507		 * sock::sk_lock.slock held.
3508		 *
3509		 * The 'mutex' is not contended and holding
3510		 * sock::sk_lock.slock prevents all other lockers to
3511		 * proceed so the corresponding unlock_sock_fast() can
3512		 * avoid the slow path of release_sock() completely and
3513		 * just release slock.
3514		 *
3515		 * From a semantical POV this is equivalent to 'acquiring'
3516		 * the 'mutex', hence the corresponding lockdep
3517		 * mutex_release() has to happen in the fast path of
3518		 * unlock_sock_fast().
3519		 */
3520		return false;
3521	}
3522
3523	__lock_sock(sk);
3524	sk->sk_lock.owned = 1;
3525	__acquire(&sk->sk_lock.slock);
3526	spin_unlock_bh(&sk->sk_lock.slock);
3527	return true;
3528}
3529EXPORT_SYMBOL(__lock_sock_fast);
3530
3531int sock_gettstamp(struct socket *sock, void __user *userstamp,
3532		   bool timeval, bool time32)
3533{
3534	struct sock *sk = sock->sk;
3535	struct timespec64 ts;
3536
3537	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3538	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3539	if (ts.tv_sec == -1)
3540		return -ENOENT;
3541	if (ts.tv_sec == 0) {
3542		ktime_t kt = ktime_get_real();
3543		sock_write_timestamp(sk, kt);
3544		ts = ktime_to_timespec64(kt);
3545	}
3546
3547	if (timeval)
3548		ts.tv_nsec /= 1000;
3549
3550#ifdef CONFIG_COMPAT_32BIT_TIME
3551	if (time32)
3552		return put_old_timespec32(&ts, userstamp);
3553#endif
3554#ifdef CONFIG_SPARC64
3555	/* beware of padding in sparc64 timeval */
3556	if (timeval && !in_compat_syscall()) {
3557		struct __kernel_old_timeval __user tv = {
3558			.tv_sec = ts.tv_sec,
3559			.tv_usec = ts.tv_nsec,
3560		};
3561		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3562			return -EFAULT;
3563		return 0;
3564	}
3565#endif
3566	return put_timespec64(&ts, userstamp);
3567}
3568EXPORT_SYMBOL(sock_gettstamp);
3569
3570void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3571{
3572	if (!sock_flag(sk, flag)) {
3573		unsigned long previous_flags = sk->sk_flags;
3574
3575		sock_set_flag(sk, flag);
3576		/*
3577		 * we just set one of the two flags which require net
3578		 * time stamping, but time stamping might have been on
3579		 * already because of the other one
3580		 */
3581		if (sock_needs_netstamp(sk) &&
3582		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3583			net_enable_timestamp();
3584	}
3585}
3586
3587int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3588		       int level, int type)
3589{
3590	struct sock_exterr_skb *serr;
3591	struct sk_buff *skb;
3592	int copied, err;
3593
3594	err = -EAGAIN;
3595	skb = sock_dequeue_err_skb(sk);
3596	if (skb == NULL)
3597		goto out;
3598
3599	copied = skb->len;
3600	if (copied > len) {
3601		msg->msg_flags |= MSG_TRUNC;
3602		copied = len;
3603	}
3604	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3605	if (err)
3606		goto out_free_skb;
3607
3608	sock_recv_timestamp(msg, sk, skb);
3609
3610	serr = SKB_EXT_ERR(skb);
3611	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3612
3613	msg->msg_flags |= MSG_ERRQUEUE;
3614	err = copied;
3615
3616out_free_skb:
3617	kfree_skb(skb);
3618out:
3619	return err;
3620}
3621EXPORT_SYMBOL(sock_recv_errqueue);
3622
3623/*
3624 *	Get a socket option on an socket.
3625 *
3626 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3627 *	asynchronous errors should be reported by getsockopt. We assume
3628 *	this means if you specify SO_ERROR (otherwise whats the point of it).
3629 */
3630int sock_common_getsockopt(struct socket *sock, int level, int optname,
3631			   char __user *optval, int __user *optlen)
3632{
3633	struct sock *sk = sock->sk;
3634
3635	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3636	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3637}
3638EXPORT_SYMBOL(sock_common_getsockopt);
3639
3640int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3641			int flags)
3642{
3643	struct sock *sk = sock->sk;
3644	int addr_len = 0;
3645	int err;
3646
3647	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3648	if (err >= 0)
3649		msg->msg_namelen = addr_len;
3650	return err;
3651}
3652EXPORT_SYMBOL(sock_common_recvmsg);
3653
3654/*
3655 *	Set socket options on an inet socket.
3656 */
3657int sock_common_setsockopt(struct socket *sock, int level, int optname,
3658			   sockptr_t optval, unsigned int optlen)
3659{
3660	struct sock *sk = sock->sk;
3661
3662	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3663	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3664}
3665EXPORT_SYMBOL(sock_common_setsockopt);
3666
3667void sk_common_release(struct sock *sk)
3668{
3669	if (sk->sk_prot->destroy)
3670		sk->sk_prot->destroy(sk);
3671
3672	/*
3673	 * Observation: when sk_common_release is called, processes have
3674	 * no access to socket. But net still has.
3675	 * Step one, detach it from networking:
3676	 *
3677	 * A. Remove from hash tables.
3678	 */
3679
3680	sk->sk_prot->unhash(sk);
3681
3682	/*
3683	 * In this point socket cannot receive new packets, but it is possible
3684	 * that some packets are in flight because some CPU runs receiver and
3685	 * did hash table lookup before we unhashed socket. They will achieve
3686	 * receive queue and will be purged by socket destructor.
3687	 *
3688	 * Also we still have packets pending on receive queue and probably,
3689	 * our own packets waiting in device queues. sock_destroy will drain
3690	 * receive queue, but transmitted packets will delay socket destruction
3691	 * until the last reference will be released.
3692	 */
3693
3694	sock_orphan(sk);
3695
3696	xfrm_sk_free_policy(sk);
3697
3698	sock_put(sk);
3699}
3700EXPORT_SYMBOL(sk_common_release);
3701
3702void sk_get_meminfo(const struct sock *sk, u32 *mem)
3703{
3704	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3705
3706	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3707	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3708	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3709	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3710	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3711	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3712	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3713	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3714	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3715}
3716
3717#ifdef CONFIG_PROC_FS
3718static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3719
3720int sock_prot_inuse_get(struct net *net, struct proto *prot)
3721{
3722	int cpu, idx = prot->inuse_idx;
3723	int res = 0;
3724
3725	for_each_possible_cpu(cpu)
3726		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3727
3728	return res >= 0 ? res : 0;
3729}
3730EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3731
3732int sock_inuse_get(struct net *net)
3733{
3734	int cpu, res = 0;
3735
3736	for_each_possible_cpu(cpu)
3737		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3738
3739	return res;
3740}
3741
3742EXPORT_SYMBOL_GPL(sock_inuse_get);
3743
3744static int __net_init sock_inuse_init_net(struct net *net)
3745{
3746	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3747	if (net->core.prot_inuse == NULL)
3748		return -ENOMEM;
3749	return 0;
3750}
3751
3752static void __net_exit sock_inuse_exit_net(struct net *net)
3753{
3754	free_percpu(net->core.prot_inuse);
3755}
3756
3757static struct pernet_operations net_inuse_ops = {
3758	.init = sock_inuse_init_net,
3759	.exit = sock_inuse_exit_net,
3760};
3761
3762static __init int net_inuse_init(void)
3763{
3764	if (register_pernet_subsys(&net_inuse_ops))
3765		panic("Cannot initialize net inuse counters");
3766
3767	return 0;
3768}
3769
3770core_initcall(net_inuse_init);
3771
3772static int assign_proto_idx(struct proto *prot)
3773{
3774	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3775
3776	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3777		pr_err("PROTO_INUSE_NR exhausted\n");
3778		return -ENOSPC;
3779	}
3780
3781	set_bit(prot->inuse_idx, proto_inuse_idx);
3782	return 0;
3783}
3784
3785static void release_proto_idx(struct proto *prot)
3786{
3787	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3788		clear_bit(prot->inuse_idx, proto_inuse_idx);
3789}
3790#else
3791static inline int assign_proto_idx(struct proto *prot)
3792{
3793	return 0;
3794}
3795
3796static inline void release_proto_idx(struct proto *prot)
3797{
3798}
3799
3800#endif
3801
3802static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3803{
3804	if (!twsk_prot)
3805		return;
3806	kfree(twsk_prot->twsk_slab_name);
3807	twsk_prot->twsk_slab_name = NULL;
3808	kmem_cache_destroy(twsk_prot->twsk_slab);
3809	twsk_prot->twsk_slab = NULL;
3810}
3811
3812static int tw_prot_init(const struct proto *prot)
3813{
3814	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3815
3816	if (!twsk_prot)
3817		return 0;
3818
3819	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3820					      prot->name);
3821	if (!twsk_prot->twsk_slab_name)
3822		return -ENOMEM;
3823
3824	twsk_prot->twsk_slab =
3825		kmem_cache_create(twsk_prot->twsk_slab_name,
3826				  twsk_prot->twsk_obj_size, 0,
3827				  SLAB_ACCOUNT | prot->slab_flags,
3828				  NULL);
3829	if (!twsk_prot->twsk_slab) {
3830		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3831			prot->name);
3832		return -ENOMEM;
3833	}
3834
3835	return 0;
3836}
3837
3838static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3839{
3840	if (!rsk_prot)
3841		return;
3842	kfree(rsk_prot->slab_name);
3843	rsk_prot->slab_name = NULL;
3844	kmem_cache_destroy(rsk_prot->slab);
3845	rsk_prot->slab = NULL;
3846}
3847
3848static int req_prot_init(const struct proto *prot)
3849{
3850	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3851
3852	if (!rsk_prot)
3853		return 0;
3854
3855	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3856					prot->name);
3857	if (!rsk_prot->slab_name)
3858		return -ENOMEM;
3859
3860	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3861					   rsk_prot->obj_size, 0,
3862					   SLAB_ACCOUNT | prot->slab_flags,
3863					   NULL);
3864
3865	if (!rsk_prot->slab) {
3866		pr_crit("%s: Can't create request sock SLAB cache!\n",
3867			prot->name);
3868		return -ENOMEM;
3869	}
3870	return 0;
3871}
3872
3873int proto_register(struct proto *prot, int alloc_slab)
3874{
3875	int ret = -ENOBUFS;
3876
3877	if (prot->memory_allocated && !prot->sysctl_mem) {
3878		pr_err("%s: missing sysctl_mem\n", prot->name);
3879		return -EINVAL;
3880	}
3881	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3882		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3883		return -EINVAL;
3884	}
3885	if (alloc_slab) {
3886		prot->slab = kmem_cache_create_usercopy(prot->name,
3887					prot->obj_size, 0,
3888					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3889					prot->slab_flags,
3890					prot->useroffset, prot->usersize,
3891					NULL);
3892
3893		if (prot->slab == NULL) {
3894			pr_crit("%s: Can't create sock SLAB cache!\n",
3895				prot->name);
3896			goto out;
3897		}
3898
3899		if (req_prot_init(prot))
3900			goto out_free_request_sock_slab;
3901
3902		if (tw_prot_init(prot))
3903			goto out_free_timewait_sock_slab;
3904	}
3905
3906	mutex_lock(&proto_list_mutex);
3907	ret = assign_proto_idx(prot);
3908	if (ret) {
3909		mutex_unlock(&proto_list_mutex);
3910		goto out_free_timewait_sock_slab;
3911	}
3912	list_add(&prot->node, &proto_list);
3913	mutex_unlock(&proto_list_mutex);
3914	return ret;
3915
3916out_free_timewait_sock_slab:
3917	if (alloc_slab)
3918		tw_prot_cleanup(prot->twsk_prot);
3919out_free_request_sock_slab:
3920	if (alloc_slab) {
3921		req_prot_cleanup(prot->rsk_prot);
3922
3923		kmem_cache_destroy(prot->slab);
3924		prot->slab = NULL;
3925	}
3926out:
3927	return ret;
3928}
3929EXPORT_SYMBOL(proto_register);
3930
3931void proto_unregister(struct proto *prot)
3932{
3933	mutex_lock(&proto_list_mutex);
3934	release_proto_idx(prot);
3935	list_del(&prot->node);
3936	mutex_unlock(&proto_list_mutex);
3937
3938	kmem_cache_destroy(prot->slab);
3939	prot->slab = NULL;
3940
3941	req_prot_cleanup(prot->rsk_prot);
3942	tw_prot_cleanup(prot->twsk_prot);
3943}
3944EXPORT_SYMBOL(proto_unregister);
3945
3946int sock_load_diag_module(int family, int protocol)
3947{
3948	if (!protocol) {
3949		if (!sock_is_registered(family))
3950			return -ENOENT;
3951
3952		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3953				      NETLINK_SOCK_DIAG, family);
3954	}
3955
3956#ifdef CONFIG_INET
3957	if (family == AF_INET &&
3958	    protocol != IPPROTO_RAW &&
3959	    protocol < MAX_INET_PROTOS &&
3960	    !rcu_access_pointer(inet_protos[protocol]))
3961		return -ENOENT;
3962#endif
3963
3964	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3965			      NETLINK_SOCK_DIAG, family, protocol);
3966}
3967EXPORT_SYMBOL(sock_load_diag_module);
3968
3969#ifdef CONFIG_PROC_FS
3970static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3971	__acquires(proto_list_mutex)
3972{
3973	mutex_lock(&proto_list_mutex);
3974	return seq_list_start_head(&proto_list, *pos);
3975}
3976
3977static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3978{
3979	return seq_list_next(v, &proto_list, pos);
3980}
3981
3982static void proto_seq_stop(struct seq_file *seq, void *v)
3983	__releases(proto_list_mutex)
3984{
3985	mutex_unlock(&proto_list_mutex);
3986}
3987
3988static char proto_method_implemented(const void *method)
3989{
3990	return method == NULL ? 'n' : 'y';
3991}
3992static long sock_prot_memory_allocated(struct proto *proto)
3993{
3994	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3995}
3996
3997static const char *sock_prot_memory_pressure(struct proto *proto)
3998{
3999	return proto->memory_pressure != NULL ?
4000	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4001}
4002
4003static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4004{
4005
4006	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4007			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4008		   proto->name,
4009		   proto->obj_size,
4010		   sock_prot_inuse_get(seq_file_net(seq), proto),
4011		   sock_prot_memory_allocated(proto),
4012		   sock_prot_memory_pressure(proto),
4013		   proto->max_header,
4014		   proto->slab == NULL ? "no" : "yes",
4015		   module_name(proto->owner),
4016		   proto_method_implemented(proto->close),
4017		   proto_method_implemented(proto->connect),
4018		   proto_method_implemented(proto->disconnect),
4019		   proto_method_implemented(proto->accept),
4020		   proto_method_implemented(proto->ioctl),
4021		   proto_method_implemented(proto->init),
4022		   proto_method_implemented(proto->destroy),
4023		   proto_method_implemented(proto->shutdown),
4024		   proto_method_implemented(proto->setsockopt),
4025		   proto_method_implemented(proto->getsockopt),
4026		   proto_method_implemented(proto->sendmsg),
4027		   proto_method_implemented(proto->recvmsg),
4028		   proto_method_implemented(proto->sendpage),
4029		   proto_method_implemented(proto->bind),
4030		   proto_method_implemented(proto->backlog_rcv),
4031		   proto_method_implemented(proto->hash),
4032		   proto_method_implemented(proto->unhash),
4033		   proto_method_implemented(proto->get_port),
4034		   proto_method_implemented(proto->enter_memory_pressure));
4035}
4036
4037static int proto_seq_show(struct seq_file *seq, void *v)
4038{
4039	if (v == &proto_list)
4040		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4041			   "protocol",
4042			   "size",
4043			   "sockets",
4044			   "memory",
4045			   "press",
4046			   "maxhdr",
4047			   "slab",
4048			   "module",
4049			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
4050	else
4051		proto_seq_printf(seq, list_entry(v, struct proto, node));
4052	return 0;
4053}
4054
4055static const struct seq_operations proto_seq_ops = {
4056	.start  = proto_seq_start,
4057	.next   = proto_seq_next,
4058	.stop   = proto_seq_stop,
4059	.show   = proto_seq_show,
4060};
4061
4062static __net_init int proto_init_net(struct net *net)
4063{
4064	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4065			sizeof(struct seq_net_private)))
4066		return -ENOMEM;
4067
4068	return 0;
4069}
4070
4071static __net_exit void proto_exit_net(struct net *net)
4072{
4073	remove_proc_entry("protocols", net->proc_net);
4074}
4075
4076
4077static __net_initdata struct pernet_operations proto_net_ops = {
4078	.init = proto_init_net,
4079	.exit = proto_exit_net,
4080};
4081
4082static int __init proto_init(void)
4083{
4084	return register_pernet_subsys(&proto_net_ops);
4085}
4086
4087subsys_initcall(proto_init);
4088
4089#endif /* PROC_FS */
4090
4091#ifdef CONFIG_NET_RX_BUSY_POLL
4092bool sk_busy_loop_end(void *p, unsigned long start_time)
4093{
4094	struct sock *sk = p;
4095
4096	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4097	       sk_busy_loop_timeout(sk, start_time);
4098}
4099EXPORT_SYMBOL(sk_busy_loop_end);
4100#endif /* CONFIG_NET_RX_BUSY_POLL */
4101
4102int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4103{
4104	if (!sk->sk_prot->bind_add)
4105		return -EOPNOTSUPP;
4106	return sk->sk_prot->bind_add(sk, addr, addr_len);
4107}
4108EXPORT_SYMBOL(sock_bind_add);
Configure Feed

Configure Feed