net/core/sock.c at v5.0-rc7

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / core / sock.c
at v5.0-rc7 3436 lines 84 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Generic socket support routines. Memory allocators, socket lock/release
   7 *		handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *		This program is free software; you can redistribute it and/or
  87 *		modify it under the terms of the GNU General Public License
  88 *		as published by the Free Software Foundation; either version
  89 *		2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <asm/unaligned.h>
  95#include <linux/capability.h>
  96#include <linux/errno.h>
  97#include <linux/errqueue.h>
  98#include <linux/types.h>
  99#include <linux/socket.h>
 100#include <linux/in.h>
 101#include <linux/kernel.h>
 102#include <linux/module.h>
 103#include <linux/proc_fs.h>
 104#include <linux/seq_file.h>
 105#include <linux/sched.h>
 106#include <linux/sched/mm.h>
 107#include <linux/timer.h>
 108#include <linux/string.h>
 109#include <linux/sockios.h>
 110#include <linux/net.h>
 111#include <linux/mm.h>
 112#include <linux/slab.h>
 113#include <linux/interrupt.h>
 114#include <linux/poll.h>
 115#include <linux/tcp.h>
 116#include <linux/init.h>
 117#include <linux/highmem.h>
 118#include <linux/user_namespace.h>
 119#include <linux/static_key.h>
 120#include <linux/memcontrol.h>
 121#include <linux/prefetch.h>
 122
 123#include <linux/uaccess.h>
 124
 125#include <linux/netdevice.h>
 126#include <net/protocol.h>
 127#include <linux/skbuff.h>
 128#include <net/net_namespace.h>
 129#include <net/request_sock.h>
 130#include <net/sock.h>
 131#include <linux/net_tstamp.h>
 132#include <net/xfrm.h>
 133#include <linux/ipsec.h>
 134#include <net/cls_cgroup.h>
 135#include <net/netprio_cgroup.h>
 136#include <linux/sock_diag.h>
 137
 138#include <linux/filter.h>
 139#include <net/sock_reuseport.h>
 140
 141#include <trace/events/sock.h>
 142
 143#include <net/tcp.h>
 144#include <net/busy_poll.h>
 145
 146static DEFINE_MUTEX(proto_list_mutex);
 147static LIST_HEAD(proto_list);
 148
 149static void sock_inuse_add(struct net *net, int val);
 150
 151/**
 152 * sk_ns_capable - General socket capability test
 153 * @sk: Socket to use a capability on or through
 154 * @user_ns: The user namespace of the capability to use
 155 * @cap: The capability to use
 156 *
 157 * Test to see if the opener of the socket had when the socket was
 158 * created and the current process has the capability @cap in the user
 159 * namespace @user_ns.
 160 */
 161bool sk_ns_capable(const struct sock *sk,
 162		   struct user_namespace *user_ns, int cap)
 163{
 164	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 165		ns_capable(user_ns, cap);
 166}
 167EXPORT_SYMBOL(sk_ns_capable);
 168
 169/**
 170 * sk_capable - Socket global capability test
 171 * @sk: Socket to use a capability on or through
 172 * @cap: The global capability to use
 173 *
 174 * Test to see if the opener of the socket had when the socket was
 175 * created and the current process has the capability @cap in all user
 176 * namespaces.
 177 */
 178bool sk_capable(const struct sock *sk, int cap)
 179{
 180	return sk_ns_capable(sk, &init_user_ns, cap);
 181}
 182EXPORT_SYMBOL(sk_capable);
 183
 184/**
 185 * sk_net_capable - Network namespace socket capability test
 186 * @sk: Socket to use a capability on or through
 187 * @cap: The capability to use
 188 *
 189 * Test to see if the opener of the socket had when the socket was created
 190 * and the current process has the capability @cap over the network namespace
 191 * the socket is a member of.
 192 */
 193bool sk_net_capable(const struct sock *sk, int cap)
 194{
 195	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 196}
 197EXPORT_SYMBOL(sk_net_capable);
 198
 199/*
 200 * Each address family might have different locking rules, so we have
 201 * one slock key per address family and separate keys for internal and
 202 * userspace sockets.
 203 */
 204static struct lock_class_key af_family_keys[AF_MAX];
 205static struct lock_class_key af_family_kern_keys[AF_MAX];
 206static struct lock_class_key af_family_slock_keys[AF_MAX];
 207static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 208
 209/*
 210 * Make lock validator output more readable. (we pre-construct these
 211 * strings build-time, so that runtime initialization of socket
 212 * locks is fast):
 213 */
 214
 215#define _sock_locks(x)						  \
 216  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 217  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 218  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 219  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 220  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 221  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 222  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 223  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 224  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 225  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 226  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 227  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 228  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 229  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 230  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
 231  x "AF_MAX"
 232
 233static const char *const af_family_key_strings[AF_MAX+1] = {
 234	_sock_locks("sk_lock-")
 235};
 236static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 237	_sock_locks("slock-")
 238};
 239static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 240	_sock_locks("clock-")
 241};
 242
 243static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 244	_sock_locks("k-sk_lock-")
 245};
 246static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 247	_sock_locks("k-slock-")
 248};
 249static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 250	_sock_locks("k-clock-")
 251};
 252static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 253	_sock_locks("rlock-")
 254};
 255static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 256	_sock_locks("wlock-")
 257};
 258static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 259	_sock_locks("elock-")
 260};
 261
 262/*
 263 * sk_callback_lock and sk queues locking rules are per-address-family,
 264 * so split the lock classes by using a per-AF key:
 265 */
 266static struct lock_class_key af_callback_keys[AF_MAX];
 267static struct lock_class_key af_rlock_keys[AF_MAX];
 268static struct lock_class_key af_wlock_keys[AF_MAX];
 269static struct lock_class_key af_elock_keys[AF_MAX];
 270static struct lock_class_key af_kern_callback_keys[AF_MAX];
 271
 272/* Run time adjustable parameters. */
 273__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 274EXPORT_SYMBOL(sysctl_wmem_max);
 275__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 276EXPORT_SYMBOL(sysctl_rmem_max);
 277__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 278__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 279
 280/* Maximal space eaten by iovec or ancillary data plus some space */
 281int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 282EXPORT_SYMBOL(sysctl_optmem_max);
 283
 284int sysctl_tstamp_allow_data __read_mostly = 1;
 285
 286DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 287EXPORT_SYMBOL_GPL(memalloc_socks_key);
 288
 289/**
 290 * sk_set_memalloc - sets %SOCK_MEMALLOC
 291 * @sk: socket to set it on
 292 *
 293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 294 * It's the responsibility of the admin to adjust min_free_kbytes
 295 * to meet the requirements
 296 */
 297void sk_set_memalloc(struct sock *sk)
 298{
 299	sock_set_flag(sk, SOCK_MEMALLOC);
 300	sk->sk_allocation |= __GFP_MEMALLOC;
 301	static_branch_inc(&memalloc_socks_key);
 302}
 303EXPORT_SYMBOL_GPL(sk_set_memalloc);
 304
 305void sk_clear_memalloc(struct sock *sk)
 306{
 307	sock_reset_flag(sk, SOCK_MEMALLOC);
 308	sk->sk_allocation &= ~__GFP_MEMALLOC;
 309	static_branch_dec(&memalloc_socks_key);
 310
 311	/*
 312	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 313	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 314	 * it has rmem allocations due to the last swapfile being deactivated
 315	 * but there is a risk that the socket is unusable due to exceeding
 316	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 317	 */
 318	sk_mem_reclaim(sk);
 319}
 320EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 321
 322int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 323{
 324	int ret;
 325	unsigned int noreclaim_flag;
 326
 327	/* these should have been dropped before queueing */
 328	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 329
 330	noreclaim_flag = memalloc_noreclaim_save();
 331	ret = sk->sk_backlog_rcv(sk, skb);
 332	memalloc_noreclaim_restore(noreclaim_flag);
 333
 334	return ret;
 335}
 336EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 339{
 340	struct timeval tv;
 341
 342	if (optlen < sizeof(tv))
 343		return -EINVAL;
 344	if (copy_from_user(&tv, optval, sizeof(tv)))
 345		return -EFAULT;
 346	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 347		return -EDOM;
 348
 349	if (tv.tv_sec < 0) {
 350		static int warned __read_mostly;
 351
 352		*timeo_p = 0;
 353		if (warned < 10 && net_ratelimit()) {
 354			warned++;
 355			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 356				__func__, current->comm, task_pid_nr(current));
 357		}
 358		return 0;
 359	}
 360	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 361	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 362		return 0;
 363	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 364		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 365	return 0;
 366}
 367
 368static void sock_warn_obsolete_bsdism(const char *name)
 369{
 370	static int warned;
 371	static char warncomm[TASK_COMM_LEN];
 372	if (strcmp(warncomm, current->comm) && warned < 5) {
 373		strcpy(warncomm,  current->comm);
 374		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 375			warncomm, name);
 376		warned++;
 377	}
 378}
 379
 380static bool sock_needs_netstamp(const struct sock *sk)
 381{
 382	switch (sk->sk_family) {
 383	case AF_UNSPEC:
 384	case AF_UNIX:
 385		return false;
 386	default:
 387		return true;
 388	}
 389}
 390
 391static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 392{
 393	if (sk->sk_flags & flags) {
 394		sk->sk_flags &= ~flags;
 395		if (sock_needs_netstamp(sk) &&
 396		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 397			net_disable_timestamp();
 398	}
 399}
 400
 401
 402int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 403{
 404	unsigned long flags;
 405	struct sk_buff_head *list = &sk->sk_receive_queue;
 406
 407	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 408		atomic_inc(&sk->sk_drops);
 409		trace_sock_rcvqueue_full(sk, skb);
 410		return -ENOMEM;
 411	}
 412
 413	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 414		atomic_inc(&sk->sk_drops);
 415		return -ENOBUFS;
 416	}
 417
 418	skb->dev = NULL;
 419	skb_set_owner_r(skb, sk);
 420
 421	/* we escape from rcu protected region, make sure we dont leak
 422	 * a norefcounted dst
 423	 */
 424	skb_dst_force(skb);
 425
 426	spin_lock_irqsave(&list->lock, flags);
 427	sock_skb_set_dropcount(sk, skb);
 428	__skb_queue_tail(list, skb);
 429	spin_unlock_irqrestore(&list->lock, flags);
 430
 431	if (!sock_flag(sk, SOCK_DEAD))
 432		sk->sk_data_ready(sk);
 433	return 0;
 434}
 435EXPORT_SYMBOL(__sock_queue_rcv_skb);
 436
 437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 438{
 439	int err;
 440
 441	err = sk_filter(sk, skb);
 442	if (err)
 443		return err;
 444
 445	return __sock_queue_rcv_skb(sk, skb);
 446}
 447EXPORT_SYMBOL(sock_queue_rcv_skb);
 448
 449int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 450		     const int nested, unsigned int trim_cap, bool refcounted)
 451{
 452	int rc = NET_RX_SUCCESS;
 453
 454	if (sk_filter_trim_cap(sk, skb, trim_cap))
 455		goto discard_and_relse;
 456
 457	skb->dev = NULL;
 458
 459	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 460		atomic_inc(&sk->sk_drops);
 461		goto discard_and_relse;
 462	}
 463	if (nested)
 464		bh_lock_sock_nested(sk);
 465	else
 466		bh_lock_sock(sk);
 467	if (!sock_owned_by_user(sk)) {
 468		/*
 469		 * trylock + unlock semantics:
 470		 */
 471		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 472
 473		rc = sk_backlog_rcv(sk, skb);
 474
 475		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 476	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 477		bh_unlock_sock(sk);
 478		atomic_inc(&sk->sk_drops);
 479		goto discard_and_relse;
 480	}
 481
 482	bh_unlock_sock(sk);
 483out:
 484	if (refcounted)
 485		sock_put(sk);
 486	return rc;
 487discard_and_relse:
 488	kfree_skb(skb);
 489	goto out;
 490}
 491EXPORT_SYMBOL(__sk_receive_skb);
 492
 493struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 494{
 495	struct dst_entry *dst = __sk_dst_get(sk);
 496
 497	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 498		sk_tx_queue_clear(sk);
 499		sk->sk_dst_pending_confirm = 0;
 500		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 501		dst_release(dst);
 502		return NULL;
 503	}
 504
 505	return dst;
 506}
 507EXPORT_SYMBOL(__sk_dst_check);
 508
 509struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 510{
 511	struct dst_entry *dst = sk_dst_get(sk);
 512
 513	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 514		sk_dst_reset(sk);
 515		dst_release(dst);
 516		return NULL;
 517	}
 518
 519	return dst;
 520}
 521EXPORT_SYMBOL(sk_dst_check);
 522
 523static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 524				int optlen)
 525{
 526	int ret = -ENOPROTOOPT;
 527#ifdef CONFIG_NETDEVICES
 528	struct net *net = sock_net(sk);
 529	char devname[IFNAMSIZ];
 530	int index;
 531
 532	/* Sorry... */
 533	ret = -EPERM;
 534	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 535		goto out;
 536
 537	ret = -EINVAL;
 538	if (optlen < 0)
 539		goto out;
 540
 541	/* Bind this socket to a particular device like "eth0",
 542	 * as specified in the passed interface name. If the
 543	 * name is "" or the option length is zero the socket
 544	 * is not bound.
 545	 */
 546	if (optlen > IFNAMSIZ - 1)
 547		optlen = IFNAMSIZ - 1;
 548	memset(devname, 0, sizeof(devname));
 549
 550	ret = -EFAULT;
 551	if (copy_from_user(devname, optval, optlen))
 552		goto out;
 553
 554	index = 0;
 555	if (devname[0] != '\0') {
 556		struct net_device *dev;
 557
 558		rcu_read_lock();
 559		dev = dev_get_by_name_rcu(net, devname);
 560		if (dev)
 561			index = dev->ifindex;
 562		rcu_read_unlock();
 563		ret = -ENODEV;
 564		if (!dev)
 565			goto out;
 566	}
 567
 568	lock_sock(sk);
 569	sk->sk_bound_dev_if = index;
 570	if (sk->sk_prot->rehash)
 571		sk->sk_prot->rehash(sk);
 572	sk_dst_reset(sk);
 573	release_sock(sk);
 574
 575	ret = 0;
 576
 577out:
 578#endif
 579
 580	return ret;
 581}
 582
 583static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 584				int __user *optlen, int len)
 585{
 586	int ret = -ENOPROTOOPT;
 587#ifdef CONFIG_NETDEVICES
 588	struct net *net = sock_net(sk);
 589	char devname[IFNAMSIZ];
 590
 591	if (sk->sk_bound_dev_if == 0) {
 592		len = 0;
 593		goto zero;
 594	}
 595
 596	ret = -EINVAL;
 597	if (len < IFNAMSIZ)
 598		goto out;
 599
 600	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 601	if (ret)
 602		goto out;
 603
 604	len = strlen(devname) + 1;
 605
 606	ret = -EFAULT;
 607	if (copy_to_user(optval, devname, len))
 608		goto out;
 609
 610zero:
 611	ret = -EFAULT;
 612	if (put_user(len, optlen))
 613		goto out;
 614
 615	ret = 0;
 616
 617out:
 618#endif
 619
 620	return ret;
 621}
 622
 623static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 624{
 625	if (valbool)
 626		sock_set_flag(sk, bit);
 627	else
 628		sock_reset_flag(sk, bit);
 629}
 630
 631bool sk_mc_loop(struct sock *sk)
 632{
 633	if (dev_recursion_level())
 634		return false;
 635	if (!sk)
 636		return true;
 637	switch (sk->sk_family) {
 638	case AF_INET:
 639		return inet_sk(sk)->mc_loop;
 640#if IS_ENABLED(CONFIG_IPV6)
 641	case AF_INET6:
 642		return inet6_sk(sk)->mc_loop;
 643#endif
 644	}
 645	WARN_ON(1);
 646	return true;
 647}
 648EXPORT_SYMBOL(sk_mc_loop);
 649
 650/*
 651 *	This is meant for all protocols to use and covers goings on
 652 *	at the socket level. Everything here is generic.
 653 */
 654
 655int sock_setsockopt(struct socket *sock, int level, int optname,
 656		    char __user *optval, unsigned int optlen)
 657{
 658	struct sock_txtime sk_txtime;
 659	struct sock *sk = sock->sk;
 660	int val;
 661	int valbool;
 662	struct linger ling;
 663	int ret = 0;
 664
 665	/*
 666	 *	Options without arguments
 667	 */
 668
 669	if (optname == SO_BINDTODEVICE)
 670		return sock_setbindtodevice(sk, optval, optlen);
 671
 672	if (optlen < sizeof(int))
 673		return -EINVAL;
 674
 675	if (get_user(val, (int __user *)optval))
 676		return -EFAULT;
 677
 678	valbool = val ? 1 : 0;
 679
 680	lock_sock(sk);
 681
 682	switch (optname) {
 683	case SO_DEBUG:
 684		if (val && !capable(CAP_NET_ADMIN))
 685			ret = -EACCES;
 686		else
 687			sock_valbool_flag(sk, SOCK_DBG, valbool);
 688		break;
 689	case SO_REUSEADDR:
 690		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 691		break;
 692	case SO_REUSEPORT:
 693		sk->sk_reuseport = valbool;
 694		break;
 695	case SO_TYPE:
 696	case SO_PROTOCOL:
 697	case SO_DOMAIN:
 698	case SO_ERROR:
 699		ret = -ENOPROTOOPT;
 700		break;
 701	case SO_DONTROUTE:
 702		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 703		sk_dst_reset(sk);
 704		break;
 705	case SO_BROADCAST:
 706		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 707		break;
 708	case SO_SNDBUF:
 709		/* Don't error on this BSD doesn't and if you think
 710		 * about it this is right. Otherwise apps have to
 711		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 712		 * are treated in BSD as hints
 713		 */
 714		val = min_t(u32, val, sysctl_wmem_max);
 715set_sndbuf:
 716		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 717		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 718		/* Wake up sending tasks if we upped the value. */
 719		sk->sk_write_space(sk);
 720		break;
 721
 722	case SO_SNDBUFFORCE:
 723		if (!capable(CAP_NET_ADMIN)) {
 724			ret = -EPERM;
 725			break;
 726		}
 727		goto set_sndbuf;
 728
 729	case SO_RCVBUF:
 730		/* Don't error on this BSD doesn't and if you think
 731		 * about it this is right. Otherwise apps have to
 732		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 733		 * are treated in BSD as hints
 734		 */
 735		val = min_t(u32, val, sysctl_rmem_max);
 736set_rcvbuf:
 737		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 738		/*
 739		 * We double it on the way in to account for
 740		 * "struct sk_buff" etc. overhead.   Applications
 741		 * assume that the SO_RCVBUF setting they make will
 742		 * allow that much actual data to be received on that
 743		 * socket.
 744		 *
 745		 * Applications are unaware that "struct sk_buff" and
 746		 * other overheads allocate from the receive buffer
 747		 * during socket buffer allocation.
 748		 *
 749		 * And after considering the possible alternatives,
 750		 * returning the value we actually used in getsockopt
 751		 * is the most desirable behavior.
 752		 */
 753		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 754		break;
 755
 756	case SO_RCVBUFFORCE:
 757		if (!capable(CAP_NET_ADMIN)) {
 758			ret = -EPERM;
 759			break;
 760		}
 761		goto set_rcvbuf;
 762
 763	case SO_KEEPALIVE:
 764		if (sk->sk_prot->keepalive)
 765			sk->sk_prot->keepalive(sk, valbool);
 766		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 767		break;
 768
 769	case SO_OOBINLINE:
 770		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 771		break;
 772
 773	case SO_NO_CHECK:
 774		sk->sk_no_check_tx = valbool;
 775		break;
 776
 777	case SO_PRIORITY:
 778		if ((val >= 0 && val <= 6) ||
 779		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 780			sk->sk_priority = val;
 781		else
 782			ret = -EPERM;
 783		break;
 784
 785	case SO_LINGER:
 786		if (optlen < sizeof(ling)) {
 787			ret = -EINVAL;	/* 1003.1g */
 788			break;
 789		}
 790		if (copy_from_user(&ling, optval, sizeof(ling))) {
 791			ret = -EFAULT;
 792			break;
 793		}
 794		if (!ling.l_onoff)
 795			sock_reset_flag(sk, SOCK_LINGER);
 796		else {
 797#if (BITS_PER_LONG == 32)
 798			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 799				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 800			else
 801#endif
 802				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 803			sock_set_flag(sk, SOCK_LINGER);
 804		}
 805		break;
 806
 807	case SO_BSDCOMPAT:
 808		sock_warn_obsolete_bsdism("setsockopt");
 809		break;
 810
 811	case SO_PASSCRED:
 812		if (valbool)
 813			set_bit(SOCK_PASSCRED, &sock->flags);
 814		else
 815			clear_bit(SOCK_PASSCRED, &sock->flags);
 816		break;
 817
 818	case SO_TIMESTAMP:
 819	case SO_TIMESTAMPNS:
 820		if (valbool)  {
 821			if (optname == SO_TIMESTAMP)
 822				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 823			else
 824				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 825			sock_set_flag(sk, SOCK_RCVTSTAMP);
 826			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 827		} else {
 828			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 829			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 830		}
 831		break;
 832
 833	case SO_TIMESTAMPING:
 834		if (val & ~SOF_TIMESTAMPING_MASK) {
 835			ret = -EINVAL;
 836			break;
 837		}
 838
 839		if (val & SOF_TIMESTAMPING_OPT_ID &&
 840		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 841			if (sk->sk_protocol == IPPROTO_TCP &&
 842			    sk->sk_type == SOCK_STREAM) {
 843				if ((1 << sk->sk_state) &
 844				    (TCPF_CLOSE | TCPF_LISTEN)) {
 845					ret = -EINVAL;
 846					break;
 847				}
 848				sk->sk_tskey = tcp_sk(sk)->snd_una;
 849			} else {
 850				sk->sk_tskey = 0;
 851			}
 852		}
 853
 854		if (val & SOF_TIMESTAMPING_OPT_STATS &&
 855		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 856			ret = -EINVAL;
 857			break;
 858		}
 859
 860		sk->sk_tsflags = val;
 861		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 862			sock_enable_timestamp(sk,
 863					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 864		else
 865			sock_disable_timestamp(sk,
 866					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 867		break;
 868
 869	case SO_RCVLOWAT:
 870		if (val < 0)
 871			val = INT_MAX;
 872		if (sock->ops->set_rcvlowat)
 873			ret = sock->ops->set_rcvlowat(sk, val);
 874		else
 875			sk->sk_rcvlowat = val ? : 1;
 876		break;
 877
 878	case SO_RCVTIMEO:
 879		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 880		break;
 881
 882	case SO_SNDTIMEO:
 883		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 884		break;
 885
 886	case SO_ATTACH_FILTER:
 887		ret = -EINVAL;
 888		if (optlen == sizeof(struct sock_fprog)) {
 889			struct sock_fprog fprog;
 890
 891			ret = -EFAULT;
 892			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 893				break;
 894
 895			ret = sk_attach_filter(&fprog, sk);
 896		}
 897		break;
 898
 899	case SO_ATTACH_BPF:
 900		ret = -EINVAL;
 901		if (optlen == sizeof(u32)) {
 902			u32 ufd;
 903
 904			ret = -EFAULT;
 905			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 906				break;
 907
 908			ret = sk_attach_bpf(ufd, sk);
 909		}
 910		break;
 911
 912	case SO_ATTACH_REUSEPORT_CBPF:
 913		ret = -EINVAL;
 914		if (optlen == sizeof(struct sock_fprog)) {
 915			struct sock_fprog fprog;
 916
 917			ret = -EFAULT;
 918			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 919				break;
 920
 921			ret = sk_reuseport_attach_filter(&fprog, sk);
 922		}
 923		break;
 924
 925	case SO_ATTACH_REUSEPORT_EBPF:
 926		ret = -EINVAL;
 927		if (optlen == sizeof(u32)) {
 928			u32 ufd;
 929
 930			ret = -EFAULT;
 931			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 932				break;
 933
 934			ret = sk_reuseport_attach_bpf(ufd, sk);
 935		}
 936		break;
 937
 938	case SO_DETACH_FILTER:
 939		ret = sk_detach_filter(sk);
 940		break;
 941
 942	case SO_LOCK_FILTER:
 943		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 944			ret = -EPERM;
 945		else
 946			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 947		break;
 948
 949	case SO_PASSSEC:
 950		if (valbool)
 951			set_bit(SOCK_PASSSEC, &sock->flags);
 952		else
 953			clear_bit(SOCK_PASSSEC, &sock->flags);
 954		break;
 955	case SO_MARK:
 956		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
 957			ret = -EPERM;
 958		} else if (val != sk->sk_mark) {
 959			sk->sk_mark = val;
 960			sk_dst_reset(sk);
 961		}
 962		break;
 963
 964	case SO_RXQ_OVFL:
 965		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 966		break;
 967
 968	case SO_WIFI_STATUS:
 969		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 970		break;
 971
 972	case SO_PEEK_OFF:
 973		if (sock->ops->set_peek_off)
 974			ret = sock->ops->set_peek_off(sk, val);
 975		else
 976			ret = -EOPNOTSUPP;
 977		break;
 978
 979	case SO_NOFCS:
 980		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 981		break;
 982
 983	case SO_SELECT_ERR_QUEUE:
 984		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 985		break;
 986
 987#ifdef CONFIG_NET_RX_BUSY_POLL
 988	case SO_BUSY_POLL:
 989		/* allow unprivileged users to decrease the value */
 990		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 991			ret = -EPERM;
 992		else {
 993			if (val < 0)
 994				ret = -EINVAL;
 995			else
 996				sk->sk_ll_usec = val;
 997		}
 998		break;
 999#endif
1000
1001	case SO_MAX_PACING_RATE:
1002		if (val != ~0U)
1003			cmpxchg(&sk->sk_pacing_status,
1004				SK_PACING_NONE,
1005				SK_PACING_NEEDED);
1006		sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1007		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1008					 sk->sk_max_pacing_rate);
1009		break;
1010
1011	case SO_INCOMING_CPU:
1012		sk->sk_incoming_cpu = val;
1013		break;
1014
1015	case SO_CNX_ADVICE:
1016		if (val == 1)
1017			dst_negative_advice(sk);
1018		break;
1019
1020	case SO_ZEROCOPY:
1021		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1022			if (!((sk->sk_type == SOCK_STREAM &&
1023			       sk->sk_protocol == IPPROTO_TCP) ||
1024			      (sk->sk_type == SOCK_DGRAM &&
1025			       sk->sk_protocol == IPPROTO_UDP)))
1026				ret = -ENOTSUPP;
1027		} else if (sk->sk_family != PF_RDS) {
1028			ret = -ENOTSUPP;
1029		}
1030		if (!ret) {
1031			if (val < 0 || val > 1)
1032				ret = -EINVAL;
1033			else
1034				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1035		}
1036		break;
1037
1038	case SO_TXTIME:
1039		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1040			ret = -EPERM;
1041		} else if (optlen != sizeof(struct sock_txtime)) {
1042			ret = -EINVAL;
1043		} else if (copy_from_user(&sk_txtime, optval,
1044			   sizeof(struct sock_txtime))) {
1045			ret = -EFAULT;
1046		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1047			ret = -EINVAL;
1048		} else {
1049			sock_valbool_flag(sk, SOCK_TXTIME, true);
1050			sk->sk_clockid = sk_txtime.clockid;
1051			sk->sk_txtime_deadline_mode =
1052				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1053			sk->sk_txtime_report_errors =
1054				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1055		}
1056		break;
1057
1058	default:
1059		ret = -ENOPROTOOPT;
1060		break;
1061	}
1062	release_sock(sk);
1063	return ret;
1064}
1065EXPORT_SYMBOL(sock_setsockopt);
1066
1067
1068static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1069			  struct ucred *ucred)
1070{
1071	ucred->pid = pid_vnr(pid);
1072	ucred->uid = ucred->gid = -1;
1073	if (cred) {
1074		struct user_namespace *current_ns = current_user_ns();
1075
1076		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1077		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1078	}
1079}
1080
1081static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1082{
1083	struct user_namespace *user_ns = current_user_ns();
1084	int i;
1085
1086	for (i = 0; i < src->ngroups; i++)
1087		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1088			return -EFAULT;
1089
1090	return 0;
1091}
1092
1093int sock_getsockopt(struct socket *sock, int level, int optname,
1094		    char __user *optval, int __user *optlen)
1095{
1096	struct sock *sk = sock->sk;
1097
1098	union {
1099		int val;
1100		u64 val64;
1101		struct linger ling;
1102		struct timeval tm;
1103		struct sock_txtime txtime;
1104	} v;
1105
1106	int lv = sizeof(int);
1107	int len;
1108
1109	if (get_user(len, optlen))
1110		return -EFAULT;
1111	if (len < 0)
1112		return -EINVAL;
1113
1114	memset(&v, 0, sizeof(v));
1115
1116	switch (optname) {
1117	case SO_DEBUG:
1118		v.val = sock_flag(sk, SOCK_DBG);
1119		break;
1120
1121	case SO_DONTROUTE:
1122		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1123		break;
1124
1125	case SO_BROADCAST:
1126		v.val = sock_flag(sk, SOCK_BROADCAST);
1127		break;
1128
1129	case SO_SNDBUF:
1130		v.val = sk->sk_sndbuf;
1131		break;
1132
1133	case SO_RCVBUF:
1134		v.val = sk->sk_rcvbuf;
1135		break;
1136
1137	case SO_REUSEADDR:
1138		v.val = sk->sk_reuse;
1139		break;
1140
1141	case SO_REUSEPORT:
1142		v.val = sk->sk_reuseport;
1143		break;
1144
1145	case SO_KEEPALIVE:
1146		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1147		break;
1148
1149	case SO_TYPE:
1150		v.val = sk->sk_type;
1151		break;
1152
1153	case SO_PROTOCOL:
1154		v.val = sk->sk_protocol;
1155		break;
1156
1157	case SO_DOMAIN:
1158		v.val = sk->sk_family;
1159		break;
1160
1161	case SO_ERROR:
1162		v.val = -sock_error(sk);
1163		if (v.val == 0)
1164			v.val = xchg(&sk->sk_err_soft, 0);
1165		break;
1166
1167	case SO_OOBINLINE:
1168		v.val = sock_flag(sk, SOCK_URGINLINE);
1169		break;
1170
1171	case SO_NO_CHECK:
1172		v.val = sk->sk_no_check_tx;
1173		break;
1174
1175	case SO_PRIORITY:
1176		v.val = sk->sk_priority;
1177		break;
1178
1179	case SO_LINGER:
1180		lv		= sizeof(v.ling);
1181		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1182		v.ling.l_linger	= sk->sk_lingertime / HZ;
1183		break;
1184
1185	case SO_BSDCOMPAT:
1186		sock_warn_obsolete_bsdism("getsockopt");
1187		break;
1188
1189	case SO_TIMESTAMP:
1190		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1191				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1192		break;
1193
1194	case SO_TIMESTAMPNS:
1195		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1196		break;
1197
1198	case SO_TIMESTAMPING:
1199		v.val = sk->sk_tsflags;
1200		break;
1201
1202	case SO_RCVTIMEO:
1203		lv = sizeof(struct timeval);
1204		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1205			v.tm.tv_sec = 0;
1206			v.tm.tv_usec = 0;
1207		} else {
1208			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1209			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1210		}
1211		break;
1212
1213	case SO_SNDTIMEO:
1214		lv = sizeof(struct timeval);
1215		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1216			v.tm.tv_sec = 0;
1217			v.tm.tv_usec = 0;
1218		} else {
1219			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1220			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1221		}
1222		break;
1223
1224	case SO_RCVLOWAT:
1225		v.val = sk->sk_rcvlowat;
1226		break;
1227
1228	case SO_SNDLOWAT:
1229		v.val = 1;
1230		break;
1231
1232	case SO_PASSCRED:
1233		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1234		break;
1235
1236	case SO_PEERCRED:
1237	{
1238		struct ucred peercred;
1239		if (len > sizeof(peercred))
1240			len = sizeof(peercred);
1241		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1242		if (copy_to_user(optval, &peercred, len))
1243			return -EFAULT;
1244		goto lenout;
1245	}
1246
1247	case SO_PEERGROUPS:
1248	{
1249		int ret, n;
1250
1251		if (!sk->sk_peer_cred)
1252			return -ENODATA;
1253
1254		n = sk->sk_peer_cred->group_info->ngroups;
1255		if (len < n * sizeof(gid_t)) {
1256			len = n * sizeof(gid_t);
1257			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1258		}
1259		len = n * sizeof(gid_t);
1260
1261		ret = groups_to_user((gid_t __user *)optval,
1262				     sk->sk_peer_cred->group_info);
1263		if (ret)
1264			return ret;
1265		goto lenout;
1266	}
1267
1268	case SO_PEERNAME:
1269	{
1270		char address[128];
1271
1272		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1273		if (lv < 0)
1274			return -ENOTCONN;
1275		if (lv < len)
1276			return -EINVAL;
1277		if (copy_to_user(optval, address, len))
1278			return -EFAULT;
1279		goto lenout;
1280	}
1281
1282	/* Dubious BSD thing... Probably nobody even uses it, but
1283	 * the UNIX standard wants it for whatever reason... -DaveM
1284	 */
1285	case SO_ACCEPTCONN:
1286		v.val = sk->sk_state == TCP_LISTEN;
1287		break;
1288
1289	case SO_PASSSEC:
1290		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1291		break;
1292
1293	case SO_PEERSEC:
1294		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1295
1296	case SO_MARK:
1297		v.val = sk->sk_mark;
1298		break;
1299
1300	case SO_RXQ_OVFL:
1301		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1302		break;
1303
1304	case SO_WIFI_STATUS:
1305		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1306		break;
1307
1308	case SO_PEEK_OFF:
1309		if (!sock->ops->set_peek_off)
1310			return -EOPNOTSUPP;
1311
1312		v.val = sk->sk_peek_off;
1313		break;
1314	case SO_NOFCS:
1315		v.val = sock_flag(sk, SOCK_NOFCS);
1316		break;
1317
1318	case SO_BINDTODEVICE:
1319		return sock_getbindtodevice(sk, optval, optlen, len);
1320
1321	case SO_GET_FILTER:
1322		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1323		if (len < 0)
1324			return len;
1325
1326		goto lenout;
1327
1328	case SO_LOCK_FILTER:
1329		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1330		break;
1331
1332	case SO_BPF_EXTENSIONS:
1333		v.val = bpf_tell_extensions();
1334		break;
1335
1336	case SO_SELECT_ERR_QUEUE:
1337		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1338		break;
1339
1340#ifdef CONFIG_NET_RX_BUSY_POLL
1341	case SO_BUSY_POLL:
1342		v.val = sk->sk_ll_usec;
1343		break;
1344#endif
1345
1346	case SO_MAX_PACING_RATE:
1347		/* 32bit version */
1348		v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1349		break;
1350
1351	case SO_INCOMING_CPU:
1352		v.val = sk->sk_incoming_cpu;
1353		break;
1354
1355	case SO_MEMINFO:
1356	{
1357		u32 meminfo[SK_MEMINFO_VARS];
1358
1359		if (get_user(len, optlen))
1360			return -EFAULT;
1361
1362		sk_get_meminfo(sk, meminfo);
1363
1364		len = min_t(unsigned int, len, sizeof(meminfo));
1365		if (copy_to_user(optval, &meminfo, len))
1366			return -EFAULT;
1367
1368		goto lenout;
1369	}
1370
1371#ifdef CONFIG_NET_RX_BUSY_POLL
1372	case SO_INCOMING_NAPI_ID:
1373		v.val = READ_ONCE(sk->sk_napi_id);
1374
1375		/* aggregate non-NAPI IDs down to 0 */
1376		if (v.val < MIN_NAPI_ID)
1377			v.val = 0;
1378
1379		break;
1380#endif
1381
1382	case SO_COOKIE:
1383		lv = sizeof(u64);
1384		if (len < lv)
1385			return -EINVAL;
1386		v.val64 = sock_gen_cookie(sk);
1387		break;
1388
1389	case SO_ZEROCOPY:
1390		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1391		break;
1392
1393	case SO_TXTIME:
1394		lv = sizeof(v.txtime);
1395		v.txtime.clockid = sk->sk_clockid;
1396		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1397				  SOF_TXTIME_DEADLINE_MODE : 0;
1398		v.txtime.flags |= sk->sk_txtime_report_errors ?
1399				  SOF_TXTIME_REPORT_ERRORS : 0;
1400		break;
1401
1402	default:
1403		/* We implement the SO_SNDLOWAT etc to not be settable
1404		 * (1003.1g 7).
1405		 */
1406		return -ENOPROTOOPT;
1407	}
1408
1409	if (len > lv)
1410		len = lv;
1411	if (copy_to_user(optval, &v, len))
1412		return -EFAULT;
1413lenout:
1414	if (put_user(len, optlen))
1415		return -EFAULT;
1416	return 0;
1417}
1418
1419/*
1420 * Initialize an sk_lock.
1421 *
1422 * (We also register the sk_lock with the lock validator.)
1423 */
1424static inline void sock_lock_init(struct sock *sk)
1425{
1426	if (sk->sk_kern_sock)
1427		sock_lock_init_class_and_name(
1428			sk,
1429			af_family_kern_slock_key_strings[sk->sk_family],
1430			af_family_kern_slock_keys + sk->sk_family,
1431			af_family_kern_key_strings[sk->sk_family],
1432			af_family_kern_keys + sk->sk_family);
1433	else
1434		sock_lock_init_class_and_name(
1435			sk,
1436			af_family_slock_key_strings[sk->sk_family],
1437			af_family_slock_keys + sk->sk_family,
1438			af_family_key_strings[sk->sk_family],
1439			af_family_keys + sk->sk_family);
1440}
1441
1442/*
1443 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1444 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1445 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1446 */
1447static void sock_copy(struct sock *nsk, const struct sock *osk)
1448{
1449#ifdef CONFIG_SECURITY_NETWORK
1450	void *sptr = nsk->sk_security;
1451#endif
1452	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1453
1454	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1455	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1456
1457#ifdef CONFIG_SECURITY_NETWORK
1458	nsk->sk_security = sptr;
1459	security_sk_clone(osk, nsk);
1460#endif
1461}
1462
1463static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1464		int family)
1465{
1466	struct sock *sk;
1467	struct kmem_cache *slab;
1468
1469	slab = prot->slab;
1470	if (slab != NULL) {
1471		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1472		if (!sk)
1473			return sk;
1474		if (priority & __GFP_ZERO)
1475			sk_prot_clear_nulls(sk, prot->obj_size);
1476	} else
1477		sk = kmalloc(prot->obj_size, priority);
1478
1479	if (sk != NULL) {
1480		if (security_sk_alloc(sk, family, priority))
1481			goto out_free;
1482
1483		if (!try_module_get(prot->owner))
1484			goto out_free_sec;
1485		sk_tx_queue_clear(sk);
1486	}
1487
1488	return sk;
1489
1490out_free_sec:
1491	security_sk_free(sk);
1492out_free:
1493	if (slab != NULL)
1494		kmem_cache_free(slab, sk);
1495	else
1496		kfree(sk);
1497	return NULL;
1498}
1499
1500static void sk_prot_free(struct proto *prot, struct sock *sk)
1501{
1502	struct kmem_cache *slab;
1503	struct module *owner;
1504
1505	owner = prot->owner;
1506	slab = prot->slab;
1507
1508	cgroup_sk_free(&sk->sk_cgrp_data);
1509	mem_cgroup_sk_free(sk);
1510	security_sk_free(sk);
1511	if (slab != NULL)
1512		kmem_cache_free(slab, sk);
1513	else
1514		kfree(sk);
1515	module_put(owner);
1516}
1517
1518/**
1519 *	sk_alloc - All socket objects are allocated here
1520 *	@net: the applicable net namespace
1521 *	@family: protocol family
1522 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1523 *	@prot: struct proto associated with this new sock instance
1524 *	@kern: is this to be a kernel socket?
1525 */
1526struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1527		      struct proto *prot, int kern)
1528{
1529	struct sock *sk;
1530
1531	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1532	if (sk) {
1533		sk->sk_family = family;
1534		/*
1535		 * See comment in struct sock definition to understand
1536		 * why we need sk_prot_creator -acme
1537		 */
1538		sk->sk_prot = sk->sk_prot_creator = prot;
1539		sk->sk_kern_sock = kern;
1540		sock_lock_init(sk);
1541		sk->sk_net_refcnt = kern ? 0 : 1;
1542		if (likely(sk->sk_net_refcnt)) {
1543			get_net(net);
1544			sock_inuse_add(net, 1);
1545		}
1546
1547		sock_net_set(sk, net);
1548		refcount_set(&sk->sk_wmem_alloc, 1);
1549
1550		mem_cgroup_sk_alloc(sk);
1551		cgroup_sk_alloc(&sk->sk_cgrp_data);
1552		sock_update_classid(&sk->sk_cgrp_data);
1553		sock_update_netprioidx(&sk->sk_cgrp_data);
1554	}
1555
1556	return sk;
1557}
1558EXPORT_SYMBOL(sk_alloc);
1559
1560/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1561 * grace period. This is the case for UDP sockets and TCP listeners.
1562 */
1563static void __sk_destruct(struct rcu_head *head)
1564{
1565	struct sock *sk = container_of(head, struct sock, sk_rcu);
1566	struct sk_filter *filter;
1567
1568	if (sk->sk_destruct)
1569		sk->sk_destruct(sk);
1570
1571	filter = rcu_dereference_check(sk->sk_filter,
1572				       refcount_read(&sk->sk_wmem_alloc) == 0);
1573	if (filter) {
1574		sk_filter_uncharge(sk, filter);
1575		RCU_INIT_POINTER(sk->sk_filter, NULL);
1576	}
1577	if (rcu_access_pointer(sk->sk_reuseport_cb))
1578		reuseport_detach_sock(sk);
1579
1580	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1581
1582	if (atomic_read(&sk->sk_omem_alloc))
1583		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1584			 __func__, atomic_read(&sk->sk_omem_alloc));
1585
1586	if (sk->sk_frag.page) {
1587		put_page(sk->sk_frag.page);
1588		sk->sk_frag.page = NULL;
1589	}
1590
1591	if (sk->sk_peer_cred)
1592		put_cred(sk->sk_peer_cred);
1593	put_pid(sk->sk_peer_pid);
1594	if (likely(sk->sk_net_refcnt))
1595		put_net(sock_net(sk));
1596	sk_prot_free(sk->sk_prot_creator, sk);
1597}
1598
1599void sk_destruct(struct sock *sk)
1600{
1601	if (sock_flag(sk, SOCK_RCU_FREE))
1602		call_rcu(&sk->sk_rcu, __sk_destruct);
1603	else
1604		__sk_destruct(&sk->sk_rcu);
1605}
1606
1607static void __sk_free(struct sock *sk)
1608{
1609	if (likely(sk->sk_net_refcnt))
1610		sock_inuse_add(sock_net(sk), -1);
1611
1612	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1613		sock_diag_broadcast_destroy(sk);
1614	else
1615		sk_destruct(sk);
1616}
1617
1618void sk_free(struct sock *sk)
1619{
1620	/*
1621	 * We subtract one from sk_wmem_alloc and can know if
1622	 * some packets are still in some tx queue.
1623	 * If not null, sock_wfree() will call __sk_free(sk) later
1624	 */
1625	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1626		__sk_free(sk);
1627}
1628EXPORT_SYMBOL(sk_free);
1629
1630static void sk_init_common(struct sock *sk)
1631{
1632	skb_queue_head_init(&sk->sk_receive_queue);
1633	skb_queue_head_init(&sk->sk_write_queue);
1634	skb_queue_head_init(&sk->sk_error_queue);
1635
1636	rwlock_init(&sk->sk_callback_lock);
1637	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1638			af_rlock_keys + sk->sk_family,
1639			af_family_rlock_key_strings[sk->sk_family]);
1640	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1641			af_wlock_keys + sk->sk_family,
1642			af_family_wlock_key_strings[sk->sk_family]);
1643	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1644			af_elock_keys + sk->sk_family,
1645			af_family_elock_key_strings[sk->sk_family]);
1646	lockdep_set_class_and_name(&sk->sk_callback_lock,
1647			af_callback_keys + sk->sk_family,
1648			af_family_clock_key_strings[sk->sk_family]);
1649}
1650
1651/**
1652 *	sk_clone_lock - clone a socket, and lock its clone
1653 *	@sk: the socket to clone
1654 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1655 *
1656 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1657 */
1658struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1659{
1660	struct sock *newsk;
1661	bool is_charged = true;
1662
1663	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1664	if (newsk != NULL) {
1665		struct sk_filter *filter;
1666
1667		sock_copy(newsk, sk);
1668
1669		newsk->sk_prot_creator = sk->sk_prot;
1670
1671		/* SANITY */
1672		if (likely(newsk->sk_net_refcnt))
1673			get_net(sock_net(newsk));
1674		sk_node_init(&newsk->sk_node);
1675		sock_lock_init(newsk);
1676		bh_lock_sock(newsk);
1677		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1678		newsk->sk_backlog.len = 0;
1679
1680		atomic_set(&newsk->sk_rmem_alloc, 0);
1681		/*
1682		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1683		 */
1684		refcount_set(&newsk->sk_wmem_alloc, 1);
1685		atomic_set(&newsk->sk_omem_alloc, 0);
1686		sk_init_common(newsk);
1687
1688		newsk->sk_dst_cache	= NULL;
1689		newsk->sk_dst_pending_confirm = 0;
1690		newsk->sk_wmem_queued	= 0;
1691		newsk->sk_forward_alloc = 0;
1692		atomic_set(&newsk->sk_drops, 0);
1693		newsk->sk_send_head	= NULL;
1694		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1695		atomic_set(&newsk->sk_zckey, 0);
1696
1697		sock_reset_flag(newsk, SOCK_DONE);
1698		mem_cgroup_sk_alloc(newsk);
1699		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1700
1701		rcu_read_lock();
1702		filter = rcu_dereference(sk->sk_filter);
1703		if (filter != NULL)
1704			/* though it's an empty new sock, the charging may fail
1705			 * if sysctl_optmem_max was changed between creation of
1706			 * original socket and cloning
1707			 */
1708			is_charged = sk_filter_charge(newsk, filter);
1709		RCU_INIT_POINTER(newsk->sk_filter, filter);
1710		rcu_read_unlock();
1711
1712		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1713			/* We need to make sure that we don't uncharge the new
1714			 * socket if we couldn't charge it in the first place
1715			 * as otherwise we uncharge the parent's filter.
1716			 */
1717			if (!is_charged)
1718				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1719			sk_free_unlock_clone(newsk);
1720			newsk = NULL;
1721			goto out;
1722		}
1723		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1724
1725		newsk->sk_err	   = 0;
1726		newsk->sk_err_soft = 0;
1727		newsk->sk_priority = 0;
1728		newsk->sk_incoming_cpu = raw_smp_processor_id();
1729		atomic64_set(&newsk->sk_cookie, 0);
1730		if (likely(newsk->sk_net_refcnt))
1731			sock_inuse_add(sock_net(newsk), 1);
1732
1733		/*
1734		 * Before updating sk_refcnt, we must commit prior changes to memory
1735		 * (Documentation/RCU/rculist_nulls.txt for details)
1736		 */
1737		smp_wmb();
1738		refcount_set(&newsk->sk_refcnt, 2);
1739
1740		/*
1741		 * Increment the counter in the same struct proto as the master
1742		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1743		 * is the same as sk->sk_prot->socks, as this field was copied
1744		 * with memcpy).
1745		 *
1746		 * This _changes_ the previous behaviour, where
1747		 * tcp_create_openreq_child always was incrementing the
1748		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1749		 * to be taken into account in all callers. -acme
1750		 */
1751		sk_refcnt_debug_inc(newsk);
1752		sk_set_socket(newsk, NULL);
1753		newsk->sk_wq = NULL;
1754
1755		if (newsk->sk_prot->sockets_allocated)
1756			sk_sockets_allocated_inc(newsk);
1757
1758		if (sock_needs_netstamp(sk) &&
1759		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1760			net_enable_timestamp();
1761	}
1762out:
1763	return newsk;
1764}
1765EXPORT_SYMBOL_GPL(sk_clone_lock);
1766
1767void sk_free_unlock_clone(struct sock *sk)
1768{
1769	/* It is still raw copy of parent, so invalidate
1770	 * destructor and make plain sk_free() */
1771	sk->sk_destruct = NULL;
1772	bh_unlock_sock(sk);
1773	sk_free(sk);
1774}
1775EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1776
1777void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1778{
1779	u32 max_segs = 1;
1780
1781	sk_dst_set(sk, dst);
1782	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1783	if (sk->sk_route_caps & NETIF_F_GSO)
1784		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1785	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1786	if (sk_can_gso(sk)) {
1787		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1788			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1789		} else {
1790			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1791			sk->sk_gso_max_size = dst->dev->gso_max_size;
1792			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1793		}
1794	}
1795	sk->sk_gso_max_segs = max_segs;
1796}
1797EXPORT_SYMBOL_GPL(sk_setup_caps);
1798
1799/*
1800 *	Simple resource managers for sockets.
1801 */
1802
1803
1804/*
1805 * Write buffer destructor automatically called from kfree_skb.
1806 */
1807void sock_wfree(struct sk_buff *skb)
1808{
1809	struct sock *sk = skb->sk;
1810	unsigned int len = skb->truesize;
1811
1812	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1813		/*
1814		 * Keep a reference on sk_wmem_alloc, this will be released
1815		 * after sk_write_space() call
1816		 */
1817		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1818		sk->sk_write_space(sk);
1819		len = 1;
1820	}
1821	/*
1822	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1823	 * could not do because of in-flight packets
1824	 */
1825	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1826		__sk_free(sk);
1827}
1828EXPORT_SYMBOL(sock_wfree);
1829
1830/* This variant of sock_wfree() is used by TCP,
1831 * since it sets SOCK_USE_WRITE_QUEUE.
1832 */
1833void __sock_wfree(struct sk_buff *skb)
1834{
1835	struct sock *sk = skb->sk;
1836
1837	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1838		__sk_free(sk);
1839}
1840
1841void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1842{
1843	skb_orphan(skb);
1844	skb->sk = sk;
1845#ifdef CONFIG_INET
1846	if (unlikely(!sk_fullsock(sk))) {
1847		skb->destructor = sock_edemux;
1848		sock_hold(sk);
1849		return;
1850	}
1851#endif
1852	skb->destructor = sock_wfree;
1853	skb_set_hash_from_sk(skb, sk);
1854	/*
1855	 * We used to take a refcount on sk, but following operation
1856	 * is enough to guarantee sk_free() wont free this sock until
1857	 * all in-flight packets are completed
1858	 */
1859	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1860}
1861EXPORT_SYMBOL(skb_set_owner_w);
1862
1863/* This helper is used by netem, as it can hold packets in its
1864 * delay queue. We want to allow the owner socket to send more
1865 * packets, as if they were already TX completed by a typical driver.
1866 * But we also want to keep skb->sk set because some packet schedulers
1867 * rely on it (sch_fq for example).
1868 */
1869void skb_orphan_partial(struct sk_buff *skb)
1870{
1871	if (skb_is_tcp_pure_ack(skb))
1872		return;
1873
1874	if (skb->destructor == sock_wfree
1875#ifdef CONFIG_INET
1876	    || skb->destructor == tcp_wfree
1877#endif
1878		) {
1879		struct sock *sk = skb->sk;
1880
1881		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1882			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1883			skb->destructor = sock_efree;
1884		}
1885	} else {
1886		skb_orphan(skb);
1887	}
1888}
1889EXPORT_SYMBOL(skb_orphan_partial);
1890
1891/*
1892 * Read buffer destructor automatically called from kfree_skb.
1893 */
1894void sock_rfree(struct sk_buff *skb)
1895{
1896	struct sock *sk = skb->sk;
1897	unsigned int len = skb->truesize;
1898
1899	atomic_sub(len, &sk->sk_rmem_alloc);
1900	sk_mem_uncharge(sk, len);
1901}
1902EXPORT_SYMBOL(sock_rfree);
1903
1904/*
1905 * Buffer destructor for skbs that are not used directly in read or write
1906 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1907 */
1908void sock_efree(struct sk_buff *skb)
1909{
1910	sock_put(skb->sk);
1911}
1912EXPORT_SYMBOL(sock_efree);
1913
1914kuid_t sock_i_uid(struct sock *sk)
1915{
1916	kuid_t uid;
1917
1918	read_lock_bh(&sk->sk_callback_lock);
1919	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1920	read_unlock_bh(&sk->sk_callback_lock);
1921	return uid;
1922}
1923EXPORT_SYMBOL(sock_i_uid);
1924
1925unsigned long sock_i_ino(struct sock *sk)
1926{
1927	unsigned long ino;
1928
1929	read_lock_bh(&sk->sk_callback_lock);
1930	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1931	read_unlock_bh(&sk->sk_callback_lock);
1932	return ino;
1933}
1934EXPORT_SYMBOL(sock_i_ino);
1935
1936/*
1937 * Allocate a skb from the socket's send buffer.
1938 */
1939struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1940			     gfp_t priority)
1941{
1942	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1943		struct sk_buff *skb = alloc_skb(size, priority);
1944		if (skb) {
1945			skb_set_owner_w(skb, sk);
1946			return skb;
1947		}
1948	}
1949	return NULL;
1950}
1951EXPORT_SYMBOL(sock_wmalloc);
1952
1953static void sock_ofree(struct sk_buff *skb)
1954{
1955	struct sock *sk = skb->sk;
1956
1957	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1958}
1959
1960struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1961			     gfp_t priority)
1962{
1963	struct sk_buff *skb;
1964
1965	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1966	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1967	    sysctl_optmem_max)
1968		return NULL;
1969
1970	skb = alloc_skb(size, priority);
1971	if (!skb)
1972		return NULL;
1973
1974	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1975	skb->sk = sk;
1976	skb->destructor = sock_ofree;
1977	return skb;
1978}
1979
1980/*
1981 * Allocate a memory block from the socket's option memory buffer.
1982 */
1983void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1984{
1985	if ((unsigned int)size <= sysctl_optmem_max &&
1986	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1987		void *mem;
1988		/* First do the add, to avoid the race if kmalloc
1989		 * might sleep.
1990		 */
1991		atomic_add(size, &sk->sk_omem_alloc);
1992		mem = kmalloc(size, priority);
1993		if (mem)
1994			return mem;
1995		atomic_sub(size, &sk->sk_omem_alloc);
1996	}
1997	return NULL;
1998}
1999EXPORT_SYMBOL(sock_kmalloc);
2000
2001/* Free an option memory block. Note, we actually want the inline
2002 * here as this allows gcc to detect the nullify and fold away the
2003 * condition entirely.
2004 */
2005static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2006				  const bool nullify)
2007{
2008	if (WARN_ON_ONCE(!mem))
2009		return;
2010	if (nullify)
2011		kzfree(mem);
2012	else
2013		kfree(mem);
2014	atomic_sub(size, &sk->sk_omem_alloc);
2015}
2016
2017void sock_kfree_s(struct sock *sk, void *mem, int size)
2018{
2019	__sock_kfree_s(sk, mem, size, false);
2020}
2021EXPORT_SYMBOL(sock_kfree_s);
2022
2023void sock_kzfree_s(struct sock *sk, void *mem, int size)
2024{
2025	__sock_kfree_s(sk, mem, size, true);
2026}
2027EXPORT_SYMBOL(sock_kzfree_s);
2028
2029/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2030   I think, these locks should be removed for datagram sockets.
2031 */
2032static long sock_wait_for_wmem(struct sock *sk, long timeo)
2033{
2034	DEFINE_WAIT(wait);
2035
2036	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2037	for (;;) {
2038		if (!timeo)
2039			break;
2040		if (signal_pending(current))
2041			break;
2042		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2043		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2044		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2045			break;
2046		if (sk->sk_shutdown & SEND_SHUTDOWN)
2047			break;
2048		if (sk->sk_err)
2049			break;
2050		timeo = schedule_timeout(timeo);
2051	}
2052	finish_wait(sk_sleep(sk), &wait);
2053	return timeo;
2054}
2055
2056
2057/*
2058 *	Generic send/receive buffer handlers
2059 */
2060
2061struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2062				     unsigned long data_len, int noblock,
2063				     int *errcode, int max_page_order)
2064{
2065	struct sk_buff *skb;
2066	long timeo;
2067	int err;
2068
2069	timeo = sock_sndtimeo(sk, noblock);
2070	for (;;) {
2071		err = sock_error(sk);
2072		if (err != 0)
2073			goto failure;
2074
2075		err = -EPIPE;
2076		if (sk->sk_shutdown & SEND_SHUTDOWN)
2077			goto failure;
2078
2079		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2080			break;
2081
2082		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2083		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2084		err = -EAGAIN;
2085		if (!timeo)
2086			goto failure;
2087		if (signal_pending(current))
2088			goto interrupted;
2089		timeo = sock_wait_for_wmem(sk, timeo);
2090	}
2091	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2092				   errcode, sk->sk_allocation);
2093	if (skb)
2094		skb_set_owner_w(skb, sk);
2095	return skb;
2096
2097interrupted:
2098	err = sock_intr_errno(timeo);
2099failure:
2100	*errcode = err;
2101	return NULL;
2102}
2103EXPORT_SYMBOL(sock_alloc_send_pskb);
2104
2105struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2106				    int noblock, int *errcode)
2107{
2108	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2109}
2110EXPORT_SYMBOL(sock_alloc_send_skb);
2111
2112int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2113		     struct sockcm_cookie *sockc)
2114{
2115	u32 tsflags;
2116
2117	switch (cmsg->cmsg_type) {
2118	case SO_MARK:
2119		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2120			return -EPERM;
2121		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2122			return -EINVAL;
2123		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2124		break;
2125	case SO_TIMESTAMPING:
2126		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2127			return -EINVAL;
2128
2129		tsflags = *(u32 *)CMSG_DATA(cmsg);
2130		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2131			return -EINVAL;
2132
2133		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2134		sockc->tsflags |= tsflags;
2135		break;
2136	case SCM_TXTIME:
2137		if (!sock_flag(sk, SOCK_TXTIME))
2138			return -EINVAL;
2139		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2140			return -EINVAL;
2141		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2142		break;
2143	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2144	case SCM_RIGHTS:
2145	case SCM_CREDENTIALS:
2146		break;
2147	default:
2148		return -EINVAL;
2149	}
2150	return 0;
2151}
2152EXPORT_SYMBOL(__sock_cmsg_send);
2153
2154int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2155		   struct sockcm_cookie *sockc)
2156{
2157	struct cmsghdr *cmsg;
2158	int ret;
2159
2160	for_each_cmsghdr(cmsg, msg) {
2161		if (!CMSG_OK(msg, cmsg))
2162			return -EINVAL;
2163		if (cmsg->cmsg_level != SOL_SOCKET)
2164			continue;
2165		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2166		if (ret)
2167			return ret;
2168	}
2169	return 0;
2170}
2171EXPORT_SYMBOL(sock_cmsg_send);
2172
2173static void sk_enter_memory_pressure(struct sock *sk)
2174{
2175	if (!sk->sk_prot->enter_memory_pressure)
2176		return;
2177
2178	sk->sk_prot->enter_memory_pressure(sk);
2179}
2180
2181static void sk_leave_memory_pressure(struct sock *sk)
2182{
2183	if (sk->sk_prot->leave_memory_pressure) {
2184		sk->sk_prot->leave_memory_pressure(sk);
2185	} else {
2186		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2187
2188		if (memory_pressure && *memory_pressure)
2189			*memory_pressure = 0;
2190	}
2191}
2192
2193/* On 32bit arches, an skb frag is limited to 2^15 */
2194#define SKB_FRAG_PAGE_ORDER	get_order(32768)
2195
2196/**
2197 * skb_page_frag_refill - check that a page_frag contains enough room
2198 * @sz: minimum size of the fragment we want to get
2199 * @pfrag: pointer to page_frag
2200 * @gfp: priority for memory allocation
2201 *
2202 * Note: While this allocator tries to use high order pages, there is
2203 * no guarantee that allocations succeed. Therefore, @sz MUST be
2204 * less or equal than PAGE_SIZE.
2205 */
2206bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2207{
2208	if (pfrag->page) {
2209		if (page_ref_count(pfrag->page) == 1) {
2210			pfrag->offset = 0;
2211			return true;
2212		}
2213		if (pfrag->offset + sz <= pfrag->size)
2214			return true;
2215		put_page(pfrag->page);
2216	}
2217
2218	pfrag->offset = 0;
2219	if (SKB_FRAG_PAGE_ORDER) {
2220		/* Avoid direct reclaim but allow kswapd to wake */
2221		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2222					  __GFP_COMP | __GFP_NOWARN |
2223					  __GFP_NORETRY,
2224					  SKB_FRAG_PAGE_ORDER);
2225		if (likely(pfrag->page)) {
2226			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2227			return true;
2228		}
2229	}
2230	pfrag->page = alloc_page(gfp);
2231	if (likely(pfrag->page)) {
2232		pfrag->size = PAGE_SIZE;
2233		return true;
2234	}
2235	return false;
2236}
2237EXPORT_SYMBOL(skb_page_frag_refill);
2238
2239bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2240{
2241	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2242		return true;
2243
2244	sk_enter_memory_pressure(sk);
2245	sk_stream_moderate_sndbuf(sk);
2246	return false;
2247}
2248EXPORT_SYMBOL(sk_page_frag_refill);
2249
2250static void __lock_sock(struct sock *sk)
2251	__releases(&sk->sk_lock.slock)
2252	__acquires(&sk->sk_lock.slock)
2253{
2254	DEFINE_WAIT(wait);
2255
2256	for (;;) {
2257		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2258					TASK_UNINTERRUPTIBLE);
2259		spin_unlock_bh(&sk->sk_lock.slock);
2260		schedule();
2261		spin_lock_bh(&sk->sk_lock.slock);
2262		if (!sock_owned_by_user(sk))
2263			break;
2264	}
2265	finish_wait(&sk->sk_lock.wq, &wait);
2266}
2267
2268void __release_sock(struct sock *sk)
2269	__releases(&sk->sk_lock.slock)
2270	__acquires(&sk->sk_lock.slock)
2271{
2272	struct sk_buff *skb, *next;
2273
2274	while ((skb = sk->sk_backlog.head) != NULL) {
2275		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2276
2277		spin_unlock_bh(&sk->sk_lock.slock);
2278
2279		do {
2280			next = skb->next;
2281			prefetch(next);
2282			WARN_ON_ONCE(skb_dst_is_noref(skb));
2283			skb_mark_not_on_list(skb);
2284			sk_backlog_rcv(sk, skb);
2285
2286			cond_resched();
2287
2288			skb = next;
2289		} while (skb != NULL);
2290
2291		spin_lock_bh(&sk->sk_lock.slock);
2292	}
2293
2294	/*
2295	 * Doing the zeroing here guarantee we can not loop forever
2296	 * while a wild producer attempts to flood us.
2297	 */
2298	sk->sk_backlog.len = 0;
2299}
2300
2301void __sk_flush_backlog(struct sock *sk)
2302{
2303	spin_lock_bh(&sk->sk_lock.slock);
2304	__release_sock(sk);
2305	spin_unlock_bh(&sk->sk_lock.slock);
2306}
2307
2308/**
2309 * sk_wait_data - wait for data to arrive at sk_receive_queue
2310 * @sk:    sock to wait on
2311 * @timeo: for how long
2312 * @skb:   last skb seen on sk_receive_queue
2313 *
2314 * Now socket state including sk->sk_err is changed only under lock,
2315 * hence we may omit checks after joining wait queue.
2316 * We check receive queue before schedule() only as optimization;
2317 * it is very likely that release_sock() added new data.
2318 */
2319int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2320{
2321	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2322	int rc;
2323
2324	add_wait_queue(sk_sleep(sk), &wait);
2325	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2326	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2327	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2328	remove_wait_queue(sk_sleep(sk), &wait);
2329	return rc;
2330}
2331EXPORT_SYMBOL(sk_wait_data);
2332
2333/**
2334 *	__sk_mem_raise_allocated - increase memory_allocated
2335 *	@sk: socket
2336 *	@size: memory size to allocate
2337 *	@amt: pages to allocate
2338 *	@kind: allocation type
2339 *
2340 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2341 */
2342int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2343{
2344	struct proto *prot = sk->sk_prot;
2345	long allocated = sk_memory_allocated_add(sk, amt);
2346	bool charged = true;
2347
2348	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2349	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2350		goto suppress_allocation;
2351
2352	/* Under limit. */
2353	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2354		sk_leave_memory_pressure(sk);
2355		return 1;
2356	}
2357
2358	/* Under pressure. */
2359	if (allocated > sk_prot_mem_limits(sk, 1))
2360		sk_enter_memory_pressure(sk);
2361
2362	/* Over hard limit. */
2363	if (allocated > sk_prot_mem_limits(sk, 2))
2364		goto suppress_allocation;
2365
2366	/* guarantee minimum buffer size under pressure */
2367	if (kind == SK_MEM_RECV) {
2368		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2369			return 1;
2370
2371	} else { /* SK_MEM_SEND */
2372		int wmem0 = sk_get_wmem0(sk, prot);
2373
2374		if (sk->sk_type == SOCK_STREAM) {
2375			if (sk->sk_wmem_queued < wmem0)
2376				return 1;
2377		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2378				return 1;
2379		}
2380	}
2381
2382	if (sk_has_memory_pressure(sk)) {
2383		u64 alloc;
2384
2385		if (!sk_under_memory_pressure(sk))
2386			return 1;
2387		alloc = sk_sockets_allocated_read_positive(sk);
2388		if (sk_prot_mem_limits(sk, 2) > alloc *
2389		    sk_mem_pages(sk->sk_wmem_queued +
2390				 atomic_read(&sk->sk_rmem_alloc) +
2391				 sk->sk_forward_alloc))
2392			return 1;
2393	}
2394
2395suppress_allocation:
2396
2397	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2398		sk_stream_moderate_sndbuf(sk);
2399
2400		/* Fail only if socket is _under_ its sndbuf.
2401		 * In this case we cannot block, so that we have to fail.
2402		 */
2403		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2404			return 1;
2405	}
2406
2407	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2408		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2409
2410	sk_memory_allocated_sub(sk, amt);
2411
2412	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2413		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2414
2415	return 0;
2416}
2417EXPORT_SYMBOL(__sk_mem_raise_allocated);
2418
2419/**
2420 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2421 *	@sk: socket
2422 *	@size: memory size to allocate
2423 *	@kind: allocation type
2424 *
2425 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2426 *	rmem allocation. This function assumes that protocols which have
2427 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2428 */
2429int __sk_mem_schedule(struct sock *sk, int size, int kind)
2430{
2431	int ret, amt = sk_mem_pages(size);
2432
2433	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2434	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2435	if (!ret)
2436		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2437	return ret;
2438}
2439EXPORT_SYMBOL(__sk_mem_schedule);
2440
2441/**
2442 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2443 *	@sk: socket
2444 *	@amount: number of quanta
2445 *
2446 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2447 */
2448void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2449{
2450	sk_memory_allocated_sub(sk, amount);
2451
2452	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2453		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2454
2455	if (sk_under_memory_pressure(sk) &&
2456	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2457		sk_leave_memory_pressure(sk);
2458}
2459EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2460
2461/**
2462 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2463 *	@sk: socket
2464 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2465 */
2466void __sk_mem_reclaim(struct sock *sk, int amount)
2467{
2468	amount >>= SK_MEM_QUANTUM_SHIFT;
2469	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2470	__sk_mem_reduce_allocated(sk, amount);
2471}
2472EXPORT_SYMBOL(__sk_mem_reclaim);
2473
2474int sk_set_peek_off(struct sock *sk, int val)
2475{
2476	sk->sk_peek_off = val;
2477	return 0;
2478}
2479EXPORT_SYMBOL_GPL(sk_set_peek_off);
2480
2481/*
2482 * Set of default routines for initialising struct proto_ops when
2483 * the protocol does not support a particular function. In certain
2484 * cases where it makes no sense for a protocol to have a "do nothing"
2485 * function, some default processing is provided.
2486 */
2487
2488int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2489{
2490	return -EOPNOTSUPP;
2491}
2492EXPORT_SYMBOL(sock_no_bind);
2493
2494int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2495		    int len, int flags)
2496{
2497	return -EOPNOTSUPP;
2498}
2499EXPORT_SYMBOL(sock_no_connect);
2500
2501int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2502{
2503	return -EOPNOTSUPP;
2504}
2505EXPORT_SYMBOL(sock_no_socketpair);
2506
2507int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2508		   bool kern)
2509{
2510	return -EOPNOTSUPP;
2511}
2512EXPORT_SYMBOL(sock_no_accept);
2513
2514int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2515		    int peer)
2516{
2517	return -EOPNOTSUPP;
2518}
2519EXPORT_SYMBOL(sock_no_getname);
2520
2521int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2522{
2523	return -EOPNOTSUPP;
2524}
2525EXPORT_SYMBOL(sock_no_ioctl);
2526
2527int sock_no_listen(struct socket *sock, int backlog)
2528{
2529	return -EOPNOTSUPP;
2530}
2531EXPORT_SYMBOL(sock_no_listen);
2532
2533int sock_no_shutdown(struct socket *sock, int how)
2534{
2535	return -EOPNOTSUPP;
2536}
2537EXPORT_SYMBOL(sock_no_shutdown);
2538
2539int sock_no_setsockopt(struct socket *sock, int level, int optname,
2540		    char __user *optval, unsigned int optlen)
2541{
2542	return -EOPNOTSUPP;
2543}
2544EXPORT_SYMBOL(sock_no_setsockopt);
2545
2546int sock_no_getsockopt(struct socket *sock, int level, int optname,
2547		    char __user *optval, int __user *optlen)
2548{
2549	return -EOPNOTSUPP;
2550}
2551EXPORT_SYMBOL(sock_no_getsockopt);
2552
2553int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2554{
2555	return -EOPNOTSUPP;
2556}
2557EXPORT_SYMBOL(sock_no_sendmsg);
2558
2559int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2560{
2561	return -EOPNOTSUPP;
2562}
2563EXPORT_SYMBOL(sock_no_sendmsg_locked);
2564
2565int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2566		    int flags)
2567{
2568	return -EOPNOTSUPP;
2569}
2570EXPORT_SYMBOL(sock_no_recvmsg);
2571
2572int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2573{
2574	/* Mirror missing mmap method error code */
2575	return -ENODEV;
2576}
2577EXPORT_SYMBOL(sock_no_mmap);
2578
2579ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2580{
2581	ssize_t res;
2582	struct msghdr msg = {.msg_flags = flags};
2583	struct kvec iov;
2584	char *kaddr = kmap(page);
2585	iov.iov_base = kaddr + offset;
2586	iov.iov_len = size;
2587	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2588	kunmap(page);
2589	return res;
2590}
2591EXPORT_SYMBOL(sock_no_sendpage);
2592
2593ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2594				int offset, size_t size, int flags)
2595{
2596	ssize_t res;
2597	struct msghdr msg = {.msg_flags = flags};
2598	struct kvec iov;
2599	char *kaddr = kmap(page);
2600
2601	iov.iov_base = kaddr + offset;
2602	iov.iov_len = size;
2603	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2604	kunmap(page);
2605	return res;
2606}
2607EXPORT_SYMBOL(sock_no_sendpage_locked);
2608
2609/*
2610 *	Default Socket Callbacks
2611 */
2612
2613static void sock_def_wakeup(struct sock *sk)
2614{
2615	struct socket_wq *wq;
2616
2617	rcu_read_lock();
2618	wq = rcu_dereference(sk->sk_wq);
2619	if (skwq_has_sleeper(wq))
2620		wake_up_interruptible_all(&wq->wait);
2621	rcu_read_unlock();
2622}
2623
2624static void sock_def_error_report(struct sock *sk)
2625{
2626	struct socket_wq *wq;
2627
2628	rcu_read_lock();
2629	wq = rcu_dereference(sk->sk_wq);
2630	if (skwq_has_sleeper(wq))
2631		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2632	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2633	rcu_read_unlock();
2634}
2635
2636static void sock_def_readable(struct sock *sk)
2637{
2638	struct socket_wq *wq;
2639
2640	rcu_read_lock();
2641	wq = rcu_dereference(sk->sk_wq);
2642	if (skwq_has_sleeper(wq))
2643		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2644						EPOLLRDNORM | EPOLLRDBAND);
2645	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2646	rcu_read_unlock();
2647}
2648
2649static void sock_def_write_space(struct sock *sk)
2650{
2651	struct socket_wq *wq;
2652
2653	rcu_read_lock();
2654
2655	/* Do not wake up a writer until he can make "significant"
2656	 * progress.  --DaveM
2657	 */
2658	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2659		wq = rcu_dereference(sk->sk_wq);
2660		if (skwq_has_sleeper(wq))
2661			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2662						EPOLLWRNORM | EPOLLWRBAND);
2663
2664		/* Should agree with poll, otherwise some programs break */
2665		if (sock_writeable(sk))
2666			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2667	}
2668
2669	rcu_read_unlock();
2670}
2671
2672static void sock_def_destruct(struct sock *sk)
2673{
2674}
2675
2676void sk_send_sigurg(struct sock *sk)
2677{
2678	if (sk->sk_socket && sk->sk_socket->file)
2679		if (send_sigurg(&sk->sk_socket->file->f_owner))
2680			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2681}
2682EXPORT_SYMBOL(sk_send_sigurg);
2683
2684void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2685		    unsigned long expires)
2686{
2687	if (!mod_timer(timer, expires))
2688		sock_hold(sk);
2689}
2690EXPORT_SYMBOL(sk_reset_timer);
2691
2692void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2693{
2694	if (del_timer(timer))
2695		__sock_put(sk);
2696}
2697EXPORT_SYMBOL(sk_stop_timer);
2698
2699void sock_init_data(struct socket *sock, struct sock *sk)
2700{
2701	sk_init_common(sk);
2702	sk->sk_send_head	=	NULL;
2703
2704	timer_setup(&sk->sk_timer, NULL, 0);
2705
2706	sk->sk_allocation	=	GFP_KERNEL;
2707	sk->sk_rcvbuf		=	sysctl_rmem_default;
2708	sk->sk_sndbuf		=	sysctl_wmem_default;
2709	sk->sk_state		=	TCP_CLOSE;
2710	sk_set_socket(sk, sock);
2711
2712	sock_set_flag(sk, SOCK_ZAPPED);
2713
2714	if (sock) {
2715		sk->sk_type	=	sock->type;
2716		sk->sk_wq	=	sock->wq;
2717		sock->sk	=	sk;
2718		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2719	} else {
2720		sk->sk_wq	=	NULL;
2721		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2722	}
2723
2724	rwlock_init(&sk->sk_callback_lock);
2725	if (sk->sk_kern_sock)
2726		lockdep_set_class_and_name(
2727			&sk->sk_callback_lock,
2728			af_kern_callback_keys + sk->sk_family,
2729			af_family_kern_clock_key_strings[sk->sk_family]);
2730	else
2731		lockdep_set_class_and_name(
2732			&sk->sk_callback_lock,
2733			af_callback_keys + sk->sk_family,
2734			af_family_clock_key_strings[sk->sk_family]);
2735
2736	sk->sk_state_change	=	sock_def_wakeup;
2737	sk->sk_data_ready	=	sock_def_readable;
2738	sk->sk_write_space	=	sock_def_write_space;
2739	sk->sk_error_report	=	sock_def_error_report;
2740	sk->sk_destruct		=	sock_def_destruct;
2741
2742	sk->sk_frag.page	=	NULL;
2743	sk->sk_frag.offset	=	0;
2744	sk->sk_peek_off		=	-1;
2745
2746	sk->sk_peer_pid 	=	NULL;
2747	sk->sk_peer_cred	=	NULL;
2748	sk->sk_write_pending	=	0;
2749	sk->sk_rcvlowat		=	1;
2750	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2751	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2752
2753	sk->sk_stamp = SK_DEFAULT_STAMP;
2754#if BITS_PER_LONG==32
2755	seqlock_init(&sk->sk_stamp_seq);
2756#endif
2757	atomic_set(&sk->sk_zckey, 0);
2758
2759#ifdef CONFIG_NET_RX_BUSY_POLL
2760	sk->sk_napi_id		=	0;
2761	sk->sk_ll_usec		=	sysctl_net_busy_read;
2762#endif
2763
2764	sk->sk_max_pacing_rate = ~0UL;
2765	sk->sk_pacing_rate = ~0UL;
2766	sk->sk_pacing_shift = 10;
2767	sk->sk_incoming_cpu = -1;
2768
2769	sk_rx_queue_clear(sk);
2770	/*
2771	 * Before updating sk_refcnt, we must commit prior changes to memory
2772	 * (Documentation/RCU/rculist_nulls.txt for details)
2773	 */
2774	smp_wmb();
2775	refcount_set(&sk->sk_refcnt, 1);
2776	atomic_set(&sk->sk_drops, 0);
2777}
2778EXPORT_SYMBOL(sock_init_data);
2779
2780void lock_sock_nested(struct sock *sk, int subclass)
2781{
2782	might_sleep();
2783	spin_lock_bh(&sk->sk_lock.slock);
2784	if (sk->sk_lock.owned)
2785		__lock_sock(sk);
2786	sk->sk_lock.owned = 1;
2787	spin_unlock(&sk->sk_lock.slock);
2788	/*
2789	 * The sk_lock has mutex_lock() semantics here:
2790	 */
2791	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2792	local_bh_enable();
2793}
2794EXPORT_SYMBOL(lock_sock_nested);
2795
2796void release_sock(struct sock *sk)
2797{
2798	spin_lock_bh(&sk->sk_lock.slock);
2799	if (sk->sk_backlog.tail)
2800		__release_sock(sk);
2801
2802	/* Warning : release_cb() might need to release sk ownership,
2803	 * ie call sock_release_ownership(sk) before us.
2804	 */
2805	if (sk->sk_prot->release_cb)
2806		sk->sk_prot->release_cb(sk);
2807
2808	sock_release_ownership(sk);
2809	if (waitqueue_active(&sk->sk_lock.wq))
2810		wake_up(&sk->sk_lock.wq);
2811	spin_unlock_bh(&sk->sk_lock.slock);
2812}
2813EXPORT_SYMBOL(release_sock);
2814
2815/**
2816 * lock_sock_fast - fast version of lock_sock
2817 * @sk: socket
2818 *
2819 * This version should be used for very small section, where process wont block
2820 * return false if fast path is taken:
2821 *
2822 *   sk_lock.slock locked, owned = 0, BH disabled
2823 *
2824 * return true if slow path is taken:
2825 *
2826 *   sk_lock.slock unlocked, owned = 1, BH enabled
2827 */
2828bool lock_sock_fast(struct sock *sk)
2829{
2830	might_sleep();
2831	spin_lock_bh(&sk->sk_lock.slock);
2832
2833	if (!sk->sk_lock.owned)
2834		/*
2835		 * Note : We must disable BH
2836		 */
2837		return false;
2838
2839	__lock_sock(sk);
2840	sk->sk_lock.owned = 1;
2841	spin_unlock(&sk->sk_lock.slock);
2842	/*
2843	 * The sk_lock has mutex_lock() semantics here:
2844	 */
2845	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2846	local_bh_enable();
2847	return true;
2848}
2849EXPORT_SYMBOL(lock_sock_fast);
2850
2851int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2852{
2853	struct timeval tv;
2854
2855	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2856	tv = ktime_to_timeval(sock_read_timestamp(sk));
2857	if (tv.tv_sec == -1)
2858		return -ENOENT;
2859	if (tv.tv_sec == 0) {
2860		ktime_t kt = ktime_get_real();
2861		sock_write_timestamp(sk, kt);
2862		tv = ktime_to_timeval(kt);
2863	}
2864	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2865}
2866EXPORT_SYMBOL(sock_get_timestamp);
2867
2868int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2869{
2870	struct timespec ts;
2871
2872	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2873	ts = ktime_to_timespec(sock_read_timestamp(sk));
2874	if (ts.tv_sec == -1)
2875		return -ENOENT;
2876	if (ts.tv_sec == 0) {
2877		ktime_t kt = ktime_get_real();
2878		sock_write_timestamp(sk, kt);
2879		ts = ktime_to_timespec(sk->sk_stamp);
2880	}
2881	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2882}
2883EXPORT_SYMBOL(sock_get_timestampns);
2884
2885void sock_enable_timestamp(struct sock *sk, int flag)
2886{
2887	if (!sock_flag(sk, flag)) {
2888		unsigned long previous_flags = sk->sk_flags;
2889
2890		sock_set_flag(sk, flag);
2891		/*
2892		 * we just set one of the two flags which require net
2893		 * time stamping, but time stamping might have been on
2894		 * already because of the other one
2895		 */
2896		if (sock_needs_netstamp(sk) &&
2897		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2898			net_enable_timestamp();
2899	}
2900}
2901
2902int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2903		       int level, int type)
2904{
2905	struct sock_exterr_skb *serr;
2906	struct sk_buff *skb;
2907	int copied, err;
2908
2909	err = -EAGAIN;
2910	skb = sock_dequeue_err_skb(sk);
2911	if (skb == NULL)
2912		goto out;
2913
2914	copied = skb->len;
2915	if (copied > len) {
2916		msg->msg_flags |= MSG_TRUNC;
2917		copied = len;
2918	}
2919	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2920	if (err)
2921		goto out_free_skb;
2922
2923	sock_recv_timestamp(msg, sk, skb);
2924
2925	serr = SKB_EXT_ERR(skb);
2926	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2927
2928	msg->msg_flags |= MSG_ERRQUEUE;
2929	err = copied;
2930
2931out_free_skb:
2932	kfree_skb(skb);
2933out:
2934	return err;
2935}
2936EXPORT_SYMBOL(sock_recv_errqueue);
2937
2938/*
2939 *	Get a socket option on an socket.
2940 *
2941 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2942 *	asynchronous errors should be reported by getsockopt. We assume
2943 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2944 */
2945int sock_common_getsockopt(struct socket *sock, int level, int optname,
2946			   char __user *optval, int __user *optlen)
2947{
2948	struct sock *sk = sock->sk;
2949
2950	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2951}
2952EXPORT_SYMBOL(sock_common_getsockopt);
2953
2954#ifdef CONFIG_COMPAT
2955int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2956				  char __user *optval, int __user *optlen)
2957{
2958	struct sock *sk = sock->sk;
2959
2960	if (sk->sk_prot->compat_getsockopt != NULL)
2961		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2962						      optval, optlen);
2963	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2964}
2965EXPORT_SYMBOL(compat_sock_common_getsockopt);
2966#endif
2967
2968int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2969			int flags)
2970{
2971	struct sock *sk = sock->sk;
2972	int addr_len = 0;
2973	int err;
2974
2975	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2976				   flags & ~MSG_DONTWAIT, &addr_len);
2977	if (err >= 0)
2978		msg->msg_namelen = addr_len;
2979	return err;
2980}
2981EXPORT_SYMBOL(sock_common_recvmsg);
2982
2983/*
2984 *	Set socket options on an inet socket.
2985 */
2986int sock_common_setsockopt(struct socket *sock, int level, int optname,
2987			   char __user *optval, unsigned int optlen)
2988{
2989	struct sock *sk = sock->sk;
2990
2991	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2992}
2993EXPORT_SYMBOL(sock_common_setsockopt);
2994
2995#ifdef CONFIG_COMPAT
2996int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2997				  char __user *optval, unsigned int optlen)
2998{
2999	struct sock *sk = sock->sk;
3000
3001	if (sk->sk_prot->compat_setsockopt != NULL)
3002		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3003						      optval, optlen);
3004	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3005}
3006EXPORT_SYMBOL(compat_sock_common_setsockopt);
3007#endif
3008
3009void sk_common_release(struct sock *sk)
3010{
3011	if (sk->sk_prot->destroy)
3012		sk->sk_prot->destroy(sk);
3013
3014	/*
3015	 * Observation: when sock_common_release is called, processes have
3016	 * no access to socket. But net still has.
3017	 * Step one, detach it from networking:
3018	 *
3019	 * A. Remove from hash tables.
3020	 */
3021
3022	sk->sk_prot->unhash(sk);
3023
3024	/*
3025	 * In this point socket cannot receive new packets, but it is possible
3026	 * that some packets are in flight because some CPU runs receiver and
3027	 * did hash table lookup before we unhashed socket. They will achieve
3028	 * receive queue and will be purged by socket destructor.
3029	 *
3030	 * Also we still have packets pending on receive queue and probably,
3031	 * our own packets waiting in device queues. sock_destroy will drain
3032	 * receive queue, but transmitted packets will delay socket destruction
3033	 * until the last reference will be released.
3034	 */
3035
3036	sock_orphan(sk);
3037
3038	xfrm_sk_free_policy(sk);
3039
3040	sk_refcnt_debug_release(sk);
3041
3042	sock_put(sk);
3043}
3044EXPORT_SYMBOL(sk_common_release);
3045
3046void sk_get_meminfo(const struct sock *sk, u32 *mem)
3047{
3048	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3049
3050	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3051	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3052	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3053	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3054	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3055	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3056	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3057	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3058	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3059}
3060
3061#ifdef CONFIG_PROC_FS
3062#define PROTO_INUSE_NR	64	/* should be enough for the first time */
3063struct prot_inuse {
3064	int val[PROTO_INUSE_NR];
3065};
3066
3067static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3068
3069void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3070{
3071	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3072}
3073EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3074
3075int sock_prot_inuse_get(struct net *net, struct proto *prot)
3076{
3077	int cpu, idx = prot->inuse_idx;
3078	int res = 0;
3079
3080	for_each_possible_cpu(cpu)
3081		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3082
3083	return res >= 0 ? res : 0;
3084}
3085EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3086
3087static void sock_inuse_add(struct net *net, int val)
3088{
3089	this_cpu_add(*net->core.sock_inuse, val);
3090}
3091
3092int sock_inuse_get(struct net *net)
3093{
3094	int cpu, res = 0;
3095
3096	for_each_possible_cpu(cpu)
3097		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3098
3099	return res;
3100}
3101
3102EXPORT_SYMBOL_GPL(sock_inuse_get);
3103
3104static int __net_init sock_inuse_init_net(struct net *net)
3105{
3106	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3107	if (net->core.prot_inuse == NULL)
3108		return -ENOMEM;
3109
3110	net->core.sock_inuse = alloc_percpu(int);
3111	if (net->core.sock_inuse == NULL)
3112		goto out;
3113
3114	return 0;
3115
3116out:
3117	free_percpu(net->core.prot_inuse);
3118	return -ENOMEM;
3119}
3120
3121static void __net_exit sock_inuse_exit_net(struct net *net)
3122{
3123	free_percpu(net->core.prot_inuse);
3124	free_percpu(net->core.sock_inuse);
3125}
3126
3127static struct pernet_operations net_inuse_ops = {
3128	.init = sock_inuse_init_net,
3129	.exit = sock_inuse_exit_net,
3130};
3131
3132static __init int net_inuse_init(void)
3133{
3134	if (register_pernet_subsys(&net_inuse_ops))
3135		panic("Cannot initialize net inuse counters");
3136
3137	return 0;
3138}
3139
3140core_initcall(net_inuse_init);
3141
3142static void assign_proto_idx(struct proto *prot)
3143{
3144	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3145
3146	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3147		pr_err("PROTO_INUSE_NR exhausted\n");
3148		return;
3149	}
3150
3151	set_bit(prot->inuse_idx, proto_inuse_idx);
3152}
3153
3154static void release_proto_idx(struct proto *prot)
3155{
3156	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3157		clear_bit(prot->inuse_idx, proto_inuse_idx);
3158}
3159#else
3160static inline void assign_proto_idx(struct proto *prot)
3161{
3162}
3163
3164static inline void release_proto_idx(struct proto *prot)
3165{
3166}
3167
3168static void sock_inuse_add(struct net *net, int val)
3169{
3170}
3171#endif
3172
3173static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3174{
3175	if (!rsk_prot)
3176		return;
3177	kfree(rsk_prot->slab_name);
3178	rsk_prot->slab_name = NULL;
3179	kmem_cache_destroy(rsk_prot->slab);
3180	rsk_prot->slab = NULL;
3181}
3182
3183static int req_prot_init(const struct proto *prot)
3184{
3185	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3186
3187	if (!rsk_prot)
3188		return 0;
3189
3190	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3191					prot->name);
3192	if (!rsk_prot->slab_name)
3193		return -ENOMEM;
3194
3195	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3196					   rsk_prot->obj_size, 0,
3197					   SLAB_ACCOUNT | prot->slab_flags,
3198					   NULL);
3199
3200	if (!rsk_prot->slab) {
3201		pr_crit("%s: Can't create request sock SLAB cache!\n",
3202			prot->name);
3203		return -ENOMEM;
3204	}
3205	return 0;
3206}
3207
3208int proto_register(struct proto *prot, int alloc_slab)
3209{
3210	if (alloc_slab) {
3211		prot->slab = kmem_cache_create_usercopy(prot->name,
3212					prot->obj_size, 0,
3213					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3214					prot->slab_flags,
3215					prot->useroffset, prot->usersize,
3216					NULL);
3217
3218		if (prot->slab == NULL) {
3219			pr_crit("%s: Can't create sock SLAB cache!\n",
3220				prot->name);
3221			goto out;
3222		}
3223
3224		if (req_prot_init(prot))
3225			goto out_free_request_sock_slab;
3226
3227		if (prot->twsk_prot != NULL) {
3228			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3229
3230			if (prot->twsk_prot->twsk_slab_name == NULL)
3231				goto out_free_request_sock_slab;
3232
3233			prot->twsk_prot->twsk_slab =
3234				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3235						  prot->twsk_prot->twsk_obj_size,
3236						  0,
3237						  SLAB_ACCOUNT |
3238						  prot->slab_flags,
3239						  NULL);
3240			if (prot->twsk_prot->twsk_slab == NULL)
3241				goto out_free_timewait_sock_slab_name;
3242		}
3243	}
3244
3245	mutex_lock(&proto_list_mutex);
3246	list_add(&prot->node, &proto_list);
3247	assign_proto_idx(prot);
3248	mutex_unlock(&proto_list_mutex);
3249	return 0;
3250
3251out_free_timewait_sock_slab_name:
3252	kfree(prot->twsk_prot->twsk_slab_name);
3253out_free_request_sock_slab:
3254	req_prot_cleanup(prot->rsk_prot);
3255
3256	kmem_cache_destroy(prot->slab);
3257	prot->slab = NULL;
3258out:
3259	return -ENOBUFS;
3260}
3261EXPORT_SYMBOL(proto_register);
3262
3263void proto_unregister(struct proto *prot)
3264{
3265	mutex_lock(&proto_list_mutex);
3266	release_proto_idx(prot);
3267	list_del(&prot->node);
3268	mutex_unlock(&proto_list_mutex);
3269
3270	kmem_cache_destroy(prot->slab);
3271	prot->slab = NULL;
3272
3273	req_prot_cleanup(prot->rsk_prot);
3274
3275	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3276		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3277		kfree(prot->twsk_prot->twsk_slab_name);
3278		prot->twsk_prot->twsk_slab = NULL;
3279	}
3280}
3281EXPORT_SYMBOL(proto_unregister);
3282
3283int sock_load_diag_module(int family, int protocol)
3284{
3285	if (!protocol) {
3286		if (!sock_is_registered(family))
3287			return -ENOENT;
3288
3289		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3290				      NETLINK_SOCK_DIAG, family);
3291	}
3292
3293#ifdef CONFIG_INET
3294	if (family == AF_INET &&
3295	    protocol != IPPROTO_RAW &&
3296	    !rcu_access_pointer(inet_protos[protocol]))
3297		return -ENOENT;
3298#endif
3299
3300	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3301			      NETLINK_SOCK_DIAG, family, protocol);
3302}
3303EXPORT_SYMBOL(sock_load_diag_module);
3304
3305#ifdef CONFIG_PROC_FS
3306static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3307	__acquires(proto_list_mutex)
3308{
3309	mutex_lock(&proto_list_mutex);
3310	return seq_list_start_head(&proto_list, *pos);
3311}
3312
3313static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3314{
3315	return seq_list_next(v, &proto_list, pos);
3316}
3317
3318static void proto_seq_stop(struct seq_file *seq, void *v)
3319	__releases(proto_list_mutex)
3320{
3321	mutex_unlock(&proto_list_mutex);
3322}
3323
3324static char proto_method_implemented(const void *method)
3325{
3326	return method == NULL ? 'n' : 'y';
3327}
3328static long sock_prot_memory_allocated(struct proto *proto)
3329{
3330	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3331}
3332
3333static char *sock_prot_memory_pressure(struct proto *proto)
3334{
3335	return proto->memory_pressure != NULL ?
3336	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3337}
3338
3339static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3340{
3341
3342	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3343			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3344		   proto->name,
3345		   proto->obj_size,
3346		   sock_prot_inuse_get(seq_file_net(seq), proto),
3347		   sock_prot_memory_allocated(proto),
3348		   sock_prot_memory_pressure(proto),
3349		   proto->max_header,
3350		   proto->slab == NULL ? "no" : "yes",
3351		   module_name(proto->owner),
3352		   proto_method_implemented(proto->close),
3353		   proto_method_implemented(proto->connect),
3354		   proto_method_implemented(proto->disconnect),
3355		   proto_method_implemented(proto->accept),
3356		   proto_method_implemented(proto->ioctl),
3357		   proto_method_implemented(proto->init),
3358		   proto_method_implemented(proto->destroy),
3359		   proto_method_implemented(proto->shutdown),
3360		   proto_method_implemented(proto->setsockopt),
3361		   proto_method_implemented(proto->getsockopt),
3362		   proto_method_implemented(proto->sendmsg),
3363		   proto_method_implemented(proto->recvmsg),
3364		   proto_method_implemented(proto->sendpage),
3365		   proto_method_implemented(proto->bind),
3366		   proto_method_implemented(proto->backlog_rcv),
3367		   proto_method_implemented(proto->hash),
3368		   proto_method_implemented(proto->unhash),
3369		   proto_method_implemented(proto->get_port),
3370		   proto_method_implemented(proto->enter_memory_pressure));
3371}
3372
3373static int proto_seq_show(struct seq_file *seq, void *v)
3374{
3375	if (v == &proto_list)
3376		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3377			   "protocol",
3378			   "size",
3379			   "sockets",
3380			   "memory",
3381			   "press",
3382			   "maxhdr",
3383			   "slab",
3384			   "module",
3385			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3386	else
3387		proto_seq_printf(seq, list_entry(v, struct proto, node));
3388	return 0;
3389}
3390
3391static const struct seq_operations proto_seq_ops = {
3392	.start  = proto_seq_start,
3393	.next   = proto_seq_next,
3394	.stop   = proto_seq_stop,
3395	.show   = proto_seq_show,
3396};
3397
3398static __net_init int proto_init_net(struct net *net)
3399{
3400	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3401			sizeof(struct seq_net_private)))
3402		return -ENOMEM;
3403
3404	return 0;
3405}
3406
3407static __net_exit void proto_exit_net(struct net *net)
3408{
3409	remove_proc_entry("protocols", net->proc_net);
3410}
3411
3412
3413static __net_initdata struct pernet_operations proto_net_ops = {
3414	.init = proto_init_net,
3415	.exit = proto_exit_net,
3416};
3417
3418static int __init proto_init(void)
3419{
3420	return register_pernet_subsys(&proto_net_ops);
3421}
3422
3423subsys_initcall(proto_init);
3424
3425#endif /* PROC_FS */
3426
3427#ifdef CONFIG_NET_RX_BUSY_POLL
3428bool sk_busy_loop_end(void *p, unsigned long start_time)
3429{
3430	struct sock *sk = p;
3431
3432	return !skb_queue_empty(&sk->sk_receive_queue) ||
3433	       sk_busy_loop_timeout(sk, start_time);
3434}
3435EXPORT_SYMBOL(sk_busy_loop_end);
3436#endif /* CONFIG_NET_RX_BUSY_POLL */
Configure Feed

Configure Feed