net/core/sock.c at v4.11-rc3 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v4.11-rc3 3138 lines 77 kB view raw
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Generic socket support routines. Memory allocators, socket lock/release
   7 *		handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *		This program is free software; you can redistribute it and/or
  87 *		modify it under the terms of the GNU General Public License
  88 *		as published by the Free Software Foundation; either version
  89 *		2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/timer.h>
 106#include <linux/string.h>
 107#include <linux/sockios.h>
 108#include <linux/net.h>
 109#include <linux/mm.h>
 110#include <linux/slab.h>
 111#include <linux/interrupt.h>
 112#include <linux/poll.h>
 113#include <linux/tcp.h>
 114#include <linux/init.h>
 115#include <linux/highmem.h>
 116#include <linux/user_namespace.h>
 117#include <linux/static_key.h>
 118#include <linux/memcontrol.h>
 119#include <linux/prefetch.h>
 120
 121#include <linux/uaccess.h>
 122
 123#include <linux/netdevice.h>
 124#include <net/protocol.h>
 125#include <linux/skbuff.h>
 126#include <net/net_namespace.h>
 127#include <net/request_sock.h>
 128#include <net/sock.h>
 129#include <linux/net_tstamp.h>
 130#include <net/xfrm.h>
 131#include <linux/ipsec.h>
 132#include <net/cls_cgroup.h>
 133#include <net/netprio_cgroup.h>
 134#include <linux/sock_diag.h>
 135
 136#include <linux/filter.h>
 137#include <net/sock_reuseport.h>
 138
 139#include <trace/events/sock.h>
 140
 141#ifdef CONFIG_INET
 142#include <net/tcp.h>
 143#endif
 144
 145#include <net/busy_poll.h>
 146
 147static DEFINE_MUTEX(proto_list_mutex);
 148static LIST_HEAD(proto_list);
 149
 150/**
 151 * sk_ns_capable - General socket capability test
 152 * @sk: Socket to use a capability on or through
 153 * @user_ns: The user namespace of the capability to use
 154 * @cap: The capability to use
 155 *
 156 * Test to see if the opener of the socket had when the socket was
 157 * created and the current process has the capability @cap in the user
 158 * namespace @user_ns.
 159 */
 160bool sk_ns_capable(const struct sock *sk,
 161		   struct user_namespace *user_ns, int cap)
 162{
 163	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164		ns_capable(user_ns, cap);
 165}
 166EXPORT_SYMBOL(sk_ns_capable);
 167
 168/**
 169 * sk_capable - Socket global capability test
 170 * @sk: Socket to use a capability on or through
 171 * @cap: The global capability to use
 172 *
 173 * Test to see if the opener of the socket had when the socket was
 174 * created and the current process has the capability @cap in all user
 175 * namespaces.
 176 */
 177bool sk_capable(const struct sock *sk, int cap)
 178{
 179	return sk_ns_capable(sk, &init_user_ns, cap);
 180}
 181EXPORT_SYMBOL(sk_capable);
 182
 183/**
 184 * sk_net_capable - Network namespace socket capability test
 185 * @sk: Socket to use a capability on or through
 186 * @cap: The capability to use
 187 *
 188 * Test to see if the opener of the socket had when the socket was created
 189 * and the current process has the capability @cap over the network namespace
 190 * the socket is a member of.
 191 */
 192bool sk_net_capable(const struct sock *sk, int cap)
 193{
 194	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195}
 196EXPORT_SYMBOL(sk_net_capable);
 197
 198/*
 199 * Each address family might have different locking rules, so we have
 200 * one slock key per address family and separate keys for internal and
 201 * userspace sockets.
 202 */
 203static struct lock_class_key af_family_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_keys[AF_MAX];
 205static struct lock_class_key af_family_slock_keys[AF_MAX];
 206static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208/*
 209 * Make lock validator output more readable. (we pre-construct these
 210 * strings build-time, so that runtime initialization of socket
 211 * locks is fast):
 212 */
 213
 214#define _sock_locks(x)						  \
 215  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 216  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 217  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 218  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 219  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 220  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 221  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 222  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 223  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 224  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 225  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 226  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 227  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 228  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 229  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
 230
 231static const char *const af_family_key_strings[AF_MAX+1] = {
 232	_sock_locks("sk_lock-")
 233};
 234static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 235	_sock_locks("slock-")
 236};
 237static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 238	_sock_locks("clock-")
 239};
 240
 241static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 242	_sock_locks("k-sk_lock-")
 243};
 244static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 245	_sock_locks("k-slock-")
 246};
 247static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 248	_sock_locks("k-clock-")
 249};
 250
 251/*
 252 * sk_callback_lock locking rules are per-address-family,
 253 * so split the lock classes by using a per-AF key:
 254 */
 255static struct lock_class_key af_callback_keys[AF_MAX];
 256static struct lock_class_key af_kern_callback_keys[AF_MAX];
 257
 258/* Take into consideration the size of the struct sk_buff overhead in the
 259 * determination of these values, since that is non-constant across
 260 * platforms.  This makes socket queueing behavior and performance
 261 * not depend upon such differences.
 262 */
 263#define _SK_MEM_PACKETS		256
 264#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
 265#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 266#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 267
 268/* Run time adjustable parameters. */
 269__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 270EXPORT_SYMBOL(sysctl_wmem_max);
 271__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 272EXPORT_SYMBOL(sysctl_rmem_max);
 273__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 274__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 275
 276/* Maximal space eaten by iovec or ancillary data plus some space */
 277int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 278EXPORT_SYMBOL(sysctl_optmem_max);
 279
 280int sysctl_tstamp_allow_data __read_mostly = 1;
 281
 282struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 283EXPORT_SYMBOL_GPL(memalloc_socks);
 284
 285/**
 286 * sk_set_memalloc - sets %SOCK_MEMALLOC
 287 * @sk: socket to set it on
 288 *
 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 290 * It's the responsibility of the admin to adjust min_free_kbytes
 291 * to meet the requirements
 292 */
 293void sk_set_memalloc(struct sock *sk)
 294{
 295	sock_set_flag(sk, SOCK_MEMALLOC);
 296	sk->sk_allocation |= __GFP_MEMALLOC;
 297	static_key_slow_inc(&memalloc_socks);
 298}
 299EXPORT_SYMBOL_GPL(sk_set_memalloc);
 300
 301void sk_clear_memalloc(struct sock *sk)
 302{
 303	sock_reset_flag(sk, SOCK_MEMALLOC);
 304	sk->sk_allocation &= ~__GFP_MEMALLOC;
 305	static_key_slow_dec(&memalloc_socks);
 306
 307	/*
 308	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 309	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 310	 * it has rmem allocations due to the last swapfile being deactivated
 311	 * but there is a risk that the socket is unusable due to exceeding
 312	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 313	 */
 314	sk_mem_reclaim(sk);
 315}
 316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 317
 318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 319{
 320	int ret;
 321	unsigned long pflags = current->flags;
 322
 323	/* these should have been dropped before queueing */
 324	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 325
 326	current->flags |= PF_MEMALLOC;
 327	ret = sk->sk_backlog_rcv(sk, skb);
 328	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 329
 330	return ret;
 331}
 332EXPORT_SYMBOL(__sk_backlog_rcv);
 333
 334static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 335{
 336	struct timeval tv;
 337
 338	if (optlen < sizeof(tv))
 339		return -EINVAL;
 340	if (copy_from_user(&tv, optval, sizeof(tv)))
 341		return -EFAULT;
 342	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 343		return -EDOM;
 344
 345	if (tv.tv_sec < 0) {
 346		static int warned __read_mostly;
 347
 348		*timeo_p = 0;
 349		if (warned < 10 && net_ratelimit()) {
 350			warned++;
 351			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 352				__func__, current->comm, task_pid_nr(current));
 353		}
 354		return 0;
 355	}
 356	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 357	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 358		return 0;
 359	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 360		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 361	return 0;
 362}
 363
 364static void sock_warn_obsolete_bsdism(const char *name)
 365{
 366	static int warned;
 367	static char warncomm[TASK_COMM_LEN];
 368	if (strcmp(warncomm, current->comm) && warned < 5) {
 369		strcpy(warncomm,  current->comm);
 370		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 371			warncomm, name);
 372		warned++;
 373	}
 374}
 375
 376static bool sock_needs_netstamp(const struct sock *sk)
 377{
 378	switch (sk->sk_family) {
 379	case AF_UNSPEC:
 380	case AF_UNIX:
 381		return false;
 382	default:
 383		return true;
 384	}
 385}
 386
 387static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 388{
 389	if (sk->sk_flags & flags) {
 390		sk->sk_flags &= ~flags;
 391		if (sock_needs_netstamp(sk) &&
 392		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 393			net_disable_timestamp();
 394	}
 395}
 396
 397
 398int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 399{
 400	unsigned long flags;
 401	struct sk_buff_head *list = &sk->sk_receive_queue;
 402
 403	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 404		atomic_inc(&sk->sk_drops);
 405		trace_sock_rcvqueue_full(sk, skb);
 406		return -ENOMEM;
 407	}
 408
 409	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 410		atomic_inc(&sk->sk_drops);
 411		return -ENOBUFS;
 412	}
 413
 414	skb->dev = NULL;
 415	skb_set_owner_r(skb, sk);
 416
 417	/* we escape from rcu protected region, make sure we dont leak
 418	 * a norefcounted dst
 419	 */
 420	skb_dst_force(skb);
 421
 422	spin_lock_irqsave(&list->lock, flags);
 423	sock_skb_set_dropcount(sk, skb);
 424	__skb_queue_tail(list, skb);
 425	spin_unlock_irqrestore(&list->lock, flags);
 426
 427	if (!sock_flag(sk, SOCK_DEAD))
 428		sk->sk_data_ready(sk);
 429	return 0;
 430}
 431EXPORT_SYMBOL(__sock_queue_rcv_skb);
 432
 433int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 434{
 435	int err;
 436
 437	err = sk_filter(sk, skb);
 438	if (err)
 439		return err;
 440
 441	return __sock_queue_rcv_skb(sk, skb);
 442}
 443EXPORT_SYMBOL(sock_queue_rcv_skb);
 444
 445int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 446		     const int nested, unsigned int trim_cap, bool refcounted)
 447{
 448	int rc = NET_RX_SUCCESS;
 449
 450	if (sk_filter_trim_cap(sk, skb, trim_cap))
 451		goto discard_and_relse;
 452
 453	skb->dev = NULL;
 454
 455	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 456		atomic_inc(&sk->sk_drops);
 457		goto discard_and_relse;
 458	}
 459	if (nested)
 460		bh_lock_sock_nested(sk);
 461	else
 462		bh_lock_sock(sk);
 463	if (!sock_owned_by_user(sk)) {
 464		/*
 465		 * trylock + unlock semantics:
 466		 */
 467		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 468
 469		rc = sk_backlog_rcv(sk, skb);
 470
 471		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 472	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 473		bh_unlock_sock(sk);
 474		atomic_inc(&sk->sk_drops);
 475		goto discard_and_relse;
 476	}
 477
 478	bh_unlock_sock(sk);
 479out:
 480	if (refcounted)
 481		sock_put(sk);
 482	return rc;
 483discard_and_relse:
 484	kfree_skb(skb);
 485	goto out;
 486}
 487EXPORT_SYMBOL(__sk_receive_skb);
 488
 489struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 490{
 491	struct dst_entry *dst = __sk_dst_get(sk);
 492
 493	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 494		sk_tx_queue_clear(sk);
 495		sk->sk_dst_pending_confirm = 0;
 496		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 497		dst_release(dst);
 498		return NULL;
 499	}
 500
 501	return dst;
 502}
 503EXPORT_SYMBOL(__sk_dst_check);
 504
 505struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 506{
 507	struct dst_entry *dst = sk_dst_get(sk);
 508
 509	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 510		sk_dst_reset(sk);
 511		dst_release(dst);
 512		return NULL;
 513	}
 514
 515	return dst;
 516}
 517EXPORT_SYMBOL(sk_dst_check);
 518
 519static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 520				int optlen)
 521{
 522	int ret = -ENOPROTOOPT;
 523#ifdef CONFIG_NETDEVICES
 524	struct net *net = sock_net(sk);
 525	char devname[IFNAMSIZ];
 526	int index;
 527
 528	/* Sorry... */
 529	ret = -EPERM;
 530	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 531		goto out;
 532
 533	ret = -EINVAL;
 534	if (optlen < 0)
 535		goto out;
 536
 537	/* Bind this socket to a particular device like "eth0",
 538	 * as specified in the passed interface name. If the
 539	 * name is "" or the option length is zero the socket
 540	 * is not bound.
 541	 */
 542	if (optlen > IFNAMSIZ - 1)
 543		optlen = IFNAMSIZ - 1;
 544	memset(devname, 0, sizeof(devname));
 545
 546	ret = -EFAULT;
 547	if (copy_from_user(devname, optval, optlen))
 548		goto out;
 549
 550	index = 0;
 551	if (devname[0] != '\0') {
 552		struct net_device *dev;
 553
 554		rcu_read_lock();
 555		dev = dev_get_by_name_rcu(net, devname);
 556		if (dev)
 557			index = dev->ifindex;
 558		rcu_read_unlock();
 559		ret = -ENODEV;
 560		if (!dev)
 561			goto out;
 562	}
 563
 564	lock_sock(sk);
 565	sk->sk_bound_dev_if = index;
 566	sk_dst_reset(sk);
 567	release_sock(sk);
 568
 569	ret = 0;
 570
 571out:
 572#endif
 573
 574	return ret;
 575}
 576
 577static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 578				int __user *optlen, int len)
 579{
 580	int ret = -ENOPROTOOPT;
 581#ifdef CONFIG_NETDEVICES
 582	struct net *net = sock_net(sk);
 583	char devname[IFNAMSIZ];
 584
 585	if (sk->sk_bound_dev_if == 0) {
 586		len = 0;
 587		goto zero;
 588	}
 589
 590	ret = -EINVAL;
 591	if (len < IFNAMSIZ)
 592		goto out;
 593
 594	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 595	if (ret)
 596		goto out;
 597
 598	len = strlen(devname) + 1;
 599
 600	ret = -EFAULT;
 601	if (copy_to_user(optval, devname, len))
 602		goto out;
 603
 604zero:
 605	ret = -EFAULT;
 606	if (put_user(len, optlen))
 607		goto out;
 608
 609	ret = 0;
 610
 611out:
 612#endif
 613
 614	return ret;
 615}
 616
 617static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 618{
 619	if (valbool)
 620		sock_set_flag(sk, bit);
 621	else
 622		sock_reset_flag(sk, bit);
 623}
 624
 625bool sk_mc_loop(struct sock *sk)
 626{
 627	if (dev_recursion_level())
 628		return false;
 629	if (!sk)
 630		return true;
 631	switch (sk->sk_family) {
 632	case AF_INET:
 633		return inet_sk(sk)->mc_loop;
 634#if IS_ENABLED(CONFIG_IPV6)
 635	case AF_INET6:
 636		return inet6_sk(sk)->mc_loop;
 637#endif
 638	}
 639	WARN_ON(1);
 640	return true;
 641}
 642EXPORT_SYMBOL(sk_mc_loop);
 643
 644/*
 645 *	This is meant for all protocols to use and covers goings on
 646 *	at the socket level. Everything here is generic.
 647 */
 648
 649int sock_setsockopt(struct socket *sock, int level, int optname,
 650		    char __user *optval, unsigned int optlen)
 651{
 652	struct sock *sk = sock->sk;
 653	int val;
 654	int valbool;
 655	struct linger ling;
 656	int ret = 0;
 657
 658	/*
 659	 *	Options without arguments
 660	 */
 661
 662	if (optname == SO_BINDTODEVICE)
 663		return sock_setbindtodevice(sk, optval, optlen);
 664
 665	if (optlen < sizeof(int))
 666		return -EINVAL;
 667
 668	if (get_user(val, (int __user *)optval))
 669		return -EFAULT;
 670
 671	valbool = val ? 1 : 0;
 672
 673	lock_sock(sk);
 674
 675	switch (optname) {
 676	case SO_DEBUG:
 677		if (val && !capable(CAP_NET_ADMIN))
 678			ret = -EACCES;
 679		else
 680			sock_valbool_flag(sk, SOCK_DBG, valbool);
 681		break;
 682	case SO_REUSEADDR:
 683		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 684		break;
 685	case SO_REUSEPORT:
 686		sk->sk_reuseport = valbool;
 687		break;
 688	case SO_TYPE:
 689	case SO_PROTOCOL:
 690	case SO_DOMAIN:
 691	case SO_ERROR:
 692		ret = -ENOPROTOOPT;
 693		break;
 694	case SO_DONTROUTE:
 695		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 696		break;
 697	case SO_BROADCAST:
 698		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 699		break;
 700	case SO_SNDBUF:
 701		/* Don't error on this BSD doesn't and if you think
 702		 * about it this is right. Otherwise apps have to
 703		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 704		 * are treated in BSD as hints
 705		 */
 706		val = min_t(u32, val, sysctl_wmem_max);
 707set_sndbuf:
 708		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 709		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 710		/* Wake up sending tasks if we upped the value. */
 711		sk->sk_write_space(sk);
 712		break;
 713
 714	case SO_SNDBUFFORCE:
 715		if (!capable(CAP_NET_ADMIN)) {
 716			ret = -EPERM;
 717			break;
 718		}
 719		goto set_sndbuf;
 720
 721	case SO_RCVBUF:
 722		/* Don't error on this BSD doesn't and if you think
 723		 * about it this is right. Otherwise apps have to
 724		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 725		 * are treated in BSD as hints
 726		 */
 727		val = min_t(u32, val, sysctl_rmem_max);
 728set_rcvbuf:
 729		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 730		/*
 731		 * We double it on the way in to account for
 732		 * "struct sk_buff" etc. overhead.   Applications
 733		 * assume that the SO_RCVBUF setting they make will
 734		 * allow that much actual data to be received on that
 735		 * socket.
 736		 *
 737		 * Applications are unaware that "struct sk_buff" and
 738		 * other overheads allocate from the receive buffer
 739		 * during socket buffer allocation.
 740		 *
 741		 * And after considering the possible alternatives,
 742		 * returning the value we actually used in getsockopt
 743		 * is the most desirable behavior.
 744		 */
 745		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 746		break;
 747
 748	case SO_RCVBUFFORCE:
 749		if (!capable(CAP_NET_ADMIN)) {
 750			ret = -EPERM;
 751			break;
 752		}
 753		goto set_rcvbuf;
 754
 755	case SO_KEEPALIVE:
 756		if (sk->sk_prot->keepalive)
 757			sk->sk_prot->keepalive(sk, valbool);
 758		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 759		break;
 760
 761	case SO_OOBINLINE:
 762		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 763		break;
 764
 765	case SO_NO_CHECK:
 766		sk->sk_no_check_tx = valbool;
 767		break;
 768
 769	case SO_PRIORITY:
 770		if ((val >= 0 && val <= 6) ||
 771		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 772			sk->sk_priority = val;
 773		else
 774			ret = -EPERM;
 775		break;
 776
 777	case SO_LINGER:
 778		if (optlen < sizeof(ling)) {
 779			ret = -EINVAL;	/* 1003.1g */
 780			break;
 781		}
 782		if (copy_from_user(&ling, optval, sizeof(ling))) {
 783			ret = -EFAULT;
 784			break;
 785		}
 786		if (!ling.l_onoff)
 787			sock_reset_flag(sk, SOCK_LINGER);
 788		else {
 789#if (BITS_PER_LONG == 32)
 790			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 791				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 792			else
 793#endif
 794				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 795			sock_set_flag(sk, SOCK_LINGER);
 796		}
 797		break;
 798
 799	case SO_BSDCOMPAT:
 800		sock_warn_obsolete_bsdism("setsockopt");
 801		break;
 802
 803	case SO_PASSCRED:
 804		if (valbool)
 805			set_bit(SOCK_PASSCRED, &sock->flags);
 806		else
 807			clear_bit(SOCK_PASSCRED, &sock->flags);
 808		break;
 809
 810	case SO_TIMESTAMP:
 811	case SO_TIMESTAMPNS:
 812		if (valbool)  {
 813			if (optname == SO_TIMESTAMP)
 814				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 815			else
 816				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 817			sock_set_flag(sk, SOCK_RCVTSTAMP);
 818			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 819		} else {
 820			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 821			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 822		}
 823		break;
 824
 825	case SO_TIMESTAMPING:
 826		if (val & ~SOF_TIMESTAMPING_MASK) {
 827			ret = -EINVAL;
 828			break;
 829		}
 830
 831		if (val & SOF_TIMESTAMPING_OPT_ID &&
 832		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 833			if (sk->sk_protocol == IPPROTO_TCP &&
 834			    sk->sk_type == SOCK_STREAM) {
 835				if ((1 << sk->sk_state) &
 836				    (TCPF_CLOSE | TCPF_LISTEN)) {
 837					ret = -EINVAL;
 838					break;
 839				}
 840				sk->sk_tskey = tcp_sk(sk)->snd_una;
 841			} else {
 842				sk->sk_tskey = 0;
 843			}
 844		}
 845
 846		if (val & SOF_TIMESTAMPING_OPT_STATS &&
 847		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 848			ret = -EINVAL;
 849			break;
 850		}
 851
 852		sk->sk_tsflags = val;
 853		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 854			sock_enable_timestamp(sk,
 855					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 856		else
 857			sock_disable_timestamp(sk,
 858					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 859		break;
 860
 861	case SO_RCVLOWAT:
 862		if (val < 0)
 863			val = INT_MAX;
 864		sk->sk_rcvlowat = val ? : 1;
 865		break;
 866
 867	case SO_RCVTIMEO:
 868		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 869		break;
 870
 871	case SO_SNDTIMEO:
 872		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 873		break;
 874
 875	case SO_ATTACH_FILTER:
 876		ret = -EINVAL;
 877		if (optlen == sizeof(struct sock_fprog)) {
 878			struct sock_fprog fprog;
 879
 880			ret = -EFAULT;
 881			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 882				break;
 883
 884			ret = sk_attach_filter(&fprog, sk);
 885		}
 886		break;
 887
 888	case SO_ATTACH_BPF:
 889		ret = -EINVAL;
 890		if (optlen == sizeof(u32)) {
 891			u32 ufd;
 892
 893			ret = -EFAULT;
 894			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 895				break;
 896
 897			ret = sk_attach_bpf(ufd, sk);
 898		}
 899		break;
 900
 901	case SO_ATTACH_REUSEPORT_CBPF:
 902		ret = -EINVAL;
 903		if (optlen == sizeof(struct sock_fprog)) {
 904			struct sock_fprog fprog;
 905
 906			ret = -EFAULT;
 907			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 908				break;
 909
 910			ret = sk_reuseport_attach_filter(&fprog, sk);
 911		}
 912		break;
 913
 914	case SO_ATTACH_REUSEPORT_EBPF:
 915		ret = -EINVAL;
 916		if (optlen == sizeof(u32)) {
 917			u32 ufd;
 918
 919			ret = -EFAULT;
 920			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 921				break;
 922
 923			ret = sk_reuseport_attach_bpf(ufd, sk);
 924		}
 925		break;
 926
 927	case SO_DETACH_FILTER:
 928		ret = sk_detach_filter(sk);
 929		break;
 930
 931	case SO_LOCK_FILTER:
 932		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 933			ret = -EPERM;
 934		else
 935			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 936		break;
 937
 938	case SO_PASSSEC:
 939		if (valbool)
 940			set_bit(SOCK_PASSSEC, &sock->flags);
 941		else
 942			clear_bit(SOCK_PASSSEC, &sock->flags);
 943		break;
 944	case SO_MARK:
 945		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 946			ret = -EPERM;
 947		else
 948			sk->sk_mark = val;
 949		break;
 950
 951	case SO_RXQ_OVFL:
 952		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 953		break;
 954
 955	case SO_WIFI_STATUS:
 956		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 957		break;
 958
 959	case SO_PEEK_OFF:
 960		if (sock->ops->set_peek_off)
 961			ret = sock->ops->set_peek_off(sk, val);
 962		else
 963			ret = -EOPNOTSUPP;
 964		break;
 965
 966	case SO_NOFCS:
 967		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 968		break;
 969
 970	case SO_SELECT_ERR_QUEUE:
 971		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
 972		break;
 973
 974#ifdef CONFIG_NET_RX_BUSY_POLL
 975	case SO_BUSY_POLL:
 976		/* allow unprivileged users to decrease the value */
 977		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
 978			ret = -EPERM;
 979		else {
 980			if (val < 0)
 981				ret = -EINVAL;
 982			else
 983				sk->sk_ll_usec = val;
 984		}
 985		break;
 986#endif
 987
 988	case SO_MAX_PACING_RATE:
 989		sk->sk_max_pacing_rate = val;
 990		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 991					 sk->sk_max_pacing_rate);
 992		break;
 993
 994	case SO_INCOMING_CPU:
 995		sk->sk_incoming_cpu = val;
 996		break;
 997
 998	case SO_CNX_ADVICE:
 999		if (val == 1)
1000			dst_negative_advice(sk);
1001		break;
1002	default:
1003		ret = -ENOPROTOOPT;
1004		break;
1005	}
1006	release_sock(sk);
1007	return ret;
1008}
1009EXPORT_SYMBOL(sock_setsockopt);
1010
1011
1012static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1013			  struct ucred *ucred)
1014{
1015	ucred->pid = pid_vnr(pid);
1016	ucred->uid = ucred->gid = -1;
1017	if (cred) {
1018		struct user_namespace *current_ns = current_user_ns();
1019
1020		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1021		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1022	}
1023}
1024
1025int sock_getsockopt(struct socket *sock, int level, int optname,
1026		    char __user *optval, int __user *optlen)
1027{
1028	struct sock *sk = sock->sk;
1029
1030	union {
1031		int val;
1032		struct linger ling;
1033		struct timeval tm;
1034	} v;
1035
1036	int lv = sizeof(int);
1037	int len;
1038
1039	if (get_user(len, optlen))
1040		return -EFAULT;
1041	if (len < 0)
1042		return -EINVAL;
1043
1044	memset(&v, 0, sizeof(v));
1045
1046	switch (optname) {
1047	case SO_DEBUG:
1048		v.val = sock_flag(sk, SOCK_DBG);
1049		break;
1050
1051	case SO_DONTROUTE:
1052		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1053		break;
1054
1055	case SO_BROADCAST:
1056		v.val = sock_flag(sk, SOCK_BROADCAST);
1057		break;
1058
1059	case SO_SNDBUF:
1060		v.val = sk->sk_sndbuf;
1061		break;
1062
1063	case SO_RCVBUF:
1064		v.val = sk->sk_rcvbuf;
1065		break;
1066
1067	case SO_REUSEADDR:
1068		v.val = sk->sk_reuse;
1069		break;
1070
1071	case SO_REUSEPORT:
1072		v.val = sk->sk_reuseport;
1073		break;
1074
1075	case SO_KEEPALIVE:
1076		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1077		break;
1078
1079	case SO_TYPE:
1080		v.val = sk->sk_type;
1081		break;
1082
1083	case SO_PROTOCOL:
1084		v.val = sk->sk_protocol;
1085		break;
1086
1087	case SO_DOMAIN:
1088		v.val = sk->sk_family;
1089		break;
1090
1091	case SO_ERROR:
1092		v.val = -sock_error(sk);
1093		if (v.val == 0)
1094			v.val = xchg(&sk->sk_err_soft, 0);
1095		break;
1096
1097	case SO_OOBINLINE:
1098		v.val = sock_flag(sk, SOCK_URGINLINE);
1099		break;
1100
1101	case SO_NO_CHECK:
1102		v.val = sk->sk_no_check_tx;
1103		break;
1104
1105	case SO_PRIORITY:
1106		v.val = sk->sk_priority;
1107		break;
1108
1109	case SO_LINGER:
1110		lv		= sizeof(v.ling);
1111		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1112		v.ling.l_linger	= sk->sk_lingertime / HZ;
1113		break;
1114
1115	case SO_BSDCOMPAT:
1116		sock_warn_obsolete_bsdism("getsockopt");
1117		break;
1118
1119	case SO_TIMESTAMP:
1120		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1121				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1122		break;
1123
1124	case SO_TIMESTAMPNS:
1125		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1126		break;
1127
1128	case SO_TIMESTAMPING:
1129		v.val = sk->sk_tsflags;
1130		break;
1131
1132	case SO_RCVTIMEO:
1133		lv = sizeof(struct timeval);
1134		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1135			v.tm.tv_sec = 0;
1136			v.tm.tv_usec = 0;
1137		} else {
1138			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1139			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1140		}
1141		break;
1142
1143	case SO_SNDTIMEO:
1144		lv = sizeof(struct timeval);
1145		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1146			v.tm.tv_sec = 0;
1147			v.tm.tv_usec = 0;
1148		} else {
1149			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1150			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1151		}
1152		break;
1153
1154	case SO_RCVLOWAT:
1155		v.val = sk->sk_rcvlowat;
1156		break;
1157
1158	case SO_SNDLOWAT:
1159		v.val = 1;
1160		break;
1161
1162	case SO_PASSCRED:
1163		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1164		break;
1165
1166	case SO_PEERCRED:
1167	{
1168		struct ucred peercred;
1169		if (len > sizeof(peercred))
1170			len = sizeof(peercred);
1171		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1172		if (copy_to_user(optval, &peercred, len))
1173			return -EFAULT;
1174		goto lenout;
1175	}
1176
1177	case SO_PEERNAME:
1178	{
1179		char address[128];
1180
1181		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1182			return -ENOTCONN;
1183		if (lv < len)
1184			return -EINVAL;
1185		if (copy_to_user(optval, address, len))
1186			return -EFAULT;
1187		goto lenout;
1188	}
1189
1190	/* Dubious BSD thing... Probably nobody even uses it, but
1191	 * the UNIX standard wants it for whatever reason... -DaveM
1192	 */
1193	case SO_ACCEPTCONN:
1194		v.val = sk->sk_state == TCP_LISTEN;
1195		break;
1196
1197	case SO_PASSSEC:
1198		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1199		break;
1200
1201	case SO_PEERSEC:
1202		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1203
1204	case SO_MARK:
1205		v.val = sk->sk_mark;
1206		break;
1207
1208	case SO_RXQ_OVFL:
1209		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1210		break;
1211
1212	case SO_WIFI_STATUS:
1213		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1214		break;
1215
1216	case SO_PEEK_OFF:
1217		if (!sock->ops->set_peek_off)
1218			return -EOPNOTSUPP;
1219
1220		v.val = sk->sk_peek_off;
1221		break;
1222	case SO_NOFCS:
1223		v.val = sock_flag(sk, SOCK_NOFCS);
1224		break;
1225
1226	case SO_BINDTODEVICE:
1227		return sock_getbindtodevice(sk, optval, optlen, len);
1228
1229	case SO_GET_FILTER:
1230		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1231		if (len < 0)
1232			return len;
1233
1234		goto lenout;
1235
1236	case SO_LOCK_FILTER:
1237		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1238		break;
1239
1240	case SO_BPF_EXTENSIONS:
1241		v.val = bpf_tell_extensions();
1242		break;
1243
1244	case SO_SELECT_ERR_QUEUE:
1245		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1246		break;
1247
1248#ifdef CONFIG_NET_RX_BUSY_POLL
1249	case SO_BUSY_POLL:
1250		v.val = sk->sk_ll_usec;
1251		break;
1252#endif
1253
1254	case SO_MAX_PACING_RATE:
1255		v.val = sk->sk_max_pacing_rate;
1256		break;
1257
1258	case SO_INCOMING_CPU:
1259		v.val = sk->sk_incoming_cpu;
1260		break;
1261
1262	default:
1263		/* We implement the SO_SNDLOWAT etc to not be settable
1264		 * (1003.1g 7).
1265		 */
1266		return -ENOPROTOOPT;
1267	}
1268
1269	if (len > lv)
1270		len = lv;
1271	if (copy_to_user(optval, &v, len))
1272		return -EFAULT;
1273lenout:
1274	if (put_user(len, optlen))
1275		return -EFAULT;
1276	return 0;
1277}
1278
1279/*
1280 * Initialize an sk_lock.
1281 *
1282 * (We also register the sk_lock with the lock validator.)
1283 */
1284static inline void sock_lock_init(struct sock *sk)
1285{
1286	if (sk->sk_kern_sock)
1287		sock_lock_init_class_and_name(
1288			sk,
1289			af_family_kern_slock_key_strings[sk->sk_family],
1290			af_family_kern_slock_keys + sk->sk_family,
1291			af_family_kern_key_strings[sk->sk_family],
1292			af_family_kern_keys + sk->sk_family);
1293	else
1294		sock_lock_init_class_and_name(
1295			sk,
1296			af_family_slock_key_strings[sk->sk_family],
1297			af_family_slock_keys + sk->sk_family,
1298			af_family_key_strings[sk->sk_family],
1299			af_family_keys + sk->sk_family);
1300}
1301
1302/*
1303 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1304 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1305 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1306 */
1307static void sock_copy(struct sock *nsk, const struct sock *osk)
1308{
1309#ifdef CONFIG_SECURITY_NETWORK
1310	void *sptr = nsk->sk_security;
1311#endif
1312	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1313
1314	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1315	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1316
1317#ifdef CONFIG_SECURITY_NETWORK
1318	nsk->sk_security = sptr;
1319	security_sk_clone(osk, nsk);
1320#endif
1321}
1322
1323static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1324		int family)
1325{
1326	struct sock *sk;
1327	struct kmem_cache *slab;
1328
1329	slab = prot->slab;
1330	if (slab != NULL) {
1331		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1332		if (!sk)
1333			return sk;
1334		if (priority & __GFP_ZERO)
1335			sk_prot_clear_nulls(sk, prot->obj_size);
1336	} else
1337		sk = kmalloc(prot->obj_size, priority);
1338
1339	if (sk != NULL) {
1340		kmemcheck_annotate_bitfield(sk, flags);
1341
1342		if (security_sk_alloc(sk, family, priority))
1343			goto out_free;
1344
1345		if (!try_module_get(prot->owner))
1346			goto out_free_sec;
1347		sk_tx_queue_clear(sk);
1348	}
1349
1350	return sk;
1351
1352out_free_sec:
1353	security_sk_free(sk);
1354out_free:
1355	if (slab != NULL)
1356		kmem_cache_free(slab, sk);
1357	else
1358		kfree(sk);
1359	return NULL;
1360}
1361
1362static void sk_prot_free(struct proto *prot, struct sock *sk)
1363{
1364	struct kmem_cache *slab;
1365	struct module *owner;
1366
1367	owner = prot->owner;
1368	slab = prot->slab;
1369
1370	cgroup_sk_free(&sk->sk_cgrp_data);
1371	mem_cgroup_sk_free(sk);
1372	security_sk_free(sk);
1373	if (slab != NULL)
1374		kmem_cache_free(slab, sk);
1375	else
1376		kfree(sk);
1377	module_put(owner);
1378}
1379
1380/**
1381 *	sk_alloc - All socket objects are allocated here
1382 *	@net: the applicable net namespace
1383 *	@family: protocol family
1384 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1385 *	@prot: struct proto associated with this new sock instance
1386 *	@kern: is this to be a kernel socket?
1387 */
1388struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1389		      struct proto *prot, int kern)
1390{
1391	struct sock *sk;
1392
1393	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1394	if (sk) {
1395		sk->sk_family = family;
1396		/*
1397		 * See comment in struct sock definition to understand
1398		 * why we need sk_prot_creator -acme
1399		 */
1400		sk->sk_prot = sk->sk_prot_creator = prot;
1401		sk->sk_kern_sock = kern;
1402		sock_lock_init(sk);
1403		sk->sk_net_refcnt = kern ? 0 : 1;
1404		if (likely(sk->sk_net_refcnt))
1405			get_net(net);
1406		sock_net_set(sk, net);
1407		atomic_set(&sk->sk_wmem_alloc, 1);
1408
1409		mem_cgroup_sk_alloc(sk);
1410		cgroup_sk_alloc(&sk->sk_cgrp_data);
1411		sock_update_classid(&sk->sk_cgrp_data);
1412		sock_update_netprioidx(&sk->sk_cgrp_data);
1413	}
1414
1415	return sk;
1416}
1417EXPORT_SYMBOL(sk_alloc);
1418
1419/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1420 * grace period. This is the case for UDP sockets and TCP listeners.
1421 */
1422static void __sk_destruct(struct rcu_head *head)
1423{
1424	struct sock *sk = container_of(head, struct sock, sk_rcu);
1425	struct sk_filter *filter;
1426
1427	if (sk->sk_destruct)
1428		sk->sk_destruct(sk);
1429
1430	filter = rcu_dereference_check(sk->sk_filter,
1431				       atomic_read(&sk->sk_wmem_alloc) == 0);
1432	if (filter) {
1433		sk_filter_uncharge(sk, filter);
1434		RCU_INIT_POINTER(sk->sk_filter, NULL);
1435	}
1436	if (rcu_access_pointer(sk->sk_reuseport_cb))
1437		reuseport_detach_sock(sk);
1438
1439	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1440
1441	if (atomic_read(&sk->sk_omem_alloc))
1442		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1443			 __func__, atomic_read(&sk->sk_omem_alloc));
1444
1445	if (sk->sk_peer_cred)
1446		put_cred(sk->sk_peer_cred);
1447	put_pid(sk->sk_peer_pid);
1448	if (likely(sk->sk_net_refcnt))
1449		put_net(sock_net(sk));
1450	sk_prot_free(sk->sk_prot_creator, sk);
1451}
1452
1453void sk_destruct(struct sock *sk)
1454{
1455	if (sock_flag(sk, SOCK_RCU_FREE))
1456		call_rcu(&sk->sk_rcu, __sk_destruct);
1457	else
1458		__sk_destruct(&sk->sk_rcu);
1459}
1460
1461static void __sk_free(struct sock *sk)
1462{
1463	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1464		sock_diag_broadcast_destroy(sk);
1465	else
1466		sk_destruct(sk);
1467}
1468
1469void sk_free(struct sock *sk)
1470{
1471	/*
1472	 * We subtract one from sk_wmem_alloc and can know if
1473	 * some packets are still in some tx queue.
1474	 * If not null, sock_wfree() will call __sk_free(sk) later
1475	 */
1476	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1477		__sk_free(sk);
1478}
1479EXPORT_SYMBOL(sk_free);
1480
1481/**
1482 *	sk_clone_lock - clone a socket, and lock its clone
1483 *	@sk: the socket to clone
1484 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1485 *
1486 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1487 */
1488struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1489{
1490	struct sock *newsk;
1491	bool is_charged = true;
1492
1493	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1494	if (newsk != NULL) {
1495		struct sk_filter *filter;
1496
1497		sock_copy(newsk, sk);
1498
1499		/* SANITY */
1500		if (likely(newsk->sk_net_refcnt))
1501			get_net(sock_net(newsk));
1502		sk_node_init(&newsk->sk_node);
1503		sock_lock_init(newsk);
1504		bh_lock_sock(newsk);
1505		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1506		newsk->sk_backlog.len = 0;
1507
1508		atomic_set(&newsk->sk_rmem_alloc, 0);
1509		/*
1510		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1511		 */
1512		atomic_set(&newsk->sk_wmem_alloc, 1);
1513		atomic_set(&newsk->sk_omem_alloc, 0);
1514		skb_queue_head_init(&newsk->sk_receive_queue);
1515		skb_queue_head_init(&newsk->sk_write_queue);
1516
1517		rwlock_init(&newsk->sk_callback_lock);
1518		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1519				af_callback_keys + newsk->sk_family,
1520				af_family_clock_key_strings[newsk->sk_family]);
1521
1522		newsk->sk_dst_cache	= NULL;
1523		newsk->sk_dst_pending_confirm = 0;
1524		newsk->sk_wmem_queued	= 0;
1525		newsk->sk_forward_alloc = 0;
1526		atomic_set(&newsk->sk_drops, 0);
1527		newsk->sk_send_head	= NULL;
1528		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1529
1530		sock_reset_flag(newsk, SOCK_DONE);
1531		skb_queue_head_init(&newsk->sk_error_queue);
1532
1533		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1534		if (filter != NULL)
1535			/* though it's an empty new sock, the charging may fail
1536			 * if sysctl_optmem_max was changed between creation of
1537			 * original socket and cloning
1538			 */
1539			is_charged = sk_filter_charge(newsk, filter);
1540
1541		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1542			sk_free_unlock_clone(newsk);
1543			newsk = NULL;
1544			goto out;
1545		}
1546		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1547
1548		newsk->sk_err	   = 0;
1549		newsk->sk_err_soft = 0;
1550		newsk->sk_priority = 0;
1551		newsk->sk_incoming_cpu = raw_smp_processor_id();
1552		atomic64_set(&newsk->sk_cookie, 0);
1553
1554		mem_cgroup_sk_alloc(newsk);
1555		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1556
1557		/*
1558		 * Before updating sk_refcnt, we must commit prior changes to memory
1559		 * (Documentation/RCU/rculist_nulls.txt for details)
1560		 */
1561		smp_wmb();
1562		atomic_set(&newsk->sk_refcnt, 2);
1563
1564		/*
1565		 * Increment the counter in the same struct proto as the master
1566		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1567		 * is the same as sk->sk_prot->socks, as this field was copied
1568		 * with memcpy).
1569		 *
1570		 * This _changes_ the previous behaviour, where
1571		 * tcp_create_openreq_child always was incrementing the
1572		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1573		 * to be taken into account in all callers. -acme
1574		 */
1575		sk_refcnt_debug_inc(newsk);
1576		sk_set_socket(newsk, NULL);
1577		newsk->sk_wq = NULL;
1578
1579		if (newsk->sk_prot->sockets_allocated)
1580			sk_sockets_allocated_inc(newsk);
1581
1582		if (sock_needs_netstamp(sk) &&
1583		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1584			net_enable_timestamp();
1585	}
1586out:
1587	return newsk;
1588}
1589EXPORT_SYMBOL_GPL(sk_clone_lock);
1590
1591void sk_free_unlock_clone(struct sock *sk)
1592{
1593	/* It is still raw copy of parent, so invalidate
1594	 * destructor and make plain sk_free() */
1595	sk->sk_destruct = NULL;
1596	bh_unlock_sock(sk);
1597	sk_free(sk);
1598}
1599EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1600
1601void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1602{
1603	u32 max_segs = 1;
1604
1605	sk_dst_set(sk, dst);
1606	sk->sk_route_caps = dst->dev->features;
1607	if (sk->sk_route_caps & NETIF_F_GSO)
1608		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1609	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1610	if (sk_can_gso(sk)) {
1611		if (dst->header_len) {
1612			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1613		} else {
1614			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1615			sk->sk_gso_max_size = dst->dev->gso_max_size;
1616			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1617		}
1618	}
1619	sk->sk_gso_max_segs = max_segs;
1620}
1621EXPORT_SYMBOL_GPL(sk_setup_caps);
1622
1623/*
1624 *	Simple resource managers for sockets.
1625 */
1626
1627
1628/*
1629 * Write buffer destructor automatically called from kfree_skb.
1630 */
1631void sock_wfree(struct sk_buff *skb)
1632{
1633	struct sock *sk = skb->sk;
1634	unsigned int len = skb->truesize;
1635
1636	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1637		/*
1638		 * Keep a reference on sk_wmem_alloc, this will be released
1639		 * after sk_write_space() call
1640		 */
1641		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1642		sk->sk_write_space(sk);
1643		len = 1;
1644	}
1645	/*
1646	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1647	 * could not do because of in-flight packets
1648	 */
1649	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1650		__sk_free(sk);
1651}
1652EXPORT_SYMBOL(sock_wfree);
1653
1654/* This variant of sock_wfree() is used by TCP,
1655 * since it sets SOCK_USE_WRITE_QUEUE.
1656 */
1657void __sock_wfree(struct sk_buff *skb)
1658{
1659	struct sock *sk = skb->sk;
1660
1661	if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1662		__sk_free(sk);
1663}
1664
1665void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1666{
1667	skb_orphan(skb);
1668	skb->sk = sk;
1669#ifdef CONFIG_INET
1670	if (unlikely(!sk_fullsock(sk))) {
1671		skb->destructor = sock_edemux;
1672		sock_hold(sk);
1673		return;
1674	}
1675#endif
1676	skb->destructor = sock_wfree;
1677	skb_set_hash_from_sk(skb, sk);
1678	/*
1679	 * We used to take a refcount on sk, but following operation
1680	 * is enough to guarantee sk_free() wont free this sock until
1681	 * all in-flight packets are completed
1682	 */
1683	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1684}
1685EXPORT_SYMBOL(skb_set_owner_w);
1686
1687/* This helper is used by netem, as it can hold packets in its
1688 * delay queue. We want to allow the owner socket to send more
1689 * packets, as if they were already TX completed by a typical driver.
1690 * But we also want to keep skb->sk set because some packet schedulers
1691 * rely on it (sch_fq for example). So we set skb->truesize to a small
1692 * amount (1) and decrease sk_wmem_alloc accordingly.
1693 */
1694void skb_orphan_partial(struct sk_buff *skb)
1695{
1696	/* If this skb is a TCP pure ACK or already went here,
1697	 * we have nothing to do. 2 is already a very small truesize.
1698	 */
1699	if (skb->truesize <= 2)
1700		return;
1701
1702	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1703	 * so we do not completely orphan skb, but transfert all
1704	 * accounted bytes but one, to avoid unexpected reorders.
1705	 */
1706	if (skb->destructor == sock_wfree
1707#ifdef CONFIG_INET
1708	    || skb->destructor == tcp_wfree
1709#endif
1710		) {
1711		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1712		skb->truesize = 1;
1713	} else {
1714		skb_orphan(skb);
1715	}
1716}
1717EXPORT_SYMBOL(skb_orphan_partial);
1718
1719/*
1720 * Read buffer destructor automatically called from kfree_skb.
1721 */
1722void sock_rfree(struct sk_buff *skb)
1723{
1724	struct sock *sk = skb->sk;
1725	unsigned int len = skb->truesize;
1726
1727	atomic_sub(len, &sk->sk_rmem_alloc);
1728	sk_mem_uncharge(sk, len);
1729}
1730EXPORT_SYMBOL(sock_rfree);
1731
1732/*
1733 * Buffer destructor for skbs that are not used directly in read or write
1734 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1735 */
1736void sock_efree(struct sk_buff *skb)
1737{
1738	sock_put(skb->sk);
1739}
1740EXPORT_SYMBOL(sock_efree);
1741
1742kuid_t sock_i_uid(struct sock *sk)
1743{
1744	kuid_t uid;
1745
1746	read_lock_bh(&sk->sk_callback_lock);
1747	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1748	read_unlock_bh(&sk->sk_callback_lock);
1749	return uid;
1750}
1751EXPORT_SYMBOL(sock_i_uid);
1752
1753unsigned long sock_i_ino(struct sock *sk)
1754{
1755	unsigned long ino;
1756
1757	read_lock_bh(&sk->sk_callback_lock);
1758	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1759	read_unlock_bh(&sk->sk_callback_lock);
1760	return ino;
1761}
1762EXPORT_SYMBOL(sock_i_ino);
1763
1764/*
1765 * Allocate a skb from the socket's send buffer.
1766 */
1767struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1768			     gfp_t priority)
1769{
1770	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1771		struct sk_buff *skb = alloc_skb(size, priority);
1772		if (skb) {
1773			skb_set_owner_w(skb, sk);
1774			return skb;
1775		}
1776	}
1777	return NULL;
1778}
1779EXPORT_SYMBOL(sock_wmalloc);
1780
1781/*
1782 * Allocate a memory block from the socket's option memory buffer.
1783 */
1784void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1785{
1786	if ((unsigned int)size <= sysctl_optmem_max &&
1787	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1788		void *mem;
1789		/* First do the add, to avoid the race if kmalloc
1790		 * might sleep.
1791		 */
1792		atomic_add(size, &sk->sk_omem_alloc);
1793		mem = kmalloc(size, priority);
1794		if (mem)
1795			return mem;
1796		atomic_sub(size, &sk->sk_omem_alloc);
1797	}
1798	return NULL;
1799}
1800EXPORT_SYMBOL(sock_kmalloc);
1801
1802/* Free an option memory block. Note, we actually want the inline
1803 * here as this allows gcc to detect the nullify and fold away the
1804 * condition entirely.
1805 */
1806static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1807				  const bool nullify)
1808{
1809	if (WARN_ON_ONCE(!mem))
1810		return;
1811	if (nullify)
1812		kzfree(mem);
1813	else
1814		kfree(mem);
1815	atomic_sub(size, &sk->sk_omem_alloc);
1816}
1817
1818void sock_kfree_s(struct sock *sk, void *mem, int size)
1819{
1820	__sock_kfree_s(sk, mem, size, false);
1821}
1822EXPORT_SYMBOL(sock_kfree_s);
1823
1824void sock_kzfree_s(struct sock *sk, void *mem, int size)
1825{
1826	__sock_kfree_s(sk, mem, size, true);
1827}
1828EXPORT_SYMBOL(sock_kzfree_s);
1829
1830/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1831   I think, these locks should be removed for datagram sockets.
1832 */
1833static long sock_wait_for_wmem(struct sock *sk, long timeo)
1834{
1835	DEFINE_WAIT(wait);
1836
1837	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1838	for (;;) {
1839		if (!timeo)
1840			break;
1841		if (signal_pending(current))
1842			break;
1843		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1844		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1845		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1846			break;
1847		if (sk->sk_shutdown & SEND_SHUTDOWN)
1848			break;
1849		if (sk->sk_err)
1850			break;
1851		timeo = schedule_timeout(timeo);
1852	}
1853	finish_wait(sk_sleep(sk), &wait);
1854	return timeo;
1855}
1856
1857
1858/*
1859 *	Generic send/receive buffer handlers
1860 */
1861
1862struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1863				     unsigned long data_len, int noblock,
1864				     int *errcode, int max_page_order)
1865{
1866	struct sk_buff *skb;
1867	long timeo;
1868	int err;
1869
1870	timeo = sock_sndtimeo(sk, noblock);
1871	for (;;) {
1872		err = sock_error(sk);
1873		if (err != 0)
1874			goto failure;
1875
1876		err = -EPIPE;
1877		if (sk->sk_shutdown & SEND_SHUTDOWN)
1878			goto failure;
1879
1880		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1881			break;
1882
1883		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1884		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1885		err = -EAGAIN;
1886		if (!timeo)
1887			goto failure;
1888		if (signal_pending(current))
1889			goto interrupted;
1890		timeo = sock_wait_for_wmem(sk, timeo);
1891	}
1892	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1893				   errcode, sk->sk_allocation);
1894	if (skb)
1895		skb_set_owner_w(skb, sk);
1896	return skb;
1897
1898interrupted:
1899	err = sock_intr_errno(timeo);
1900failure:
1901	*errcode = err;
1902	return NULL;
1903}
1904EXPORT_SYMBOL(sock_alloc_send_pskb);
1905
1906struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1907				    int noblock, int *errcode)
1908{
1909	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1910}
1911EXPORT_SYMBOL(sock_alloc_send_skb);
1912
1913int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1914		     struct sockcm_cookie *sockc)
1915{
1916	u32 tsflags;
1917
1918	switch (cmsg->cmsg_type) {
1919	case SO_MARK:
1920		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1921			return -EPERM;
1922		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1923			return -EINVAL;
1924		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1925		break;
1926	case SO_TIMESTAMPING:
1927		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1928			return -EINVAL;
1929
1930		tsflags = *(u32 *)CMSG_DATA(cmsg);
1931		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
1932			return -EINVAL;
1933
1934		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
1935		sockc->tsflags |= tsflags;
1936		break;
1937	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
1938	case SCM_RIGHTS:
1939	case SCM_CREDENTIALS:
1940		break;
1941	default:
1942		return -EINVAL;
1943	}
1944	return 0;
1945}
1946EXPORT_SYMBOL(__sock_cmsg_send);
1947
1948int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1949		   struct sockcm_cookie *sockc)
1950{
1951	struct cmsghdr *cmsg;
1952	int ret;
1953
1954	for_each_cmsghdr(cmsg, msg) {
1955		if (!CMSG_OK(msg, cmsg))
1956			return -EINVAL;
1957		if (cmsg->cmsg_level != SOL_SOCKET)
1958			continue;
1959		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
1960		if (ret)
1961			return ret;
1962	}
1963	return 0;
1964}
1965EXPORT_SYMBOL(sock_cmsg_send);
1966
1967/* On 32bit arches, an skb frag is limited to 2^15 */
1968#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1969
1970/**
1971 * skb_page_frag_refill - check that a page_frag contains enough room
1972 * @sz: minimum size of the fragment we want to get
1973 * @pfrag: pointer to page_frag
1974 * @gfp: priority for memory allocation
1975 *
1976 * Note: While this allocator tries to use high order pages, there is
1977 * no guarantee that allocations succeed. Therefore, @sz MUST be
1978 * less or equal than PAGE_SIZE.
1979 */
1980bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1981{
1982	if (pfrag->page) {
1983		if (page_ref_count(pfrag->page) == 1) {
1984			pfrag->offset = 0;
1985			return true;
1986		}
1987		if (pfrag->offset + sz <= pfrag->size)
1988			return true;
1989		put_page(pfrag->page);
1990	}
1991
1992	pfrag->offset = 0;
1993	if (SKB_FRAG_PAGE_ORDER) {
1994		/* Avoid direct reclaim but allow kswapd to wake */
1995		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1996					  __GFP_COMP | __GFP_NOWARN |
1997					  __GFP_NORETRY,
1998					  SKB_FRAG_PAGE_ORDER);
1999		if (likely(pfrag->page)) {
2000			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2001			return true;
2002		}
2003	}
2004	pfrag->page = alloc_page(gfp);
2005	if (likely(pfrag->page)) {
2006		pfrag->size = PAGE_SIZE;
2007		return true;
2008	}
2009	return false;
2010}
2011EXPORT_SYMBOL(skb_page_frag_refill);
2012
2013bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2014{
2015	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2016		return true;
2017
2018	sk_enter_memory_pressure(sk);
2019	sk_stream_moderate_sndbuf(sk);
2020	return false;
2021}
2022EXPORT_SYMBOL(sk_page_frag_refill);
2023
2024static void __lock_sock(struct sock *sk)
2025	__releases(&sk->sk_lock.slock)
2026	__acquires(&sk->sk_lock.slock)
2027{
2028	DEFINE_WAIT(wait);
2029
2030	for (;;) {
2031		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2032					TASK_UNINTERRUPTIBLE);
2033		spin_unlock_bh(&sk->sk_lock.slock);
2034		schedule();
2035		spin_lock_bh(&sk->sk_lock.slock);
2036		if (!sock_owned_by_user(sk))
2037			break;
2038	}
2039	finish_wait(&sk->sk_lock.wq, &wait);
2040}
2041
2042static void __release_sock(struct sock *sk)
2043	__releases(&sk->sk_lock.slock)
2044	__acquires(&sk->sk_lock.slock)
2045{
2046	struct sk_buff *skb, *next;
2047
2048	while ((skb = sk->sk_backlog.head) != NULL) {
2049		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2050
2051		spin_unlock_bh(&sk->sk_lock.slock);
2052
2053		do {
2054			next = skb->next;
2055			prefetch(next);
2056			WARN_ON_ONCE(skb_dst_is_noref(skb));
2057			skb->next = NULL;
2058			sk_backlog_rcv(sk, skb);
2059
2060			cond_resched();
2061
2062			skb = next;
2063		} while (skb != NULL);
2064
2065		spin_lock_bh(&sk->sk_lock.slock);
2066	}
2067
2068	/*
2069	 * Doing the zeroing here guarantee we can not loop forever
2070	 * while a wild producer attempts to flood us.
2071	 */
2072	sk->sk_backlog.len = 0;
2073}
2074
2075void __sk_flush_backlog(struct sock *sk)
2076{
2077	spin_lock_bh(&sk->sk_lock.slock);
2078	__release_sock(sk);
2079	spin_unlock_bh(&sk->sk_lock.slock);
2080}
2081
2082/**
2083 * sk_wait_data - wait for data to arrive at sk_receive_queue
2084 * @sk:    sock to wait on
2085 * @timeo: for how long
2086 * @skb:   last skb seen on sk_receive_queue
2087 *
2088 * Now socket state including sk->sk_err is changed only under lock,
2089 * hence we may omit checks after joining wait queue.
2090 * We check receive queue before schedule() only as optimization;
2091 * it is very likely that release_sock() added new data.
2092 */
2093int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2094{
2095	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2096	int rc;
2097
2098	add_wait_queue(sk_sleep(sk), &wait);
2099	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2100	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2101	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2102	remove_wait_queue(sk_sleep(sk), &wait);
2103	return rc;
2104}
2105EXPORT_SYMBOL(sk_wait_data);
2106
2107/**
2108 *	__sk_mem_raise_allocated - increase memory_allocated
2109 *	@sk: socket
2110 *	@size: memory size to allocate
2111 *	@amt: pages to allocate
2112 *	@kind: allocation type
2113 *
2114 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2115 */
2116int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2117{
2118	struct proto *prot = sk->sk_prot;
2119	long allocated = sk_memory_allocated_add(sk, amt);
2120
2121	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2122	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2123		goto suppress_allocation;
2124
2125	/* Under limit. */
2126	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2127		sk_leave_memory_pressure(sk);
2128		return 1;
2129	}
2130
2131	/* Under pressure. */
2132	if (allocated > sk_prot_mem_limits(sk, 1))
2133		sk_enter_memory_pressure(sk);
2134
2135	/* Over hard limit. */
2136	if (allocated > sk_prot_mem_limits(sk, 2))
2137		goto suppress_allocation;
2138
2139	/* guarantee minimum buffer size under pressure */
2140	if (kind == SK_MEM_RECV) {
2141		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2142			return 1;
2143
2144	} else { /* SK_MEM_SEND */
2145		if (sk->sk_type == SOCK_STREAM) {
2146			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2147				return 1;
2148		} else if (atomic_read(&sk->sk_wmem_alloc) <
2149			   prot->sysctl_wmem[0])
2150				return 1;
2151	}
2152
2153	if (sk_has_memory_pressure(sk)) {
2154		int alloc;
2155
2156		if (!sk_under_memory_pressure(sk))
2157			return 1;
2158		alloc = sk_sockets_allocated_read_positive(sk);
2159		if (sk_prot_mem_limits(sk, 2) > alloc *
2160		    sk_mem_pages(sk->sk_wmem_queued +
2161				 atomic_read(&sk->sk_rmem_alloc) +
2162				 sk->sk_forward_alloc))
2163			return 1;
2164	}
2165
2166suppress_allocation:
2167
2168	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2169		sk_stream_moderate_sndbuf(sk);
2170
2171		/* Fail only if socket is _under_ its sndbuf.
2172		 * In this case we cannot block, so that we have to fail.
2173		 */
2174		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2175			return 1;
2176	}
2177
2178	trace_sock_exceed_buf_limit(sk, prot, allocated);
2179
2180	sk_memory_allocated_sub(sk, amt);
2181
2182	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2183		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2184
2185	return 0;
2186}
2187EXPORT_SYMBOL(__sk_mem_raise_allocated);
2188
2189/**
2190 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2191 *	@sk: socket
2192 *	@size: memory size to allocate
2193 *	@kind: allocation type
2194 *
2195 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2196 *	rmem allocation. This function assumes that protocols which have
2197 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2198 */
2199int __sk_mem_schedule(struct sock *sk, int size, int kind)
2200{
2201	int ret, amt = sk_mem_pages(size);
2202
2203	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2204	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2205	if (!ret)
2206		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2207	return ret;
2208}
2209EXPORT_SYMBOL(__sk_mem_schedule);
2210
2211/**
2212 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2213 *	@sk: socket
2214 *	@amount: number of quanta
2215 *
2216 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2217 */
2218void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2219{
2220	sk_memory_allocated_sub(sk, amount);
2221
2222	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2223		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2224
2225	if (sk_under_memory_pressure(sk) &&
2226	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2227		sk_leave_memory_pressure(sk);
2228}
2229EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2230
2231/**
2232 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2233 *	@sk: socket
2234 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2235 */
2236void __sk_mem_reclaim(struct sock *sk, int amount)
2237{
2238	amount >>= SK_MEM_QUANTUM_SHIFT;
2239	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2240	__sk_mem_reduce_allocated(sk, amount);
2241}
2242EXPORT_SYMBOL(__sk_mem_reclaim);
2243
2244int sk_set_peek_off(struct sock *sk, int val)
2245{
2246	if (val < 0)
2247		return -EINVAL;
2248
2249	sk->sk_peek_off = val;
2250	return 0;
2251}
2252EXPORT_SYMBOL_GPL(sk_set_peek_off);
2253
2254/*
2255 * Set of default routines for initialising struct proto_ops when
2256 * the protocol does not support a particular function. In certain
2257 * cases where it makes no sense for a protocol to have a "do nothing"
2258 * function, some default processing is provided.
2259 */
2260
2261int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2262{
2263	return -EOPNOTSUPP;
2264}
2265EXPORT_SYMBOL(sock_no_bind);
2266
2267int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2268		    int len, int flags)
2269{
2270	return -EOPNOTSUPP;
2271}
2272EXPORT_SYMBOL(sock_no_connect);
2273
2274int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2275{
2276	return -EOPNOTSUPP;
2277}
2278EXPORT_SYMBOL(sock_no_socketpair);
2279
2280int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2281		   bool kern)
2282{
2283	return -EOPNOTSUPP;
2284}
2285EXPORT_SYMBOL(sock_no_accept);
2286
2287int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2288		    int *len, int peer)
2289{
2290	return -EOPNOTSUPP;
2291}
2292EXPORT_SYMBOL(sock_no_getname);
2293
2294unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2295{
2296	return 0;
2297}
2298EXPORT_SYMBOL(sock_no_poll);
2299
2300int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2301{
2302	return -EOPNOTSUPP;
2303}
2304EXPORT_SYMBOL(sock_no_ioctl);
2305
2306int sock_no_listen(struct socket *sock, int backlog)
2307{
2308	return -EOPNOTSUPP;
2309}
2310EXPORT_SYMBOL(sock_no_listen);
2311
2312int sock_no_shutdown(struct socket *sock, int how)
2313{
2314	return -EOPNOTSUPP;
2315}
2316EXPORT_SYMBOL(sock_no_shutdown);
2317
2318int sock_no_setsockopt(struct socket *sock, int level, int optname,
2319		    char __user *optval, unsigned int optlen)
2320{
2321	return -EOPNOTSUPP;
2322}
2323EXPORT_SYMBOL(sock_no_setsockopt);
2324
2325int sock_no_getsockopt(struct socket *sock, int level, int optname,
2326		    char __user *optval, int __user *optlen)
2327{
2328	return -EOPNOTSUPP;
2329}
2330EXPORT_SYMBOL(sock_no_getsockopt);
2331
2332int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2333{
2334	return -EOPNOTSUPP;
2335}
2336EXPORT_SYMBOL(sock_no_sendmsg);
2337
2338int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2339		    int flags)
2340{
2341	return -EOPNOTSUPP;
2342}
2343EXPORT_SYMBOL(sock_no_recvmsg);
2344
2345int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2346{
2347	/* Mirror missing mmap method error code */
2348	return -ENODEV;
2349}
2350EXPORT_SYMBOL(sock_no_mmap);
2351
2352ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2353{
2354	ssize_t res;
2355	struct msghdr msg = {.msg_flags = flags};
2356	struct kvec iov;
2357	char *kaddr = kmap(page);
2358	iov.iov_base = kaddr + offset;
2359	iov.iov_len = size;
2360	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2361	kunmap(page);
2362	return res;
2363}
2364EXPORT_SYMBOL(sock_no_sendpage);
2365
2366/*
2367 *	Default Socket Callbacks
2368 */
2369
2370static void sock_def_wakeup(struct sock *sk)
2371{
2372	struct socket_wq *wq;
2373
2374	rcu_read_lock();
2375	wq = rcu_dereference(sk->sk_wq);
2376	if (skwq_has_sleeper(wq))
2377		wake_up_interruptible_all(&wq->wait);
2378	rcu_read_unlock();
2379}
2380
2381static void sock_def_error_report(struct sock *sk)
2382{
2383	struct socket_wq *wq;
2384
2385	rcu_read_lock();
2386	wq = rcu_dereference(sk->sk_wq);
2387	if (skwq_has_sleeper(wq))
2388		wake_up_interruptible_poll(&wq->wait, POLLERR);
2389	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2390	rcu_read_unlock();
2391}
2392
2393static void sock_def_readable(struct sock *sk)
2394{
2395	struct socket_wq *wq;
2396
2397	rcu_read_lock();
2398	wq = rcu_dereference(sk->sk_wq);
2399	if (skwq_has_sleeper(wq))
2400		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2401						POLLRDNORM | POLLRDBAND);
2402	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2403	rcu_read_unlock();
2404}
2405
2406static void sock_def_write_space(struct sock *sk)
2407{
2408	struct socket_wq *wq;
2409
2410	rcu_read_lock();
2411
2412	/* Do not wake up a writer until he can make "significant"
2413	 * progress.  --DaveM
2414	 */
2415	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2416		wq = rcu_dereference(sk->sk_wq);
2417		if (skwq_has_sleeper(wq))
2418			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2419						POLLWRNORM | POLLWRBAND);
2420
2421		/* Should agree with poll, otherwise some programs break */
2422		if (sock_writeable(sk))
2423			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2424	}
2425
2426	rcu_read_unlock();
2427}
2428
2429static void sock_def_destruct(struct sock *sk)
2430{
2431}
2432
2433void sk_send_sigurg(struct sock *sk)
2434{
2435	if (sk->sk_socket && sk->sk_socket->file)
2436		if (send_sigurg(&sk->sk_socket->file->f_owner))
2437			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2438}
2439EXPORT_SYMBOL(sk_send_sigurg);
2440
2441void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2442		    unsigned long expires)
2443{
2444	if (!mod_timer(timer, expires))
2445		sock_hold(sk);
2446}
2447EXPORT_SYMBOL(sk_reset_timer);
2448
2449void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2450{
2451	if (del_timer(timer))
2452		__sock_put(sk);
2453}
2454EXPORT_SYMBOL(sk_stop_timer);
2455
2456void sock_init_data(struct socket *sock, struct sock *sk)
2457{
2458	skb_queue_head_init(&sk->sk_receive_queue);
2459	skb_queue_head_init(&sk->sk_write_queue);
2460	skb_queue_head_init(&sk->sk_error_queue);
2461
2462	sk->sk_send_head	=	NULL;
2463
2464	init_timer(&sk->sk_timer);
2465
2466	sk->sk_allocation	=	GFP_KERNEL;
2467	sk->sk_rcvbuf		=	sysctl_rmem_default;
2468	sk->sk_sndbuf		=	sysctl_wmem_default;
2469	sk->sk_state		=	TCP_CLOSE;
2470	sk_set_socket(sk, sock);
2471
2472	sock_set_flag(sk, SOCK_ZAPPED);
2473
2474	if (sock) {
2475		sk->sk_type	=	sock->type;
2476		sk->sk_wq	=	sock->wq;
2477		sock->sk	=	sk;
2478		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2479	} else {
2480		sk->sk_wq	=	NULL;
2481		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2482	}
2483
2484	rwlock_init(&sk->sk_callback_lock);
2485	if (sk->sk_kern_sock)
2486		lockdep_set_class_and_name(
2487			&sk->sk_callback_lock,
2488			af_kern_callback_keys + sk->sk_family,
2489			af_family_kern_clock_key_strings[sk->sk_family]);
2490	else
2491		lockdep_set_class_and_name(
2492			&sk->sk_callback_lock,
2493			af_callback_keys + sk->sk_family,
2494			af_family_clock_key_strings[sk->sk_family]);
2495
2496	sk->sk_state_change	=	sock_def_wakeup;
2497	sk->sk_data_ready	=	sock_def_readable;
2498	sk->sk_write_space	=	sock_def_write_space;
2499	sk->sk_error_report	=	sock_def_error_report;
2500	sk->sk_destruct		=	sock_def_destruct;
2501
2502	sk->sk_frag.page	=	NULL;
2503	sk->sk_frag.offset	=	0;
2504	sk->sk_peek_off		=	-1;
2505
2506	sk->sk_peer_pid 	=	NULL;
2507	sk->sk_peer_cred	=	NULL;
2508	sk->sk_write_pending	=	0;
2509	sk->sk_rcvlowat		=	1;
2510	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2511	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2512
2513	sk->sk_stamp = ktime_set(-1L, 0);
2514
2515#ifdef CONFIG_NET_RX_BUSY_POLL
2516	sk->sk_napi_id		=	0;
2517	sk->sk_ll_usec		=	sysctl_net_busy_read;
2518#endif
2519
2520	sk->sk_max_pacing_rate = ~0U;
2521	sk->sk_pacing_rate = ~0U;
2522	sk->sk_incoming_cpu = -1;
2523	/*
2524	 * Before updating sk_refcnt, we must commit prior changes to memory
2525	 * (Documentation/RCU/rculist_nulls.txt for details)
2526	 */
2527	smp_wmb();
2528	atomic_set(&sk->sk_refcnt, 1);
2529	atomic_set(&sk->sk_drops, 0);
2530}
2531EXPORT_SYMBOL(sock_init_data);
2532
2533void lock_sock_nested(struct sock *sk, int subclass)
2534{
2535	might_sleep();
2536	spin_lock_bh(&sk->sk_lock.slock);
2537	if (sk->sk_lock.owned)
2538		__lock_sock(sk);
2539	sk->sk_lock.owned = 1;
2540	spin_unlock(&sk->sk_lock.slock);
2541	/*
2542	 * The sk_lock has mutex_lock() semantics here:
2543	 */
2544	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2545	local_bh_enable();
2546}
2547EXPORT_SYMBOL(lock_sock_nested);
2548
2549void release_sock(struct sock *sk)
2550{
2551	spin_lock_bh(&sk->sk_lock.slock);
2552	if (sk->sk_backlog.tail)
2553		__release_sock(sk);
2554
2555	/* Warning : release_cb() might need to release sk ownership,
2556	 * ie call sock_release_ownership(sk) before us.
2557	 */
2558	if (sk->sk_prot->release_cb)
2559		sk->sk_prot->release_cb(sk);
2560
2561	sock_release_ownership(sk);
2562	if (waitqueue_active(&sk->sk_lock.wq))
2563		wake_up(&sk->sk_lock.wq);
2564	spin_unlock_bh(&sk->sk_lock.slock);
2565}
2566EXPORT_SYMBOL(release_sock);
2567
2568/**
2569 * lock_sock_fast - fast version of lock_sock
2570 * @sk: socket
2571 *
2572 * This version should be used for very small section, where process wont block
2573 * return false if fast path is taken
2574 *   sk_lock.slock locked, owned = 0, BH disabled
2575 * return true if slow path is taken
2576 *   sk_lock.slock unlocked, owned = 1, BH enabled
2577 */
2578bool lock_sock_fast(struct sock *sk)
2579{
2580	might_sleep();
2581	spin_lock_bh(&sk->sk_lock.slock);
2582
2583	if (!sk->sk_lock.owned)
2584		/*
2585		 * Note : We must disable BH
2586		 */
2587		return false;
2588
2589	__lock_sock(sk);
2590	sk->sk_lock.owned = 1;
2591	spin_unlock(&sk->sk_lock.slock);
2592	/*
2593	 * The sk_lock has mutex_lock() semantics here:
2594	 */
2595	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2596	local_bh_enable();
2597	return true;
2598}
2599EXPORT_SYMBOL(lock_sock_fast);
2600
2601int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2602{
2603	struct timeval tv;
2604	if (!sock_flag(sk, SOCK_TIMESTAMP))
2605		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2606	tv = ktime_to_timeval(sk->sk_stamp);
2607	if (tv.tv_sec == -1)
2608		return -ENOENT;
2609	if (tv.tv_sec == 0) {
2610		sk->sk_stamp = ktime_get_real();
2611		tv = ktime_to_timeval(sk->sk_stamp);
2612	}
2613	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2614}
2615EXPORT_SYMBOL(sock_get_timestamp);
2616
2617int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2618{
2619	struct timespec ts;
2620	if (!sock_flag(sk, SOCK_TIMESTAMP))
2621		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2622	ts = ktime_to_timespec(sk->sk_stamp);
2623	if (ts.tv_sec == -1)
2624		return -ENOENT;
2625	if (ts.tv_sec == 0) {
2626		sk->sk_stamp = ktime_get_real();
2627		ts = ktime_to_timespec(sk->sk_stamp);
2628	}
2629	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2630}
2631EXPORT_SYMBOL(sock_get_timestampns);
2632
2633void sock_enable_timestamp(struct sock *sk, int flag)
2634{
2635	if (!sock_flag(sk, flag)) {
2636		unsigned long previous_flags = sk->sk_flags;
2637
2638		sock_set_flag(sk, flag);
2639		/*
2640		 * we just set one of the two flags which require net
2641		 * time stamping, but time stamping might have been on
2642		 * already because of the other one
2643		 */
2644		if (sock_needs_netstamp(sk) &&
2645		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2646			net_enable_timestamp();
2647	}
2648}
2649
2650int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2651		       int level, int type)
2652{
2653	struct sock_exterr_skb *serr;
2654	struct sk_buff *skb;
2655	int copied, err;
2656
2657	err = -EAGAIN;
2658	skb = sock_dequeue_err_skb(sk);
2659	if (skb == NULL)
2660		goto out;
2661
2662	copied = skb->len;
2663	if (copied > len) {
2664		msg->msg_flags |= MSG_TRUNC;
2665		copied = len;
2666	}
2667	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2668	if (err)
2669		goto out_free_skb;
2670
2671	sock_recv_timestamp(msg, sk, skb);
2672
2673	serr = SKB_EXT_ERR(skb);
2674	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2675
2676	msg->msg_flags |= MSG_ERRQUEUE;
2677	err = copied;
2678
2679out_free_skb:
2680	kfree_skb(skb);
2681out:
2682	return err;
2683}
2684EXPORT_SYMBOL(sock_recv_errqueue);
2685
2686/*
2687 *	Get a socket option on an socket.
2688 *
2689 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2690 *	asynchronous errors should be reported by getsockopt. We assume
2691 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2692 */
2693int sock_common_getsockopt(struct socket *sock, int level, int optname,
2694			   char __user *optval, int __user *optlen)
2695{
2696	struct sock *sk = sock->sk;
2697
2698	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2699}
2700EXPORT_SYMBOL(sock_common_getsockopt);
2701
2702#ifdef CONFIG_COMPAT
2703int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2704				  char __user *optval, int __user *optlen)
2705{
2706	struct sock *sk = sock->sk;
2707
2708	if (sk->sk_prot->compat_getsockopt != NULL)
2709		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2710						      optval, optlen);
2711	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2712}
2713EXPORT_SYMBOL(compat_sock_common_getsockopt);
2714#endif
2715
2716int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2717			int flags)
2718{
2719	struct sock *sk = sock->sk;
2720	int addr_len = 0;
2721	int err;
2722
2723	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2724				   flags & ~MSG_DONTWAIT, &addr_len);
2725	if (err >= 0)
2726		msg->msg_namelen = addr_len;
2727	return err;
2728}
2729EXPORT_SYMBOL(sock_common_recvmsg);
2730
2731/*
2732 *	Set socket options on an inet socket.
2733 */
2734int sock_common_setsockopt(struct socket *sock, int level, int optname,
2735			   char __user *optval, unsigned int optlen)
2736{
2737	struct sock *sk = sock->sk;
2738
2739	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2740}
2741EXPORT_SYMBOL(sock_common_setsockopt);
2742
2743#ifdef CONFIG_COMPAT
2744int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2745				  char __user *optval, unsigned int optlen)
2746{
2747	struct sock *sk = sock->sk;
2748
2749	if (sk->sk_prot->compat_setsockopt != NULL)
2750		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2751						      optval, optlen);
2752	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2753}
2754EXPORT_SYMBOL(compat_sock_common_setsockopt);
2755#endif
2756
2757void sk_common_release(struct sock *sk)
2758{
2759	if (sk->sk_prot->destroy)
2760		sk->sk_prot->destroy(sk);
2761
2762	/*
2763	 * Observation: when sock_common_release is called, processes have
2764	 * no access to socket. But net still has.
2765	 * Step one, detach it from networking:
2766	 *
2767	 * A. Remove from hash tables.
2768	 */
2769
2770	sk->sk_prot->unhash(sk);
2771
2772	/*
2773	 * In this point socket cannot receive new packets, but it is possible
2774	 * that some packets are in flight because some CPU runs receiver and
2775	 * did hash table lookup before we unhashed socket. They will achieve
2776	 * receive queue and will be purged by socket destructor.
2777	 *
2778	 * Also we still have packets pending on receive queue and probably,
2779	 * our own packets waiting in device queues. sock_destroy will drain
2780	 * receive queue, but transmitted packets will delay socket destruction
2781	 * until the last reference will be released.
2782	 */
2783
2784	sock_orphan(sk);
2785
2786	xfrm_sk_free_policy(sk);
2787
2788	sk_refcnt_debug_release(sk);
2789
2790	if (sk->sk_frag.page) {
2791		put_page(sk->sk_frag.page);
2792		sk->sk_frag.page = NULL;
2793	}
2794
2795	sock_put(sk);
2796}
2797EXPORT_SYMBOL(sk_common_release);
2798
2799#ifdef CONFIG_PROC_FS
2800#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2801struct prot_inuse {
2802	int val[PROTO_INUSE_NR];
2803};
2804
2805static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2806
2807#ifdef CONFIG_NET_NS
2808void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2809{
2810	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2811}
2812EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2813
2814int sock_prot_inuse_get(struct net *net, struct proto *prot)
2815{
2816	int cpu, idx = prot->inuse_idx;
2817	int res = 0;
2818
2819	for_each_possible_cpu(cpu)
2820		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2821
2822	return res >= 0 ? res : 0;
2823}
2824EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2825
2826static int __net_init sock_inuse_init_net(struct net *net)
2827{
2828	net->core.inuse = alloc_percpu(struct prot_inuse);
2829	return net->core.inuse ? 0 : -ENOMEM;
2830}
2831
2832static void __net_exit sock_inuse_exit_net(struct net *net)
2833{
2834	free_percpu(net->core.inuse);
2835}
2836
2837static struct pernet_operations net_inuse_ops = {
2838	.init = sock_inuse_init_net,
2839	.exit = sock_inuse_exit_net,
2840};
2841
2842static __init int net_inuse_init(void)
2843{
2844	if (register_pernet_subsys(&net_inuse_ops))
2845		panic("Cannot initialize net inuse counters");
2846
2847	return 0;
2848}
2849
2850core_initcall(net_inuse_init);
2851#else
2852static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2853
2854void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2855{
2856	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2857}
2858EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2859
2860int sock_prot_inuse_get(struct net *net, struct proto *prot)
2861{
2862	int cpu, idx = prot->inuse_idx;
2863	int res = 0;
2864
2865	for_each_possible_cpu(cpu)
2866		res += per_cpu(prot_inuse, cpu).val[idx];
2867
2868	return res >= 0 ? res : 0;
2869}
2870EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2871#endif
2872
2873static void assign_proto_idx(struct proto *prot)
2874{
2875	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2876
2877	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2878		pr_err("PROTO_INUSE_NR exhausted\n");
2879		return;
2880	}
2881
2882	set_bit(prot->inuse_idx, proto_inuse_idx);
2883}
2884
2885static void release_proto_idx(struct proto *prot)
2886{
2887	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2888		clear_bit(prot->inuse_idx, proto_inuse_idx);
2889}
2890#else
2891static inline void assign_proto_idx(struct proto *prot)
2892{
2893}
2894
2895static inline void release_proto_idx(struct proto *prot)
2896{
2897}
2898#endif
2899
2900static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2901{
2902	if (!rsk_prot)
2903		return;
2904	kfree(rsk_prot->slab_name);
2905	rsk_prot->slab_name = NULL;
2906	kmem_cache_destroy(rsk_prot->slab);
2907	rsk_prot->slab = NULL;
2908}
2909
2910static int req_prot_init(const struct proto *prot)
2911{
2912	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2913
2914	if (!rsk_prot)
2915		return 0;
2916
2917	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2918					prot->name);
2919	if (!rsk_prot->slab_name)
2920		return -ENOMEM;
2921
2922	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2923					   rsk_prot->obj_size, 0,
2924					   prot->slab_flags, NULL);
2925
2926	if (!rsk_prot->slab) {
2927		pr_crit("%s: Can't create request sock SLAB cache!\n",
2928			prot->name);
2929		return -ENOMEM;
2930	}
2931	return 0;
2932}
2933
2934int proto_register(struct proto *prot, int alloc_slab)
2935{
2936	if (alloc_slab) {
2937		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2938					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2939					NULL);
2940
2941		if (prot->slab == NULL) {
2942			pr_crit("%s: Can't create sock SLAB cache!\n",
2943				prot->name);
2944			goto out;
2945		}
2946
2947		if (req_prot_init(prot))
2948			goto out_free_request_sock_slab;
2949
2950		if (prot->twsk_prot != NULL) {
2951			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2952
2953			if (prot->twsk_prot->twsk_slab_name == NULL)
2954				goto out_free_request_sock_slab;
2955
2956			prot->twsk_prot->twsk_slab =
2957				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2958						  prot->twsk_prot->twsk_obj_size,
2959						  0,
2960						  prot->slab_flags,
2961						  NULL);
2962			if (prot->twsk_prot->twsk_slab == NULL)
2963				goto out_free_timewait_sock_slab_name;
2964		}
2965	}
2966
2967	mutex_lock(&proto_list_mutex);
2968	list_add(&prot->node, &proto_list);
2969	assign_proto_idx(prot);
2970	mutex_unlock(&proto_list_mutex);
2971	return 0;
2972
2973out_free_timewait_sock_slab_name:
2974	kfree(prot->twsk_prot->twsk_slab_name);
2975out_free_request_sock_slab:
2976	req_prot_cleanup(prot->rsk_prot);
2977
2978	kmem_cache_destroy(prot->slab);
2979	prot->slab = NULL;
2980out:
2981	return -ENOBUFS;
2982}
2983EXPORT_SYMBOL(proto_register);
2984
2985void proto_unregister(struct proto *prot)
2986{
2987	mutex_lock(&proto_list_mutex);
2988	release_proto_idx(prot);
2989	list_del(&prot->node);
2990	mutex_unlock(&proto_list_mutex);
2991
2992	kmem_cache_destroy(prot->slab);
2993	prot->slab = NULL;
2994
2995	req_prot_cleanup(prot->rsk_prot);
2996
2997	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2998		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2999		kfree(prot->twsk_prot->twsk_slab_name);
3000		prot->twsk_prot->twsk_slab = NULL;
3001	}
3002}
3003EXPORT_SYMBOL(proto_unregister);
3004
3005#ifdef CONFIG_PROC_FS
3006static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3007	__acquires(proto_list_mutex)
3008{
3009	mutex_lock(&proto_list_mutex);
3010	return seq_list_start_head(&proto_list, *pos);
3011}
3012
3013static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3014{
3015	return seq_list_next(v, &proto_list, pos);
3016}
3017
3018static void proto_seq_stop(struct seq_file *seq, void *v)
3019	__releases(proto_list_mutex)
3020{
3021	mutex_unlock(&proto_list_mutex);
3022}
3023
3024static char proto_method_implemented(const void *method)
3025{
3026	return method == NULL ? 'n' : 'y';
3027}
3028static long sock_prot_memory_allocated(struct proto *proto)
3029{
3030	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3031}
3032
3033static char *sock_prot_memory_pressure(struct proto *proto)
3034{
3035	return proto->memory_pressure != NULL ?
3036	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3037}
3038
3039static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3040{
3041
3042	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3043			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3044		   proto->name,
3045		   proto->obj_size,
3046		   sock_prot_inuse_get(seq_file_net(seq), proto),
3047		   sock_prot_memory_allocated(proto),
3048		   sock_prot_memory_pressure(proto),
3049		   proto->max_header,
3050		   proto->slab == NULL ? "no" : "yes",
3051		   module_name(proto->owner),
3052		   proto_method_implemented(proto->close),
3053		   proto_method_implemented(proto->connect),
3054		   proto_method_implemented(proto->disconnect),
3055		   proto_method_implemented(proto->accept),
3056		   proto_method_implemented(proto->ioctl),
3057		   proto_method_implemented(proto->init),
3058		   proto_method_implemented(proto->destroy),
3059		   proto_method_implemented(proto->shutdown),
3060		   proto_method_implemented(proto->setsockopt),
3061		   proto_method_implemented(proto->getsockopt),
3062		   proto_method_implemented(proto->sendmsg),
3063		   proto_method_implemented(proto->recvmsg),
3064		   proto_method_implemented(proto->sendpage),
3065		   proto_method_implemented(proto->bind),
3066		   proto_method_implemented(proto->backlog_rcv),
3067		   proto_method_implemented(proto->hash),
3068		   proto_method_implemented(proto->unhash),
3069		   proto_method_implemented(proto->get_port),
3070		   proto_method_implemented(proto->enter_memory_pressure));
3071}
3072
3073static int proto_seq_show(struct seq_file *seq, void *v)
3074{
3075	if (v == &proto_list)
3076		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3077			   "protocol",
3078			   "size",
3079			   "sockets",
3080			   "memory",
3081			   "press",
3082			   "maxhdr",
3083			   "slab",
3084			   "module",
3085			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3086	else
3087		proto_seq_printf(seq, list_entry(v, struct proto, node));
3088	return 0;
3089}
3090
3091static const struct seq_operations proto_seq_ops = {
3092	.start  = proto_seq_start,
3093	.next   = proto_seq_next,
3094	.stop   = proto_seq_stop,
3095	.show   = proto_seq_show,
3096};
3097
3098static int proto_seq_open(struct inode *inode, struct file *file)
3099{
3100	return seq_open_net(inode, file, &proto_seq_ops,
3101			    sizeof(struct seq_net_private));
3102}
3103
3104static const struct file_operations proto_seq_fops = {
3105	.owner		= THIS_MODULE,
3106	.open		= proto_seq_open,
3107	.read		= seq_read,
3108	.llseek		= seq_lseek,
3109	.release	= seq_release_net,
3110};
3111
3112static __net_init int proto_init_net(struct net *net)
3113{
3114	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3115		return -ENOMEM;
3116
3117	return 0;
3118}
3119
3120static __net_exit void proto_exit_net(struct net *net)
3121{
3122	remove_proc_entry("protocols", net->proc_net);
3123}
3124
3125
3126static __net_initdata struct pernet_operations proto_net_ops = {
3127	.init = proto_init_net,
3128	.exit = proto_exit_net,
3129};
3130
3131static int __init proto_init(void)
3132{
3133	return register_pernet_subsys(&proto_net_ops);
3134}
3135
3136subsys_initcall(proto_init);
3137
3138#endif /* PROC_FS */