net/core/sock.c at v4.15-rc4 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / core / sock.c
at v4.15-rc4 3366 lines 84 kB view raw
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Generic socket support routines. Memory allocators, socket lock/release
   7 *		handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *		This program is free software; you can redistribute it and/or
  87 *		modify it under the terms of the GNU General Public License
  88 *		as published by the Free Software Foundation; either version
  89 *		2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/sched/mm.h>
 106#include <linux/timer.h>
 107#include <linux/string.h>
 108#include <linux/sockios.h>
 109#include <linux/net.h>
 110#include <linux/mm.h>
 111#include <linux/slab.h>
 112#include <linux/interrupt.h>
 113#include <linux/poll.h>
 114#include <linux/tcp.h>
 115#include <linux/init.h>
 116#include <linux/highmem.h>
 117#include <linux/user_namespace.h>
 118#include <linux/static_key.h>
 119#include <linux/memcontrol.h>
 120#include <linux/prefetch.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <linux/net_tstamp.h>
 131#include <net/xfrm.h>
 132#include <linux/ipsec.h>
 133#include <net/cls_cgroup.h>
 134#include <net/netprio_cgroup.h>
 135#include <linux/sock_diag.h>
 136
 137#include <linux/filter.h>
 138#include <net/sock_reuseport.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159		   struct user_namespace *user_ns, int cap)
 160{
 161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162		ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177	return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196/*
 197 * Each address family might have different locking rules, so we have
 198 * one slock key per address family and separate keys for internal and
 199 * userspace sockets.
 200 */
 201static struct lock_class_key af_family_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206/*
 207 * Make lock validator output more readable. (we pre-construct these
 208 * strings build-time, so that runtime initialization of socket
 209 * locks is fast):
 210 */
 211
 212#define _sock_locks(x)						  \
 213  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 214  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 215  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 216  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 217  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 218  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 219  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 220  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 221  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 222  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 223  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 224  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 225  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 226  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 227  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230	_sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233	_sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236	_sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240	_sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243	_sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246	_sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249  "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250  "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251  "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252  "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253  "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254  "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255  "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256  "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257  "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258  "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259  "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260  "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261  "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262  "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264};
 265static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266  "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267  "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268  "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269  "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270  "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271  "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272  "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273  "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274  "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275  "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276  "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277  "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278  "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279  "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281};
 282static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283  "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284  "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285  "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286  "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287  "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288  "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289  "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290  "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291  "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292  "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293  "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294  "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295  "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296  "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock and sk queues locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305static struct lock_class_key af_rlock_keys[AF_MAX];
 306static struct lock_class_key af_wlock_keys[AF_MAX];
 307static struct lock_class_key af_elock_keys[AF_MAX];
 308static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310/* Run time adjustable parameters. */
 311__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 312EXPORT_SYMBOL(sysctl_wmem_max);
 313__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 314EXPORT_SYMBOL(sysctl_rmem_max);
 315__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 316__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 317
 318/* Maximal space eaten by iovec or ancillary data plus some space */
 319int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 320EXPORT_SYMBOL(sysctl_optmem_max);
 321
 322int sysctl_tstamp_allow_data __read_mostly = 1;
 323
 324struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 325EXPORT_SYMBOL_GPL(memalloc_socks);
 326
 327/**
 328 * sk_set_memalloc - sets %SOCK_MEMALLOC
 329 * @sk: socket to set it on
 330 *
 331 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 332 * It's the responsibility of the admin to adjust min_free_kbytes
 333 * to meet the requirements
 334 */
 335void sk_set_memalloc(struct sock *sk)
 336{
 337	sock_set_flag(sk, SOCK_MEMALLOC);
 338	sk->sk_allocation |= __GFP_MEMALLOC;
 339	static_key_slow_inc(&memalloc_socks);
 340}
 341EXPORT_SYMBOL_GPL(sk_set_memalloc);
 342
 343void sk_clear_memalloc(struct sock *sk)
 344{
 345	sock_reset_flag(sk, SOCK_MEMALLOC);
 346	sk->sk_allocation &= ~__GFP_MEMALLOC;
 347	static_key_slow_dec(&memalloc_socks);
 348
 349	/*
 350	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 351	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 352	 * it has rmem allocations due to the last swapfile being deactivated
 353	 * but there is a risk that the socket is unusable due to exceeding
 354	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 355	 */
 356	sk_mem_reclaim(sk);
 357}
 358EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 359
 360int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 361{
 362	int ret;
 363	unsigned int noreclaim_flag;
 364
 365	/* these should have been dropped before queueing */
 366	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 367
 368	noreclaim_flag = memalloc_noreclaim_save();
 369	ret = sk->sk_backlog_rcv(sk, skb);
 370	memalloc_noreclaim_restore(noreclaim_flag);
 371
 372	return ret;
 373}
 374EXPORT_SYMBOL(__sk_backlog_rcv);
 375
 376static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 377{
 378	struct timeval tv;
 379
 380	if (optlen < sizeof(tv))
 381		return -EINVAL;
 382	if (copy_from_user(&tv, optval, sizeof(tv)))
 383		return -EFAULT;
 384	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 385		return -EDOM;
 386
 387	if (tv.tv_sec < 0) {
 388		static int warned __read_mostly;
 389
 390		*timeo_p = 0;
 391		if (warned < 10 && net_ratelimit()) {
 392			warned++;
 393			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 394				__func__, current->comm, task_pid_nr(current));
 395		}
 396		return 0;
 397	}
 398	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 399	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 400		return 0;
 401	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 402		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 403	return 0;
 404}
 405
 406static void sock_warn_obsolete_bsdism(const char *name)
 407{
 408	static int warned;
 409	static char warncomm[TASK_COMM_LEN];
 410	if (strcmp(warncomm, current->comm) && warned < 5) {
 411		strcpy(warncomm,  current->comm);
 412		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 413			warncomm, name);
 414		warned++;
 415	}
 416}
 417
 418static bool sock_needs_netstamp(const struct sock *sk)
 419{
 420	switch (sk->sk_family) {
 421	case AF_UNSPEC:
 422	case AF_UNIX:
 423		return false;
 424	default:
 425		return true;
 426	}
 427}
 428
 429static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 430{
 431	if (sk->sk_flags & flags) {
 432		sk->sk_flags &= ~flags;
 433		if (sock_needs_netstamp(sk) &&
 434		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 435			net_disable_timestamp();
 436	}
 437}
 438
 439
 440int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441{
 442	unsigned long flags;
 443	struct sk_buff_head *list = &sk->sk_receive_queue;
 444
 445	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 446		atomic_inc(&sk->sk_drops);
 447		trace_sock_rcvqueue_full(sk, skb);
 448		return -ENOMEM;
 449	}
 450
 451	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 452		atomic_inc(&sk->sk_drops);
 453		return -ENOBUFS;
 454	}
 455
 456	skb->dev = NULL;
 457	skb_set_owner_r(skb, sk);
 458
 459	/* we escape from rcu protected region, make sure we dont leak
 460	 * a norefcounted dst
 461	 */
 462	skb_dst_force(skb);
 463
 464	spin_lock_irqsave(&list->lock, flags);
 465	sock_skb_set_dropcount(sk, skb);
 466	__skb_queue_tail(list, skb);
 467	spin_unlock_irqrestore(&list->lock, flags);
 468
 469	if (!sock_flag(sk, SOCK_DEAD))
 470		sk->sk_data_ready(sk);
 471	return 0;
 472}
 473EXPORT_SYMBOL(__sock_queue_rcv_skb);
 474
 475int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 476{
 477	int err;
 478
 479	err = sk_filter(sk, skb);
 480	if (err)
 481		return err;
 482
 483	return __sock_queue_rcv_skb(sk, skb);
 484}
 485EXPORT_SYMBOL(sock_queue_rcv_skb);
 486
 487int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 488		     const int nested, unsigned int trim_cap, bool refcounted)
 489{
 490	int rc = NET_RX_SUCCESS;
 491
 492	if (sk_filter_trim_cap(sk, skb, trim_cap))
 493		goto discard_and_relse;
 494
 495	skb->dev = NULL;
 496
 497	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 498		atomic_inc(&sk->sk_drops);
 499		goto discard_and_relse;
 500	}
 501	if (nested)
 502		bh_lock_sock_nested(sk);
 503	else
 504		bh_lock_sock(sk);
 505	if (!sock_owned_by_user(sk)) {
 506		/*
 507		 * trylock + unlock semantics:
 508		 */
 509		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 510
 511		rc = sk_backlog_rcv(sk, skb);
 512
 513		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 514	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 515		bh_unlock_sock(sk);
 516		atomic_inc(&sk->sk_drops);
 517		goto discard_and_relse;
 518	}
 519
 520	bh_unlock_sock(sk);
 521out:
 522	if (refcounted)
 523		sock_put(sk);
 524	return rc;
 525discard_and_relse:
 526	kfree_skb(skb);
 527	goto out;
 528}
 529EXPORT_SYMBOL(__sk_receive_skb);
 530
 531struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 532{
 533	struct dst_entry *dst = __sk_dst_get(sk);
 534
 535	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 536		sk_tx_queue_clear(sk);
 537		sk->sk_dst_pending_confirm = 0;
 538		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 539		dst_release(dst);
 540		return NULL;
 541	}
 542
 543	return dst;
 544}
 545EXPORT_SYMBOL(__sk_dst_check);
 546
 547struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 548{
 549	struct dst_entry *dst = sk_dst_get(sk);
 550
 551	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 552		sk_dst_reset(sk);
 553		dst_release(dst);
 554		return NULL;
 555	}
 556
 557	return dst;
 558}
 559EXPORT_SYMBOL(sk_dst_check);
 560
 561static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 562				int optlen)
 563{
 564	int ret = -ENOPROTOOPT;
 565#ifdef CONFIG_NETDEVICES
 566	struct net *net = sock_net(sk);
 567	char devname[IFNAMSIZ];
 568	int index;
 569
 570	/* Sorry... */
 571	ret = -EPERM;
 572	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 573		goto out;
 574
 575	ret = -EINVAL;
 576	if (optlen < 0)
 577		goto out;
 578
 579	/* Bind this socket to a particular device like "eth0",
 580	 * as specified in the passed interface name. If the
 581	 * name is "" or the option length is zero the socket
 582	 * is not bound.
 583	 */
 584	if (optlen > IFNAMSIZ - 1)
 585		optlen = IFNAMSIZ - 1;
 586	memset(devname, 0, sizeof(devname));
 587
 588	ret = -EFAULT;
 589	if (copy_from_user(devname, optval, optlen))
 590		goto out;
 591
 592	index = 0;
 593	if (devname[0] != '\0') {
 594		struct net_device *dev;
 595
 596		rcu_read_lock();
 597		dev = dev_get_by_name_rcu(net, devname);
 598		if (dev)
 599			index = dev->ifindex;
 600		rcu_read_unlock();
 601		ret = -ENODEV;
 602		if (!dev)
 603			goto out;
 604	}
 605
 606	lock_sock(sk);
 607	sk->sk_bound_dev_if = index;
 608	sk_dst_reset(sk);
 609	release_sock(sk);
 610
 611	ret = 0;
 612
 613out:
 614#endif
 615
 616	return ret;
 617}
 618
 619static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 620				int __user *optlen, int len)
 621{
 622	int ret = -ENOPROTOOPT;
 623#ifdef CONFIG_NETDEVICES
 624	struct net *net = sock_net(sk);
 625	char devname[IFNAMSIZ];
 626
 627	if (sk->sk_bound_dev_if == 0) {
 628		len = 0;
 629		goto zero;
 630	}
 631
 632	ret = -EINVAL;
 633	if (len < IFNAMSIZ)
 634		goto out;
 635
 636	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 637	if (ret)
 638		goto out;
 639
 640	len = strlen(devname) + 1;
 641
 642	ret = -EFAULT;
 643	if (copy_to_user(optval, devname, len))
 644		goto out;
 645
 646zero:
 647	ret = -EFAULT;
 648	if (put_user(len, optlen))
 649		goto out;
 650
 651	ret = 0;
 652
 653out:
 654#endif
 655
 656	return ret;
 657}
 658
 659static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 660{
 661	if (valbool)
 662		sock_set_flag(sk, bit);
 663	else
 664		sock_reset_flag(sk, bit);
 665}
 666
 667bool sk_mc_loop(struct sock *sk)
 668{
 669	if (dev_recursion_level())
 670		return false;
 671	if (!sk)
 672		return true;
 673	switch (sk->sk_family) {
 674	case AF_INET:
 675		return inet_sk(sk)->mc_loop;
 676#if IS_ENABLED(CONFIG_IPV6)
 677	case AF_INET6:
 678		return inet6_sk(sk)->mc_loop;
 679#endif
 680	}
 681	WARN_ON(1);
 682	return true;
 683}
 684EXPORT_SYMBOL(sk_mc_loop);
 685
 686/*
 687 *	This is meant for all protocols to use and covers goings on
 688 *	at the socket level. Everything here is generic.
 689 */
 690
 691int sock_setsockopt(struct socket *sock, int level, int optname,
 692		    char __user *optval, unsigned int optlen)
 693{
 694	struct sock *sk = sock->sk;
 695	int val;
 696	int valbool;
 697	struct linger ling;
 698	int ret = 0;
 699
 700	/*
 701	 *	Options without arguments
 702	 */
 703
 704	if (optname == SO_BINDTODEVICE)
 705		return sock_setbindtodevice(sk, optval, optlen);
 706
 707	if (optlen < sizeof(int))
 708		return -EINVAL;
 709
 710	if (get_user(val, (int __user *)optval))
 711		return -EFAULT;
 712
 713	valbool = val ? 1 : 0;
 714
 715	lock_sock(sk);
 716
 717	switch (optname) {
 718	case SO_DEBUG:
 719		if (val && !capable(CAP_NET_ADMIN))
 720			ret = -EACCES;
 721		else
 722			sock_valbool_flag(sk, SOCK_DBG, valbool);
 723		break;
 724	case SO_REUSEADDR:
 725		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 726		break;
 727	case SO_REUSEPORT:
 728		sk->sk_reuseport = valbool;
 729		break;
 730	case SO_TYPE:
 731	case SO_PROTOCOL:
 732	case SO_DOMAIN:
 733	case SO_ERROR:
 734		ret = -ENOPROTOOPT;
 735		break;
 736	case SO_DONTROUTE:
 737		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 738		break;
 739	case SO_BROADCAST:
 740		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 741		break;
 742	case SO_SNDBUF:
 743		/* Don't error on this BSD doesn't and if you think
 744		 * about it this is right. Otherwise apps have to
 745		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 746		 * are treated in BSD as hints
 747		 */
 748		val = min_t(u32, val, sysctl_wmem_max);
 749set_sndbuf:
 750		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 751		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 752		/* Wake up sending tasks if we upped the value. */
 753		sk->sk_write_space(sk);
 754		break;
 755
 756	case SO_SNDBUFFORCE:
 757		if (!capable(CAP_NET_ADMIN)) {
 758			ret = -EPERM;
 759			break;
 760		}
 761		goto set_sndbuf;
 762
 763	case SO_RCVBUF:
 764		/* Don't error on this BSD doesn't and if you think
 765		 * about it this is right. Otherwise apps have to
 766		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 767		 * are treated in BSD as hints
 768		 */
 769		val = min_t(u32, val, sysctl_rmem_max);
 770set_rcvbuf:
 771		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 772		/*
 773		 * We double it on the way in to account for
 774		 * "struct sk_buff" etc. overhead.   Applications
 775		 * assume that the SO_RCVBUF setting they make will
 776		 * allow that much actual data to be received on that
 777		 * socket.
 778		 *
 779		 * Applications are unaware that "struct sk_buff" and
 780		 * other overheads allocate from the receive buffer
 781		 * during socket buffer allocation.
 782		 *
 783		 * And after considering the possible alternatives,
 784		 * returning the value we actually used in getsockopt
 785		 * is the most desirable behavior.
 786		 */
 787		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 788		break;
 789
 790	case SO_RCVBUFFORCE:
 791		if (!capable(CAP_NET_ADMIN)) {
 792			ret = -EPERM;
 793			break;
 794		}
 795		goto set_rcvbuf;
 796
 797	case SO_KEEPALIVE:
 798		if (sk->sk_prot->keepalive)
 799			sk->sk_prot->keepalive(sk, valbool);
 800		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 801		break;
 802
 803	case SO_OOBINLINE:
 804		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 805		break;
 806
 807	case SO_NO_CHECK:
 808		sk->sk_no_check_tx = valbool;
 809		break;
 810
 811	case SO_PRIORITY:
 812		if ((val >= 0 && val <= 6) ||
 813		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 814			sk->sk_priority = val;
 815		else
 816			ret = -EPERM;
 817		break;
 818
 819	case SO_LINGER:
 820		if (optlen < sizeof(ling)) {
 821			ret = -EINVAL;	/* 1003.1g */
 822			break;
 823		}
 824		if (copy_from_user(&ling, optval, sizeof(ling))) {
 825			ret = -EFAULT;
 826			break;
 827		}
 828		if (!ling.l_onoff)
 829			sock_reset_flag(sk, SOCK_LINGER);
 830		else {
 831#if (BITS_PER_LONG == 32)
 832			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 833				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 834			else
 835#endif
 836				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 837			sock_set_flag(sk, SOCK_LINGER);
 838		}
 839		break;
 840
 841	case SO_BSDCOMPAT:
 842		sock_warn_obsolete_bsdism("setsockopt");
 843		break;
 844
 845	case SO_PASSCRED:
 846		if (valbool)
 847			set_bit(SOCK_PASSCRED, &sock->flags);
 848		else
 849			clear_bit(SOCK_PASSCRED, &sock->flags);
 850		break;
 851
 852	case SO_TIMESTAMP:
 853	case SO_TIMESTAMPNS:
 854		if (valbool)  {
 855			if (optname == SO_TIMESTAMP)
 856				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 857			else
 858				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 859			sock_set_flag(sk, SOCK_RCVTSTAMP);
 860			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 861		} else {
 862			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 863			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 864		}
 865		break;
 866
 867	case SO_TIMESTAMPING:
 868		if (val & ~SOF_TIMESTAMPING_MASK) {
 869			ret = -EINVAL;
 870			break;
 871		}
 872
 873		if (val & SOF_TIMESTAMPING_OPT_ID &&
 874		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 875			if (sk->sk_protocol == IPPROTO_TCP &&
 876			    sk->sk_type == SOCK_STREAM) {
 877				if ((1 << sk->sk_state) &
 878				    (TCPF_CLOSE | TCPF_LISTEN)) {
 879					ret = -EINVAL;
 880					break;
 881				}
 882				sk->sk_tskey = tcp_sk(sk)->snd_una;
 883			} else {
 884				sk->sk_tskey = 0;
 885			}
 886		}
 887
 888		if (val & SOF_TIMESTAMPING_OPT_STATS &&
 889		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 890			ret = -EINVAL;
 891			break;
 892		}
 893
 894		sk->sk_tsflags = val;
 895		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 896			sock_enable_timestamp(sk,
 897					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 898		else
 899			sock_disable_timestamp(sk,
 900					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 901		break;
 902
 903	case SO_RCVLOWAT:
 904		if (val < 0)
 905			val = INT_MAX;
 906		sk->sk_rcvlowat = val ? : 1;
 907		break;
 908
 909	case SO_RCVTIMEO:
 910		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 911		break;
 912
 913	case SO_SNDTIMEO:
 914		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 915		break;
 916
 917	case SO_ATTACH_FILTER:
 918		ret = -EINVAL;
 919		if (optlen == sizeof(struct sock_fprog)) {
 920			struct sock_fprog fprog;
 921
 922			ret = -EFAULT;
 923			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 924				break;
 925
 926			ret = sk_attach_filter(&fprog, sk);
 927		}
 928		break;
 929
 930	case SO_ATTACH_BPF:
 931		ret = -EINVAL;
 932		if (optlen == sizeof(u32)) {
 933			u32 ufd;
 934
 935			ret = -EFAULT;
 936			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 937				break;
 938
 939			ret = sk_attach_bpf(ufd, sk);
 940		}
 941		break;
 942
 943	case SO_ATTACH_REUSEPORT_CBPF:
 944		ret = -EINVAL;
 945		if (optlen == sizeof(struct sock_fprog)) {
 946			struct sock_fprog fprog;
 947
 948			ret = -EFAULT;
 949			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 950				break;
 951
 952			ret = sk_reuseport_attach_filter(&fprog, sk);
 953		}
 954		break;
 955
 956	case SO_ATTACH_REUSEPORT_EBPF:
 957		ret = -EINVAL;
 958		if (optlen == sizeof(u32)) {
 959			u32 ufd;
 960
 961			ret = -EFAULT;
 962			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 963				break;
 964
 965			ret = sk_reuseport_attach_bpf(ufd, sk);
 966		}
 967		break;
 968
 969	case SO_DETACH_FILTER:
 970		ret = sk_detach_filter(sk);
 971		break;
 972
 973	case SO_LOCK_FILTER:
 974		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 975			ret = -EPERM;
 976		else
 977			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 978		break;
 979
 980	case SO_PASSSEC:
 981		if (valbool)
 982			set_bit(SOCK_PASSSEC, &sock->flags);
 983		else
 984			clear_bit(SOCK_PASSSEC, &sock->flags);
 985		break;
 986	case SO_MARK:
 987		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 988			ret = -EPERM;
 989		else
 990			sk->sk_mark = val;
 991		break;
 992
 993	case SO_RXQ_OVFL:
 994		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 995		break;
 996
 997	case SO_WIFI_STATUS:
 998		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 999		break;
1000
1001	case SO_PEEK_OFF:
1002		if (sock->ops->set_peek_off)
1003			ret = sock->ops->set_peek_off(sk, val);
1004		else
1005			ret = -EOPNOTSUPP;
1006		break;
1007
1008	case SO_NOFCS:
1009		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1010		break;
1011
1012	case SO_SELECT_ERR_QUEUE:
1013		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1014		break;
1015
1016#ifdef CONFIG_NET_RX_BUSY_POLL
1017	case SO_BUSY_POLL:
1018		/* allow unprivileged users to decrease the value */
1019		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1020			ret = -EPERM;
1021		else {
1022			if (val < 0)
1023				ret = -EINVAL;
1024			else
1025				sk->sk_ll_usec = val;
1026		}
1027		break;
1028#endif
1029
1030	case SO_MAX_PACING_RATE:
1031		if (val != ~0U)
1032			cmpxchg(&sk->sk_pacing_status,
1033				SK_PACING_NONE,
1034				SK_PACING_NEEDED);
1035		sk->sk_max_pacing_rate = val;
1036		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1037					 sk->sk_max_pacing_rate);
1038		break;
1039
1040	case SO_INCOMING_CPU:
1041		sk->sk_incoming_cpu = val;
1042		break;
1043
1044	case SO_CNX_ADVICE:
1045		if (val == 1)
1046			dst_negative_advice(sk);
1047		break;
1048
1049	case SO_ZEROCOPY:
1050		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1051			ret = -ENOTSUPP;
1052		else if (sk->sk_protocol != IPPROTO_TCP)
1053			ret = -ENOTSUPP;
1054		else if (sk->sk_state != TCP_CLOSE)
1055			ret = -EBUSY;
1056		else if (val < 0 || val > 1)
1057			ret = -EINVAL;
1058		else
1059			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1060		break;
1061
1062	default:
1063		ret = -ENOPROTOOPT;
1064		break;
1065	}
1066	release_sock(sk);
1067	return ret;
1068}
1069EXPORT_SYMBOL(sock_setsockopt);
1070
1071
1072static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1073			  struct ucred *ucred)
1074{
1075	ucred->pid = pid_vnr(pid);
1076	ucred->uid = ucred->gid = -1;
1077	if (cred) {
1078		struct user_namespace *current_ns = current_user_ns();
1079
1080		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1081		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1082	}
1083}
1084
1085static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1086{
1087	struct user_namespace *user_ns = current_user_ns();
1088	int i;
1089
1090	for (i = 0; i < src->ngroups; i++)
1091		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1092			return -EFAULT;
1093
1094	return 0;
1095}
1096
1097int sock_getsockopt(struct socket *sock, int level, int optname,
1098		    char __user *optval, int __user *optlen)
1099{
1100	struct sock *sk = sock->sk;
1101
1102	union {
1103		int val;
1104		u64 val64;
1105		struct linger ling;
1106		struct timeval tm;
1107	} v;
1108
1109	int lv = sizeof(int);
1110	int len;
1111
1112	if (get_user(len, optlen))
1113		return -EFAULT;
1114	if (len < 0)
1115		return -EINVAL;
1116
1117	memset(&v, 0, sizeof(v));
1118
1119	switch (optname) {
1120	case SO_DEBUG:
1121		v.val = sock_flag(sk, SOCK_DBG);
1122		break;
1123
1124	case SO_DONTROUTE:
1125		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1126		break;
1127
1128	case SO_BROADCAST:
1129		v.val = sock_flag(sk, SOCK_BROADCAST);
1130		break;
1131
1132	case SO_SNDBUF:
1133		v.val = sk->sk_sndbuf;
1134		break;
1135
1136	case SO_RCVBUF:
1137		v.val = sk->sk_rcvbuf;
1138		break;
1139
1140	case SO_REUSEADDR:
1141		v.val = sk->sk_reuse;
1142		break;
1143
1144	case SO_REUSEPORT:
1145		v.val = sk->sk_reuseport;
1146		break;
1147
1148	case SO_KEEPALIVE:
1149		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1150		break;
1151
1152	case SO_TYPE:
1153		v.val = sk->sk_type;
1154		break;
1155
1156	case SO_PROTOCOL:
1157		v.val = sk->sk_protocol;
1158		break;
1159
1160	case SO_DOMAIN:
1161		v.val = sk->sk_family;
1162		break;
1163
1164	case SO_ERROR:
1165		v.val = -sock_error(sk);
1166		if (v.val == 0)
1167			v.val = xchg(&sk->sk_err_soft, 0);
1168		break;
1169
1170	case SO_OOBINLINE:
1171		v.val = sock_flag(sk, SOCK_URGINLINE);
1172		break;
1173
1174	case SO_NO_CHECK:
1175		v.val = sk->sk_no_check_tx;
1176		break;
1177
1178	case SO_PRIORITY:
1179		v.val = sk->sk_priority;
1180		break;
1181
1182	case SO_LINGER:
1183		lv		= sizeof(v.ling);
1184		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1185		v.ling.l_linger	= sk->sk_lingertime / HZ;
1186		break;
1187
1188	case SO_BSDCOMPAT:
1189		sock_warn_obsolete_bsdism("getsockopt");
1190		break;
1191
1192	case SO_TIMESTAMP:
1193		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1194				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1195		break;
1196
1197	case SO_TIMESTAMPNS:
1198		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1199		break;
1200
1201	case SO_TIMESTAMPING:
1202		v.val = sk->sk_tsflags;
1203		break;
1204
1205	case SO_RCVTIMEO:
1206		lv = sizeof(struct timeval);
1207		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1208			v.tm.tv_sec = 0;
1209			v.tm.tv_usec = 0;
1210		} else {
1211			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1212			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1213		}
1214		break;
1215
1216	case SO_SNDTIMEO:
1217		lv = sizeof(struct timeval);
1218		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1219			v.tm.tv_sec = 0;
1220			v.tm.tv_usec = 0;
1221		} else {
1222			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1223			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1224		}
1225		break;
1226
1227	case SO_RCVLOWAT:
1228		v.val = sk->sk_rcvlowat;
1229		break;
1230
1231	case SO_SNDLOWAT:
1232		v.val = 1;
1233		break;
1234
1235	case SO_PASSCRED:
1236		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1237		break;
1238
1239	case SO_PEERCRED:
1240	{
1241		struct ucred peercred;
1242		if (len > sizeof(peercred))
1243			len = sizeof(peercred);
1244		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1245		if (copy_to_user(optval, &peercred, len))
1246			return -EFAULT;
1247		goto lenout;
1248	}
1249
1250	case SO_PEERGROUPS:
1251	{
1252		int ret, n;
1253
1254		if (!sk->sk_peer_cred)
1255			return -ENODATA;
1256
1257		n = sk->sk_peer_cred->group_info->ngroups;
1258		if (len < n * sizeof(gid_t)) {
1259			len = n * sizeof(gid_t);
1260			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1261		}
1262		len = n * sizeof(gid_t);
1263
1264		ret = groups_to_user((gid_t __user *)optval,
1265				     sk->sk_peer_cred->group_info);
1266		if (ret)
1267			return ret;
1268		goto lenout;
1269	}
1270
1271	case SO_PEERNAME:
1272	{
1273		char address[128];
1274
1275		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1276			return -ENOTCONN;
1277		if (lv < len)
1278			return -EINVAL;
1279		if (copy_to_user(optval, address, len))
1280			return -EFAULT;
1281		goto lenout;
1282	}
1283
1284	/* Dubious BSD thing... Probably nobody even uses it, but
1285	 * the UNIX standard wants it for whatever reason... -DaveM
1286	 */
1287	case SO_ACCEPTCONN:
1288		v.val = sk->sk_state == TCP_LISTEN;
1289		break;
1290
1291	case SO_PASSSEC:
1292		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1293		break;
1294
1295	case SO_PEERSEC:
1296		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1297
1298	case SO_MARK:
1299		v.val = sk->sk_mark;
1300		break;
1301
1302	case SO_RXQ_OVFL:
1303		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1304		break;
1305
1306	case SO_WIFI_STATUS:
1307		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1308		break;
1309
1310	case SO_PEEK_OFF:
1311		if (!sock->ops->set_peek_off)
1312			return -EOPNOTSUPP;
1313
1314		v.val = sk->sk_peek_off;
1315		break;
1316	case SO_NOFCS:
1317		v.val = sock_flag(sk, SOCK_NOFCS);
1318		break;
1319
1320	case SO_BINDTODEVICE:
1321		return sock_getbindtodevice(sk, optval, optlen, len);
1322
1323	case SO_GET_FILTER:
1324		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1325		if (len < 0)
1326			return len;
1327
1328		goto lenout;
1329
1330	case SO_LOCK_FILTER:
1331		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1332		break;
1333
1334	case SO_BPF_EXTENSIONS:
1335		v.val = bpf_tell_extensions();
1336		break;
1337
1338	case SO_SELECT_ERR_QUEUE:
1339		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1340		break;
1341
1342#ifdef CONFIG_NET_RX_BUSY_POLL
1343	case SO_BUSY_POLL:
1344		v.val = sk->sk_ll_usec;
1345		break;
1346#endif
1347
1348	case SO_MAX_PACING_RATE:
1349		v.val = sk->sk_max_pacing_rate;
1350		break;
1351
1352	case SO_INCOMING_CPU:
1353		v.val = sk->sk_incoming_cpu;
1354		break;
1355
1356	case SO_MEMINFO:
1357	{
1358		u32 meminfo[SK_MEMINFO_VARS];
1359
1360		if (get_user(len, optlen))
1361			return -EFAULT;
1362
1363		sk_get_meminfo(sk, meminfo);
1364
1365		len = min_t(unsigned int, len, sizeof(meminfo));
1366		if (copy_to_user(optval, &meminfo, len))
1367			return -EFAULT;
1368
1369		goto lenout;
1370	}
1371
1372#ifdef CONFIG_NET_RX_BUSY_POLL
1373	case SO_INCOMING_NAPI_ID:
1374		v.val = READ_ONCE(sk->sk_napi_id);
1375
1376		/* aggregate non-NAPI IDs down to 0 */
1377		if (v.val < MIN_NAPI_ID)
1378			v.val = 0;
1379
1380		break;
1381#endif
1382
1383	case SO_COOKIE:
1384		lv = sizeof(u64);
1385		if (len < lv)
1386			return -EINVAL;
1387		v.val64 = sock_gen_cookie(sk);
1388		break;
1389
1390	case SO_ZEROCOPY:
1391		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1392		break;
1393
1394	default:
1395		/* We implement the SO_SNDLOWAT etc to not be settable
1396		 * (1003.1g 7).
1397		 */
1398		return -ENOPROTOOPT;
1399	}
1400
1401	if (len > lv)
1402		len = lv;
1403	if (copy_to_user(optval, &v, len))
1404		return -EFAULT;
1405lenout:
1406	if (put_user(len, optlen))
1407		return -EFAULT;
1408	return 0;
1409}
1410
1411/*
1412 * Initialize an sk_lock.
1413 *
1414 * (We also register the sk_lock with the lock validator.)
1415 */
1416static inline void sock_lock_init(struct sock *sk)
1417{
1418	if (sk->sk_kern_sock)
1419		sock_lock_init_class_and_name(
1420			sk,
1421			af_family_kern_slock_key_strings[sk->sk_family],
1422			af_family_kern_slock_keys + sk->sk_family,
1423			af_family_kern_key_strings[sk->sk_family],
1424			af_family_kern_keys + sk->sk_family);
1425	else
1426		sock_lock_init_class_and_name(
1427			sk,
1428			af_family_slock_key_strings[sk->sk_family],
1429			af_family_slock_keys + sk->sk_family,
1430			af_family_key_strings[sk->sk_family],
1431			af_family_keys + sk->sk_family);
1432}
1433
1434/*
1435 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1436 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1437 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1438 */
1439static void sock_copy(struct sock *nsk, const struct sock *osk)
1440{
1441#ifdef CONFIG_SECURITY_NETWORK
1442	void *sptr = nsk->sk_security;
1443#endif
1444	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1445
1446	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1447	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1448
1449#ifdef CONFIG_SECURITY_NETWORK
1450	nsk->sk_security = sptr;
1451	security_sk_clone(osk, nsk);
1452#endif
1453}
1454
1455static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1456		int family)
1457{
1458	struct sock *sk;
1459	struct kmem_cache *slab;
1460
1461	slab = prot->slab;
1462	if (slab != NULL) {
1463		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1464		if (!sk)
1465			return sk;
1466		if (priority & __GFP_ZERO)
1467			sk_prot_clear_nulls(sk, prot->obj_size);
1468	} else
1469		sk = kmalloc(prot->obj_size, priority);
1470
1471	if (sk != NULL) {
1472		if (security_sk_alloc(sk, family, priority))
1473			goto out_free;
1474
1475		if (!try_module_get(prot->owner))
1476			goto out_free_sec;
1477		sk_tx_queue_clear(sk);
1478	}
1479
1480	return sk;
1481
1482out_free_sec:
1483	security_sk_free(sk);
1484out_free:
1485	if (slab != NULL)
1486		kmem_cache_free(slab, sk);
1487	else
1488		kfree(sk);
1489	return NULL;
1490}
1491
1492static void sk_prot_free(struct proto *prot, struct sock *sk)
1493{
1494	struct kmem_cache *slab;
1495	struct module *owner;
1496
1497	owner = prot->owner;
1498	slab = prot->slab;
1499
1500	cgroup_sk_free(&sk->sk_cgrp_data);
1501	mem_cgroup_sk_free(sk);
1502	security_sk_free(sk);
1503	if (slab != NULL)
1504		kmem_cache_free(slab, sk);
1505	else
1506		kfree(sk);
1507	module_put(owner);
1508}
1509
1510/**
1511 *	sk_alloc - All socket objects are allocated here
1512 *	@net: the applicable net namespace
1513 *	@family: protocol family
1514 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1515 *	@prot: struct proto associated with this new sock instance
1516 *	@kern: is this to be a kernel socket?
1517 */
1518struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1519		      struct proto *prot, int kern)
1520{
1521	struct sock *sk;
1522
1523	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1524	if (sk) {
1525		sk->sk_family = family;
1526		/*
1527		 * See comment in struct sock definition to understand
1528		 * why we need sk_prot_creator -acme
1529		 */
1530		sk->sk_prot = sk->sk_prot_creator = prot;
1531		sk->sk_kern_sock = kern;
1532		sock_lock_init(sk);
1533		sk->sk_net_refcnt = kern ? 0 : 1;
1534		if (likely(sk->sk_net_refcnt))
1535			get_net(net);
1536		sock_net_set(sk, net);
1537		refcount_set(&sk->sk_wmem_alloc, 1);
1538
1539		mem_cgroup_sk_alloc(sk);
1540		cgroup_sk_alloc(&sk->sk_cgrp_data);
1541		sock_update_classid(&sk->sk_cgrp_data);
1542		sock_update_netprioidx(&sk->sk_cgrp_data);
1543	}
1544
1545	return sk;
1546}
1547EXPORT_SYMBOL(sk_alloc);
1548
1549/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1550 * grace period. This is the case for UDP sockets and TCP listeners.
1551 */
1552static void __sk_destruct(struct rcu_head *head)
1553{
1554	struct sock *sk = container_of(head, struct sock, sk_rcu);
1555	struct sk_filter *filter;
1556
1557	if (sk->sk_destruct)
1558		sk->sk_destruct(sk);
1559
1560	filter = rcu_dereference_check(sk->sk_filter,
1561				       refcount_read(&sk->sk_wmem_alloc) == 0);
1562	if (filter) {
1563		sk_filter_uncharge(sk, filter);
1564		RCU_INIT_POINTER(sk->sk_filter, NULL);
1565	}
1566	if (rcu_access_pointer(sk->sk_reuseport_cb))
1567		reuseport_detach_sock(sk);
1568
1569	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1570
1571	if (atomic_read(&sk->sk_omem_alloc))
1572		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1573			 __func__, atomic_read(&sk->sk_omem_alloc));
1574
1575	if (sk->sk_frag.page) {
1576		put_page(sk->sk_frag.page);
1577		sk->sk_frag.page = NULL;
1578	}
1579
1580	if (sk->sk_peer_cred)
1581		put_cred(sk->sk_peer_cred);
1582	put_pid(sk->sk_peer_pid);
1583	if (likely(sk->sk_net_refcnt))
1584		put_net(sock_net(sk));
1585	sk_prot_free(sk->sk_prot_creator, sk);
1586}
1587
1588void sk_destruct(struct sock *sk)
1589{
1590	if (sock_flag(sk, SOCK_RCU_FREE))
1591		call_rcu(&sk->sk_rcu, __sk_destruct);
1592	else
1593		__sk_destruct(&sk->sk_rcu);
1594}
1595
1596static void __sk_free(struct sock *sk)
1597{
1598	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1599		sock_diag_broadcast_destroy(sk);
1600	else
1601		sk_destruct(sk);
1602}
1603
1604void sk_free(struct sock *sk)
1605{
1606	/*
1607	 * We subtract one from sk_wmem_alloc and can know if
1608	 * some packets are still in some tx queue.
1609	 * If not null, sock_wfree() will call __sk_free(sk) later
1610	 */
1611	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1612		__sk_free(sk);
1613}
1614EXPORT_SYMBOL(sk_free);
1615
1616static void sk_init_common(struct sock *sk)
1617{
1618	skb_queue_head_init(&sk->sk_receive_queue);
1619	skb_queue_head_init(&sk->sk_write_queue);
1620	skb_queue_head_init(&sk->sk_error_queue);
1621
1622	rwlock_init(&sk->sk_callback_lock);
1623	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1624			af_rlock_keys + sk->sk_family,
1625			af_family_rlock_key_strings[sk->sk_family]);
1626	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1627			af_wlock_keys + sk->sk_family,
1628			af_family_wlock_key_strings[sk->sk_family]);
1629	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1630			af_elock_keys + sk->sk_family,
1631			af_family_elock_key_strings[sk->sk_family]);
1632	lockdep_set_class_and_name(&sk->sk_callback_lock,
1633			af_callback_keys + sk->sk_family,
1634			af_family_clock_key_strings[sk->sk_family]);
1635}
1636
1637/**
1638 *	sk_clone_lock - clone a socket, and lock its clone
1639 *	@sk: the socket to clone
1640 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1641 *
1642 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1643 */
1644struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1645{
1646	struct sock *newsk;
1647	bool is_charged = true;
1648
1649	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1650	if (newsk != NULL) {
1651		struct sk_filter *filter;
1652
1653		sock_copy(newsk, sk);
1654
1655		newsk->sk_prot_creator = sk->sk_prot;
1656
1657		/* SANITY */
1658		if (likely(newsk->sk_net_refcnt))
1659			get_net(sock_net(newsk));
1660		sk_node_init(&newsk->sk_node);
1661		sock_lock_init(newsk);
1662		bh_lock_sock(newsk);
1663		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1664		newsk->sk_backlog.len = 0;
1665
1666		atomic_set(&newsk->sk_rmem_alloc, 0);
1667		/*
1668		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1669		 */
1670		refcount_set(&newsk->sk_wmem_alloc, 1);
1671		atomic_set(&newsk->sk_omem_alloc, 0);
1672		sk_init_common(newsk);
1673
1674		newsk->sk_dst_cache	= NULL;
1675		newsk->sk_dst_pending_confirm = 0;
1676		newsk->sk_wmem_queued	= 0;
1677		newsk->sk_forward_alloc = 0;
1678
1679		/* sk->sk_memcg will be populated at accept() time */
1680		newsk->sk_memcg = NULL;
1681
1682		atomic_set(&newsk->sk_drops, 0);
1683		newsk->sk_send_head	= NULL;
1684		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1685		atomic_set(&newsk->sk_zckey, 0);
1686
1687		sock_reset_flag(newsk, SOCK_DONE);
1688		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1689
1690		rcu_read_lock();
1691		filter = rcu_dereference(sk->sk_filter);
1692		if (filter != NULL)
1693			/* though it's an empty new sock, the charging may fail
1694			 * if sysctl_optmem_max was changed between creation of
1695			 * original socket and cloning
1696			 */
1697			is_charged = sk_filter_charge(newsk, filter);
1698		RCU_INIT_POINTER(newsk->sk_filter, filter);
1699		rcu_read_unlock();
1700
1701		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1702			/* We need to make sure that we don't uncharge the new
1703			 * socket if we couldn't charge it in the first place
1704			 * as otherwise we uncharge the parent's filter.
1705			 */
1706			if (!is_charged)
1707				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1708			sk_free_unlock_clone(newsk);
1709			newsk = NULL;
1710			goto out;
1711		}
1712		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1713
1714		newsk->sk_err	   = 0;
1715		newsk->sk_err_soft = 0;
1716		newsk->sk_priority = 0;
1717		newsk->sk_incoming_cpu = raw_smp_processor_id();
1718		atomic64_set(&newsk->sk_cookie, 0);
1719
1720		/*
1721		 * Before updating sk_refcnt, we must commit prior changes to memory
1722		 * (Documentation/RCU/rculist_nulls.txt for details)
1723		 */
1724		smp_wmb();
1725		refcount_set(&newsk->sk_refcnt, 2);
1726
1727		/*
1728		 * Increment the counter in the same struct proto as the master
1729		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1730		 * is the same as sk->sk_prot->socks, as this field was copied
1731		 * with memcpy).
1732		 *
1733		 * This _changes_ the previous behaviour, where
1734		 * tcp_create_openreq_child always was incrementing the
1735		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1736		 * to be taken into account in all callers. -acme
1737		 */
1738		sk_refcnt_debug_inc(newsk);
1739		sk_set_socket(newsk, NULL);
1740		newsk->sk_wq = NULL;
1741
1742		if (newsk->sk_prot->sockets_allocated)
1743			sk_sockets_allocated_inc(newsk);
1744
1745		if (sock_needs_netstamp(sk) &&
1746		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1747			net_enable_timestamp();
1748	}
1749out:
1750	return newsk;
1751}
1752EXPORT_SYMBOL_GPL(sk_clone_lock);
1753
1754void sk_free_unlock_clone(struct sock *sk)
1755{
1756	/* It is still raw copy of parent, so invalidate
1757	 * destructor and make plain sk_free() */
1758	sk->sk_destruct = NULL;
1759	bh_unlock_sock(sk);
1760	sk_free(sk);
1761}
1762EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1763
1764void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1765{
1766	u32 max_segs = 1;
1767
1768	sk_dst_set(sk, dst);
1769	sk->sk_route_caps = dst->dev->features;
1770	if (sk->sk_route_caps & NETIF_F_GSO)
1771		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1772	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1773	if (sk_can_gso(sk)) {
1774		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1775			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1776		} else {
1777			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1778			sk->sk_gso_max_size = dst->dev->gso_max_size;
1779			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1780		}
1781	}
1782	sk->sk_gso_max_segs = max_segs;
1783}
1784EXPORT_SYMBOL_GPL(sk_setup_caps);
1785
1786/*
1787 *	Simple resource managers for sockets.
1788 */
1789
1790
1791/*
1792 * Write buffer destructor automatically called from kfree_skb.
1793 */
1794void sock_wfree(struct sk_buff *skb)
1795{
1796	struct sock *sk = skb->sk;
1797	unsigned int len = skb->truesize;
1798
1799	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1800		/*
1801		 * Keep a reference on sk_wmem_alloc, this will be released
1802		 * after sk_write_space() call
1803		 */
1804		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1805		sk->sk_write_space(sk);
1806		len = 1;
1807	}
1808	/*
1809	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1810	 * could not do because of in-flight packets
1811	 */
1812	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1813		__sk_free(sk);
1814}
1815EXPORT_SYMBOL(sock_wfree);
1816
1817/* This variant of sock_wfree() is used by TCP,
1818 * since it sets SOCK_USE_WRITE_QUEUE.
1819 */
1820void __sock_wfree(struct sk_buff *skb)
1821{
1822	struct sock *sk = skb->sk;
1823
1824	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1825		__sk_free(sk);
1826}
1827
1828void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1829{
1830	skb_orphan(skb);
1831	skb->sk = sk;
1832#ifdef CONFIG_INET
1833	if (unlikely(!sk_fullsock(sk))) {
1834		skb->destructor = sock_edemux;
1835		sock_hold(sk);
1836		return;
1837	}
1838#endif
1839	skb->destructor = sock_wfree;
1840	skb_set_hash_from_sk(skb, sk);
1841	/*
1842	 * We used to take a refcount on sk, but following operation
1843	 * is enough to guarantee sk_free() wont free this sock until
1844	 * all in-flight packets are completed
1845	 */
1846	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1847}
1848EXPORT_SYMBOL(skb_set_owner_w);
1849
1850/* This helper is used by netem, as it can hold packets in its
1851 * delay queue. We want to allow the owner socket to send more
1852 * packets, as if they were already TX completed by a typical driver.
1853 * But we also want to keep skb->sk set because some packet schedulers
1854 * rely on it (sch_fq for example).
1855 */
1856void skb_orphan_partial(struct sk_buff *skb)
1857{
1858	if (skb_is_tcp_pure_ack(skb))
1859		return;
1860
1861	if (skb->destructor == sock_wfree
1862#ifdef CONFIG_INET
1863	    || skb->destructor == tcp_wfree
1864#endif
1865		) {
1866		struct sock *sk = skb->sk;
1867
1868		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1869			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1870			skb->destructor = sock_efree;
1871		}
1872	} else {
1873		skb_orphan(skb);
1874	}
1875}
1876EXPORT_SYMBOL(skb_orphan_partial);
1877
1878/*
1879 * Read buffer destructor automatically called from kfree_skb.
1880 */
1881void sock_rfree(struct sk_buff *skb)
1882{
1883	struct sock *sk = skb->sk;
1884	unsigned int len = skb->truesize;
1885
1886	atomic_sub(len, &sk->sk_rmem_alloc);
1887	sk_mem_uncharge(sk, len);
1888}
1889EXPORT_SYMBOL(sock_rfree);
1890
1891/*
1892 * Buffer destructor for skbs that are not used directly in read or write
1893 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1894 */
1895void sock_efree(struct sk_buff *skb)
1896{
1897	sock_put(skb->sk);
1898}
1899EXPORT_SYMBOL(sock_efree);
1900
1901kuid_t sock_i_uid(struct sock *sk)
1902{
1903	kuid_t uid;
1904
1905	read_lock_bh(&sk->sk_callback_lock);
1906	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1907	read_unlock_bh(&sk->sk_callback_lock);
1908	return uid;
1909}
1910EXPORT_SYMBOL(sock_i_uid);
1911
1912unsigned long sock_i_ino(struct sock *sk)
1913{
1914	unsigned long ino;
1915
1916	read_lock_bh(&sk->sk_callback_lock);
1917	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1918	read_unlock_bh(&sk->sk_callback_lock);
1919	return ino;
1920}
1921EXPORT_SYMBOL(sock_i_ino);
1922
1923/*
1924 * Allocate a skb from the socket's send buffer.
1925 */
1926struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1927			     gfp_t priority)
1928{
1929	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1930		struct sk_buff *skb = alloc_skb(size, priority);
1931		if (skb) {
1932			skb_set_owner_w(skb, sk);
1933			return skb;
1934		}
1935	}
1936	return NULL;
1937}
1938EXPORT_SYMBOL(sock_wmalloc);
1939
1940static void sock_ofree(struct sk_buff *skb)
1941{
1942	struct sock *sk = skb->sk;
1943
1944	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1945}
1946
1947struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1948			     gfp_t priority)
1949{
1950	struct sk_buff *skb;
1951
1952	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1953	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1954	    sysctl_optmem_max)
1955		return NULL;
1956
1957	skb = alloc_skb(size, priority);
1958	if (!skb)
1959		return NULL;
1960
1961	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1962	skb->sk = sk;
1963	skb->destructor = sock_ofree;
1964	return skb;
1965}
1966
1967/*
1968 * Allocate a memory block from the socket's option memory buffer.
1969 */
1970void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1971{
1972	if ((unsigned int)size <= sysctl_optmem_max &&
1973	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1974		void *mem;
1975		/* First do the add, to avoid the race if kmalloc
1976		 * might sleep.
1977		 */
1978		atomic_add(size, &sk->sk_omem_alloc);
1979		mem = kmalloc(size, priority);
1980		if (mem)
1981			return mem;
1982		atomic_sub(size, &sk->sk_omem_alloc);
1983	}
1984	return NULL;
1985}
1986EXPORT_SYMBOL(sock_kmalloc);
1987
1988/* Free an option memory block. Note, we actually want the inline
1989 * here as this allows gcc to detect the nullify and fold away the
1990 * condition entirely.
1991 */
1992static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1993				  const bool nullify)
1994{
1995	if (WARN_ON_ONCE(!mem))
1996		return;
1997	if (nullify)
1998		kzfree(mem);
1999	else
2000		kfree(mem);
2001	atomic_sub(size, &sk->sk_omem_alloc);
2002}
2003
2004void sock_kfree_s(struct sock *sk, void *mem, int size)
2005{
2006	__sock_kfree_s(sk, mem, size, false);
2007}
2008EXPORT_SYMBOL(sock_kfree_s);
2009
2010void sock_kzfree_s(struct sock *sk, void *mem, int size)
2011{
2012	__sock_kfree_s(sk, mem, size, true);
2013}
2014EXPORT_SYMBOL(sock_kzfree_s);
2015
2016/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2017   I think, these locks should be removed for datagram sockets.
2018 */
2019static long sock_wait_for_wmem(struct sock *sk, long timeo)
2020{
2021	DEFINE_WAIT(wait);
2022
2023	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2024	for (;;) {
2025		if (!timeo)
2026			break;
2027		if (signal_pending(current))
2028			break;
2029		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2030		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2031		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2032			break;
2033		if (sk->sk_shutdown & SEND_SHUTDOWN)
2034			break;
2035		if (sk->sk_err)
2036			break;
2037		timeo = schedule_timeout(timeo);
2038	}
2039	finish_wait(sk_sleep(sk), &wait);
2040	return timeo;
2041}
2042
2043
2044/*
2045 *	Generic send/receive buffer handlers
2046 */
2047
2048struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2049				     unsigned long data_len, int noblock,
2050				     int *errcode, int max_page_order)
2051{
2052	struct sk_buff *skb;
2053	long timeo;
2054	int err;
2055
2056	timeo = sock_sndtimeo(sk, noblock);
2057	for (;;) {
2058		err = sock_error(sk);
2059		if (err != 0)
2060			goto failure;
2061
2062		err = -EPIPE;
2063		if (sk->sk_shutdown & SEND_SHUTDOWN)
2064			goto failure;
2065
2066		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2067			break;
2068
2069		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2070		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2071		err = -EAGAIN;
2072		if (!timeo)
2073			goto failure;
2074		if (signal_pending(current))
2075			goto interrupted;
2076		timeo = sock_wait_for_wmem(sk, timeo);
2077	}
2078	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2079				   errcode, sk->sk_allocation);
2080	if (skb)
2081		skb_set_owner_w(skb, sk);
2082	return skb;
2083
2084interrupted:
2085	err = sock_intr_errno(timeo);
2086failure:
2087	*errcode = err;
2088	return NULL;
2089}
2090EXPORT_SYMBOL(sock_alloc_send_pskb);
2091
2092struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2093				    int noblock, int *errcode)
2094{
2095	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2096}
2097EXPORT_SYMBOL(sock_alloc_send_skb);
2098
2099int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2100		     struct sockcm_cookie *sockc)
2101{
2102	u32 tsflags;
2103
2104	switch (cmsg->cmsg_type) {
2105	case SO_MARK:
2106		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2107			return -EPERM;
2108		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2109			return -EINVAL;
2110		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2111		break;
2112	case SO_TIMESTAMPING:
2113		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2114			return -EINVAL;
2115
2116		tsflags = *(u32 *)CMSG_DATA(cmsg);
2117		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2118			return -EINVAL;
2119
2120		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2121		sockc->tsflags |= tsflags;
2122		break;
2123	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2124	case SCM_RIGHTS:
2125	case SCM_CREDENTIALS:
2126		break;
2127	default:
2128		return -EINVAL;
2129	}
2130	return 0;
2131}
2132EXPORT_SYMBOL(__sock_cmsg_send);
2133
2134int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2135		   struct sockcm_cookie *sockc)
2136{
2137	struct cmsghdr *cmsg;
2138	int ret;
2139
2140	for_each_cmsghdr(cmsg, msg) {
2141		if (!CMSG_OK(msg, cmsg))
2142			return -EINVAL;
2143		if (cmsg->cmsg_level != SOL_SOCKET)
2144			continue;
2145		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2146		if (ret)
2147			return ret;
2148	}
2149	return 0;
2150}
2151EXPORT_SYMBOL(sock_cmsg_send);
2152
2153static void sk_enter_memory_pressure(struct sock *sk)
2154{
2155	if (!sk->sk_prot->enter_memory_pressure)
2156		return;
2157
2158	sk->sk_prot->enter_memory_pressure(sk);
2159}
2160
2161static void sk_leave_memory_pressure(struct sock *sk)
2162{
2163	if (sk->sk_prot->leave_memory_pressure) {
2164		sk->sk_prot->leave_memory_pressure(sk);
2165	} else {
2166		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2167
2168		if (memory_pressure && *memory_pressure)
2169			*memory_pressure = 0;
2170	}
2171}
2172
2173/* On 32bit arches, an skb frag is limited to 2^15 */
2174#define SKB_FRAG_PAGE_ORDER	get_order(32768)
2175
2176/**
2177 * skb_page_frag_refill - check that a page_frag contains enough room
2178 * @sz: minimum size of the fragment we want to get
2179 * @pfrag: pointer to page_frag
2180 * @gfp: priority for memory allocation
2181 *
2182 * Note: While this allocator tries to use high order pages, there is
2183 * no guarantee that allocations succeed. Therefore, @sz MUST be
2184 * less or equal than PAGE_SIZE.
2185 */
2186bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2187{
2188	if (pfrag->page) {
2189		if (page_ref_count(pfrag->page) == 1) {
2190			pfrag->offset = 0;
2191			return true;
2192		}
2193		if (pfrag->offset + sz <= pfrag->size)
2194			return true;
2195		put_page(pfrag->page);
2196	}
2197
2198	pfrag->offset = 0;
2199	if (SKB_FRAG_PAGE_ORDER) {
2200		/* Avoid direct reclaim but allow kswapd to wake */
2201		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2202					  __GFP_COMP | __GFP_NOWARN |
2203					  __GFP_NORETRY,
2204					  SKB_FRAG_PAGE_ORDER);
2205		if (likely(pfrag->page)) {
2206			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2207			return true;
2208		}
2209	}
2210	pfrag->page = alloc_page(gfp);
2211	if (likely(pfrag->page)) {
2212		pfrag->size = PAGE_SIZE;
2213		return true;
2214	}
2215	return false;
2216}
2217EXPORT_SYMBOL(skb_page_frag_refill);
2218
2219bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2220{
2221	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2222		return true;
2223
2224	sk_enter_memory_pressure(sk);
2225	sk_stream_moderate_sndbuf(sk);
2226	return false;
2227}
2228EXPORT_SYMBOL(sk_page_frag_refill);
2229
2230static void __lock_sock(struct sock *sk)
2231	__releases(&sk->sk_lock.slock)
2232	__acquires(&sk->sk_lock.slock)
2233{
2234	DEFINE_WAIT(wait);
2235
2236	for (;;) {
2237		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2238					TASK_UNINTERRUPTIBLE);
2239		spin_unlock_bh(&sk->sk_lock.slock);
2240		schedule();
2241		spin_lock_bh(&sk->sk_lock.slock);
2242		if (!sock_owned_by_user(sk))
2243			break;
2244	}
2245	finish_wait(&sk->sk_lock.wq, &wait);
2246}
2247
2248static void __release_sock(struct sock *sk)
2249	__releases(&sk->sk_lock.slock)
2250	__acquires(&sk->sk_lock.slock)
2251{
2252	struct sk_buff *skb, *next;
2253
2254	while ((skb = sk->sk_backlog.head) != NULL) {
2255		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2256
2257		spin_unlock_bh(&sk->sk_lock.slock);
2258
2259		do {
2260			next = skb->next;
2261			prefetch(next);
2262			WARN_ON_ONCE(skb_dst_is_noref(skb));
2263			skb->next = NULL;
2264			sk_backlog_rcv(sk, skb);
2265
2266			cond_resched();
2267
2268			skb = next;
2269		} while (skb != NULL);
2270
2271		spin_lock_bh(&sk->sk_lock.slock);
2272	}
2273
2274	/*
2275	 * Doing the zeroing here guarantee we can not loop forever
2276	 * while a wild producer attempts to flood us.
2277	 */
2278	sk->sk_backlog.len = 0;
2279}
2280
2281void __sk_flush_backlog(struct sock *sk)
2282{
2283	spin_lock_bh(&sk->sk_lock.slock);
2284	__release_sock(sk);
2285	spin_unlock_bh(&sk->sk_lock.slock);
2286}
2287
2288/**
2289 * sk_wait_data - wait for data to arrive at sk_receive_queue
2290 * @sk:    sock to wait on
2291 * @timeo: for how long
2292 * @skb:   last skb seen on sk_receive_queue
2293 *
2294 * Now socket state including sk->sk_err is changed only under lock,
2295 * hence we may omit checks after joining wait queue.
2296 * We check receive queue before schedule() only as optimization;
2297 * it is very likely that release_sock() added new data.
2298 */
2299int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2300{
2301	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2302	int rc;
2303
2304	add_wait_queue(sk_sleep(sk), &wait);
2305	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2306	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2307	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2308	remove_wait_queue(sk_sleep(sk), &wait);
2309	return rc;
2310}
2311EXPORT_SYMBOL(sk_wait_data);
2312
2313/**
2314 *	__sk_mem_raise_allocated - increase memory_allocated
2315 *	@sk: socket
2316 *	@size: memory size to allocate
2317 *	@amt: pages to allocate
2318 *	@kind: allocation type
2319 *
2320 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2321 */
2322int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2323{
2324	struct proto *prot = sk->sk_prot;
2325	long allocated = sk_memory_allocated_add(sk, amt);
2326
2327	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2328	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2329		goto suppress_allocation;
2330
2331	/* Under limit. */
2332	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2333		sk_leave_memory_pressure(sk);
2334		return 1;
2335	}
2336
2337	/* Under pressure. */
2338	if (allocated > sk_prot_mem_limits(sk, 1))
2339		sk_enter_memory_pressure(sk);
2340
2341	/* Over hard limit. */
2342	if (allocated > sk_prot_mem_limits(sk, 2))
2343		goto suppress_allocation;
2344
2345	/* guarantee minimum buffer size under pressure */
2346	if (kind == SK_MEM_RECV) {
2347		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2348			return 1;
2349
2350	} else { /* SK_MEM_SEND */
2351		int wmem0 = sk_get_wmem0(sk, prot);
2352
2353		if (sk->sk_type == SOCK_STREAM) {
2354			if (sk->sk_wmem_queued < wmem0)
2355				return 1;
2356		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2357				return 1;
2358		}
2359	}
2360
2361	if (sk_has_memory_pressure(sk)) {
2362		int alloc;
2363
2364		if (!sk_under_memory_pressure(sk))
2365			return 1;
2366		alloc = sk_sockets_allocated_read_positive(sk);
2367		if (sk_prot_mem_limits(sk, 2) > alloc *
2368		    sk_mem_pages(sk->sk_wmem_queued +
2369				 atomic_read(&sk->sk_rmem_alloc) +
2370				 sk->sk_forward_alloc))
2371			return 1;
2372	}
2373
2374suppress_allocation:
2375
2376	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2377		sk_stream_moderate_sndbuf(sk);
2378
2379		/* Fail only if socket is _under_ its sndbuf.
2380		 * In this case we cannot block, so that we have to fail.
2381		 */
2382		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2383			return 1;
2384	}
2385
2386	trace_sock_exceed_buf_limit(sk, prot, allocated);
2387
2388	sk_memory_allocated_sub(sk, amt);
2389
2390	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2391		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2392
2393	return 0;
2394}
2395EXPORT_SYMBOL(__sk_mem_raise_allocated);
2396
2397/**
2398 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2399 *	@sk: socket
2400 *	@size: memory size to allocate
2401 *	@kind: allocation type
2402 *
2403 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2404 *	rmem allocation. This function assumes that protocols which have
2405 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2406 */
2407int __sk_mem_schedule(struct sock *sk, int size, int kind)
2408{
2409	int ret, amt = sk_mem_pages(size);
2410
2411	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2412	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2413	if (!ret)
2414		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2415	return ret;
2416}
2417EXPORT_SYMBOL(__sk_mem_schedule);
2418
2419/**
2420 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2421 *	@sk: socket
2422 *	@amount: number of quanta
2423 *
2424 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2425 */
2426void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2427{
2428	sk_memory_allocated_sub(sk, amount);
2429
2430	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2431		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2432
2433	if (sk_under_memory_pressure(sk) &&
2434	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2435		sk_leave_memory_pressure(sk);
2436}
2437EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2438
2439/**
2440 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2441 *	@sk: socket
2442 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2443 */
2444void __sk_mem_reclaim(struct sock *sk, int amount)
2445{
2446	amount >>= SK_MEM_QUANTUM_SHIFT;
2447	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2448	__sk_mem_reduce_allocated(sk, amount);
2449}
2450EXPORT_SYMBOL(__sk_mem_reclaim);
2451
2452int sk_set_peek_off(struct sock *sk, int val)
2453{
2454	sk->sk_peek_off = val;
2455	return 0;
2456}
2457EXPORT_SYMBOL_GPL(sk_set_peek_off);
2458
2459/*
2460 * Set of default routines for initialising struct proto_ops when
2461 * the protocol does not support a particular function. In certain
2462 * cases where it makes no sense for a protocol to have a "do nothing"
2463 * function, some default processing is provided.
2464 */
2465
2466int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2467{
2468	return -EOPNOTSUPP;
2469}
2470EXPORT_SYMBOL(sock_no_bind);
2471
2472int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2473		    int len, int flags)
2474{
2475	return -EOPNOTSUPP;
2476}
2477EXPORT_SYMBOL(sock_no_connect);
2478
2479int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2480{
2481	return -EOPNOTSUPP;
2482}
2483EXPORT_SYMBOL(sock_no_socketpair);
2484
2485int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2486		   bool kern)
2487{
2488	return -EOPNOTSUPP;
2489}
2490EXPORT_SYMBOL(sock_no_accept);
2491
2492int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2493		    int *len, int peer)
2494{
2495	return -EOPNOTSUPP;
2496}
2497EXPORT_SYMBOL(sock_no_getname);
2498
2499unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2500{
2501	return 0;
2502}
2503EXPORT_SYMBOL(sock_no_poll);
2504
2505int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2506{
2507	return -EOPNOTSUPP;
2508}
2509EXPORT_SYMBOL(sock_no_ioctl);
2510
2511int sock_no_listen(struct socket *sock, int backlog)
2512{
2513	return -EOPNOTSUPP;
2514}
2515EXPORT_SYMBOL(sock_no_listen);
2516
2517int sock_no_shutdown(struct socket *sock, int how)
2518{
2519	return -EOPNOTSUPP;
2520}
2521EXPORT_SYMBOL(sock_no_shutdown);
2522
2523int sock_no_setsockopt(struct socket *sock, int level, int optname,
2524		    char __user *optval, unsigned int optlen)
2525{
2526	return -EOPNOTSUPP;
2527}
2528EXPORT_SYMBOL(sock_no_setsockopt);
2529
2530int sock_no_getsockopt(struct socket *sock, int level, int optname,
2531		    char __user *optval, int __user *optlen)
2532{
2533	return -EOPNOTSUPP;
2534}
2535EXPORT_SYMBOL(sock_no_getsockopt);
2536
2537int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2538{
2539	return -EOPNOTSUPP;
2540}
2541EXPORT_SYMBOL(sock_no_sendmsg);
2542
2543int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2544{
2545	return -EOPNOTSUPP;
2546}
2547EXPORT_SYMBOL(sock_no_sendmsg_locked);
2548
2549int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2550		    int flags)
2551{
2552	return -EOPNOTSUPP;
2553}
2554EXPORT_SYMBOL(sock_no_recvmsg);
2555
2556int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2557{
2558	/* Mirror missing mmap method error code */
2559	return -ENODEV;
2560}
2561EXPORT_SYMBOL(sock_no_mmap);
2562
2563ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2564{
2565	ssize_t res;
2566	struct msghdr msg = {.msg_flags = flags};
2567	struct kvec iov;
2568	char *kaddr = kmap(page);
2569	iov.iov_base = kaddr + offset;
2570	iov.iov_len = size;
2571	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2572	kunmap(page);
2573	return res;
2574}
2575EXPORT_SYMBOL(sock_no_sendpage);
2576
2577ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2578				int offset, size_t size, int flags)
2579{
2580	ssize_t res;
2581	struct msghdr msg = {.msg_flags = flags};
2582	struct kvec iov;
2583	char *kaddr = kmap(page);
2584
2585	iov.iov_base = kaddr + offset;
2586	iov.iov_len = size;
2587	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2588	kunmap(page);
2589	return res;
2590}
2591EXPORT_SYMBOL(sock_no_sendpage_locked);
2592
2593/*
2594 *	Default Socket Callbacks
2595 */
2596
2597static void sock_def_wakeup(struct sock *sk)
2598{
2599	struct socket_wq *wq;
2600
2601	rcu_read_lock();
2602	wq = rcu_dereference(sk->sk_wq);
2603	if (skwq_has_sleeper(wq))
2604		wake_up_interruptible_all(&wq->wait);
2605	rcu_read_unlock();
2606}
2607
2608static void sock_def_error_report(struct sock *sk)
2609{
2610	struct socket_wq *wq;
2611
2612	rcu_read_lock();
2613	wq = rcu_dereference(sk->sk_wq);
2614	if (skwq_has_sleeper(wq))
2615		wake_up_interruptible_poll(&wq->wait, POLLERR);
2616	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2617	rcu_read_unlock();
2618}
2619
2620static void sock_def_readable(struct sock *sk)
2621{
2622	struct socket_wq *wq;
2623
2624	rcu_read_lock();
2625	wq = rcu_dereference(sk->sk_wq);
2626	if (skwq_has_sleeper(wq))
2627		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2628						POLLRDNORM | POLLRDBAND);
2629	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2630	rcu_read_unlock();
2631}
2632
2633static void sock_def_write_space(struct sock *sk)
2634{
2635	struct socket_wq *wq;
2636
2637	rcu_read_lock();
2638
2639	/* Do not wake up a writer until he can make "significant"
2640	 * progress.  --DaveM
2641	 */
2642	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2643		wq = rcu_dereference(sk->sk_wq);
2644		if (skwq_has_sleeper(wq))
2645			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2646						POLLWRNORM | POLLWRBAND);
2647
2648		/* Should agree with poll, otherwise some programs break */
2649		if (sock_writeable(sk))
2650			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2651	}
2652
2653	rcu_read_unlock();
2654}
2655
2656static void sock_def_destruct(struct sock *sk)
2657{
2658}
2659
2660void sk_send_sigurg(struct sock *sk)
2661{
2662	if (sk->sk_socket && sk->sk_socket->file)
2663		if (send_sigurg(&sk->sk_socket->file->f_owner))
2664			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2665}
2666EXPORT_SYMBOL(sk_send_sigurg);
2667
2668void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2669		    unsigned long expires)
2670{
2671	if (!mod_timer(timer, expires))
2672		sock_hold(sk);
2673}
2674EXPORT_SYMBOL(sk_reset_timer);
2675
2676void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2677{
2678	if (del_timer(timer))
2679		__sock_put(sk);
2680}
2681EXPORT_SYMBOL(sk_stop_timer);
2682
2683void sock_init_data(struct socket *sock, struct sock *sk)
2684{
2685	sk_init_common(sk);
2686	sk->sk_send_head	=	NULL;
2687
2688	timer_setup(&sk->sk_timer, NULL, 0);
2689
2690	sk->sk_allocation	=	GFP_KERNEL;
2691	sk->sk_rcvbuf		=	sysctl_rmem_default;
2692	sk->sk_sndbuf		=	sysctl_wmem_default;
2693	sk->sk_state		=	TCP_CLOSE;
2694	sk_set_socket(sk, sock);
2695
2696	sock_set_flag(sk, SOCK_ZAPPED);
2697
2698	if (sock) {
2699		sk->sk_type	=	sock->type;
2700		sk->sk_wq	=	sock->wq;
2701		sock->sk	=	sk;
2702		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2703	} else {
2704		sk->sk_wq	=	NULL;
2705		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2706	}
2707
2708	rwlock_init(&sk->sk_callback_lock);
2709	if (sk->sk_kern_sock)
2710		lockdep_set_class_and_name(
2711			&sk->sk_callback_lock,
2712			af_kern_callback_keys + sk->sk_family,
2713			af_family_kern_clock_key_strings[sk->sk_family]);
2714	else
2715		lockdep_set_class_and_name(
2716			&sk->sk_callback_lock,
2717			af_callback_keys + sk->sk_family,
2718			af_family_clock_key_strings[sk->sk_family]);
2719
2720	sk->sk_state_change	=	sock_def_wakeup;
2721	sk->sk_data_ready	=	sock_def_readable;
2722	sk->sk_write_space	=	sock_def_write_space;
2723	sk->sk_error_report	=	sock_def_error_report;
2724	sk->sk_destruct		=	sock_def_destruct;
2725
2726	sk->sk_frag.page	=	NULL;
2727	sk->sk_frag.offset	=	0;
2728	sk->sk_peek_off		=	-1;
2729
2730	sk->sk_peer_pid 	=	NULL;
2731	sk->sk_peer_cred	=	NULL;
2732	sk->sk_write_pending	=	0;
2733	sk->sk_rcvlowat		=	1;
2734	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2735	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2736
2737	sk->sk_stamp = SK_DEFAULT_STAMP;
2738	atomic_set(&sk->sk_zckey, 0);
2739
2740#ifdef CONFIG_NET_RX_BUSY_POLL
2741	sk->sk_napi_id		=	0;
2742	sk->sk_ll_usec		=	sysctl_net_busy_read;
2743#endif
2744
2745	sk->sk_max_pacing_rate = ~0U;
2746	sk->sk_pacing_rate = ~0U;
2747	sk->sk_pacing_shift = 10;
2748	sk->sk_incoming_cpu = -1;
2749	/*
2750	 * Before updating sk_refcnt, we must commit prior changes to memory
2751	 * (Documentation/RCU/rculist_nulls.txt for details)
2752	 */
2753	smp_wmb();
2754	refcount_set(&sk->sk_refcnt, 1);
2755	atomic_set(&sk->sk_drops, 0);
2756}
2757EXPORT_SYMBOL(sock_init_data);
2758
2759void lock_sock_nested(struct sock *sk, int subclass)
2760{
2761	might_sleep();
2762	spin_lock_bh(&sk->sk_lock.slock);
2763	if (sk->sk_lock.owned)
2764		__lock_sock(sk);
2765	sk->sk_lock.owned = 1;
2766	spin_unlock(&sk->sk_lock.slock);
2767	/*
2768	 * The sk_lock has mutex_lock() semantics here:
2769	 */
2770	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2771	local_bh_enable();
2772}
2773EXPORT_SYMBOL(lock_sock_nested);
2774
2775void release_sock(struct sock *sk)
2776{
2777	spin_lock_bh(&sk->sk_lock.slock);
2778	if (sk->sk_backlog.tail)
2779		__release_sock(sk);
2780
2781	/* Warning : release_cb() might need to release sk ownership,
2782	 * ie call sock_release_ownership(sk) before us.
2783	 */
2784	if (sk->sk_prot->release_cb)
2785		sk->sk_prot->release_cb(sk);
2786
2787	sock_release_ownership(sk);
2788	if (waitqueue_active(&sk->sk_lock.wq))
2789		wake_up(&sk->sk_lock.wq);
2790	spin_unlock_bh(&sk->sk_lock.slock);
2791}
2792EXPORT_SYMBOL(release_sock);
2793
2794/**
2795 * lock_sock_fast - fast version of lock_sock
2796 * @sk: socket
2797 *
2798 * This version should be used for very small section, where process wont block
2799 * return false if fast path is taken:
2800 *
2801 *   sk_lock.slock locked, owned = 0, BH disabled
2802 *
2803 * return true if slow path is taken:
2804 *
2805 *   sk_lock.slock unlocked, owned = 1, BH enabled
2806 */
2807bool lock_sock_fast(struct sock *sk)
2808{
2809	might_sleep();
2810	spin_lock_bh(&sk->sk_lock.slock);
2811
2812	if (!sk->sk_lock.owned)
2813		/*
2814		 * Note : We must disable BH
2815		 */
2816		return false;
2817
2818	__lock_sock(sk);
2819	sk->sk_lock.owned = 1;
2820	spin_unlock(&sk->sk_lock.slock);
2821	/*
2822	 * The sk_lock has mutex_lock() semantics here:
2823	 */
2824	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2825	local_bh_enable();
2826	return true;
2827}
2828EXPORT_SYMBOL(lock_sock_fast);
2829
2830int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2831{
2832	struct timeval tv;
2833	if (!sock_flag(sk, SOCK_TIMESTAMP))
2834		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2835	tv = ktime_to_timeval(sk->sk_stamp);
2836	if (tv.tv_sec == -1)
2837		return -ENOENT;
2838	if (tv.tv_sec == 0) {
2839		sk->sk_stamp = ktime_get_real();
2840		tv = ktime_to_timeval(sk->sk_stamp);
2841	}
2842	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2843}
2844EXPORT_SYMBOL(sock_get_timestamp);
2845
2846int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2847{
2848	struct timespec ts;
2849	if (!sock_flag(sk, SOCK_TIMESTAMP))
2850		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2851	ts = ktime_to_timespec(sk->sk_stamp);
2852	if (ts.tv_sec == -1)
2853		return -ENOENT;
2854	if (ts.tv_sec == 0) {
2855		sk->sk_stamp = ktime_get_real();
2856		ts = ktime_to_timespec(sk->sk_stamp);
2857	}
2858	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2859}
2860EXPORT_SYMBOL(sock_get_timestampns);
2861
2862void sock_enable_timestamp(struct sock *sk, int flag)
2863{
2864	if (!sock_flag(sk, flag)) {
2865		unsigned long previous_flags = sk->sk_flags;
2866
2867		sock_set_flag(sk, flag);
2868		/*
2869		 * we just set one of the two flags which require net
2870		 * time stamping, but time stamping might have been on
2871		 * already because of the other one
2872		 */
2873		if (sock_needs_netstamp(sk) &&
2874		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2875			net_enable_timestamp();
2876	}
2877}
2878
2879int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2880		       int level, int type)
2881{
2882	struct sock_exterr_skb *serr;
2883	struct sk_buff *skb;
2884	int copied, err;
2885
2886	err = -EAGAIN;
2887	skb = sock_dequeue_err_skb(sk);
2888	if (skb == NULL)
2889		goto out;
2890
2891	copied = skb->len;
2892	if (copied > len) {
2893		msg->msg_flags |= MSG_TRUNC;
2894		copied = len;
2895	}
2896	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2897	if (err)
2898		goto out_free_skb;
2899
2900	sock_recv_timestamp(msg, sk, skb);
2901
2902	serr = SKB_EXT_ERR(skb);
2903	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2904
2905	msg->msg_flags |= MSG_ERRQUEUE;
2906	err = copied;
2907
2908out_free_skb:
2909	kfree_skb(skb);
2910out:
2911	return err;
2912}
2913EXPORT_SYMBOL(sock_recv_errqueue);
2914
2915/*
2916 *	Get a socket option on an socket.
2917 *
2918 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2919 *	asynchronous errors should be reported by getsockopt. We assume
2920 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2921 */
2922int sock_common_getsockopt(struct socket *sock, int level, int optname,
2923			   char __user *optval, int __user *optlen)
2924{
2925	struct sock *sk = sock->sk;
2926
2927	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2928}
2929EXPORT_SYMBOL(sock_common_getsockopt);
2930
2931#ifdef CONFIG_COMPAT
2932int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2933				  char __user *optval, int __user *optlen)
2934{
2935	struct sock *sk = sock->sk;
2936
2937	if (sk->sk_prot->compat_getsockopt != NULL)
2938		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2939						      optval, optlen);
2940	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2941}
2942EXPORT_SYMBOL(compat_sock_common_getsockopt);
2943#endif
2944
2945int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2946			int flags)
2947{
2948	struct sock *sk = sock->sk;
2949	int addr_len = 0;
2950	int err;
2951
2952	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2953				   flags & ~MSG_DONTWAIT, &addr_len);
2954	if (err >= 0)
2955		msg->msg_namelen = addr_len;
2956	return err;
2957}
2958EXPORT_SYMBOL(sock_common_recvmsg);
2959
2960/*
2961 *	Set socket options on an inet socket.
2962 */
2963int sock_common_setsockopt(struct socket *sock, int level, int optname,
2964			   char __user *optval, unsigned int optlen)
2965{
2966	struct sock *sk = sock->sk;
2967
2968	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2969}
2970EXPORT_SYMBOL(sock_common_setsockopt);
2971
2972#ifdef CONFIG_COMPAT
2973int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2974				  char __user *optval, unsigned int optlen)
2975{
2976	struct sock *sk = sock->sk;
2977
2978	if (sk->sk_prot->compat_setsockopt != NULL)
2979		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2980						      optval, optlen);
2981	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2982}
2983EXPORT_SYMBOL(compat_sock_common_setsockopt);
2984#endif
2985
2986void sk_common_release(struct sock *sk)
2987{
2988	if (sk->sk_prot->destroy)
2989		sk->sk_prot->destroy(sk);
2990
2991	/*
2992	 * Observation: when sock_common_release is called, processes have
2993	 * no access to socket. But net still has.
2994	 * Step one, detach it from networking:
2995	 *
2996	 * A. Remove from hash tables.
2997	 */
2998
2999	sk->sk_prot->unhash(sk);
3000
3001	/*
3002	 * In this point socket cannot receive new packets, but it is possible
3003	 * that some packets are in flight because some CPU runs receiver and
3004	 * did hash table lookup before we unhashed socket. They will achieve
3005	 * receive queue and will be purged by socket destructor.
3006	 *
3007	 * Also we still have packets pending on receive queue and probably,
3008	 * our own packets waiting in device queues. sock_destroy will drain
3009	 * receive queue, but transmitted packets will delay socket destruction
3010	 * until the last reference will be released.
3011	 */
3012
3013	sock_orphan(sk);
3014
3015	xfrm_sk_free_policy(sk);
3016
3017	sk_refcnt_debug_release(sk);
3018
3019	sock_put(sk);
3020}
3021EXPORT_SYMBOL(sk_common_release);
3022
3023void sk_get_meminfo(const struct sock *sk, u32 *mem)
3024{
3025	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3026
3027	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3028	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3029	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3030	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3031	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3032	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3033	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3034	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3035	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3036}
3037
3038#ifdef CONFIG_PROC_FS
3039#define PROTO_INUSE_NR	64	/* should be enough for the first time */
3040struct prot_inuse {
3041	int val[PROTO_INUSE_NR];
3042};
3043
3044static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3045
3046void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3047{
3048	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3049}
3050EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3051
3052int sock_prot_inuse_get(struct net *net, struct proto *prot)
3053{
3054	int cpu, idx = prot->inuse_idx;
3055	int res = 0;
3056
3057	for_each_possible_cpu(cpu)
3058		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3059
3060	return res >= 0 ? res : 0;
3061}
3062EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3063
3064static int __net_init sock_inuse_init_net(struct net *net)
3065{
3066	net->core.inuse = alloc_percpu(struct prot_inuse);
3067	return net->core.inuse ? 0 : -ENOMEM;
3068}
3069
3070static void __net_exit sock_inuse_exit_net(struct net *net)
3071{
3072	free_percpu(net->core.inuse);
3073}
3074
3075static struct pernet_operations net_inuse_ops = {
3076	.init = sock_inuse_init_net,
3077	.exit = sock_inuse_exit_net,
3078};
3079
3080static __init int net_inuse_init(void)
3081{
3082	if (register_pernet_subsys(&net_inuse_ops))
3083		panic("Cannot initialize net inuse counters");
3084
3085	return 0;
3086}
3087
3088core_initcall(net_inuse_init);
3089
3090static void assign_proto_idx(struct proto *prot)
3091{
3092	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3093
3094	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3095		pr_err("PROTO_INUSE_NR exhausted\n");
3096		return;
3097	}
3098
3099	set_bit(prot->inuse_idx, proto_inuse_idx);
3100}
3101
3102static void release_proto_idx(struct proto *prot)
3103{
3104	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3105		clear_bit(prot->inuse_idx, proto_inuse_idx);
3106}
3107#else
3108static inline void assign_proto_idx(struct proto *prot)
3109{
3110}
3111
3112static inline void release_proto_idx(struct proto *prot)
3113{
3114}
3115#endif
3116
3117static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3118{
3119	if (!rsk_prot)
3120		return;
3121	kfree(rsk_prot->slab_name);
3122	rsk_prot->slab_name = NULL;
3123	kmem_cache_destroy(rsk_prot->slab);
3124	rsk_prot->slab = NULL;
3125}
3126
3127static int req_prot_init(const struct proto *prot)
3128{
3129	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3130
3131	if (!rsk_prot)
3132		return 0;
3133
3134	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3135					prot->name);
3136	if (!rsk_prot->slab_name)
3137		return -ENOMEM;
3138
3139	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3140					   rsk_prot->obj_size, 0,
3141					   prot->slab_flags, NULL);
3142
3143	if (!rsk_prot->slab) {
3144		pr_crit("%s: Can't create request sock SLAB cache!\n",
3145			prot->name);
3146		return -ENOMEM;
3147	}
3148	return 0;
3149}
3150
3151int proto_register(struct proto *prot, int alloc_slab)
3152{
3153	if (alloc_slab) {
3154		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3155					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3156					NULL);
3157
3158		if (prot->slab == NULL) {
3159			pr_crit("%s: Can't create sock SLAB cache!\n",
3160				prot->name);
3161			goto out;
3162		}
3163
3164		if (req_prot_init(prot))
3165			goto out_free_request_sock_slab;
3166
3167		if (prot->twsk_prot != NULL) {
3168			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3169
3170			if (prot->twsk_prot->twsk_slab_name == NULL)
3171				goto out_free_request_sock_slab;
3172
3173			prot->twsk_prot->twsk_slab =
3174				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3175						  prot->twsk_prot->twsk_obj_size,
3176						  0,
3177						  prot->slab_flags,
3178						  NULL);
3179			if (prot->twsk_prot->twsk_slab == NULL)
3180				goto out_free_timewait_sock_slab_name;
3181		}
3182	}
3183
3184	mutex_lock(&proto_list_mutex);
3185	list_add(&prot->node, &proto_list);
3186	assign_proto_idx(prot);
3187	mutex_unlock(&proto_list_mutex);
3188	return 0;
3189
3190out_free_timewait_sock_slab_name:
3191	kfree(prot->twsk_prot->twsk_slab_name);
3192out_free_request_sock_slab:
3193	req_prot_cleanup(prot->rsk_prot);
3194
3195	kmem_cache_destroy(prot->slab);
3196	prot->slab = NULL;
3197out:
3198	return -ENOBUFS;
3199}
3200EXPORT_SYMBOL(proto_register);
3201
3202void proto_unregister(struct proto *prot)
3203{
3204	mutex_lock(&proto_list_mutex);
3205	release_proto_idx(prot);
3206	list_del(&prot->node);
3207	mutex_unlock(&proto_list_mutex);
3208
3209	kmem_cache_destroy(prot->slab);
3210	prot->slab = NULL;
3211
3212	req_prot_cleanup(prot->rsk_prot);
3213
3214	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3215		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3216		kfree(prot->twsk_prot->twsk_slab_name);
3217		prot->twsk_prot->twsk_slab = NULL;
3218	}
3219}
3220EXPORT_SYMBOL(proto_unregister);
3221
3222#ifdef CONFIG_PROC_FS
3223static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3224	__acquires(proto_list_mutex)
3225{
3226	mutex_lock(&proto_list_mutex);
3227	return seq_list_start_head(&proto_list, *pos);
3228}
3229
3230static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3231{
3232	return seq_list_next(v, &proto_list, pos);
3233}
3234
3235static void proto_seq_stop(struct seq_file *seq, void *v)
3236	__releases(proto_list_mutex)
3237{
3238	mutex_unlock(&proto_list_mutex);
3239}
3240
3241static char proto_method_implemented(const void *method)
3242{
3243	return method == NULL ? 'n' : 'y';
3244}
3245static long sock_prot_memory_allocated(struct proto *proto)
3246{
3247	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3248}
3249
3250static char *sock_prot_memory_pressure(struct proto *proto)
3251{
3252	return proto->memory_pressure != NULL ?
3253	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3254}
3255
3256static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3257{
3258
3259	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3260			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3261		   proto->name,
3262		   proto->obj_size,
3263		   sock_prot_inuse_get(seq_file_net(seq), proto),
3264		   sock_prot_memory_allocated(proto),
3265		   sock_prot_memory_pressure(proto),
3266		   proto->max_header,
3267		   proto->slab == NULL ? "no" : "yes",
3268		   module_name(proto->owner),
3269		   proto_method_implemented(proto->close),
3270		   proto_method_implemented(proto->connect),
3271		   proto_method_implemented(proto->disconnect),
3272		   proto_method_implemented(proto->accept),
3273		   proto_method_implemented(proto->ioctl),
3274		   proto_method_implemented(proto->init),
3275		   proto_method_implemented(proto->destroy),
3276		   proto_method_implemented(proto->shutdown),
3277		   proto_method_implemented(proto->setsockopt),
3278		   proto_method_implemented(proto->getsockopt),
3279		   proto_method_implemented(proto->sendmsg),
3280		   proto_method_implemented(proto->recvmsg),
3281		   proto_method_implemented(proto->sendpage),
3282		   proto_method_implemented(proto->bind),
3283		   proto_method_implemented(proto->backlog_rcv),
3284		   proto_method_implemented(proto->hash),
3285		   proto_method_implemented(proto->unhash),
3286		   proto_method_implemented(proto->get_port),
3287		   proto_method_implemented(proto->enter_memory_pressure));
3288}
3289
3290static int proto_seq_show(struct seq_file *seq, void *v)
3291{
3292	if (v == &proto_list)
3293		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3294			   "protocol",
3295			   "size",
3296			   "sockets",
3297			   "memory",
3298			   "press",
3299			   "maxhdr",
3300			   "slab",
3301			   "module",
3302			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3303	else
3304		proto_seq_printf(seq, list_entry(v, struct proto, node));
3305	return 0;
3306}
3307
3308static const struct seq_operations proto_seq_ops = {
3309	.start  = proto_seq_start,
3310	.next   = proto_seq_next,
3311	.stop   = proto_seq_stop,
3312	.show   = proto_seq_show,
3313};
3314
3315static int proto_seq_open(struct inode *inode, struct file *file)
3316{
3317	return seq_open_net(inode, file, &proto_seq_ops,
3318			    sizeof(struct seq_net_private));
3319}
3320
3321static const struct file_operations proto_seq_fops = {
3322	.owner		= THIS_MODULE,
3323	.open		= proto_seq_open,
3324	.read		= seq_read,
3325	.llseek		= seq_lseek,
3326	.release	= seq_release_net,
3327};
3328
3329static __net_init int proto_init_net(struct net *net)
3330{
3331	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3332		return -ENOMEM;
3333
3334	return 0;
3335}
3336
3337static __net_exit void proto_exit_net(struct net *net)
3338{
3339	remove_proc_entry("protocols", net->proc_net);
3340}
3341
3342
3343static __net_initdata struct pernet_operations proto_net_ops = {
3344	.init = proto_init_net,
3345	.exit = proto_exit_net,
3346};
3347
3348static int __init proto_init(void)
3349{
3350	return register_pernet_subsys(&proto_net_ops);
3351}
3352
3353subsys_initcall(proto_init);
3354
3355#endif /* PROC_FS */
3356
3357#ifdef CONFIG_NET_RX_BUSY_POLL
3358bool sk_busy_loop_end(void *p, unsigned long start_time)
3359{
3360	struct sock *sk = p;
3361
3362	return !skb_queue_empty(&sk->sk_receive_queue) ||
3363	       sk_busy_loop_timeout(sk, start_time);
3364}
3365EXPORT_SYMBOL(sk_busy_loop_end);
3366#endif /* CONFIG_NET_RX_BUSY_POLL */