net/core/sock.c at 0f50a49e3008597abed0fff052d487f77db89093

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / core / sock.c
at 0f50a49e3008597abed0fff052d487f77db89093 3380 lines 85 kB view raw
wrap content
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		Generic socket support routines. Memory allocators, socket lock/release
   7 *		handler for protocols to use and generic option handler.
   8 *
   9 *
  10 * Authors:	Ross Biro
  11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12 *		Florian La Roche, <flla@stud.uni-sb.de>
  13 *		Alan Cox, <A.Cox@swansea.ac.uk>
  14 *
  15 * Fixes:
  16 *		Alan Cox	: 	Numerous verify_area() problems
  17 *		Alan Cox	:	Connecting on a connecting socket
  18 *					now returns an error for tcp.
  19 *		Alan Cox	:	sock->protocol is set correctly.
  20 *					and is not sometimes left as 0.
  21 *		Alan Cox	:	connect handles icmp errors on a
  22 *					connect properly. Unfortunately there
  23 *					is a restart syscall nasty there. I
  24 *					can't match BSD without hacking the C
  25 *					library. Ideas urgently sought!
  26 *		Alan Cox	:	Disallow bind() to addresses that are
  27 *					not ours - especially broadcast ones!!
  28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
  29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
  30 *					instead they leave that for the DESTROY timer.
  31 *		Alan Cox	:	Clean up error flag in accept
  32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
  33 *					was buggy. Put a remove_sock() in the handler
  34 *					for memory when we hit 0. Also altered the timer
  35 *					code. The ACK stuff can wait and needs major
  36 *					TCP layer surgery.
  37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
  38 *					and fixed timer/inet_bh race.
  39 *		Alan Cox	:	Added zapped flag for TCP
  40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
  41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
  43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
  46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
  47 *	Pauline Middelink	:	identd support
  48 *		Alan Cox	:	Fixed connect() taking signals I think.
  49 *		Alan Cox	:	SO_LINGER supported
  50 *		Alan Cox	:	Error reporting fixes
  51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
  52 *		Alan Cox	:	inet sockets don't set sk->type!
  53 *		Alan Cox	:	Split socket option code
  54 *		Alan Cox	:	Callbacks
  55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
  56 *		Alex		:	Removed restriction on inet fioctl
  57 *		Alan Cox	:	Splitting INET from NET core
  58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
  59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
  60 *		Alan Cox	:	Split IP from generic code
  61 *		Alan Cox	:	New kfree_skbmem()
  62 *		Alan Cox	:	Make SO_DEBUG superuser only.
  63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
  64 *					(compatibility fix)
  65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
  66 *		Alan Cox	:	Allocator for a socket is settable.
  67 *		Alan Cox	:	SO_ERROR includes soft errors.
  68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
  69 *		Alan Cox	: 	Generic socket allocation to make hooks
  70 *					easier (suggested by Craig Metz).
  71 *		Michael Pall	:	SO_ERROR returns positive errno again
  72 *              Steve Whitehouse:       Added default destructor to free
  73 *                                      protocol private data.
  74 *              Steve Whitehouse:       Added various other default routines
  75 *                                      common to several socket families.
  76 *              Chris Evans     :       Call suser() check last on F_SETOWN
  77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
  79 *		Andi Kleen	:	Fix write_space callback
  80 *		Chris Evans	:	Security fixes - signedness again
  81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82 *
  83 * To Fix:
  84 *
  85 *
  86 *		This program is free software; you can redistribute it and/or
  87 *		modify it under the terms of the GNU General Public License
  88 *		as published by the Free Software Foundation; either version
  89 *		2 of the License, or (at your option) any later version.
  90 */
  91
  92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94#include <linux/capability.h>
  95#include <linux/errno.h>
  96#include <linux/errqueue.h>
  97#include <linux/types.h>
  98#include <linux/socket.h>
  99#include <linux/in.h>
 100#include <linux/kernel.h>
 101#include <linux/module.h>
 102#include <linux/proc_fs.h>
 103#include <linux/seq_file.h>
 104#include <linux/sched.h>
 105#include <linux/sched/mm.h>
 106#include <linux/timer.h>
 107#include <linux/string.h>
 108#include <linux/sockios.h>
 109#include <linux/net.h>
 110#include <linux/mm.h>
 111#include <linux/slab.h>
 112#include <linux/interrupt.h>
 113#include <linux/poll.h>
 114#include <linux/tcp.h>
 115#include <linux/init.h>
 116#include <linux/highmem.h>
 117#include <linux/user_namespace.h>
 118#include <linux/static_key.h>
 119#include <linux/memcontrol.h>
 120#include <linux/prefetch.h>
 121
 122#include <linux/uaccess.h>
 123
 124#include <linux/netdevice.h>
 125#include <net/protocol.h>
 126#include <linux/skbuff.h>
 127#include <net/net_namespace.h>
 128#include <net/request_sock.h>
 129#include <net/sock.h>
 130#include <linux/net_tstamp.h>
 131#include <net/xfrm.h>
 132#include <linux/ipsec.h>
 133#include <net/cls_cgroup.h>
 134#include <net/netprio_cgroup.h>
 135#include <linux/sock_diag.h>
 136
 137#include <linux/filter.h>
 138#include <net/sock_reuseport.h>
 139
 140#include <trace/events/sock.h>
 141
 142#include <net/tcp.h>
 143#include <net/busy_poll.h>
 144
 145static DEFINE_MUTEX(proto_list_mutex);
 146static LIST_HEAD(proto_list);
 147
 148/**
 149 * sk_ns_capable - General socket capability test
 150 * @sk: Socket to use a capability on or through
 151 * @user_ns: The user namespace of the capability to use
 152 * @cap: The capability to use
 153 *
 154 * Test to see if the opener of the socket had when the socket was
 155 * created and the current process has the capability @cap in the user
 156 * namespace @user_ns.
 157 */
 158bool sk_ns_capable(const struct sock *sk,
 159		   struct user_namespace *user_ns, int cap)
 160{
 161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 162		ns_capable(user_ns, cap);
 163}
 164EXPORT_SYMBOL(sk_ns_capable);
 165
 166/**
 167 * sk_capable - Socket global capability test
 168 * @sk: Socket to use a capability on or through
 169 * @cap: The global capability to use
 170 *
 171 * Test to see if the opener of the socket had when the socket was
 172 * created and the current process has the capability @cap in all user
 173 * namespaces.
 174 */
 175bool sk_capable(const struct sock *sk, int cap)
 176{
 177	return sk_ns_capable(sk, &init_user_ns, cap);
 178}
 179EXPORT_SYMBOL(sk_capable);
 180
 181/**
 182 * sk_net_capable - Network namespace socket capability test
 183 * @sk: Socket to use a capability on or through
 184 * @cap: The capability to use
 185 *
 186 * Test to see if the opener of the socket had when the socket was created
 187 * and the current process has the capability @cap over the network namespace
 188 * the socket is a member of.
 189 */
 190bool sk_net_capable(const struct sock *sk, int cap)
 191{
 192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 193}
 194EXPORT_SYMBOL(sk_net_capable);
 195
 196/*
 197 * Each address family might have different locking rules, so we have
 198 * one slock key per address family and separate keys for internal and
 199 * userspace sockets.
 200 */
 201static struct lock_class_key af_family_keys[AF_MAX];
 202static struct lock_class_key af_family_kern_keys[AF_MAX];
 203static struct lock_class_key af_family_slock_keys[AF_MAX];
 204static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 205
 206/*
 207 * Make lock validator output more readable. (we pre-construct these
 208 * strings build-time, so that runtime initialization of socket
 209 * locks is fast):
 210 */
 211
 212#define _sock_locks(x)						  \
 213  x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
 214  x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
 215  x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
 216  x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
 217  x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
 218  x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
 219  x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
 220  x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
 221  x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
 222  x "27"       ,	x "28"          ,	x "AF_CAN"      , \
 223  x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
 224  x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
 225  x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
 226  x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
 227  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
 228
 229static const char *const af_family_key_strings[AF_MAX+1] = {
 230	_sock_locks("sk_lock-")
 231};
 232static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 233	_sock_locks("slock-")
 234};
 235static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 236	_sock_locks("clock-")
 237};
 238
 239static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 240	_sock_locks("k-sk_lock-")
 241};
 242static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 243	_sock_locks("k-slock-")
 244};
 245static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 246	_sock_locks("k-clock-")
 247};
 248static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 249  "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 250  "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 251  "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 252  "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 253  "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 254  "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 255  "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 256  "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 257  "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 258  "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 259  "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 260  "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 261  "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 262  "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 263  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 264};
 265static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 266  "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 267  "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 268  "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 269  "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 270  "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 271  "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 272  "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 273  "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 274  "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 275  "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 276  "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 277  "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 278  "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 279  "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 280  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 281};
 282static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 283  "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 284  "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 285  "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 286  "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 287  "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 288  "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 289  "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 290  "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 291  "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 292  "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 293  "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 294  "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 295  "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 296  "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 297  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 298};
 299
 300/*
 301 * sk_callback_lock and sk queues locking rules are per-address-family,
 302 * so split the lock classes by using a per-AF key:
 303 */
 304static struct lock_class_key af_callback_keys[AF_MAX];
 305static struct lock_class_key af_rlock_keys[AF_MAX];
 306static struct lock_class_key af_wlock_keys[AF_MAX];
 307static struct lock_class_key af_elock_keys[AF_MAX];
 308static struct lock_class_key af_kern_callback_keys[AF_MAX];
 309
 310/* Run time adjustable parameters. */
 311__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 312EXPORT_SYMBOL(sysctl_wmem_max);
 313__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 314EXPORT_SYMBOL(sysctl_rmem_max);
 315__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 316__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 317
 318/* Maximal space eaten by iovec or ancillary data plus some space */
 319int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 320EXPORT_SYMBOL(sysctl_optmem_max);
 321
 322int sysctl_tstamp_allow_data __read_mostly = 1;
 323
 324struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 325EXPORT_SYMBOL_GPL(memalloc_socks);
 326
 327/**
 328 * sk_set_memalloc - sets %SOCK_MEMALLOC
 329 * @sk: socket to set it on
 330 *
 331 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 332 * It's the responsibility of the admin to adjust min_free_kbytes
 333 * to meet the requirements
 334 */
 335void sk_set_memalloc(struct sock *sk)
 336{
 337	sock_set_flag(sk, SOCK_MEMALLOC);
 338	sk->sk_allocation |= __GFP_MEMALLOC;
 339	static_key_slow_inc(&memalloc_socks);
 340}
 341EXPORT_SYMBOL_GPL(sk_set_memalloc);
 342
 343void sk_clear_memalloc(struct sock *sk)
 344{
 345	sock_reset_flag(sk, SOCK_MEMALLOC);
 346	sk->sk_allocation &= ~__GFP_MEMALLOC;
 347	static_key_slow_dec(&memalloc_socks);
 348
 349	/*
 350	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 351	 * progress of swapping. SOCK_MEMALLOC may be cleared while
 352	 * it has rmem allocations due to the last swapfile being deactivated
 353	 * but there is a risk that the socket is unusable due to exceeding
 354	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
 355	 */
 356	sk_mem_reclaim(sk);
 357}
 358EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 359
 360int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 361{
 362	int ret;
 363	unsigned int noreclaim_flag;
 364
 365	/* these should have been dropped before queueing */
 366	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 367
 368	noreclaim_flag = memalloc_noreclaim_save();
 369	ret = sk->sk_backlog_rcv(sk, skb);
 370	memalloc_noreclaim_restore(noreclaim_flag);
 371
 372	return ret;
 373}
 374EXPORT_SYMBOL(__sk_backlog_rcv);
 375
 376static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 377{
 378	struct timeval tv;
 379
 380	if (optlen < sizeof(tv))
 381		return -EINVAL;
 382	if (copy_from_user(&tv, optval, sizeof(tv)))
 383		return -EFAULT;
 384	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 385		return -EDOM;
 386
 387	if (tv.tv_sec < 0) {
 388		static int warned __read_mostly;
 389
 390		*timeo_p = 0;
 391		if (warned < 10 && net_ratelimit()) {
 392			warned++;
 393			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 394				__func__, current->comm, task_pid_nr(current));
 395		}
 396		return 0;
 397	}
 398	*timeo_p = MAX_SCHEDULE_TIMEOUT;
 399	if (tv.tv_sec == 0 && tv.tv_usec == 0)
 400		return 0;
 401	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 402		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 403	return 0;
 404}
 405
 406static void sock_warn_obsolete_bsdism(const char *name)
 407{
 408	static int warned;
 409	static char warncomm[TASK_COMM_LEN];
 410	if (strcmp(warncomm, current->comm) && warned < 5) {
 411		strcpy(warncomm,  current->comm);
 412		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 413			warncomm, name);
 414		warned++;
 415	}
 416}
 417
 418static bool sock_needs_netstamp(const struct sock *sk)
 419{
 420	switch (sk->sk_family) {
 421	case AF_UNSPEC:
 422	case AF_UNIX:
 423		return false;
 424	default:
 425		return true;
 426	}
 427}
 428
 429static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 430{
 431	if (sk->sk_flags & flags) {
 432		sk->sk_flags &= ~flags;
 433		if (sock_needs_netstamp(sk) &&
 434		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 435			net_disable_timestamp();
 436	}
 437}
 438
 439
 440int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 441{
 442	unsigned long flags;
 443	struct sk_buff_head *list = &sk->sk_receive_queue;
 444
 445	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 446		atomic_inc(&sk->sk_drops);
 447		trace_sock_rcvqueue_full(sk, skb);
 448		return -ENOMEM;
 449	}
 450
 451	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 452		atomic_inc(&sk->sk_drops);
 453		return -ENOBUFS;
 454	}
 455
 456	skb->dev = NULL;
 457	skb_set_owner_r(skb, sk);
 458
 459	/* we escape from rcu protected region, make sure we dont leak
 460	 * a norefcounted dst
 461	 */
 462	skb_dst_force(skb);
 463
 464	spin_lock_irqsave(&list->lock, flags);
 465	sock_skb_set_dropcount(sk, skb);
 466	__skb_queue_tail(list, skb);
 467	spin_unlock_irqrestore(&list->lock, flags);
 468
 469	if (!sock_flag(sk, SOCK_DEAD))
 470		sk->sk_data_ready(sk);
 471	return 0;
 472}
 473EXPORT_SYMBOL(__sock_queue_rcv_skb);
 474
 475int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 476{
 477	int err;
 478
 479	err = sk_filter(sk, skb);
 480	if (err)
 481		return err;
 482
 483	return __sock_queue_rcv_skb(sk, skb);
 484}
 485EXPORT_SYMBOL(sock_queue_rcv_skb);
 486
 487int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 488		     const int nested, unsigned int trim_cap, bool refcounted)
 489{
 490	int rc = NET_RX_SUCCESS;
 491
 492	if (sk_filter_trim_cap(sk, skb, trim_cap))
 493		goto discard_and_relse;
 494
 495	skb->dev = NULL;
 496
 497	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 498		atomic_inc(&sk->sk_drops);
 499		goto discard_and_relse;
 500	}
 501	if (nested)
 502		bh_lock_sock_nested(sk);
 503	else
 504		bh_lock_sock(sk);
 505	if (!sock_owned_by_user(sk)) {
 506		/*
 507		 * trylock + unlock semantics:
 508		 */
 509		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 510
 511		rc = sk_backlog_rcv(sk, skb);
 512
 513		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 514	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 515		bh_unlock_sock(sk);
 516		atomic_inc(&sk->sk_drops);
 517		goto discard_and_relse;
 518	}
 519
 520	bh_unlock_sock(sk);
 521out:
 522	if (refcounted)
 523		sock_put(sk);
 524	return rc;
 525discard_and_relse:
 526	kfree_skb(skb);
 527	goto out;
 528}
 529EXPORT_SYMBOL(__sk_receive_skb);
 530
 531struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 532{
 533	struct dst_entry *dst = __sk_dst_get(sk);
 534
 535	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 536		sk_tx_queue_clear(sk);
 537		sk->sk_dst_pending_confirm = 0;
 538		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 539		dst_release(dst);
 540		return NULL;
 541	}
 542
 543	return dst;
 544}
 545EXPORT_SYMBOL(__sk_dst_check);
 546
 547struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 548{
 549	struct dst_entry *dst = sk_dst_get(sk);
 550
 551	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 552		sk_dst_reset(sk);
 553		dst_release(dst);
 554		return NULL;
 555	}
 556
 557	return dst;
 558}
 559EXPORT_SYMBOL(sk_dst_check);
 560
 561static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 562				int optlen)
 563{
 564	int ret = -ENOPROTOOPT;
 565#ifdef CONFIG_NETDEVICES
 566	struct net *net = sock_net(sk);
 567	char devname[IFNAMSIZ];
 568	int index;
 569
 570	/* Sorry... */
 571	ret = -EPERM;
 572	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 573		goto out;
 574
 575	ret = -EINVAL;
 576	if (optlen < 0)
 577		goto out;
 578
 579	/* Bind this socket to a particular device like "eth0",
 580	 * as specified in the passed interface name. If the
 581	 * name is "" or the option length is zero the socket
 582	 * is not bound.
 583	 */
 584	if (optlen > IFNAMSIZ - 1)
 585		optlen = IFNAMSIZ - 1;
 586	memset(devname, 0, sizeof(devname));
 587
 588	ret = -EFAULT;
 589	if (copy_from_user(devname, optval, optlen))
 590		goto out;
 591
 592	index = 0;
 593	if (devname[0] != '\0') {
 594		struct net_device *dev;
 595
 596		rcu_read_lock();
 597		dev = dev_get_by_name_rcu(net, devname);
 598		if (dev)
 599			index = dev->ifindex;
 600		rcu_read_unlock();
 601		ret = -ENODEV;
 602		if (!dev)
 603			goto out;
 604	}
 605
 606	lock_sock(sk);
 607	sk->sk_bound_dev_if = index;
 608	sk_dst_reset(sk);
 609	release_sock(sk);
 610
 611	ret = 0;
 612
 613out:
 614#endif
 615
 616	return ret;
 617}
 618
 619static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 620				int __user *optlen, int len)
 621{
 622	int ret = -ENOPROTOOPT;
 623#ifdef CONFIG_NETDEVICES
 624	struct net *net = sock_net(sk);
 625	char devname[IFNAMSIZ];
 626
 627	if (sk->sk_bound_dev_if == 0) {
 628		len = 0;
 629		goto zero;
 630	}
 631
 632	ret = -EINVAL;
 633	if (len < IFNAMSIZ)
 634		goto out;
 635
 636	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 637	if (ret)
 638		goto out;
 639
 640	len = strlen(devname) + 1;
 641
 642	ret = -EFAULT;
 643	if (copy_to_user(optval, devname, len))
 644		goto out;
 645
 646zero:
 647	ret = -EFAULT;
 648	if (put_user(len, optlen))
 649		goto out;
 650
 651	ret = 0;
 652
 653out:
 654#endif
 655
 656	return ret;
 657}
 658
 659static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 660{
 661	if (valbool)
 662		sock_set_flag(sk, bit);
 663	else
 664		sock_reset_flag(sk, bit);
 665}
 666
 667bool sk_mc_loop(struct sock *sk)
 668{
 669	if (dev_recursion_level())
 670		return false;
 671	if (!sk)
 672		return true;
 673	switch (sk->sk_family) {
 674	case AF_INET:
 675		return inet_sk(sk)->mc_loop;
 676#if IS_ENABLED(CONFIG_IPV6)
 677	case AF_INET6:
 678		return inet6_sk(sk)->mc_loop;
 679#endif
 680	}
 681	WARN_ON(1);
 682	return true;
 683}
 684EXPORT_SYMBOL(sk_mc_loop);
 685
 686/*
 687 *	This is meant for all protocols to use and covers goings on
 688 *	at the socket level. Everything here is generic.
 689 */
 690
 691int sock_setsockopt(struct socket *sock, int level, int optname,
 692		    char __user *optval, unsigned int optlen)
 693{
 694	struct sock *sk = sock->sk;
 695	int val;
 696	int valbool;
 697	struct linger ling;
 698	int ret = 0;
 699
 700	/*
 701	 *	Options without arguments
 702	 */
 703
 704	if (optname == SO_BINDTODEVICE)
 705		return sock_setbindtodevice(sk, optval, optlen);
 706
 707	if (optlen < sizeof(int))
 708		return -EINVAL;
 709
 710	if (get_user(val, (int __user *)optval))
 711		return -EFAULT;
 712
 713	valbool = val ? 1 : 0;
 714
 715	lock_sock(sk);
 716
 717	switch (optname) {
 718	case SO_DEBUG:
 719		if (val && !capable(CAP_NET_ADMIN))
 720			ret = -EACCES;
 721		else
 722			sock_valbool_flag(sk, SOCK_DBG, valbool);
 723		break;
 724	case SO_REUSEADDR:
 725		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 726		break;
 727	case SO_REUSEPORT:
 728		sk->sk_reuseport = valbool;
 729		break;
 730	case SO_TYPE:
 731	case SO_PROTOCOL:
 732	case SO_DOMAIN:
 733	case SO_ERROR:
 734		ret = -ENOPROTOOPT;
 735		break;
 736	case SO_DONTROUTE:
 737		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 738		break;
 739	case SO_BROADCAST:
 740		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 741		break;
 742	case SO_SNDBUF:
 743		/* Don't error on this BSD doesn't and if you think
 744		 * about it this is right. Otherwise apps have to
 745		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 746		 * are treated in BSD as hints
 747		 */
 748		val = min_t(u32, val, sysctl_wmem_max);
 749set_sndbuf:
 750		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 751		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 752		/* Wake up sending tasks if we upped the value. */
 753		sk->sk_write_space(sk);
 754		break;
 755
 756	case SO_SNDBUFFORCE:
 757		if (!capable(CAP_NET_ADMIN)) {
 758			ret = -EPERM;
 759			break;
 760		}
 761		goto set_sndbuf;
 762
 763	case SO_RCVBUF:
 764		/* Don't error on this BSD doesn't and if you think
 765		 * about it this is right. Otherwise apps have to
 766		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
 767		 * are treated in BSD as hints
 768		 */
 769		val = min_t(u32, val, sysctl_rmem_max);
 770set_rcvbuf:
 771		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 772		/*
 773		 * We double it on the way in to account for
 774		 * "struct sk_buff" etc. overhead.   Applications
 775		 * assume that the SO_RCVBUF setting they make will
 776		 * allow that much actual data to be received on that
 777		 * socket.
 778		 *
 779		 * Applications are unaware that "struct sk_buff" and
 780		 * other overheads allocate from the receive buffer
 781		 * during socket buffer allocation.
 782		 *
 783		 * And after considering the possible alternatives,
 784		 * returning the value we actually used in getsockopt
 785		 * is the most desirable behavior.
 786		 */
 787		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 788		break;
 789
 790	case SO_RCVBUFFORCE:
 791		if (!capable(CAP_NET_ADMIN)) {
 792			ret = -EPERM;
 793			break;
 794		}
 795		goto set_rcvbuf;
 796
 797	case SO_KEEPALIVE:
 798		if (sk->sk_prot->keepalive)
 799			sk->sk_prot->keepalive(sk, valbool);
 800		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 801		break;
 802
 803	case SO_OOBINLINE:
 804		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 805		break;
 806
 807	case SO_NO_CHECK:
 808		sk->sk_no_check_tx = valbool;
 809		break;
 810
 811	case SO_PRIORITY:
 812		if ((val >= 0 && val <= 6) ||
 813		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 814			sk->sk_priority = val;
 815		else
 816			ret = -EPERM;
 817		break;
 818
 819	case SO_LINGER:
 820		if (optlen < sizeof(ling)) {
 821			ret = -EINVAL;	/* 1003.1g */
 822			break;
 823		}
 824		if (copy_from_user(&ling, optval, sizeof(ling))) {
 825			ret = -EFAULT;
 826			break;
 827		}
 828		if (!ling.l_onoff)
 829			sock_reset_flag(sk, SOCK_LINGER);
 830		else {
 831#if (BITS_PER_LONG == 32)
 832			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 833				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 834			else
 835#endif
 836				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 837			sock_set_flag(sk, SOCK_LINGER);
 838		}
 839		break;
 840
 841	case SO_BSDCOMPAT:
 842		sock_warn_obsolete_bsdism("setsockopt");
 843		break;
 844
 845	case SO_PASSCRED:
 846		if (valbool)
 847			set_bit(SOCK_PASSCRED, &sock->flags);
 848		else
 849			clear_bit(SOCK_PASSCRED, &sock->flags);
 850		break;
 851
 852	case SO_TIMESTAMP:
 853	case SO_TIMESTAMPNS:
 854		if (valbool)  {
 855			if (optname == SO_TIMESTAMP)
 856				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 857			else
 858				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 859			sock_set_flag(sk, SOCK_RCVTSTAMP);
 860			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 861		} else {
 862			sock_reset_flag(sk, SOCK_RCVTSTAMP);
 863			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 864		}
 865		break;
 866
 867	case SO_TIMESTAMPING:
 868		if (val & ~SOF_TIMESTAMPING_MASK) {
 869			ret = -EINVAL;
 870			break;
 871		}
 872
 873		if (val & SOF_TIMESTAMPING_OPT_ID &&
 874		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 875			if (sk->sk_protocol == IPPROTO_TCP &&
 876			    sk->sk_type == SOCK_STREAM) {
 877				if ((1 << sk->sk_state) &
 878				    (TCPF_CLOSE | TCPF_LISTEN)) {
 879					ret = -EINVAL;
 880					break;
 881				}
 882				sk->sk_tskey = tcp_sk(sk)->snd_una;
 883			} else {
 884				sk->sk_tskey = 0;
 885			}
 886		}
 887
 888		if (val & SOF_TIMESTAMPING_OPT_STATS &&
 889		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 890			ret = -EINVAL;
 891			break;
 892		}
 893
 894		sk->sk_tsflags = val;
 895		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 896			sock_enable_timestamp(sk,
 897					      SOCK_TIMESTAMPING_RX_SOFTWARE);
 898		else
 899			sock_disable_timestamp(sk,
 900					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 901		break;
 902
 903	case SO_RCVLOWAT:
 904		if (val < 0)
 905			val = INT_MAX;
 906		sk->sk_rcvlowat = val ? : 1;
 907		break;
 908
 909	case SO_RCVTIMEO:
 910		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 911		break;
 912
 913	case SO_SNDTIMEO:
 914		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 915		break;
 916
 917	case SO_ATTACH_FILTER:
 918		ret = -EINVAL;
 919		if (optlen == sizeof(struct sock_fprog)) {
 920			struct sock_fprog fprog;
 921
 922			ret = -EFAULT;
 923			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 924				break;
 925
 926			ret = sk_attach_filter(&fprog, sk);
 927		}
 928		break;
 929
 930	case SO_ATTACH_BPF:
 931		ret = -EINVAL;
 932		if (optlen == sizeof(u32)) {
 933			u32 ufd;
 934
 935			ret = -EFAULT;
 936			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 937				break;
 938
 939			ret = sk_attach_bpf(ufd, sk);
 940		}
 941		break;
 942
 943	case SO_ATTACH_REUSEPORT_CBPF:
 944		ret = -EINVAL;
 945		if (optlen == sizeof(struct sock_fprog)) {
 946			struct sock_fprog fprog;
 947
 948			ret = -EFAULT;
 949			if (copy_from_user(&fprog, optval, sizeof(fprog)))
 950				break;
 951
 952			ret = sk_reuseport_attach_filter(&fprog, sk);
 953		}
 954		break;
 955
 956	case SO_ATTACH_REUSEPORT_EBPF:
 957		ret = -EINVAL;
 958		if (optlen == sizeof(u32)) {
 959			u32 ufd;
 960
 961			ret = -EFAULT;
 962			if (copy_from_user(&ufd, optval, sizeof(ufd)))
 963				break;
 964
 965			ret = sk_reuseport_attach_bpf(ufd, sk);
 966		}
 967		break;
 968
 969	case SO_DETACH_FILTER:
 970		ret = sk_detach_filter(sk);
 971		break;
 972
 973	case SO_LOCK_FILTER:
 974		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 975			ret = -EPERM;
 976		else
 977			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
 978		break;
 979
 980	case SO_PASSSEC:
 981		if (valbool)
 982			set_bit(SOCK_PASSSEC, &sock->flags);
 983		else
 984			clear_bit(SOCK_PASSSEC, &sock->flags);
 985		break;
 986	case SO_MARK:
 987		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 988			ret = -EPERM;
 989		else
 990			sk->sk_mark = val;
 991		break;
 992
 993	case SO_RXQ_OVFL:
 994		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 995		break;
 996
 997	case SO_WIFI_STATUS:
 998		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 999		break;
1000
1001	case SO_PEEK_OFF:
1002		if (sock->ops->set_peek_off)
1003			ret = sock->ops->set_peek_off(sk, val);
1004		else
1005			ret = -EOPNOTSUPP;
1006		break;
1007
1008	case SO_NOFCS:
1009		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1010		break;
1011
1012	case SO_SELECT_ERR_QUEUE:
1013		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1014		break;
1015
1016#ifdef CONFIG_NET_RX_BUSY_POLL
1017	case SO_BUSY_POLL:
1018		/* allow unprivileged users to decrease the value */
1019		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1020			ret = -EPERM;
1021		else {
1022			if (val < 0)
1023				ret = -EINVAL;
1024			else
1025				sk->sk_ll_usec = val;
1026		}
1027		break;
1028#endif
1029
1030	case SO_MAX_PACING_RATE:
1031		if (val != ~0U)
1032			cmpxchg(&sk->sk_pacing_status,
1033				SK_PACING_NONE,
1034				SK_PACING_NEEDED);
1035		sk->sk_max_pacing_rate = val;
1036		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1037					 sk->sk_max_pacing_rate);
1038		break;
1039
1040	case SO_INCOMING_CPU:
1041		sk->sk_incoming_cpu = val;
1042		break;
1043
1044	case SO_CNX_ADVICE:
1045		if (val == 1)
1046			dst_negative_advice(sk);
1047		break;
1048
1049	case SO_ZEROCOPY:
1050		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
1051			ret = -ENOTSUPP;
1052		else if (sk->sk_protocol != IPPROTO_TCP)
1053			ret = -ENOTSUPP;
1054		else if (sk->sk_state != TCP_CLOSE)
1055			ret = -EBUSY;
1056		else if (val < 0 || val > 1)
1057			ret = -EINVAL;
1058		else
1059			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1060		break;
1061
1062	default:
1063		ret = -ENOPROTOOPT;
1064		break;
1065	}
1066	release_sock(sk);
1067	return ret;
1068}
1069EXPORT_SYMBOL(sock_setsockopt);
1070
1071
1072static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1073			  struct ucred *ucred)
1074{
1075	ucred->pid = pid_vnr(pid);
1076	ucred->uid = ucred->gid = -1;
1077	if (cred) {
1078		struct user_namespace *current_ns = current_user_ns();
1079
1080		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1081		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1082	}
1083}
1084
1085static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1086{
1087	struct user_namespace *user_ns = current_user_ns();
1088	int i;
1089
1090	for (i = 0; i < src->ngroups; i++)
1091		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1092			return -EFAULT;
1093
1094	return 0;
1095}
1096
1097int sock_getsockopt(struct socket *sock, int level, int optname,
1098		    char __user *optval, int __user *optlen)
1099{
1100	struct sock *sk = sock->sk;
1101
1102	union {
1103		int val;
1104		u64 val64;
1105		struct linger ling;
1106		struct timeval tm;
1107	} v;
1108
1109	int lv = sizeof(int);
1110	int len;
1111
1112	if (get_user(len, optlen))
1113		return -EFAULT;
1114	if (len < 0)
1115		return -EINVAL;
1116
1117	memset(&v, 0, sizeof(v));
1118
1119	switch (optname) {
1120	case SO_DEBUG:
1121		v.val = sock_flag(sk, SOCK_DBG);
1122		break;
1123
1124	case SO_DONTROUTE:
1125		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1126		break;
1127
1128	case SO_BROADCAST:
1129		v.val = sock_flag(sk, SOCK_BROADCAST);
1130		break;
1131
1132	case SO_SNDBUF:
1133		v.val = sk->sk_sndbuf;
1134		break;
1135
1136	case SO_RCVBUF:
1137		v.val = sk->sk_rcvbuf;
1138		break;
1139
1140	case SO_REUSEADDR:
1141		v.val = sk->sk_reuse;
1142		break;
1143
1144	case SO_REUSEPORT:
1145		v.val = sk->sk_reuseport;
1146		break;
1147
1148	case SO_KEEPALIVE:
1149		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1150		break;
1151
1152	case SO_TYPE:
1153		v.val = sk->sk_type;
1154		break;
1155
1156	case SO_PROTOCOL:
1157		v.val = sk->sk_protocol;
1158		break;
1159
1160	case SO_DOMAIN:
1161		v.val = sk->sk_family;
1162		break;
1163
1164	case SO_ERROR:
1165		v.val = -sock_error(sk);
1166		if (v.val == 0)
1167			v.val = xchg(&sk->sk_err_soft, 0);
1168		break;
1169
1170	case SO_OOBINLINE:
1171		v.val = sock_flag(sk, SOCK_URGINLINE);
1172		break;
1173
1174	case SO_NO_CHECK:
1175		v.val = sk->sk_no_check_tx;
1176		break;
1177
1178	case SO_PRIORITY:
1179		v.val = sk->sk_priority;
1180		break;
1181
1182	case SO_LINGER:
1183		lv		= sizeof(v.ling);
1184		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1185		v.ling.l_linger	= sk->sk_lingertime / HZ;
1186		break;
1187
1188	case SO_BSDCOMPAT:
1189		sock_warn_obsolete_bsdism("getsockopt");
1190		break;
1191
1192	case SO_TIMESTAMP:
1193		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1194				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1195		break;
1196
1197	case SO_TIMESTAMPNS:
1198		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1199		break;
1200
1201	case SO_TIMESTAMPING:
1202		v.val = sk->sk_tsflags;
1203		break;
1204
1205	case SO_RCVTIMEO:
1206		lv = sizeof(struct timeval);
1207		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1208			v.tm.tv_sec = 0;
1209			v.tm.tv_usec = 0;
1210		} else {
1211			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1212			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1213		}
1214		break;
1215
1216	case SO_SNDTIMEO:
1217		lv = sizeof(struct timeval);
1218		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1219			v.tm.tv_sec = 0;
1220			v.tm.tv_usec = 0;
1221		} else {
1222			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1223			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1224		}
1225		break;
1226
1227	case SO_RCVLOWAT:
1228		v.val = sk->sk_rcvlowat;
1229		break;
1230
1231	case SO_SNDLOWAT:
1232		v.val = 1;
1233		break;
1234
1235	case SO_PASSCRED:
1236		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1237		break;
1238
1239	case SO_PEERCRED:
1240	{
1241		struct ucred peercred;
1242		if (len > sizeof(peercred))
1243			len = sizeof(peercred);
1244		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1245		if (copy_to_user(optval, &peercred, len))
1246			return -EFAULT;
1247		goto lenout;
1248	}
1249
1250	case SO_PEERGROUPS:
1251	{
1252		int ret, n;
1253
1254		if (!sk->sk_peer_cred)
1255			return -ENODATA;
1256
1257		n = sk->sk_peer_cred->group_info->ngroups;
1258		if (len < n * sizeof(gid_t)) {
1259			len = n * sizeof(gid_t);
1260			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1261		}
1262		len = n * sizeof(gid_t);
1263
1264		ret = groups_to_user((gid_t __user *)optval,
1265				     sk->sk_peer_cred->group_info);
1266		if (ret)
1267			return ret;
1268		goto lenout;
1269	}
1270
1271	case SO_PEERNAME:
1272	{
1273		char address[128];
1274
1275		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1276			return -ENOTCONN;
1277		if (lv < len)
1278			return -EINVAL;
1279		if (copy_to_user(optval, address, len))
1280			return -EFAULT;
1281		goto lenout;
1282	}
1283
1284	/* Dubious BSD thing... Probably nobody even uses it, but
1285	 * the UNIX standard wants it for whatever reason... -DaveM
1286	 */
1287	case SO_ACCEPTCONN:
1288		v.val = sk->sk_state == TCP_LISTEN;
1289		break;
1290
1291	case SO_PASSSEC:
1292		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1293		break;
1294
1295	case SO_PEERSEC:
1296		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1297
1298	case SO_MARK:
1299		v.val = sk->sk_mark;
1300		break;
1301
1302	case SO_RXQ_OVFL:
1303		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1304		break;
1305
1306	case SO_WIFI_STATUS:
1307		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1308		break;
1309
1310	case SO_PEEK_OFF:
1311		if (!sock->ops->set_peek_off)
1312			return -EOPNOTSUPP;
1313
1314		v.val = sk->sk_peek_off;
1315		break;
1316	case SO_NOFCS:
1317		v.val = sock_flag(sk, SOCK_NOFCS);
1318		break;
1319
1320	case SO_BINDTODEVICE:
1321		return sock_getbindtodevice(sk, optval, optlen, len);
1322
1323	case SO_GET_FILTER:
1324		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1325		if (len < 0)
1326			return len;
1327
1328		goto lenout;
1329
1330	case SO_LOCK_FILTER:
1331		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1332		break;
1333
1334	case SO_BPF_EXTENSIONS:
1335		v.val = bpf_tell_extensions();
1336		break;
1337
1338	case SO_SELECT_ERR_QUEUE:
1339		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1340		break;
1341
1342#ifdef CONFIG_NET_RX_BUSY_POLL
1343	case SO_BUSY_POLL:
1344		v.val = sk->sk_ll_usec;
1345		break;
1346#endif
1347
1348	case SO_MAX_PACING_RATE:
1349		v.val = sk->sk_max_pacing_rate;
1350		break;
1351
1352	case SO_INCOMING_CPU:
1353		v.val = sk->sk_incoming_cpu;
1354		break;
1355
1356	case SO_MEMINFO:
1357	{
1358		u32 meminfo[SK_MEMINFO_VARS];
1359
1360		if (get_user(len, optlen))
1361			return -EFAULT;
1362
1363		sk_get_meminfo(sk, meminfo);
1364
1365		len = min_t(unsigned int, len, sizeof(meminfo));
1366		if (copy_to_user(optval, &meminfo, len))
1367			return -EFAULT;
1368
1369		goto lenout;
1370	}
1371
1372#ifdef CONFIG_NET_RX_BUSY_POLL
1373	case SO_INCOMING_NAPI_ID:
1374		v.val = READ_ONCE(sk->sk_napi_id);
1375
1376		/* aggregate non-NAPI IDs down to 0 */
1377		if (v.val < MIN_NAPI_ID)
1378			v.val = 0;
1379
1380		break;
1381#endif
1382
1383	case SO_COOKIE:
1384		lv = sizeof(u64);
1385		if (len < lv)
1386			return -EINVAL;
1387		v.val64 = sock_gen_cookie(sk);
1388		break;
1389
1390	case SO_ZEROCOPY:
1391		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1392		break;
1393
1394	default:
1395		/* We implement the SO_SNDLOWAT etc to not be settable
1396		 * (1003.1g 7).
1397		 */
1398		return -ENOPROTOOPT;
1399	}
1400
1401	if (len > lv)
1402		len = lv;
1403	if (copy_to_user(optval, &v, len))
1404		return -EFAULT;
1405lenout:
1406	if (put_user(len, optlen))
1407		return -EFAULT;
1408	return 0;
1409}
1410
1411/*
1412 * Initialize an sk_lock.
1413 *
1414 * (We also register the sk_lock with the lock validator.)
1415 */
1416static inline void sock_lock_init(struct sock *sk)
1417{
1418	if (sk->sk_kern_sock)
1419		sock_lock_init_class_and_name(
1420			sk,
1421			af_family_kern_slock_key_strings[sk->sk_family],
1422			af_family_kern_slock_keys + sk->sk_family,
1423			af_family_kern_key_strings[sk->sk_family],
1424			af_family_kern_keys + sk->sk_family);
1425	else
1426		sock_lock_init_class_and_name(
1427			sk,
1428			af_family_slock_key_strings[sk->sk_family],
1429			af_family_slock_keys + sk->sk_family,
1430			af_family_key_strings[sk->sk_family],
1431			af_family_keys + sk->sk_family);
1432}
1433
1434/*
1435 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1436 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1437 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1438 */
1439static void sock_copy(struct sock *nsk, const struct sock *osk)
1440{
1441#ifdef CONFIG_SECURITY_NETWORK
1442	void *sptr = nsk->sk_security;
1443#endif
1444	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1445
1446	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1447	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1448
1449#ifdef CONFIG_SECURITY_NETWORK
1450	nsk->sk_security = sptr;
1451	security_sk_clone(osk, nsk);
1452#endif
1453}
1454
1455static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1456		int family)
1457{
1458	struct sock *sk;
1459	struct kmem_cache *slab;
1460
1461	slab = prot->slab;
1462	if (slab != NULL) {
1463		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1464		if (!sk)
1465			return sk;
1466		if (priority & __GFP_ZERO)
1467			sk_prot_clear_nulls(sk, prot->obj_size);
1468	} else
1469		sk = kmalloc(prot->obj_size, priority);
1470
1471	if (sk != NULL) {
1472		kmemcheck_annotate_bitfield(sk, flags);
1473
1474		if (security_sk_alloc(sk, family, priority))
1475			goto out_free;
1476
1477		if (!try_module_get(prot->owner))
1478			goto out_free_sec;
1479		sk_tx_queue_clear(sk);
1480	}
1481
1482	return sk;
1483
1484out_free_sec:
1485	security_sk_free(sk);
1486out_free:
1487	if (slab != NULL)
1488		kmem_cache_free(slab, sk);
1489	else
1490		kfree(sk);
1491	return NULL;
1492}
1493
1494static void sk_prot_free(struct proto *prot, struct sock *sk)
1495{
1496	struct kmem_cache *slab;
1497	struct module *owner;
1498
1499	owner = prot->owner;
1500	slab = prot->slab;
1501
1502	cgroup_sk_free(&sk->sk_cgrp_data);
1503	mem_cgroup_sk_free(sk);
1504	security_sk_free(sk);
1505	if (slab != NULL)
1506		kmem_cache_free(slab, sk);
1507	else
1508		kfree(sk);
1509	module_put(owner);
1510}
1511
1512/**
1513 *	sk_alloc - All socket objects are allocated here
1514 *	@net: the applicable net namespace
1515 *	@family: protocol family
1516 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1517 *	@prot: struct proto associated with this new sock instance
1518 *	@kern: is this to be a kernel socket?
1519 */
1520struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1521		      struct proto *prot, int kern)
1522{
1523	struct sock *sk;
1524
1525	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1526	if (sk) {
1527		sk->sk_family = family;
1528		/*
1529		 * See comment in struct sock definition to understand
1530		 * why we need sk_prot_creator -acme
1531		 */
1532		sk->sk_prot = sk->sk_prot_creator = prot;
1533		sk->sk_kern_sock = kern;
1534		sock_lock_init(sk);
1535		sk->sk_net_refcnt = kern ? 0 : 1;
1536		if (likely(sk->sk_net_refcnt))
1537			get_net(net);
1538		sock_net_set(sk, net);
1539		refcount_set(&sk->sk_wmem_alloc, 1);
1540
1541		mem_cgroup_sk_alloc(sk);
1542		cgroup_sk_alloc(&sk->sk_cgrp_data);
1543		sock_update_classid(&sk->sk_cgrp_data);
1544		sock_update_netprioidx(&sk->sk_cgrp_data);
1545	}
1546
1547	return sk;
1548}
1549EXPORT_SYMBOL(sk_alloc);
1550
1551/* Sockets having SOCK_RCU_FREE will call this function after one RCU
1552 * grace period. This is the case for UDP sockets and TCP listeners.
1553 */
1554static void __sk_destruct(struct rcu_head *head)
1555{
1556	struct sock *sk = container_of(head, struct sock, sk_rcu);
1557	struct sk_filter *filter;
1558
1559	if (sk->sk_destruct)
1560		sk->sk_destruct(sk);
1561
1562	filter = rcu_dereference_check(sk->sk_filter,
1563				       refcount_read(&sk->sk_wmem_alloc) == 0);
1564	if (filter) {
1565		sk_filter_uncharge(sk, filter);
1566		RCU_INIT_POINTER(sk->sk_filter, NULL);
1567	}
1568	if (rcu_access_pointer(sk->sk_reuseport_cb))
1569		reuseport_detach_sock(sk);
1570
1571	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1572
1573	if (atomic_read(&sk->sk_omem_alloc))
1574		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1575			 __func__, atomic_read(&sk->sk_omem_alloc));
1576
1577	if (sk->sk_frag.page) {
1578		put_page(sk->sk_frag.page);
1579		sk->sk_frag.page = NULL;
1580	}
1581
1582	if (sk->sk_peer_cred)
1583		put_cred(sk->sk_peer_cred);
1584	put_pid(sk->sk_peer_pid);
1585	if (likely(sk->sk_net_refcnt))
1586		put_net(sock_net(sk));
1587	sk_prot_free(sk->sk_prot_creator, sk);
1588}
1589
1590void sk_destruct(struct sock *sk)
1591{
1592	if (sock_flag(sk, SOCK_RCU_FREE))
1593		call_rcu(&sk->sk_rcu, __sk_destruct);
1594	else
1595		__sk_destruct(&sk->sk_rcu);
1596}
1597
1598static void __sk_free(struct sock *sk)
1599{
1600	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1601		sock_diag_broadcast_destroy(sk);
1602	else
1603		sk_destruct(sk);
1604}
1605
1606void sk_free(struct sock *sk)
1607{
1608	/*
1609	 * We subtract one from sk_wmem_alloc and can know if
1610	 * some packets are still in some tx queue.
1611	 * If not null, sock_wfree() will call __sk_free(sk) later
1612	 */
1613	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1614		__sk_free(sk);
1615}
1616EXPORT_SYMBOL(sk_free);
1617
1618static void sk_init_common(struct sock *sk)
1619{
1620	skb_queue_head_init(&sk->sk_receive_queue);
1621	skb_queue_head_init(&sk->sk_write_queue);
1622	skb_queue_head_init(&sk->sk_error_queue);
1623
1624	rwlock_init(&sk->sk_callback_lock);
1625	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1626			af_rlock_keys + sk->sk_family,
1627			af_family_rlock_key_strings[sk->sk_family]);
1628	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1629			af_wlock_keys + sk->sk_family,
1630			af_family_wlock_key_strings[sk->sk_family]);
1631	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1632			af_elock_keys + sk->sk_family,
1633			af_family_elock_key_strings[sk->sk_family]);
1634	lockdep_set_class_and_name(&sk->sk_callback_lock,
1635			af_callback_keys + sk->sk_family,
1636			af_family_clock_key_strings[sk->sk_family]);
1637}
1638
1639/**
1640 *	sk_clone_lock - clone a socket, and lock its clone
1641 *	@sk: the socket to clone
1642 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1643 *
1644 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1645 */
1646struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1647{
1648	struct sock *newsk;
1649	bool is_charged = true;
1650
1651	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1652	if (newsk != NULL) {
1653		struct sk_filter *filter;
1654
1655		sock_copy(newsk, sk);
1656
1657		/* SANITY */
1658		if (likely(newsk->sk_net_refcnt))
1659			get_net(sock_net(newsk));
1660		sk_node_init(&newsk->sk_node);
1661		sock_lock_init(newsk);
1662		bh_lock_sock(newsk);
1663		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1664		newsk->sk_backlog.len = 0;
1665
1666		atomic_set(&newsk->sk_rmem_alloc, 0);
1667		/*
1668		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1669		 */
1670		refcount_set(&newsk->sk_wmem_alloc, 1);
1671		atomic_set(&newsk->sk_omem_alloc, 0);
1672		sk_init_common(newsk);
1673
1674		newsk->sk_dst_cache	= NULL;
1675		newsk->sk_dst_pending_confirm = 0;
1676		newsk->sk_wmem_queued	= 0;
1677		newsk->sk_forward_alloc = 0;
1678		atomic_set(&newsk->sk_drops, 0);
1679		newsk->sk_send_head	= NULL;
1680		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1681		atomic_set(&newsk->sk_zckey, 0);
1682
1683		sock_reset_flag(newsk, SOCK_DONE);
1684
1685		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1686		if (filter != NULL)
1687			/* though it's an empty new sock, the charging may fail
1688			 * if sysctl_optmem_max was changed between creation of
1689			 * original socket and cloning
1690			 */
1691			is_charged = sk_filter_charge(newsk, filter);
1692
1693		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1694			/* We need to make sure that we don't uncharge the new
1695			 * socket if we couldn't charge it in the first place
1696			 * as otherwise we uncharge the parent's filter.
1697			 */
1698			if (!is_charged)
1699				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1700			sk_free_unlock_clone(newsk);
1701			newsk = NULL;
1702			goto out;
1703		}
1704		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1705
1706		newsk->sk_err	   = 0;
1707		newsk->sk_err_soft = 0;
1708		newsk->sk_priority = 0;
1709		newsk->sk_incoming_cpu = raw_smp_processor_id();
1710		atomic64_set(&newsk->sk_cookie, 0);
1711
1712		mem_cgroup_sk_alloc(newsk);
1713		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1714
1715		/*
1716		 * Before updating sk_refcnt, we must commit prior changes to memory
1717		 * (Documentation/RCU/rculist_nulls.txt for details)
1718		 */
1719		smp_wmb();
1720		refcount_set(&newsk->sk_refcnt, 2);
1721
1722		/*
1723		 * Increment the counter in the same struct proto as the master
1724		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1725		 * is the same as sk->sk_prot->socks, as this field was copied
1726		 * with memcpy).
1727		 *
1728		 * This _changes_ the previous behaviour, where
1729		 * tcp_create_openreq_child always was incrementing the
1730		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1731		 * to be taken into account in all callers. -acme
1732		 */
1733		sk_refcnt_debug_inc(newsk);
1734		sk_set_socket(newsk, NULL);
1735		newsk->sk_wq = NULL;
1736
1737		if (newsk->sk_prot->sockets_allocated)
1738			sk_sockets_allocated_inc(newsk);
1739
1740		if (sock_needs_netstamp(sk) &&
1741		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1742			net_enable_timestamp();
1743	}
1744out:
1745	return newsk;
1746}
1747EXPORT_SYMBOL_GPL(sk_clone_lock);
1748
1749void sk_free_unlock_clone(struct sock *sk)
1750{
1751	/* It is still raw copy of parent, so invalidate
1752	 * destructor and make plain sk_free() */
1753	sk->sk_destruct = NULL;
1754	bh_unlock_sock(sk);
1755	sk_free(sk);
1756}
1757EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1758
1759void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1760{
1761	u32 max_segs = 1;
1762
1763	sk_dst_set(sk, dst);
1764	sk->sk_route_caps = dst->dev->features;
1765	if (sk->sk_route_caps & NETIF_F_GSO)
1766		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1767	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1768	if (sk_can_gso(sk)) {
1769		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1770			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1771		} else {
1772			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1773			sk->sk_gso_max_size = dst->dev->gso_max_size;
1774			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1775		}
1776	}
1777	sk->sk_gso_max_segs = max_segs;
1778}
1779EXPORT_SYMBOL_GPL(sk_setup_caps);
1780
1781/*
1782 *	Simple resource managers for sockets.
1783 */
1784
1785
1786/*
1787 * Write buffer destructor automatically called from kfree_skb.
1788 */
1789void sock_wfree(struct sk_buff *skb)
1790{
1791	struct sock *sk = skb->sk;
1792	unsigned int len = skb->truesize;
1793
1794	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1795		/*
1796		 * Keep a reference on sk_wmem_alloc, this will be released
1797		 * after sk_write_space() call
1798		 */
1799		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1800		sk->sk_write_space(sk);
1801		len = 1;
1802	}
1803	/*
1804	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1805	 * could not do because of in-flight packets
1806	 */
1807	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1808		__sk_free(sk);
1809}
1810EXPORT_SYMBOL(sock_wfree);
1811
1812/* This variant of sock_wfree() is used by TCP,
1813 * since it sets SOCK_USE_WRITE_QUEUE.
1814 */
1815void __sock_wfree(struct sk_buff *skb)
1816{
1817	struct sock *sk = skb->sk;
1818
1819	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1820		__sk_free(sk);
1821}
1822
1823void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1824{
1825	skb_orphan(skb);
1826	skb->sk = sk;
1827#ifdef CONFIG_INET
1828	if (unlikely(!sk_fullsock(sk))) {
1829		skb->destructor = sock_edemux;
1830		sock_hold(sk);
1831		return;
1832	}
1833#endif
1834	skb->destructor = sock_wfree;
1835	skb_set_hash_from_sk(skb, sk);
1836	/*
1837	 * We used to take a refcount on sk, but following operation
1838	 * is enough to guarantee sk_free() wont free this sock until
1839	 * all in-flight packets are completed
1840	 */
1841	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1842}
1843EXPORT_SYMBOL(skb_set_owner_w);
1844
1845/* This helper is used by netem, as it can hold packets in its
1846 * delay queue. We want to allow the owner socket to send more
1847 * packets, as if they were already TX completed by a typical driver.
1848 * But we also want to keep skb->sk set because some packet schedulers
1849 * rely on it (sch_fq for example).
1850 */
1851void skb_orphan_partial(struct sk_buff *skb)
1852{
1853	if (skb_is_tcp_pure_ack(skb))
1854		return;
1855
1856	if (skb->destructor == sock_wfree
1857#ifdef CONFIG_INET
1858	    || skb->destructor == tcp_wfree
1859#endif
1860		) {
1861		struct sock *sk = skb->sk;
1862
1863		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1864			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1865			skb->destructor = sock_efree;
1866		}
1867	} else {
1868		skb_orphan(skb);
1869	}
1870}
1871EXPORT_SYMBOL(skb_orphan_partial);
1872
1873/*
1874 * Read buffer destructor automatically called from kfree_skb.
1875 */
1876void sock_rfree(struct sk_buff *skb)
1877{
1878	struct sock *sk = skb->sk;
1879	unsigned int len = skb->truesize;
1880
1881	atomic_sub(len, &sk->sk_rmem_alloc);
1882	sk_mem_uncharge(sk, len);
1883}
1884EXPORT_SYMBOL(sock_rfree);
1885
1886/*
1887 * Buffer destructor for skbs that are not used directly in read or write
1888 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1889 */
1890void sock_efree(struct sk_buff *skb)
1891{
1892	sock_put(skb->sk);
1893}
1894EXPORT_SYMBOL(sock_efree);
1895
1896kuid_t sock_i_uid(struct sock *sk)
1897{
1898	kuid_t uid;
1899
1900	read_lock_bh(&sk->sk_callback_lock);
1901	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1902	read_unlock_bh(&sk->sk_callback_lock);
1903	return uid;
1904}
1905EXPORT_SYMBOL(sock_i_uid);
1906
1907unsigned long sock_i_ino(struct sock *sk)
1908{
1909	unsigned long ino;
1910
1911	read_lock_bh(&sk->sk_callback_lock);
1912	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1913	read_unlock_bh(&sk->sk_callback_lock);
1914	return ino;
1915}
1916EXPORT_SYMBOL(sock_i_ino);
1917
1918/*
1919 * Allocate a skb from the socket's send buffer.
1920 */
1921struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1922			     gfp_t priority)
1923{
1924	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1925		struct sk_buff *skb = alloc_skb(size, priority);
1926		if (skb) {
1927			skb_set_owner_w(skb, sk);
1928			return skb;
1929		}
1930	}
1931	return NULL;
1932}
1933EXPORT_SYMBOL(sock_wmalloc);
1934
1935static void sock_ofree(struct sk_buff *skb)
1936{
1937	struct sock *sk = skb->sk;
1938
1939	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1940}
1941
1942struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1943			     gfp_t priority)
1944{
1945	struct sk_buff *skb;
1946
1947	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1948	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1949	    sysctl_optmem_max)
1950		return NULL;
1951
1952	skb = alloc_skb(size, priority);
1953	if (!skb)
1954		return NULL;
1955
1956	atomic_add(skb->truesize, &sk->sk_omem_alloc);
1957	skb->sk = sk;
1958	skb->destructor = sock_ofree;
1959	return skb;
1960}
1961
1962/*
1963 * Allocate a memory block from the socket's option memory buffer.
1964 */
1965void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1966{
1967	if ((unsigned int)size <= sysctl_optmem_max &&
1968	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1969		void *mem;
1970		/* First do the add, to avoid the race if kmalloc
1971		 * might sleep.
1972		 */
1973		atomic_add(size, &sk->sk_omem_alloc);
1974		mem = kmalloc(size, priority);
1975		if (mem)
1976			return mem;
1977		atomic_sub(size, &sk->sk_omem_alloc);
1978	}
1979	return NULL;
1980}
1981EXPORT_SYMBOL(sock_kmalloc);
1982
1983/* Free an option memory block. Note, we actually want the inline
1984 * here as this allows gcc to detect the nullify and fold away the
1985 * condition entirely.
1986 */
1987static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1988				  const bool nullify)
1989{
1990	if (WARN_ON_ONCE(!mem))
1991		return;
1992	if (nullify)
1993		kzfree(mem);
1994	else
1995		kfree(mem);
1996	atomic_sub(size, &sk->sk_omem_alloc);
1997}
1998
1999void sock_kfree_s(struct sock *sk, void *mem, int size)
2000{
2001	__sock_kfree_s(sk, mem, size, false);
2002}
2003EXPORT_SYMBOL(sock_kfree_s);
2004
2005void sock_kzfree_s(struct sock *sk, void *mem, int size)
2006{
2007	__sock_kfree_s(sk, mem, size, true);
2008}
2009EXPORT_SYMBOL(sock_kzfree_s);
2010
2011/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2012   I think, these locks should be removed for datagram sockets.
2013 */
2014static long sock_wait_for_wmem(struct sock *sk, long timeo)
2015{
2016	DEFINE_WAIT(wait);
2017
2018	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2019	for (;;) {
2020		if (!timeo)
2021			break;
2022		if (signal_pending(current))
2023			break;
2024		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2025		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2026		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2027			break;
2028		if (sk->sk_shutdown & SEND_SHUTDOWN)
2029			break;
2030		if (sk->sk_err)
2031			break;
2032		timeo = schedule_timeout(timeo);
2033	}
2034	finish_wait(sk_sleep(sk), &wait);
2035	return timeo;
2036}
2037
2038
2039/*
2040 *	Generic send/receive buffer handlers
2041 */
2042
2043struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2044				     unsigned long data_len, int noblock,
2045				     int *errcode, int max_page_order)
2046{
2047	struct sk_buff *skb;
2048	long timeo;
2049	int err;
2050
2051	timeo = sock_sndtimeo(sk, noblock);
2052	for (;;) {
2053		err = sock_error(sk);
2054		if (err != 0)
2055			goto failure;
2056
2057		err = -EPIPE;
2058		if (sk->sk_shutdown & SEND_SHUTDOWN)
2059			goto failure;
2060
2061		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2062			break;
2063
2064		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2065		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2066		err = -EAGAIN;
2067		if (!timeo)
2068			goto failure;
2069		if (signal_pending(current))
2070			goto interrupted;
2071		timeo = sock_wait_for_wmem(sk, timeo);
2072	}
2073	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2074				   errcode, sk->sk_allocation);
2075	if (skb)
2076		skb_set_owner_w(skb, sk);
2077	return skb;
2078
2079interrupted:
2080	err = sock_intr_errno(timeo);
2081failure:
2082	*errcode = err;
2083	return NULL;
2084}
2085EXPORT_SYMBOL(sock_alloc_send_pskb);
2086
2087struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2088				    int noblock, int *errcode)
2089{
2090	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2091}
2092EXPORT_SYMBOL(sock_alloc_send_skb);
2093
2094int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2095		     struct sockcm_cookie *sockc)
2096{
2097	u32 tsflags;
2098
2099	switch (cmsg->cmsg_type) {
2100	case SO_MARK:
2101		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2102			return -EPERM;
2103		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2104			return -EINVAL;
2105		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2106		break;
2107	case SO_TIMESTAMPING:
2108		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2109			return -EINVAL;
2110
2111		tsflags = *(u32 *)CMSG_DATA(cmsg);
2112		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2113			return -EINVAL;
2114
2115		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2116		sockc->tsflags |= tsflags;
2117		break;
2118	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2119	case SCM_RIGHTS:
2120	case SCM_CREDENTIALS:
2121		break;
2122	default:
2123		return -EINVAL;
2124	}
2125	return 0;
2126}
2127EXPORT_SYMBOL(__sock_cmsg_send);
2128
2129int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2130		   struct sockcm_cookie *sockc)
2131{
2132	struct cmsghdr *cmsg;
2133	int ret;
2134
2135	for_each_cmsghdr(cmsg, msg) {
2136		if (!CMSG_OK(msg, cmsg))
2137			return -EINVAL;
2138		if (cmsg->cmsg_level != SOL_SOCKET)
2139			continue;
2140		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2141		if (ret)
2142			return ret;
2143	}
2144	return 0;
2145}
2146EXPORT_SYMBOL(sock_cmsg_send);
2147
2148static void sk_enter_memory_pressure(struct sock *sk)
2149{
2150	if (!sk->sk_prot->enter_memory_pressure)
2151		return;
2152
2153	sk->sk_prot->enter_memory_pressure(sk);
2154}
2155
2156static void sk_leave_memory_pressure(struct sock *sk)
2157{
2158	if (sk->sk_prot->leave_memory_pressure) {
2159		sk->sk_prot->leave_memory_pressure(sk);
2160	} else {
2161		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2162
2163		if (memory_pressure && *memory_pressure)
2164			*memory_pressure = 0;
2165	}
2166}
2167
2168/* On 32bit arches, an skb frag is limited to 2^15 */
2169#define SKB_FRAG_PAGE_ORDER	get_order(32768)
2170
2171/**
2172 * skb_page_frag_refill - check that a page_frag contains enough room
2173 * @sz: minimum size of the fragment we want to get
2174 * @pfrag: pointer to page_frag
2175 * @gfp: priority for memory allocation
2176 *
2177 * Note: While this allocator tries to use high order pages, there is
2178 * no guarantee that allocations succeed. Therefore, @sz MUST be
2179 * less or equal than PAGE_SIZE.
2180 */
2181bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2182{
2183	if (pfrag->page) {
2184		if (page_ref_count(pfrag->page) == 1) {
2185			pfrag->offset = 0;
2186			return true;
2187		}
2188		if (pfrag->offset + sz <= pfrag->size)
2189			return true;
2190		put_page(pfrag->page);
2191	}
2192
2193	pfrag->offset = 0;
2194	if (SKB_FRAG_PAGE_ORDER) {
2195		/* Avoid direct reclaim but allow kswapd to wake */
2196		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2197					  __GFP_COMP | __GFP_NOWARN |
2198					  __GFP_NORETRY,
2199					  SKB_FRAG_PAGE_ORDER);
2200		if (likely(pfrag->page)) {
2201			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2202			return true;
2203		}
2204	}
2205	pfrag->page = alloc_page(gfp);
2206	if (likely(pfrag->page)) {
2207		pfrag->size = PAGE_SIZE;
2208		return true;
2209	}
2210	return false;
2211}
2212EXPORT_SYMBOL(skb_page_frag_refill);
2213
2214bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2215{
2216	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2217		return true;
2218
2219	sk_enter_memory_pressure(sk);
2220	sk_stream_moderate_sndbuf(sk);
2221	return false;
2222}
2223EXPORT_SYMBOL(sk_page_frag_refill);
2224
2225static void __lock_sock(struct sock *sk)
2226	__releases(&sk->sk_lock.slock)
2227	__acquires(&sk->sk_lock.slock)
2228{
2229	DEFINE_WAIT(wait);
2230
2231	for (;;) {
2232		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2233					TASK_UNINTERRUPTIBLE);
2234		spin_unlock_bh(&sk->sk_lock.slock);
2235		schedule();
2236		spin_lock_bh(&sk->sk_lock.slock);
2237		if (!sock_owned_by_user(sk))
2238			break;
2239	}
2240	finish_wait(&sk->sk_lock.wq, &wait);
2241}
2242
2243static void __release_sock(struct sock *sk)
2244	__releases(&sk->sk_lock.slock)
2245	__acquires(&sk->sk_lock.slock)
2246{
2247	struct sk_buff *skb, *next;
2248
2249	while ((skb = sk->sk_backlog.head) != NULL) {
2250		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2251
2252		spin_unlock_bh(&sk->sk_lock.slock);
2253
2254		do {
2255			next = skb->next;
2256			prefetch(next);
2257			WARN_ON_ONCE(skb_dst_is_noref(skb));
2258			skb->next = NULL;
2259			sk_backlog_rcv(sk, skb);
2260
2261			cond_resched();
2262
2263			skb = next;
2264		} while (skb != NULL);
2265
2266		spin_lock_bh(&sk->sk_lock.slock);
2267	}
2268
2269	/*
2270	 * Doing the zeroing here guarantee we can not loop forever
2271	 * while a wild producer attempts to flood us.
2272	 */
2273	sk->sk_backlog.len = 0;
2274}
2275
2276void __sk_flush_backlog(struct sock *sk)
2277{
2278	spin_lock_bh(&sk->sk_lock.slock);
2279	__release_sock(sk);
2280	spin_unlock_bh(&sk->sk_lock.slock);
2281}
2282
2283/**
2284 * sk_wait_data - wait for data to arrive at sk_receive_queue
2285 * @sk:    sock to wait on
2286 * @timeo: for how long
2287 * @skb:   last skb seen on sk_receive_queue
2288 *
2289 * Now socket state including sk->sk_err is changed only under lock,
2290 * hence we may omit checks after joining wait queue.
2291 * We check receive queue before schedule() only as optimization;
2292 * it is very likely that release_sock() added new data.
2293 */
2294int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2295{
2296	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2297	int rc;
2298
2299	add_wait_queue(sk_sleep(sk), &wait);
2300	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2301	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2302	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2303	remove_wait_queue(sk_sleep(sk), &wait);
2304	return rc;
2305}
2306EXPORT_SYMBOL(sk_wait_data);
2307
2308/**
2309 *	__sk_mem_raise_allocated - increase memory_allocated
2310 *	@sk: socket
2311 *	@size: memory size to allocate
2312 *	@amt: pages to allocate
2313 *	@kind: allocation type
2314 *
2315 *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2316 */
2317int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2318{
2319	struct proto *prot = sk->sk_prot;
2320	long allocated = sk_memory_allocated_add(sk, amt);
2321
2322	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2323	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2324		goto suppress_allocation;
2325
2326	/* Under limit. */
2327	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2328		sk_leave_memory_pressure(sk);
2329		return 1;
2330	}
2331
2332	/* Under pressure. */
2333	if (allocated > sk_prot_mem_limits(sk, 1))
2334		sk_enter_memory_pressure(sk);
2335
2336	/* Over hard limit. */
2337	if (allocated > sk_prot_mem_limits(sk, 2))
2338		goto suppress_allocation;
2339
2340	/* guarantee minimum buffer size under pressure */
2341	if (kind == SK_MEM_RECV) {
2342		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2343			return 1;
2344
2345	} else { /* SK_MEM_SEND */
2346		if (sk->sk_type == SOCK_STREAM) {
2347			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2348				return 1;
2349		} else if (refcount_read(&sk->sk_wmem_alloc) <
2350			   prot->sysctl_wmem[0])
2351				return 1;
2352	}
2353
2354	if (sk_has_memory_pressure(sk)) {
2355		int alloc;
2356
2357		if (!sk_under_memory_pressure(sk))
2358			return 1;
2359		alloc = sk_sockets_allocated_read_positive(sk);
2360		if (sk_prot_mem_limits(sk, 2) > alloc *
2361		    sk_mem_pages(sk->sk_wmem_queued +
2362				 atomic_read(&sk->sk_rmem_alloc) +
2363				 sk->sk_forward_alloc))
2364			return 1;
2365	}
2366
2367suppress_allocation:
2368
2369	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2370		sk_stream_moderate_sndbuf(sk);
2371
2372		/* Fail only if socket is _under_ its sndbuf.
2373		 * In this case we cannot block, so that we have to fail.
2374		 */
2375		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2376			return 1;
2377	}
2378
2379	trace_sock_exceed_buf_limit(sk, prot, allocated);
2380
2381	sk_memory_allocated_sub(sk, amt);
2382
2383	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2384		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2385
2386	return 0;
2387}
2388EXPORT_SYMBOL(__sk_mem_raise_allocated);
2389
2390/**
2391 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2392 *	@sk: socket
2393 *	@size: memory size to allocate
2394 *	@kind: allocation type
2395 *
2396 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2397 *	rmem allocation. This function assumes that protocols which have
2398 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2399 */
2400int __sk_mem_schedule(struct sock *sk, int size, int kind)
2401{
2402	int ret, amt = sk_mem_pages(size);
2403
2404	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2405	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2406	if (!ret)
2407		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2408	return ret;
2409}
2410EXPORT_SYMBOL(__sk_mem_schedule);
2411
2412/**
2413 *	__sk_mem_reduce_allocated - reclaim memory_allocated
2414 *	@sk: socket
2415 *	@amount: number of quanta
2416 *
2417 *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2418 */
2419void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2420{
2421	sk_memory_allocated_sub(sk, amount);
2422
2423	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2424		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2425
2426	if (sk_under_memory_pressure(sk) &&
2427	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2428		sk_leave_memory_pressure(sk);
2429}
2430EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2431
2432/**
2433 *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2434 *	@sk: socket
2435 *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2436 */
2437void __sk_mem_reclaim(struct sock *sk, int amount)
2438{
2439	amount >>= SK_MEM_QUANTUM_SHIFT;
2440	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2441	__sk_mem_reduce_allocated(sk, amount);
2442}
2443EXPORT_SYMBOL(__sk_mem_reclaim);
2444
2445int sk_set_peek_off(struct sock *sk, int val)
2446{
2447	sk->sk_peek_off = val;
2448	return 0;
2449}
2450EXPORT_SYMBOL_GPL(sk_set_peek_off);
2451
2452/*
2453 * Set of default routines for initialising struct proto_ops when
2454 * the protocol does not support a particular function. In certain
2455 * cases where it makes no sense for a protocol to have a "do nothing"
2456 * function, some default processing is provided.
2457 */
2458
2459int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2460{
2461	return -EOPNOTSUPP;
2462}
2463EXPORT_SYMBOL(sock_no_bind);
2464
2465int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2466		    int len, int flags)
2467{
2468	return -EOPNOTSUPP;
2469}
2470EXPORT_SYMBOL(sock_no_connect);
2471
2472int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2473{
2474	return -EOPNOTSUPP;
2475}
2476EXPORT_SYMBOL(sock_no_socketpair);
2477
2478int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2479		   bool kern)
2480{
2481	return -EOPNOTSUPP;
2482}
2483EXPORT_SYMBOL(sock_no_accept);
2484
2485int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2486		    int *len, int peer)
2487{
2488	return -EOPNOTSUPP;
2489}
2490EXPORT_SYMBOL(sock_no_getname);
2491
2492unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2493{
2494	return 0;
2495}
2496EXPORT_SYMBOL(sock_no_poll);
2497
2498int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2499{
2500	return -EOPNOTSUPP;
2501}
2502EXPORT_SYMBOL(sock_no_ioctl);
2503
2504int sock_no_listen(struct socket *sock, int backlog)
2505{
2506	return -EOPNOTSUPP;
2507}
2508EXPORT_SYMBOL(sock_no_listen);
2509
2510int sock_no_shutdown(struct socket *sock, int how)
2511{
2512	return -EOPNOTSUPP;
2513}
2514EXPORT_SYMBOL(sock_no_shutdown);
2515
2516int sock_no_setsockopt(struct socket *sock, int level, int optname,
2517		    char __user *optval, unsigned int optlen)
2518{
2519	return -EOPNOTSUPP;
2520}
2521EXPORT_SYMBOL(sock_no_setsockopt);
2522
2523int sock_no_getsockopt(struct socket *sock, int level, int optname,
2524		    char __user *optval, int __user *optlen)
2525{
2526	return -EOPNOTSUPP;
2527}
2528EXPORT_SYMBOL(sock_no_getsockopt);
2529
2530int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2531{
2532	return -EOPNOTSUPP;
2533}
2534EXPORT_SYMBOL(sock_no_sendmsg);
2535
2536int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2537{
2538	return -EOPNOTSUPP;
2539}
2540EXPORT_SYMBOL(sock_no_sendmsg_locked);
2541
2542int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2543		    int flags)
2544{
2545	return -EOPNOTSUPP;
2546}
2547EXPORT_SYMBOL(sock_no_recvmsg);
2548
2549int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2550{
2551	/* Mirror missing mmap method error code */
2552	return -ENODEV;
2553}
2554EXPORT_SYMBOL(sock_no_mmap);
2555
2556ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2557{
2558	ssize_t res;
2559	struct msghdr msg = {.msg_flags = flags};
2560	struct kvec iov;
2561	char *kaddr = kmap(page);
2562	iov.iov_base = kaddr + offset;
2563	iov.iov_len = size;
2564	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2565	kunmap(page);
2566	return res;
2567}
2568EXPORT_SYMBOL(sock_no_sendpage);
2569
2570ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2571				int offset, size_t size, int flags)
2572{
2573	ssize_t res;
2574	struct msghdr msg = {.msg_flags = flags};
2575	struct kvec iov;
2576	char *kaddr = kmap(page);
2577
2578	iov.iov_base = kaddr + offset;
2579	iov.iov_len = size;
2580	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2581	kunmap(page);
2582	return res;
2583}
2584EXPORT_SYMBOL(sock_no_sendpage_locked);
2585
2586/*
2587 *	Default Socket Callbacks
2588 */
2589
2590static void sock_def_wakeup(struct sock *sk)
2591{
2592	struct socket_wq *wq;
2593
2594	rcu_read_lock();
2595	wq = rcu_dereference(sk->sk_wq);
2596	if (skwq_has_sleeper(wq))
2597		wake_up_interruptible_all(&wq->wait);
2598	rcu_read_unlock();
2599}
2600
2601static void sock_def_error_report(struct sock *sk)
2602{
2603	struct socket_wq *wq;
2604
2605	rcu_read_lock();
2606	wq = rcu_dereference(sk->sk_wq);
2607	if (skwq_has_sleeper(wq))
2608		wake_up_interruptible_poll(&wq->wait, POLLERR);
2609	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2610	rcu_read_unlock();
2611}
2612
2613static void sock_def_readable(struct sock *sk)
2614{
2615	struct socket_wq *wq;
2616
2617	rcu_read_lock();
2618	wq = rcu_dereference(sk->sk_wq);
2619	if (skwq_has_sleeper(wq))
2620		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2621						POLLRDNORM | POLLRDBAND);
2622	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2623	rcu_read_unlock();
2624}
2625
2626static void sock_def_write_space(struct sock *sk)
2627{
2628	struct socket_wq *wq;
2629
2630	rcu_read_lock();
2631
2632	/* Do not wake up a writer until he can make "significant"
2633	 * progress.  --DaveM
2634	 */
2635	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2636		wq = rcu_dereference(sk->sk_wq);
2637		if (skwq_has_sleeper(wq))
2638			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2639						POLLWRNORM | POLLWRBAND);
2640
2641		/* Should agree with poll, otherwise some programs break */
2642		if (sock_writeable(sk))
2643			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2644	}
2645
2646	rcu_read_unlock();
2647}
2648
2649static void sock_def_destruct(struct sock *sk)
2650{
2651}
2652
2653void sk_send_sigurg(struct sock *sk)
2654{
2655	if (sk->sk_socket && sk->sk_socket->file)
2656		if (send_sigurg(&sk->sk_socket->file->f_owner))
2657			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2658}
2659EXPORT_SYMBOL(sk_send_sigurg);
2660
2661void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2662		    unsigned long expires)
2663{
2664	if (!mod_timer(timer, expires))
2665		sock_hold(sk);
2666}
2667EXPORT_SYMBOL(sk_reset_timer);
2668
2669void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2670{
2671	if (del_timer(timer))
2672		__sock_put(sk);
2673}
2674EXPORT_SYMBOL(sk_stop_timer);
2675
2676void sock_init_data(struct socket *sock, struct sock *sk)
2677{
2678	sk_init_common(sk);
2679	sk->sk_send_head	=	NULL;
2680
2681	init_timer(&sk->sk_timer);
2682
2683	sk->sk_allocation	=	GFP_KERNEL;
2684	sk->sk_rcvbuf		=	sysctl_rmem_default;
2685	sk->sk_sndbuf		=	sysctl_wmem_default;
2686	sk->sk_state		=	TCP_CLOSE;
2687	sk_set_socket(sk, sock);
2688
2689	sock_set_flag(sk, SOCK_ZAPPED);
2690
2691	if (sock) {
2692		sk->sk_type	=	sock->type;
2693		sk->sk_wq	=	sock->wq;
2694		sock->sk	=	sk;
2695		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2696	} else {
2697		sk->sk_wq	=	NULL;
2698		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2699	}
2700
2701	rwlock_init(&sk->sk_callback_lock);
2702	if (sk->sk_kern_sock)
2703		lockdep_set_class_and_name(
2704			&sk->sk_callback_lock,
2705			af_kern_callback_keys + sk->sk_family,
2706			af_family_kern_clock_key_strings[sk->sk_family]);
2707	else
2708		lockdep_set_class_and_name(
2709			&sk->sk_callback_lock,
2710			af_callback_keys + sk->sk_family,
2711			af_family_clock_key_strings[sk->sk_family]);
2712
2713	sk->sk_state_change	=	sock_def_wakeup;
2714	sk->sk_data_ready	=	sock_def_readable;
2715	sk->sk_write_space	=	sock_def_write_space;
2716	sk->sk_error_report	=	sock_def_error_report;
2717	sk->sk_destruct		=	sock_def_destruct;
2718
2719	sk->sk_frag.page	=	NULL;
2720	sk->sk_frag.offset	=	0;
2721	sk->sk_peek_off		=	-1;
2722
2723	sk->sk_peer_pid 	=	NULL;
2724	sk->sk_peer_cred	=	NULL;
2725	sk->sk_write_pending	=	0;
2726	sk->sk_rcvlowat		=	1;
2727	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2728	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2729
2730	sk->sk_stamp = SK_DEFAULT_STAMP;
2731	atomic_set(&sk->sk_zckey, 0);
2732
2733#ifdef CONFIG_NET_RX_BUSY_POLL
2734	sk->sk_napi_id		=	0;
2735	sk->sk_ll_usec		=	sysctl_net_busy_read;
2736#endif
2737
2738	sk->sk_max_pacing_rate = ~0U;
2739	sk->sk_pacing_rate = ~0U;
2740	sk->sk_incoming_cpu = -1;
2741	/*
2742	 * Before updating sk_refcnt, we must commit prior changes to memory
2743	 * (Documentation/RCU/rculist_nulls.txt for details)
2744	 */
2745	smp_wmb();
2746	refcount_set(&sk->sk_refcnt, 1);
2747	atomic_set(&sk->sk_drops, 0);
2748}
2749EXPORT_SYMBOL(sock_init_data);
2750
2751void lock_sock_nested(struct sock *sk, int subclass)
2752{
2753	might_sleep();
2754	spin_lock_bh(&sk->sk_lock.slock);
2755	if (sk->sk_lock.owned)
2756		__lock_sock(sk);
2757	sk->sk_lock.owned = 1;
2758	spin_unlock(&sk->sk_lock.slock);
2759	/*
2760	 * The sk_lock has mutex_lock() semantics here:
2761	 */
2762	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2763	local_bh_enable();
2764}
2765EXPORT_SYMBOL(lock_sock_nested);
2766
2767void release_sock(struct sock *sk)
2768{
2769	spin_lock_bh(&sk->sk_lock.slock);
2770	if (sk->sk_backlog.tail)
2771		__release_sock(sk);
2772
2773	/* Warning : release_cb() might need to release sk ownership,
2774	 * ie call sock_release_ownership(sk) before us.
2775	 */
2776	if (sk->sk_prot->release_cb)
2777		sk->sk_prot->release_cb(sk);
2778
2779	sock_release_ownership(sk);
2780	if (waitqueue_active(&sk->sk_lock.wq))
2781		wake_up(&sk->sk_lock.wq);
2782	spin_unlock_bh(&sk->sk_lock.slock);
2783}
2784EXPORT_SYMBOL(release_sock);
2785
2786/**
2787 * lock_sock_fast - fast version of lock_sock
2788 * @sk: socket
2789 *
2790 * This version should be used for very small section, where process wont block
2791 * return false if fast path is taken:
2792 *
2793 *   sk_lock.slock locked, owned = 0, BH disabled
2794 *
2795 * return true if slow path is taken:
2796 *
2797 *   sk_lock.slock unlocked, owned = 1, BH enabled
2798 */
2799bool lock_sock_fast(struct sock *sk)
2800{
2801	might_sleep();
2802	spin_lock_bh(&sk->sk_lock.slock);
2803
2804	if (!sk->sk_lock.owned)
2805		/*
2806		 * Note : We must disable BH
2807		 */
2808		return false;
2809
2810	__lock_sock(sk);
2811	sk->sk_lock.owned = 1;
2812	spin_unlock(&sk->sk_lock.slock);
2813	/*
2814	 * The sk_lock has mutex_lock() semantics here:
2815	 */
2816	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2817	local_bh_enable();
2818	return true;
2819}
2820EXPORT_SYMBOL(lock_sock_fast);
2821
2822int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2823{
2824	struct timeval tv;
2825	if (!sock_flag(sk, SOCK_TIMESTAMP))
2826		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2827	tv = ktime_to_timeval(sk->sk_stamp);
2828	if (tv.tv_sec == -1)
2829		return -ENOENT;
2830	if (tv.tv_sec == 0) {
2831		sk->sk_stamp = ktime_get_real();
2832		tv = ktime_to_timeval(sk->sk_stamp);
2833	}
2834	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2835}
2836EXPORT_SYMBOL(sock_get_timestamp);
2837
2838int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2839{
2840	struct timespec ts;
2841	if (!sock_flag(sk, SOCK_TIMESTAMP))
2842		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2843	ts = ktime_to_timespec(sk->sk_stamp);
2844	if (ts.tv_sec == -1)
2845		return -ENOENT;
2846	if (ts.tv_sec == 0) {
2847		sk->sk_stamp = ktime_get_real();
2848		ts = ktime_to_timespec(sk->sk_stamp);
2849	}
2850	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2851}
2852EXPORT_SYMBOL(sock_get_timestampns);
2853
2854void sock_enable_timestamp(struct sock *sk, int flag)
2855{
2856	if (!sock_flag(sk, flag)) {
2857		unsigned long previous_flags = sk->sk_flags;
2858
2859		sock_set_flag(sk, flag);
2860		/*
2861		 * we just set one of the two flags which require net
2862		 * time stamping, but time stamping might have been on
2863		 * already because of the other one
2864		 */
2865		if (sock_needs_netstamp(sk) &&
2866		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2867			net_enable_timestamp();
2868	}
2869}
2870
2871int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2872		       int level, int type)
2873{
2874	struct sock_exterr_skb *serr;
2875	struct sk_buff *skb;
2876	int copied, err;
2877
2878	err = -EAGAIN;
2879	skb = sock_dequeue_err_skb(sk);
2880	if (skb == NULL)
2881		goto out;
2882
2883	copied = skb->len;
2884	if (copied > len) {
2885		msg->msg_flags |= MSG_TRUNC;
2886		copied = len;
2887	}
2888	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2889	if (err)
2890		goto out_free_skb;
2891
2892	sock_recv_timestamp(msg, sk, skb);
2893
2894	serr = SKB_EXT_ERR(skb);
2895	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2896
2897	msg->msg_flags |= MSG_ERRQUEUE;
2898	err = copied;
2899
2900out_free_skb:
2901	kfree_skb(skb);
2902out:
2903	return err;
2904}
2905EXPORT_SYMBOL(sock_recv_errqueue);
2906
2907/*
2908 *	Get a socket option on an socket.
2909 *
2910 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2911 *	asynchronous errors should be reported by getsockopt. We assume
2912 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2913 */
2914int sock_common_getsockopt(struct socket *sock, int level, int optname,
2915			   char __user *optval, int __user *optlen)
2916{
2917	struct sock *sk = sock->sk;
2918
2919	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2920}
2921EXPORT_SYMBOL(sock_common_getsockopt);
2922
2923#ifdef CONFIG_COMPAT
2924int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2925				  char __user *optval, int __user *optlen)
2926{
2927	struct sock *sk = sock->sk;
2928
2929	if (sk->sk_prot->compat_getsockopt != NULL)
2930		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2931						      optval, optlen);
2932	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2933}
2934EXPORT_SYMBOL(compat_sock_common_getsockopt);
2935#endif
2936
2937int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2938			int flags)
2939{
2940	struct sock *sk = sock->sk;
2941	int addr_len = 0;
2942	int err;
2943
2944	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2945				   flags & ~MSG_DONTWAIT, &addr_len);
2946	if (err >= 0)
2947		msg->msg_namelen = addr_len;
2948	return err;
2949}
2950EXPORT_SYMBOL(sock_common_recvmsg);
2951
2952/*
2953 *	Set socket options on an inet socket.
2954 */
2955int sock_common_setsockopt(struct socket *sock, int level, int optname,
2956			   char __user *optval, unsigned int optlen)
2957{
2958	struct sock *sk = sock->sk;
2959
2960	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2961}
2962EXPORT_SYMBOL(sock_common_setsockopt);
2963
2964#ifdef CONFIG_COMPAT
2965int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2966				  char __user *optval, unsigned int optlen)
2967{
2968	struct sock *sk = sock->sk;
2969
2970	if (sk->sk_prot->compat_setsockopt != NULL)
2971		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2972						      optval, optlen);
2973	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2974}
2975EXPORT_SYMBOL(compat_sock_common_setsockopt);
2976#endif
2977
2978void sk_common_release(struct sock *sk)
2979{
2980	if (sk->sk_prot->destroy)
2981		sk->sk_prot->destroy(sk);
2982
2983	/*
2984	 * Observation: when sock_common_release is called, processes have
2985	 * no access to socket. But net still has.
2986	 * Step one, detach it from networking:
2987	 *
2988	 * A. Remove from hash tables.
2989	 */
2990
2991	sk->sk_prot->unhash(sk);
2992
2993	/*
2994	 * In this point socket cannot receive new packets, but it is possible
2995	 * that some packets are in flight because some CPU runs receiver and
2996	 * did hash table lookup before we unhashed socket. They will achieve
2997	 * receive queue and will be purged by socket destructor.
2998	 *
2999	 * Also we still have packets pending on receive queue and probably,
3000	 * our own packets waiting in device queues. sock_destroy will drain
3001	 * receive queue, but transmitted packets will delay socket destruction
3002	 * until the last reference will be released.
3003	 */
3004
3005	sock_orphan(sk);
3006
3007	xfrm_sk_free_policy(sk);
3008
3009	sk_refcnt_debug_release(sk);
3010
3011	sock_put(sk);
3012}
3013EXPORT_SYMBOL(sk_common_release);
3014
3015void sk_get_meminfo(const struct sock *sk, u32 *mem)
3016{
3017	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3018
3019	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3020	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3021	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3022	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3023	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3024	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3025	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3026	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3027	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3028}
3029
3030#ifdef CONFIG_PROC_FS
3031#define PROTO_INUSE_NR	64	/* should be enough for the first time */
3032struct prot_inuse {
3033	int val[PROTO_INUSE_NR];
3034};
3035
3036static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3037
3038#ifdef CONFIG_NET_NS
3039void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3040{
3041	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
3042}
3043EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3044
3045int sock_prot_inuse_get(struct net *net, struct proto *prot)
3046{
3047	int cpu, idx = prot->inuse_idx;
3048	int res = 0;
3049
3050	for_each_possible_cpu(cpu)
3051		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
3052
3053	return res >= 0 ? res : 0;
3054}
3055EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3056
3057static int __net_init sock_inuse_init_net(struct net *net)
3058{
3059	net->core.inuse = alloc_percpu(struct prot_inuse);
3060	return net->core.inuse ? 0 : -ENOMEM;
3061}
3062
3063static void __net_exit sock_inuse_exit_net(struct net *net)
3064{
3065	free_percpu(net->core.inuse);
3066}
3067
3068static struct pernet_operations net_inuse_ops = {
3069	.init = sock_inuse_init_net,
3070	.exit = sock_inuse_exit_net,
3071};
3072
3073static __init int net_inuse_init(void)
3074{
3075	if (register_pernet_subsys(&net_inuse_ops))
3076		panic("Cannot initialize net inuse counters");
3077
3078	return 0;
3079}
3080
3081core_initcall(net_inuse_init);
3082#else
3083static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
3084
3085void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3086{
3087	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
3088}
3089EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3090
3091int sock_prot_inuse_get(struct net *net, struct proto *prot)
3092{
3093	int cpu, idx = prot->inuse_idx;
3094	int res = 0;
3095
3096	for_each_possible_cpu(cpu)
3097		res += per_cpu(prot_inuse, cpu).val[idx];
3098
3099	return res >= 0 ? res : 0;
3100}
3101EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3102#endif
3103
3104static void assign_proto_idx(struct proto *prot)
3105{
3106	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3107
3108	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3109		pr_err("PROTO_INUSE_NR exhausted\n");
3110		return;
3111	}
3112
3113	set_bit(prot->inuse_idx, proto_inuse_idx);
3114}
3115
3116static void release_proto_idx(struct proto *prot)
3117{
3118	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3119		clear_bit(prot->inuse_idx, proto_inuse_idx);
3120}
3121#else
3122static inline void assign_proto_idx(struct proto *prot)
3123{
3124}
3125
3126static inline void release_proto_idx(struct proto *prot)
3127{
3128}
3129#endif
3130
3131static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3132{
3133	if (!rsk_prot)
3134		return;
3135	kfree(rsk_prot->slab_name);
3136	rsk_prot->slab_name = NULL;
3137	kmem_cache_destroy(rsk_prot->slab);
3138	rsk_prot->slab = NULL;
3139}
3140
3141static int req_prot_init(const struct proto *prot)
3142{
3143	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3144
3145	if (!rsk_prot)
3146		return 0;
3147
3148	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3149					prot->name);
3150	if (!rsk_prot->slab_name)
3151		return -ENOMEM;
3152
3153	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3154					   rsk_prot->obj_size, 0,
3155					   prot->slab_flags, NULL);
3156
3157	if (!rsk_prot->slab) {
3158		pr_crit("%s: Can't create request sock SLAB cache!\n",
3159			prot->name);
3160		return -ENOMEM;
3161	}
3162	return 0;
3163}
3164
3165int proto_register(struct proto *prot, int alloc_slab)
3166{
3167	if (alloc_slab) {
3168		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
3169					SLAB_HWCACHE_ALIGN | prot->slab_flags,
3170					NULL);
3171
3172		if (prot->slab == NULL) {
3173			pr_crit("%s: Can't create sock SLAB cache!\n",
3174				prot->name);
3175			goto out;
3176		}
3177
3178		if (req_prot_init(prot))
3179			goto out_free_request_sock_slab;
3180
3181		if (prot->twsk_prot != NULL) {
3182			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3183
3184			if (prot->twsk_prot->twsk_slab_name == NULL)
3185				goto out_free_request_sock_slab;
3186
3187			prot->twsk_prot->twsk_slab =
3188				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3189						  prot->twsk_prot->twsk_obj_size,
3190						  0,
3191						  prot->slab_flags,
3192						  NULL);
3193			if (prot->twsk_prot->twsk_slab == NULL)
3194				goto out_free_timewait_sock_slab_name;
3195		}
3196	}
3197
3198	mutex_lock(&proto_list_mutex);
3199	list_add(&prot->node, &proto_list);
3200	assign_proto_idx(prot);
3201	mutex_unlock(&proto_list_mutex);
3202	return 0;
3203
3204out_free_timewait_sock_slab_name:
3205	kfree(prot->twsk_prot->twsk_slab_name);
3206out_free_request_sock_slab:
3207	req_prot_cleanup(prot->rsk_prot);
3208
3209	kmem_cache_destroy(prot->slab);
3210	prot->slab = NULL;
3211out:
3212	return -ENOBUFS;
3213}
3214EXPORT_SYMBOL(proto_register);
3215
3216void proto_unregister(struct proto *prot)
3217{
3218	mutex_lock(&proto_list_mutex);
3219	release_proto_idx(prot);
3220	list_del(&prot->node);
3221	mutex_unlock(&proto_list_mutex);
3222
3223	kmem_cache_destroy(prot->slab);
3224	prot->slab = NULL;
3225
3226	req_prot_cleanup(prot->rsk_prot);
3227
3228	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3229		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3230		kfree(prot->twsk_prot->twsk_slab_name);
3231		prot->twsk_prot->twsk_slab = NULL;
3232	}
3233}
3234EXPORT_SYMBOL(proto_unregister);
3235
3236#ifdef CONFIG_PROC_FS
3237static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3238	__acquires(proto_list_mutex)
3239{
3240	mutex_lock(&proto_list_mutex);
3241	return seq_list_start_head(&proto_list, *pos);
3242}
3243
3244static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3245{
3246	return seq_list_next(v, &proto_list, pos);
3247}
3248
3249static void proto_seq_stop(struct seq_file *seq, void *v)
3250	__releases(proto_list_mutex)
3251{
3252	mutex_unlock(&proto_list_mutex);
3253}
3254
3255static char proto_method_implemented(const void *method)
3256{
3257	return method == NULL ? 'n' : 'y';
3258}
3259static long sock_prot_memory_allocated(struct proto *proto)
3260{
3261	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3262}
3263
3264static char *sock_prot_memory_pressure(struct proto *proto)
3265{
3266	return proto->memory_pressure != NULL ?
3267	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3268}
3269
3270static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3271{
3272
3273	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3274			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3275		   proto->name,
3276		   proto->obj_size,
3277		   sock_prot_inuse_get(seq_file_net(seq), proto),
3278		   sock_prot_memory_allocated(proto),
3279		   sock_prot_memory_pressure(proto),
3280		   proto->max_header,
3281		   proto->slab == NULL ? "no" : "yes",
3282		   module_name(proto->owner),
3283		   proto_method_implemented(proto->close),
3284		   proto_method_implemented(proto->connect),
3285		   proto_method_implemented(proto->disconnect),
3286		   proto_method_implemented(proto->accept),
3287		   proto_method_implemented(proto->ioctl),
3288		   proto_method_implemented(proto->init),
3289		   proto_method_implemented(proto->destroy),
3290		   proto_method_implemented(proto->shutdown),
3291		   proto_method_implemented(proto->setsockopt),
3292		   proto_method_implemented(proto->getsockopt),
3293		   proto_method_implemented(proto->sendmsg),
3294		   proto_method_implemented(proto->recvmsg),
3295		   proto_method_implemented(proto->sendpage),
3296		   proto_method_implemented(proto->bind),
3297		   proto_method_implemented(proto->backlog_rcv),
3298		   proto_method_implemented(proto->hash),
3299		   proto_method_implemented(proto->unhash),
3300		   proto_method_implemented(proto->get_port),
3301		   proto_method_implemented(proto->enter_memory_pressure));
3302}
3303
3304static int proto_seq_show(struct seq_file *seq, void *v)
3305{
3306	if (v == &proto_list)
3307		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3308			   "protocol",
3309			   "size",
3310			   "sockets",
3311			   "memory",
3312			   "press",
3313			   "maxhdr",
3314			   "slab",
3315			   "module",
3316			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3317	else
3318		proto_seq_printf(seq, list_entry(v, struct proto, node));
3319	return 0;
3320}
3321
3322static const struct seq_operations proto_seq_ops = {
3323	.start  = proto_seq_start,
3324	.next   = proto_seq_next,
3325	.stop   = proto_seq_stop,
3326	.show   = proto_seq_show,
3327};
3328
3329static int proto_seq_open(struct inode *inode, struct file *file)
3330{
3331	return seq_open_net(inode, file, &proto_seq_ops,
3332			    sizeof(struct seq_net_private));
3333}
3334
3335static const struct file_operations proto_seq_fops = {
3336	.owner		= THIS_MODULE,
3337	.open		= proto_seq_open,
3338	.read		= seq_read,
3339	.llseek		= seq_lseek,
3340	.release	= seq_release_net,
3341};
3342
3343static __net_init int proto_init_net(struct net *net)
3344{
3345	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3346		return -ENOMEM;
3347
3348	return 0;
3349}
3350
3351static __net_exit void proto_exit_net(struct net *net)
3352{
3353	remove_proc_entry("protocols", net->proc_net);
3354}
3355
3356
3357static __net_initdata struct pernet_operations proto_net_ops = {
3358	.init = proto_init_net,
3359	.exit = proto_exit_net,
3360};
3361
3362static int __init proto_init(void)
3363{
3364	return register_pernet_subsys(&proto_net_ops);
3365}
3366
3367subsys_initcall(proto_init);
3368
3369#endif /* PROC_FS */
3370
3371#ifdef CONFIG_NET_RX_BUSY_POLL
3372bool sk_busy_loop_end(void *p, unsigned long start_time)
3373{
3374	struct sock *sk = p;
3375
3376	return !skb_queue_empty(&sk->sk_receive_queue) ||
3377	       sk_busy_loop_timeout(sk, start_time);
3378}
3379EXPORT_SYMBOL(sk_busy_loop_end);
3380#endif /* CONFIG_NET_RX_BUSY_POLL */
Configure Feed

Configure Feed